@@ -15,6 +15,8 @@
/* Status bits in the gmap segment entry. */
#define _SEGMENT_ENTRY_GMAP_SPLIT 0x0001 /* split huge pmd */
+/* Status bits only for huge segment entries */
+#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* user dirty (migration) */
/**
* struct gmap_struct - guest address space
@@ -151,4 +153,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
+ unsigned long gaddr, unsigned long vmaddr);
#endif /* _ASM_S390_GMAP_H */
@@ -1106,6 +1106,10 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
pte_t *sptep, pte_t *tptep, pte_t pte);
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
+void ptep_remove_dirty_protection_split(struct mm_struct *mm, pte_t *ptep,
+ unsigned long vmaddr);
+bool test_and_clear_guest_dirty_split(struct mm_struct *mm, pmd_t *pmdp,
+ unsigned long vmaddr);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
@@ -511,19 +511,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
}
static void kvm_s390_sync_dirty_log(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ struct kvm_memory_slot *memslot)
{
gfn_t cur_gfn, last_gfn;
- unsigned long address;
+ unsigned long gaddr, vmaddr;
+ unsigned long *dirty = memslot->dirty_bitmap;
struct gmap *gmap = kvm->arch.gmap;
- /* Loop over all guest pages */
+ /* Loop over all guest segments */
last_gfn = memslot->base_gfn + memslot->npages;
- for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
- address = gfn_to_hva_memslot(memslot, cur_gfn);
+ for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES, dirty += 4) {
+ gaddr = gfn_to_gpa(cur_gfn);
+ vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
+ if (kvm_is_error_hva(vmaddr))
+ continue;
+
+ gmap_sync_dirty_log_pmd(gmap, dirty, gaddr, vmaddr);
- if (test_and_clear_guest_dirty(gmap->mm, address))
- mark_page_dirty(kvm, cur_gfn);
if (fatal_signal_pending(current))
return;
cond_resched();
@@ -15,6 +15,7 @@
#include <linux/swapops.h>
#include <linux/ksm.h>
#include <linux/mman.h>
+#include <linux/hugetlb.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -549,6 +550,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pmd_t unprot;
+ pte_t *ptep;
int rc;
BUG_ON(gmap_is_shadow(gmap));
@@ -606,12 +609,29 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
vmaddr >> PMD_SHIFT, table);
if (!rc) {
if (pmd_large(*pmd)) {
- *table = pmd_val(*pmd) &
- _SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
+ | _SEGMENT_ENTRY_GMAP_UC;
} else
*table = pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS;
}
+ } else if (*table & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
+ unprot = __pmd((*table & (_SEGMENT_ENTRY_HARDWARE_BITS_LARGE
+ & ~_SEGMENT_ENTRY_PROTECT))
+ | _SEGMENT_ENTRY_GMAP_UC);
+ gmap_pmdp_xchg(gmap, (pmd_t *)table, unprot, gaddr);
+ } else if (gmap_pmd_is_split((pmd_t *)table)) {
+ /*
+ * Split pmds are somewhere in-between a normal and a
+ * large pmd. As we don't share the page table, the
+ * host does not remove protection on a fault and we
+ * have to do it ourselves for the guest mapping.
+ */
+ ptep = pte_offset_map((pmd_t *)table, gaddr);
+ if (pte_val(*ptep) & _PAGE_PROTECT)
+ ptep_remove_dirty_protection_split(mm, ptep, vmaddr);
}
spin_unlock(&gmap->guest_table_lock);
spin_unlock(ptl);
@@ -989,6 +1009,113 @@ static int gmap_pmd_split(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp)
return 0;
}
+/**
+ * gmap_pmdp_force_prot - change access rights of a locked pmd
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @pmdp: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: software bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+static int gmap_pmdp_force_prot(struct gmap *gmap, unsigned long addr,
+ pmd_t *pmdp, int prot)
+{
+ int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
+ int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+ pmd_t new = *pmdp;
+
+ /* Fixup needed */
+ if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
+ return -EAGAIN;
+
+ if (prot == PROT_NONE && !pmd_i) {
+ pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ gmap_pmdp_xchg(gmap, pmdp, new, addr);
+ }
+
+ if (prot == PROT_READ && !pmd_p) {
+ pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
+ pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ gmap_pmdp_xchg(gmap, pmdp, new, addr);
+ }
+ return 0;
+}
+
+/**
+ * gmap_pmdp_transfer_prot - transfer protection of guest pmd to host pmd
+ * @mm: the memory context
+ * @address: the affected host virtual address
+ * @gpmdp: guest pmd ptr
+ * @hpmdp: host pmd ptr
+ *
+ * Transfers the protection from a guest pmd to the associated guest
+ * pmd. This has to be done with a plain idte to circumvent the gmap
+ * invalidation hooks in the standard invalidation functions provided
+ * by pgtable.c.
+ */
+static void gmap_pmdp_transfer_prot(struct mm_struct *mm, unsigned long addr,
+ pmd_t *gpmdp, pmd_t *hpmdp)
+{
+ const int gpmd_i = pmd_val(*gpmdp) & _SEGMENT_ENTRY_INVALID;
+ const int gpmd_p = pmd_val(*gpmdp) & _SEGMENT_ENTRY_PROTECT;
+ const int hpmd_i = pmd_val(*hpmdp) & _SEGMENT_ENTRY_INVALID;
+ const int hpmd_p = pmd_val(*hpmdp) & _SEGMENT_ENTRY_PROTECT;
+ pmd_t new = *hpmdp;
+
+ /* Fastpath, change not needed. */
+ if (hpmd_i || (hpmd_p && gpmd_p) || (!gpmd_i && !gpmd_p))
+ return;
+
+ if (gpmd_p && !hpmd_p)
+ pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ if (!gpmd_i && !hpmd_i)
+ pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
+
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(addr, hpmdp,
+ IDTE_NODAT | IDTE_GUEST_ASCE,
+ mm->context.asce, IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(addr, hpmdp, 0, 0,
+ IDTE_GLOBAL);
+ else
+ __pmdp_csp(hpmdp);
+ *hpmdp = new;
+}
+
+/*
+ * gmap_protect_pmd - set pmd notification bits
+ * @pmdp: pointer to the pmd to be protected
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EAGAIN if a fixup is needed.
+ *
+ * Expected to be called with sg->mm->mmap_sem in read and
+ * guest_table_lock held.
+ */
+static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, pmd_t *pmdp, pmd_t *hpmdp,
+ int prot)
+{
+ int ret = 0;
+
+ /* Protect gmap pmd for dirty tracking. */
+ ret = gmap_pmdp_force_prot(gmap, gaddr, pmdp, prot);
+ /*
+ * Transfer protection back to the host pmd, so userspace has
+ * never more access rights than the VM.
+ */
+ if (!ret)
+ gmap_pmdp_transfer_prot(gmap->mm, vmaddr, pmdp, hpmdp);
+ return ret;
+}
+
+
/*
* gmap_protect_pte - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
@@ -2477,6 +2604,87 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
}
EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
+/**
+ * gmap_test_and_clear_dirty_segment - test and reset segment dirty status
+ * @gmap: pointer to guest address space
+ * @pmdp: pointer to the pmd to be tested
+ * @gaddr: virtual address in the guest address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+bool gmap_test_and_clear_dirty_segment(struct gmap *gmap, pmd_t *pmdp,
+ pmd_t *hpmdp, unsigned long gaddr,
+ unsigned long vmaddr)
+{
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+ return false;
+
+ /* Already protected memory, which did not change is clean */
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
+ return false;
+
+ /* Clear UC indication and reset protection */
+ pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ gmap_protect_pmd(gmap, gaddr, vmaddr, pmdp, hpmdp, PROT_READ);
+ return true;
+}
+
+/**
+ * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
+ * @gmap: pointer to guest address space
+ * @bitmap: dirty bitmap for this pmd
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: virtual address in the host address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
+ unsigned long gaddr, unsigned long vmaddr)
+{
+ int i = 0;
+ pmd_t *pmdp, *hpmdp, fpmd;
+ spinlock_t *ptl;
+
+ hpmdp = (pmd_t *)huge_pte_offset(gmap->mm, vmaddr, HPAGE_SIZE);
+ if (!hpmdp)
+ return;
+ ptl = pmd_lock(gmap->mm, hpmdp);
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp) {
+ spin_unlock(ptl);
+ return;
+ }
+
+ if (pmd_large(*pmdp)) {
+ if (gmap_test_and_clear_dirty_segment(gmap, pmdp, hpmdp,
+ gaddr, vmaddr))
+ memset(bitmap, 0xff, 32);
+ } else {
+ /* We handle this here, as it's of the records from mm. */
+ if (unlikely(gmap_pmd_is_split(pmdp))) {
+ for (; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+ if (test_and_clear_guest_dirty_split(gmap->mm, pmdp, vmaddr))
+ set_bit_le(i, bitmap);
+ fpmd = *hpmdp;
+ pmd_val(fpmd) |= _SEGMENT_ENTRY_PROTECT;
+ gmap_pmdp_transfer_prot(gmap->mm, vmaddr,
+ &fpmd, hpmdp);
+ }
+ } else {
+ for (; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+ if (test_and_clear_guest_dirty(gmap->mm, vmaddr))
+ set_bit_le(i, bitmap);
+ }
+ }
+ }
+ gmap_pmd_op_end(gmap, pmdp);
+ spin_unlock(ptl);
+}
+EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+
static inline void thp_split_mm(struct mm_struct *mm)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -705,6 +705,57 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
preempt_enable();
}
+void ptep_remove_dirty_protection_split(struct mm_struct *mm,
+ pte_t *ptep, unsigned long vmaddr)
+{
+ pte_t unprot = __pte(pte_val(*ptep) & ~_PAGE_PROTECT);
+ pgste_t pgste;
+ unsigned long bits;
+
+ pgste = pgste_get_lock(ptep);
+ pgste_val(pgste) |= PGSTE_UC_BIT;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ pgste_val(pgste) ^= bits;
+ ptep_notify_gmap(mm, vmaddr, ptep, bits);
+ ptep_ipte_global(mm, vmaddr, ptep, 0);
+
+ *ptep = unprot;
+ pgste_set_unlock(ptep, pgste);
+}
+EXPORT_SYMBOL_GPL(ptep_remove_dirty_protection_split);
+
+bool test_and_clear_guest_dirty_split(struct mm_struct *mm, pmd_t *pmdp,
+ unsigned long vmaddr)
+{
+ bool dirty;
+ pte_t *ptep, pte;
+ pgste_t pgste;
+ unsigned long bits;
+
+ ptep = pte_offset_map(pmdp, vmaddr);
+ pgste = pgste_get_lock(ptep);
+ dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+ pgste_val(pgste) &= ~PGSTE_UC_BIT;
+ pte = *ptep;
+ if (dirty) {
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify_gmap(mm, vmaddr, ptep, bits);
+ }
+ ptep_ipte_global(mm, vmaddr, ptep, 0);
+ if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+ pte_val(pte) |= _PAGE_PROTECT;
+ else
+ pte_val(pte) |= _PAGE_INVALID;
+ *ptep = pte;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return dirty;
+}
+EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty_split);
+
/*
* Test and reset if a guest page is dirty
*/
@@ -731,12 +782,6 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return false;
- /* We can't run guests backed by huge pages, but userspace can
- * still set them up and then try to migrate them without any
- * migration support.
- */
- if (pmd_large(*pmd))
- return true;
ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (unlikely(!ptep))