@@ -1595,6 +1595,63 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}
+static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
+ unsigned long hva)
+{
+ gpa_t gpa_start, gpa_end;
+ hva_t uaddr_start, uaddr_end;
+ size_t size;
+
+ size = memslot->npages * PAGE_SIZE;
+
+ gpa_start = memslot->base_gfn << PAGE_SHIFT;
+ gpa_end = gpa_start + size;
+
+ uaddr_start = memslot->userspace_addr;
+ uaddr_end = uaddr_start + size;
+
+ /*
+ * Pages belonging to memslots that don't have the same alignment
+ * within a PMD for userspace and IPA cannot be mapped with stage-2
+ * PMD entries, because we'll end up mapping the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SIZE:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 PMDs, we'll end up with this incorrect
+ * mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
+ return false;
+
+ /*
+ * Next, let's make sure we're not trying to map anything not covered
+ * by the memslot. This means we have to prohibit PMD size mappings
+ * for the beginning and end of a non-PMD aligned and non-PMD sized
+ * memory slot (illustrated by the head and tail parts of the
+ * userspace view above containing pages 'abcde' and 'xyz',
+ * respectively).
+ *
+ * Note that it doesn't matter if we do the check using the
+ * userspace_addr or the base_gfn, as both are equally aligned (per
+ * the check above) and equally sized.
+ */
+ return (hva & S2_PMD_MASK) >= uaddr_start &&
+ (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
@@ -1621,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
+ if (!fault_supports_stage2_pmd_mappings(memslot, hva))
+ force_pte = true;
+
+ if (logging_active)
+ force_pte = true;
+
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(¤t->mm->mmap_sem);
vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1637,28 +1700,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
*/
if ((vma_pagesize == PMD_SIZE ||
(vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) &&
- !logging_active) {
+ !force_pte) {
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
- } else {
- /*
- * Fallback to PTE if it's not one of the Stage 2
- * supported hugepage sizes or the corresponding level
- * doesn't exist
- */
- vma_pagesize = PAGE_SIZE;
-
- /*
- * Pages belonging to memslots that don't have the same
- * alignment for userspace and IPA cannot be mapped using
- * block descriptors even if the pages belong to a THP for
- * the process, because the stage-2 block descriptor will
- * cover more than a single THP and we loose atomicity for
- * unmapping, updates, and splits of the THP or other pages
- * in the stage-2 block range.
- */
- if ((memslot->userspace_addr & ~PMD_MASK) !=
- ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
- force_pte = true;
}
up_read(¤t->mm->mmap_sem);
@@ -1697,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* should not be mapped with huge pages (it introduces churn
* and performance degradation), so force a pte mapping.
*/
- force_pte = true;
flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
/*