@@ -900,6 +900,18 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
}
/*
+ * It is the placeholder and it will be set on pte-list after removing
+ * a spte so that other sptes on this pte_list are not moved and the
+ * pte-list-descs on the pte-list are not freed.
+ *
+ * If vcpu can not add spte to the pte-list (e.g. the rmap on invalid
+ * memslot) and spte can not be freed during pte-list walk, we can
+ * cocurrently clear sptes on the pte-list, the worst case is, we double
+ * zap a spte that is safe.
+ */
+#define PTE_LIST_SPTE_SKIP (u64 *)((~0x0ul) & (~1))
+
+/*
* Pte mapping structures:
*
* If pte_list bit zero is zero, then pte_list point to the spte.
@@ -1003,6 +1015,40 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
}
}
+static void pte_list_clear_concurrently(u64 *spte, unsigned long *pte_list)
+{
+ struct pte_list_desc *desc;
+ unsigned long pte_value = *pte_list;
+ int i;
+
+ /* Empty pte list stores nothing. */
+ WARN_ON(!pte_value);
+
+ if (!(pte_value & 1)) {
+ if ((u64 *)pte_value == spte) {
+ *pte_list = (unsigned long)PTE_LIST_SPTE_SKIP;
+ return;
+ }
+
+ /* someone has already cleared it. */
+ WARN_ON(pte_value != (unsigned long)PTE_LIST_SPTE_SKIP);
+ return;
+ }
+
+ desc = (struct pte_list_desc *)(pte_value & ~1ul);
+ while (desc) {
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+ if (desc->sptes[i] == spte) {
+ desc->sptes[i] = PTE_LIST_SPTE_SKIP;
+ return;
+ }
+
+ desc = desc->more;
+ }
+
+ return;
+}
+
typedef void (*pte_list_walk_fn) (u64 *spte);
static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
{
@@ -1214,6 +1260,12 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
return false;
}
+/* PTE_LIST_SPTE_SKIP is only used on invalid rmap. */
+static void check_valid_sptep(u64 *sptep)
+{
+ WARN_ON(sptep == PTE_LIST_SPTE_SKIP || !is_rmap_spte(*sptep));
+}
+
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
bool pt_protect)
{
@@ -1222,7 +1274,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
bool flush = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ check_valid_sptep(sptep);
if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
sptep = rmap_get_first(*rmapp, &iter);
continue;
@@ -1293,7 +1345,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
int need_tlb_flush = 0;
while ((sptep = rmap_get_first(*rmapp, &iter))) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ check_valid_sptep(sptep);
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
drop_spte(kvm, sptep);
@@ -1322,7 +1374,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_pfn = pte_pfn(*ptep);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+ check_valid_sptep(sptep);
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
need_flush = 1;
@@ -1455,7 +1507,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+ check_valid_sptep(sptep);
if (*sptep & shadow_accessed_mask) {
young = 1;
@@ -1493,7 +1545,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+ check_valid_sptep(sptep);
if (*sptep & shadow_accessed_mask) {
young = 1;
This patch introduce PTE_LIST_SPTE_SKIP which is the placeholder and it will be set on pte-list after removing a spte so that other sptes on this pte_list are not moved and the pte-list-descs on the pte-list are not freed. If vcpu can not add spte to the pte-list (e.g. the rmap on invalid memslot) and spte can not be freed during pte-list walk, we can concurrently clear sptes on the pte-list, the worst case is, we double zap a spte that is safe. This patch only ensures that concurrently zapping pte-list is safe, we will keep spte available during concurrently clearing in the later patches Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> --- arch/x86/kvm/mmu.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 57 insertions(+), 5 deletions(-)