From patchwork Mon Aug 30 23:59:09 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466177
Return-Path: 
 <SRS0=zltt=NW=lists.openwall.com=kernel-hardening-return-21355-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 71944C4320A
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:20 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id C222760698
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:19 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org C222760698
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 6077 invoked by uid 550); 31 Aug 2021 00:00:24 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 5970 invoked from network); 31 Aug 2021 00:00:22 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933703"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933703"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712796"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 01/19] list: Support getting most recent element in
 list_lru
Date: Mon, 30 Aug 2021 16:59:09 -0700
Message-Id: <20210830235927.6443-2-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

In future patches, some functionality will use list_lru that also needs
to keep track of the most recently used element on a node. Since this
information is already contained within list_lru, add a function to get
it so that an additional list is not needed in the caller.

Do not support memcg aware list_lru's since it is not needed by the
intended caller.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 include/linux/list_lru.h | 13 +++++++++++++
 mm/list_lru.c            | 28 ++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 1b5fceb565df..08e07c19fd13 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -103,6 +103,19 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
  */
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
+/**
+ * list_lru_get_mru: gets and removes the tail from one of the node lists
+ * @list_lru: the lru pointer
+ * @nid: the node id
+ *
+ * This function removes the most recently added item from one of the node
+ * id specified. This function should not be used if the list_lru is memcg
+ * aware.
+ *
+ * Return value: The element removed
+ */
+struct list_head *list_lru_get_mru(struct list_lru *lru, int nid);
+
 /**
  * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index cd58790d0fb3..c1bec58168e1 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -156,6 +156,34 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
+struct list_head *list_lru_get_mru(struct list_lru *lru, int nid)
+{
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l = &nlru->lru;
+	struct list_head *ret;
+
+	/* This function does not attempt to search through the memcg lists */
+	if (list_lru_memcg_aware(lru)) {
+		WARN_ONCE(1, "list_lru: %s not supported on memcg aware list_lrus", __func__);
+		return NULL;
+	}
+
+	spin_lock(&nlru->lock);
+	if (list_empty(&l->list)) {
+		ret = NULL;
+	} else {
+		/* Get tail */
+		ret = l->list.prev;
+		list_del_init(ret);
+
+		l->nr_items--;
+		nlru->nr_items--;
+	}
+	spin_unlock(&nlru->lock);
+
+	return ret;
+}
+
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
 {
 	list_del_init(item);

From patchwork Mon Aug 30 23:59:10 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466175
Return-Path: 
 <SRS0=wiy8=NW=lists.openwall.com=kernel-hardening-return-21354-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 55460C43214
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:12 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id A7A1F60E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:11 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org A7A1F60E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 6010 invoked by uid 550); 31 Aug 2021 00:00:23 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 5940 invoked from network); 31 Aug 2021 00:00:21 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933705"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933705"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712805"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 02/19] list: Support list head not in object for
 list_lru
Date: Mon, 30 Aug 2021 16:59:10 -0700
Message-Id: <20210830235927.6443-3-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

In future patches, there will be a need to keep track of objects with
list_lru where the list_head is not in the object (will be in struct
page). Since list_lru automatically determines the node id from the
list_head, this will fail when using struct page.

So create a new function in list_lru, list_lru_add_node(), that allows
the node id of the item to be passed in. Otherwise it behaves exactly
like list_lru_add().

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 include/linux/list_lru.h | 13 +++++++++++++
 mm/list_lru.c            | 10 ++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 08e07c19fd13..42c22322058b 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -90,6 +90,19 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);
  */
 bool list_lru_add(struct list_lru *lru, struct list_head *item);
 
+/**
+ * list_lru_add_node: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @item: the item to be added.
+ * @nid: the node id of the item
+ *
+ * Like list_lru_add, but takes the node id as parameter instead of
+ * calculating it from the list_head passed in.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_node(struct list_lru *lru, struct list_head *item, int nid);
+
 /**
  * list_lru_del: delete an element to the lru list
  * @list_lru: the lru pointer
diff --git a/mm/list_lru.c b/mm/list_lru.c
index c1bec58168e1..f35f11ada8a1 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -112,9 +112,8 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
-bool list_lru_add(struct list_lru *lru, struct list_head *item)
+bool list_lru_add_node(struct list_lru *lru, struct list_head *item, int nid)
 {
-	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
 	struct mem_cgroup *memcg;
 	struct list_lru_one *l;
@@ -134,6 +133,13 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 	spin_unlock(&nlru->lock);
 	return false;
 }
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+	int nid = page_to_nid(virt_to_page(item));
+
+	return list_lru_add_node(lru, item, nid);
+}
 EXPORT_SYMBOL_GPL(list_lru_add);
 
 bool list_lru_del(struct list_lru *lru, struct list_head *item)

From patchwork Mon Aug 30 23:59:11 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466185
Return-Path: 
 <SRS0=UPuK=NW=lists.openwall.com=kernel-hardening-return-21359-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8004DC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:59 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id AA87560E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:58 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org AA87560E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7399 invoked by uid 550); 31 Aug 2021 00:00:32 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7220 invoked from network); 31 Aug 2021 00:00:28 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933707"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933707"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712809"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 03/19] x86/mm/cpa: Add grouped page allocations
Date: Mon, 30 Aug 2021 16:59:11 -0700
Message-Id: <20210830235927.6443-4-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

For x86, setting memory permissions on the direct map results in fracturing
large pages. Direct map fracturing can be reduced by locating pages that
will have their permissions set close together.

Create a simple page cache that allocates pages from huge page size
blocks. Don't guarantee that a page will come from a huge page grouping,
instead fallback to non-grouped pages to fulfill the allocation if
needed. Also, register a shrinker such that the system can ask for the
pages back if needed. Since this is only needed when there is a direct
map, compile it out on highmem systems.

Free pages in the cache are kept track of in per-node list inside a
list_lru. NUMA_NO_NODE requests are serviced by checking each per-node
list in a round robin fashion. If pages are requested for a certain node
but the cache is empty for that node, a whole additional huge page size
page is allocated.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/set_memory.h |  14 +++
 arch/x86/mm/pat/set_memory.c      | 156 ++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 43fa081a1adb..6e897ab91b77 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -4,6 +4,9 @@
 
 #include <asm/page.h>
 #include <asm-generic/set_memory.h>
+#include <linux/gfp.h>
+#include <linux/list_lru.h>
+#include <linux/shrinker.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
@@ -135,4 +138,15 @@ static inline int clear_mce_nospec(unsigned long pfn)
  */
 #endif
 
+struct grouped_page_cache {
+	struct shrinker shrinker;
+	struct list_lru lru;
+	gfp_t gfp;
+	atomic_t nid_round_robin;
+};
+
+int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp);
+struct page *get_grouped_page(int node, struct grouped_page_cache *gpc);
+void free_grouped_page(struct grouped_page_cache *gpc, struct page *page);
+
 #endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index ad8a5c586a35..e9527811f476 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2314,6 +2314,162 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
 	return retval;
 }
 
+#ifndef HIGHMEM
+static struct page *__alloc_page_order(int node, gfp_t gfp_mask, int order)
+{
+	if (node == NUMA_NO_NODE)
+		return alloc_pages(gfp_mask, order);
+
+	return alloc_pages_node(node, gfp_mask, order);
+}
+
+static struct grouped_page_cache *__get_gpc_from_sc(struct shrinker *shrinker)
+{
+	return container_of(shrinker, struct grouped_page_cache, shrinker);
+}
+
+static unsigned long grouped_shrink_count(struct shrinker *shrinker,
+					  struct shrink_control *sc)
+{
+	struct grouped_page_cache *gpc = __get_gpc_from_sc(shrinker);
+	unsigned long page_cnt = list_lru_shrink_count(&gpc->lru, sc);
+
+	return page_cnt ? page_cnt : SHRINK_EMPTY;
+}
+
+static enum lru_status grouped_isolate(struct list_head *item,
+				       struct list_lru_one *list,
+				       spinlock_t *lock, void *cb_arg)
+{
+	struct list_head *dispose = cb_arg;
+
+	list_lru_isolate_move(list, item, dispose);
+
+	return LRU_REMOVED;
+}
+
+static void __dispose_pages(struct grouped_page_cache *gpc, struct list_head *head)
+{
+	struct list_head *cur, *next;
+
+	list_for_each_safe(cur, next, head) {
+		struct page *page = list_entry(head, struct page, lru);
+
+		list_del(cur);
+
+		__free_pages(page, 0);
+	}
+}
+
+static unsigned long grouped_shrink_scan(struct shrinker *shrinker,
+					 struct shrink_control *sc)
+{
+	struct grouped_page_cache *gpc = __get_gpc_from_sc(shrinker);
+	unsigned long isolated;
+	LIST_HEAD(freeable);
+
+	if (!(sc->gfp_mask & gpc->gfp))
+		return SHRINK_STOP;
+
+	isolated = list_lru_shrink_walk(&gpc->lru, sc, grouped_isolate,
+					&freeable);
+	__dispose_pages(gpc, &freeable);
+
+	/* Every item walked gets isolated */
+	sc->nr_scanned += isolated;
+
+	return isolated;
+}
+
+static struct page *__remove_first_page(struct grouped_page_cache *gpc, int node)
+{
+	unsigned int start_nid, i;
+	struct list_head *head;
+
+	if (node != NUMA_NO_NODE) {
+		head = list_lru_get_mru(&gpc->lru, node);
+		if (head)
+			return list_entry(head, struct page, lru);
+		return NULL;
+	}
+
+	/* If NUMA_NO_NODE, search the nodes in round robin for a page */
+	start_nid = (unsigned int)atomic_fetch_inc(&gpc->nid_round_robin) % nr_node_ids;
+	for (i = 0; i < nr_node_ids; i++) {
+		int cur_nid = (start_nid + i) % nr_node_ids;
+
+		head = list_lru_get_mru(&gpc->lru, cur_nid);
+		if (head)
+			return list_entry(head, struct page, lru);
+	}
+
+	return NULL;
+}
+
+/* Get and add some new pages to the cache to be used by VM_GROUP_PAGES */
+static struct page *__replenish_grouped_pages(struct grouped_page_cache *gpc, int node)
+{
+	const unsigned int hpage_cnt = HPAGE_SIZE >> PAGE_SHIFT;
+	struct page *page;
+	int i;
+
+	page = __alloc_page_order(node, gpc->gfp, HUGETLB_PAGE_ORDER);
+	if (!page)
+		return __alloc_page_order(node, gpc->gfp, 0);
+
+	split_page(page, HUGETLB_PAGE_ORDER);
+
+	for (i = 1; i < hpage_cnt; i++)
+		free_grouped_page(gpc, &page[i]);
+
+	return &page[0];
+}
+
+int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp)
+{
+	int err = 0;
+
+	memset(gpc, 0, sizeof(struct grouped_page_cache));
+
+	if (list_lru_init(&gpc->lru))
+		goto out;
+
+	gpc->shrinker.count_objects = grouped_shrink_count;
+	gpc->shrinker.scan_objects = grouped_shrink_scan;
+	gpc->shrinker.seeks = DEFAULT_SEEKS;
+	gpc->shrinker.flags = SHRINKER_NUMA_AWARE;
+
+	err = register_shrinker(&gpc->shrinker);
+	if (err)
+		list_lru_destroy(&gpc->lru);
+
+out:
+	return err;
+}
+
+struct page *get_grouped_page(int node, struct grouped_page_cache *gpc)
+{
+	struct page *page;
+
+	if (in_interrupt()) {
+		pr_warn_once("grouped pages: Cannot allocate grouped page in interrupt");
+		return NULL;
+	}
+
+	page = __remove_first_page(gpc, node);
+
+	if (page)
+		return page;
+
+	return __replenish_grouped_pages(gpc, node);
+}
+
+void free_grouped_page(struct grouped_page_cache *gpc, struct page *page)
+{
+	INIT_LIST_HEAD(&page->lru);
+	list_lru_add_node(&gpc->lru, &page->lru, page_to_nid(page));
+}
+#endif /* !HIGHMEM */
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.

From patchwork Mon Aug 30 23:59:12 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466183
Return-Path: 
 <SRS0=duED=NW=lists.openwall.com=kernel-hardening-return-21358-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C8D69C432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:47 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 2496560E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:46 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 2496560E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7339 invoked by uid 550); 31 Aug 2021 00:00:31 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 6126 invoked from network); 31 Aug 2021 00:00:26 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933709"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933709"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712816"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 04/19] mm: Explicitly zero page table lock ptr
Date: Mon, 30 Aug 2021 16:59:12 -0700
Message-Id: <20210830235927.6443-5-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

In ptlock_init() there is a VM_BUG_ON_PAGE() check on the page table lock
pointer. Explicitly zero the lock in ptlock_free() so a page table lock
can be re-initialized without triggering the BUG_ON().

It appears this doesn't normally trigger because the private field
shares the same space in struct page as ptl and page tables always
return to the buddy allocator before being re-initialized as new page
tables. When the page returns to the buddy allocator, private gets
used to store the page order, so it inadvertently clears ptl as well.
In future patches, pages will get re-initialized as page tables without
returning to the buddy allocator so this is needed.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 mm/memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/memory.c b/mm/memory.c
index 25fc46e87214..e6d630463c7f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5465,5 +5465,6 @@ bool ptlock_alloc(struct page *page)
 void ptlock_free(struct page *page)
 {
 	kmem_cache_free(page_ptl_cachep, page->ptl);
+	page->ptl = 0;
 }
 #endif

From patchwork Mon Aug 30 23:59:13 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466187
Return-Path: 
 <SRS0=37az=NW=lists.openwall.com=kernel-hardening-return-21361-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 3E35EC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:11 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 2B29060F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:09 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 2B29060F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7514 invoked by uid 550); 31 Aug 2021 00:00:35 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7271 invoked from network); 31 Aug 2021 00:00:29 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933710"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933710"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712827"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 05/19] x86, mm: Use cache of page tables
Date: Mon, 30 Aug 2021 16:59:13 -0700
Message-Id: <20210830235927.6443-6-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Change the page table allocation functions defined in pgalloc.h to use
a cache of physically grouped pages. This will let the page tables be set
with PKS permissions later.

For userspace page tables, they are gathered up using mmu gather, and
freed along with other types of pages in swap.c. Move setting/clearing of
the PageTable page flag to the allocators so that swap can know to return
this page to the cache of page tables, and not free it to the page
allocator. Where it currently is, in the ctor/dtors, causes it to be
cleared before the page gets to swap.

Do not set PKS permissions on the page tables, because the page table
setting functions cannot handle it yet. This will be done in later
patches.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/pgalloc.h |  6 ++-
 arch/x86/include/asm/pgtable.h |  6 +++
 arch/x86/mm/pgtable.c          | 79 ++++++++++++++++++++++++++++++++++
 include/asm-generic/pgalloc.h  | 44 ++++++++++++++-----
 include/linux/mm.h             | 11 +++--
 mm/swap.c                      |  6 +++
 mm/swap_state.c                |  5 +++
 7 files changed, 142 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c7ec5bb88334..1ff308ea76cd 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -7,6 +7,10 @@
 #include <linux/pagemap.h>
 
 #define __HAVE_ARCH_PTE_ALLOC_ONE
+#ifdef CONFIG_PKS_PG_TABLES
+#define __HAVE_ARCH_FREE_TABLE
+#define __HAVE_ARCH_ALLOC_TABLE
+#endif
 #define __HAVE_ARCH_PGD_FREE
 #include <asm-generic/pgalloc.h>
 
@@ -162,7 +166,7 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
 		return;
 
 	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
-	free_page((unsigned long)p4d);
+	free_table(virt_to_page(p4d));
 }
 
 extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 448cd01eb3ec..3c119ef49062 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -117,6 +117,12 @@ extern pmdval_t early_pmd_flags;
 #define arch_end_context_switch(prev)	do {} while(0)
 #endif	/* CONFIG_PARAVIRT_XXL */
 
+#ifdef CONFIG_PKS_PG_TABLES
+bool pks_tables_inited(void);
+#else /* CONFIG_PKS_PG_TABLES */
+#define pks_tables_inited() 0
+#endif /* CONFIG_PKS_PG_TABLES */
+
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3481b35cb4ec..81b767a5d6ef 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,6 +6,8 @@
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
+#include <asm/set_memory.h>
+#include <linux/page-flags.h>
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
@@ -33,6 +35,55 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
 	return __pte_alloc_one(mm, __userpte_alloc_gfp);
 }
 
+#ifdef CONFIG_PKS_PG_TABLES
+static struct grouped_page_cache gpc_pks;
+static bool __ro_after_init pks_tables_inited_val;
+
+
+struct page *alloc_table(gfp_t gfp)
+{
+	struct page *table;
+
+	if (!pks_tables_inited()) {
+		table = alloc_page(gfp);
+		if (table)
+			__SetPageTable(table);
+		return table;
+	}
+
+	table = get_grouped_page(numa_node_id(), &gpc_pks);
+	if (!table)
+		return NULL;
+	__SetPageTable(table);
+
+	if (gfp & __GFP_ZERO)
+		memset(page_address(table), 0, PAGE_SIZE);
+
+	if (memcg_kmem_enabled() &&
+	    gfp & __GFP_ACCOUNT &&
+	    !__memcg_kmem_charge_page(table, gfp, 0)) {
+		free_table(table);
+		return NULL;
+	}
+
+	return table;
+}
+
+void free_table(struct page *table_page)
+{
+	__ClearPageTable(table_page);
+
+	if (!pks_tables_inited()) {
+		__free_pages(table_page, 0);
+		return;
+	}
+
+	if (memcg_kmem_enabled() && PageMemcgKmem(table_page))
+		__memcg_kmem_uncharge_page(table_page, 0);
+	free_grouped_page(&gpc_pks, table_page);
+}
+#endif /* CONFIG_PKS_PG_TABLES */
+
 static int __init setup_userpte(char *arg)
 {
 	if (!arg)
@@ -411,12 +462,24 @@ static inline void _pgd_free(pgd_t *pgd)
 
 static inline pgd_t *_pgd_alloc(void)
 {
+	if (pks_tables_inited()) {
+		struct page *page = alloc_table(GFP_PGTABLE_USER);
+
+		if (!page)
+			return NULL;
+		return page_address(page);
+	}
+
 	return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
 					 PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
+	if (pks_tables_inited()) {
+		free_table(virt_to_page(pgd));
+		return;
+	}
 	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
@@ -851,6 +914,22 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 	return 1;
 }
 
+#ifdef CONFIG_PKS_PG_TABLES
+bool pks_tables_inited(void)
+{
+	return pks_tables_inited_val;
+}
+
+static int __init pks_page_init(void)
+{
+	pks_tables_inited_val = !init_grouped_page_cache(&gpc_pks, GFP_KERNEL | PGTABLE_HIGHMEM);
+
+out:
+	return !pks_tables_inited_val;
+}
+
+device_initcall(pks_page_init);
+#endif /* CONFIG_PKS_PG_TABLES */
 #else /* !CONFIG_X86_64 */
 
 /*
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 02932efad3ab..e576c19abc8c 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -2,11 +2,26 @@
 #ifndef __ASM_GENERIC_PGALLOC_H
 #define __ASM_GENERIC_PGALLOC_H
 
+#include <linux/mm.h>
+
 #ifdef CONFIG_MMU
 
 #define GFP_PGTABLE_KERNEL	(GFP_KERNEL | __GFP_ZERO)
 #define GFP_PGTABLE_USER	(GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
 
+#ifndef __HAVE_ARCH_ALLOC_TABLE
+static inline struct page *alloc_table(gfp_t gfp)
+{
+	return alloc_page(gfp);
+}
+#else /* __HAVE_ARCH_ALLOC_TABLE */
+extern struct page *alloc_table(gfp_t gfp);
+#endif /* __HAVE_ARCH_ALLOC_TABLE */
+
+#ifdef __HAVE_ARCH_FREE_TABLE
+extern void free_table(struct page *);
+#endif /* __HAVE_ARCH_FREE_TABLE */
+
 /**
  * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
  * @mm: the mm_struct of the current context
@@ -18,7 +33,12 @@
  */
 static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
 {
-	return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
+	struct page *page = alloc_table(GFP_PGTABLE_KERNEL);
+
+	if (!page)
+		return NULL;
+
+	return (pte_t *)page_address(page);
 }
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
@@ -41,7 +61,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
  */
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-	free_page((unsigned long)pte);
+	free_table(virt_to_page(pte));
 }
 
 /**
@@ -60,11 +80,11 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
 {
 	struct page *pte;
 
-	pte = alloc_page(gfp);
+	pte = alloc_table(gfp);
 	if (!pte)
 		return NULL;
 	if (!pgtable_pte_page_ctor(pte)) {
-		__free_page(pte);
+		free_table(pte);
 		return NULL;
 	}
 
@@ -99,7 +119,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
 {
 	pgtable_pte_page_dtor(pte_page);
-	__free_page(pte_page);
+	free_table(pte_page);
 }
 
 
@@ -123,11 +143,11 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	page = alloc_pages(gfp, 0);
+	page = alloc_table(gfp);
 	if (!page)
 		return NULL;
 	if (!pgtable_pmd_page_ctor(page)) {
-		__free_pages(page, 0);
+		free_table(page);
 		return NULL;
 	}
 	return (pmd_t *)page_address(page);
@@ -139,7 +159,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
 	pgtable_pmd_page_dtor(virt_to_page(pmd));
-	free_page((unsigned long)pmd);
+	free_table(virt_to_page(pmd));
 }
 #endif
 
@@ -160,17 +180,21 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	gfp_t gfp = GFP_PGTABLE_USER;
+	struct page *table;
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	return (pud_t *)get_zeroed_page(gfp);
+	table = alloc_table(gfp);
+	if (!table)
+		return NULL;
+	return (pud_t *)page_address(table);
 }
 #endif
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	free_page((unsigned long)pud);
+	free_table(virt_to_page(pud));
 }
 
 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c13c7af7cad3..ab63d5a201cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2327,6 +2327,13 @@ static inline bool ptlock_init(struct page *page) { return true; }
 static inline void ptlock_free(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
+#ifndef CONFIG_PKS_PG_TABLES
+static inline void free_table(struct page *table_page)
+{
+	__free_pages(table_page, 0);
+}
+#endif /* CONFIG_PKS_PG_TABLES */
+
 static inline void pgtable_init(void)
 {
 	ptlock_cache_init();
@@ -2337,7 +2344,6 @@ static inline bool pgtable_pte_page_ctor(struct page *page)
 {
 	if (!ptlock_init(page))
 		return false;
-	__SetPageTable(page);
 	inc_lruvec_page_state(page, NR_PAGETABLE);
 	return true;
 }
@@ -2345,7 +2351,6 @@ static inline bool pgtable_pte_page_ctor(struct page *page)
 static inline void pgtable_pte_page_dtor(struct page *page)
 {
 	ptlock_free(page);
-	__ClearPageTable(page);
 	dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
@@ -2432,7 +2437,6 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
 {
 	if (!pmd_ptlock_init(page))
 		return false;
-	__SetPageTable(page);
 	inc_lruvec_page_state(page, NR_PAGETABLE);
 	return true;
 }
@@ -2440,7 +2444,6 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
 static inline void pgtable_pmd_page_dtor(struct page *page)
 {
 	pmd_ptlock_free(page);
-	__ClearPageTable(page);
 	dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 19600430e536..234bb339ad57 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -37,6 +37,7 @@
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
 #include <linux/buffer_head.h>
+#include <asm/pgalloc.h>
 
 #include "internal.h"
 
@@ -937,6 +938,11 @@ void release_pages(struct page **pages, int nr)
 			continue;
 		}
 
+		if (PageTable(page)) {
+			free_table(page);
+			continue;
+		}
+
 		if (!put_page_testzero(page))
 			continue;
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c56aa9ac050d..49f267a5f05c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -22,6 +22,7 @@
 #include <linux/swap_slots.h>
 #include <linux/huge_mm.h>
 #include <linux/shmem_fs.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
 
 /*
@@ -301,6 +302,10 @@ void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
+	if (PageTable(page)) {
+		free_table(page);
+		return;
+	}
 	if (!is_huge_zero_page(page))
 		put_page(page);
 }

From patchwork Mon Aug 30 23:59:14 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466171
Return-Path: 
 <SRS0=UILK=NW=lists.openwall.com=kernel-hardening-return-21352-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id AE3E8C432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:00:57 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id DDCE160FC0
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:00:56 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org DDCE160FC0
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 5945 invoked by uid 550); 31 Aug 2021 00:00:21 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 5896 invoked from network); 31 Aug 2021 00:00:19 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933712"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933712"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712846"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 06/19] x86/mm/cpa: Add perm callbacks to grouped pages
Date: Mon, 30 Aug 2021 16:59:14 -0700
Message-Id: <20210830235927.6443-7-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Future patches will need to set permissions on pages in the cache, so
add some callbacks that let gouped page cache callers provide callbacks
the grouped pages can use when replenishing the cache or freeing pages via
the shrinker.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/set_memory.h |  8 ++++++-
 arch/x86/mm/pat/set_memory.c      | 38 ++++++++++++++++++++++++++++---
 arch/x86/mm/pgtable.c             |  3 ++-
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 6e897ab91b77..eaac7e3e08bf 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -138,14 +138,20 @@ static inline int clear_mce_nospec(unsigned long pfn)
  */
 #endif
 
+typedef int (*gpc_callback)(struct page*, unsigned int);
+
 struct grouped_page_cache {
 	struct shrinker shrinker;
 	struct list_lru lru;
 	gfp_t gfp;
+	gpc_callback pre_add_to_cache;
+	gpc_callback pre_shrink_free;
 	atomic_t nid_round_robin;
 };
 
-int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp);
+int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp,
+			    gpc_callback pre_add_to_cache,
+			    gpc_callback pre_shrink_free);
 struct page *get_grouped_page(int node, struct grouped_page_cache *gpc);
 void free_grouped_page(struct grouped_page_cache *gpc, struct page *page);
 
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index e9527811f476..72a465e37648 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2357,6 +2357,9 @@ static void __dispose_pages(struct grouped_page_cache *gpc, struct list_head *he
 
 		list_del(cur);
 
+		if (gpc->pre_shrink_free)
+			gpc->pre_shrink_free(page, 1);
+
 		__free_pages(page, 0);
 	}
 }
@@ -2406,6 +2409,21 @@ static struct page *__remove_first_page(struct grouped_page_cache *gpc, int node
 	return NULL;
 }
 
+/* Helper to try to convert the pages, or clean up and free if it fails */
+static int __try_convert(struct grouped_page_cache *gpc, struct page *page, int cnt)
+{
+	int i;
+
+	if (gpc->pre_add_to_cache && gpc->pre_add_to_cache(page, cnt)) {
+		if (gpc->pre_shrink_free)
+			gpc->pre_shrink_free(page, cnt);
+		for (i = 0; i < cnt; i++)
+			__free_pages(&page[i], 0);
+		return 1;
+	}
+	return 0;
+}
+
 /* Get and add some new pages to the cache to be used by VM_GROUP_PAGES */
 static struct page *__replenish_grouped_pages(struct grouped_page_cache *gpc, int node)
 {
@@ -2414,18 +2432,30 @@ static struct page *__replenish_grouped_pages(struct grouped_page_cache *gpc, in
 	int i;
 
 	page = __alloc_page_order(node, gpc->gfp, HUGETLB_PAGE_ORDER);
-	if (!page)
-		return __alloc_page_order(node, gpc->gfp, 0);
+	if (!page) {
+		page = __alloc_page_order(node, gpc->gfp, 0);
+		if (__try_convert(gpc, page, 1))
+			return NULL;
+
+		return page;
+	}
 
 	split_page(page, HUGETLB_PAGE_ORDER);
 
+	/* If fail to convert to be added, try to clean up and free */
+	if (__try_convert(gpc, page, 1))
+		return NULL;
+
+	/* Add the rest to the cache except for the one returned below */
 	for (i = 1; i < hpage_cnt; i++)
 		free_grouped_page(gpc, &page[i]);
 
 	return &page[0];
 }
 
-int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp)
+int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp,
+			    gpc_callback pre_add_to_cache,
+			    gpc_callback pre_shrink_free)
 {
 	int err = 0;
 
@@ -2443,6 +2473,8 @@ int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp)
 	if (err)
 		list_lru_destroy(&gpc->lru);
 
+	gpc->pre_add_to_cache = pre_add_to_cache;
+	gpc->pre_shrink_free = pre_shrink_free;
 out:
 	return err;
 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 81b767a5d6ef..4b929fa1a0ac 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -922,7 +922,8 @@ bool pks_tables_inited(void)
 
 static int __init pks_page_init(void)
 {
-	pks_tables_inited_val = !init_grouped_page_cache(&gpc_pks, GFP_KERNEL | PGTABLE_HIGHMEM);
+	pks_tables_inited_val = !init_grouped_page_cache(&gpc_pks, GFP_KERNEL | PGTABLE_HIGHMEM,
+					       NULL, NULL);
 
 out:
 	return !pks_tables_inited_val;

From patchwork Mon Aug 30 23:59:15 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466179
Return-Path: 
 <SRS0=Kx0E=NW=lists.openwall.com=kernel-hardening-return-21356-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D9D00C432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:28 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 3092A60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:28 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 3092A60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7188 invoked by uid 550); 31 Aug 2021 00:00:27 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 6030 invoked from network); 31 Aug 2021 00:00:24 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933716"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933716"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712861"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 07/19] x86/cpufeatures: Add feature for pks tables
Date: Mon, 30 Aug 2021 16:59:15 -0700
Message-Id: <20210830235927.6443-8-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Add a feature for PKS tables and a kernel parameter to disable it if
desired. Check this boot parameter early in boot such that
initialization functionality that happens earlier in boot can be skipped
if the disable boot parameter is present.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/cpufeatures.h            |  2 +-
 arch/x86/include/asm/pgtable.h                |  6 ++++++
 arch/x86/mm/init.c                            |  1 +
 arch/x86/mm/pgtable.c                         | 19 +++++++++++++++++++
 .../arch/x86/include/asm/disabled-features.h  |  8 +++++++-
 5 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 80c357f638fd..8d2c4e9d32ec 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -201,7 +201,7 @@
 #define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
-/* FREE!                                ( 7*32+10) */
+#define X86_FEATURE_PKS_TABLES		( 3*32+10) /* "" Write protected page tables */
 #define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_RETPOLINE		( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3c119ef49062..3505e3b1f40b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -118,8 +118,14 @@ extern pmdval_t early_pmd_flags;
 #endif	/* CONFIG_PARAVIRT_XXL */
 
 #ifdef CONFIG_PKS_PG_TABLES
+void pks_tables_check_boottime_disable(void);
+void enable_pgtable_write(void);
+void disable_pgtable_write(void);
 bool pks_tables_inited(void);
 #else /* CONFIG_PKS_PG_TABLES */
+static inline void pks_tables_check_boottime_disable(void) { }
+static void enable_pgtable_write(void) { }
+static void disable_pgtable_write(void) { }
 #define pks_tables_inited() 0
 #endif /* CONFIG_PKS_PG_TABLES */
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 75ef19aa8903..c8933c6d5efd 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -719,6 +719,7 @@ void __init init_mem_mapping(void)
 	unsigned long end;
 
 	pti_check_boottime_disable();
+	pks_tables_check_boottime_disable();
 	probe_page_size_mask();
 	setup_pcid();
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 4b929fa1a0ac..ef0b4ce95522 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,6 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 #include <asm/set_memory.h>
+#include <asm/cmdline.h>
 #include <linux/page-flags.h>
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
@@ -930,6 +931,24 @@ static int __init pks_page_init(void)
 }
 
 device_initcall(pks_page_init);
+
+__init void pks_tables_check_boottime_disable(void)
+{
+	if (cmdline_find_option_bool(boot_command_line, "nopkstables"))
+		return;
+
+	/*
+	 * PTI will want to allocate higher order page table pages, which the
+	 * PKS table allocator doesn't support. So don't attempt to enable PKS
+	 * tables in this case.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_PTI)) {
+		pr_info("PTI enabled, not enabling PKS tables");
+		return;
+	}
+	setup_force_cpu_cap(X86_FEATURE_PKS_TABLES);
+}
+
 #endif /* CONFIG_PKS_PG_TABLES */
 #else /* !CONFIG_X86_64 */
 
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 8f28fafa98b3..d98bdfa72170 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 
+#ifdef CONFIG_PKS_TABLES
+# define DISABLE_PKS_TABLES		0
+#else
+# define DISABLE_PKS_TABLES		(1 << (X86_FEATURE_PKS_TABLES & 31))
+#endif
+
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 # define DISABLE_PTI		0
 #else
@@ -75,7 +81,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	(DISABLE_PTI)
+#define DISABLED_MASK7	(DISABLE_PTI|DISABLE_PKS_TABLES)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_SMAP|DISABLE_SGX)
 #define DISABLED_MASK10	0

From patchwork Mon Aug 30 23:59:16 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466169
Return-Path: 
 <SRS0=XjeQ=NW=lists.openwall.com=kernel-hardening-return-21351-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 4B6E1C4320E
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:00:50 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 797D960E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:00:49 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 797D960E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 5854 invoked by uid 550); 31 Aug 2021 00:00:18 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 5810 invoked from network); 31 Aug 2021 00:00:17 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933720"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933720"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712885"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 08/19] x86/mm/cpa: Add get_grouped_page_atomic()
Date: Mon, 30 Aug 2021 16:59:16 -0700
Message-Id: <20210830235927.6443-9-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

For PKS tables, some page table allocations are made with GFP_ATOMIC,
which up to now has not been supported.

Grouped pages may often have pages in the cache, which can be retrieved
without sleeping, however if the cache is empty, it will need try to
replenish from the normal page allocator. Just passing GFP_ATOMIC in
addition to the configure GFP is not ideal because it may try to grab a
whole huge page size page which will stress the atomic reserves.

So instead create new logic that will only try to allocate a single page
and convert it.

Expose this atomic logic in a new function get_grouped_page_atomic(),
instead of allowing a flag to be passed in. This is so it doesn't look
like any GFP flag can get passed in and result in behavior like normal
allocations.

In the case of PKS tables conversion of the page allocated atomically,
set_memory() calls cannot usually be made in atomic context because
pages may need to be allocated, but PKS tables has its own reserve of
direct map tables so pages can be converted to PKS protected in atomic
context.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/set_memory.h |  1 +
 arch/x86/mm/pat/set_memory.c      | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index eaac7e3e08bf..e0516651698a 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -153,6 +153,7 @@ int init_grouped_page_cache(struct grouped_page_cache *gpc, gfp_t gfp,
 			    gpc_callback pre_add_to_cache,
 			    gpc_callback pre_shrink_free);
 struct page *get_grouped_page(int node, struct grouped_page_cache *gpc);
+struct page *get_grouped_page_atomic(int node, struct grouped_page_cache *gpc);
 void free_grouped_page(struct grouped_page_cache *gpc, struct page *page);
 
 #endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 72a465e37648..c51792a797cb 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2496,6 +2496,32 @@ struct page *get_grouped_page(int node, struct grouped_page_cache *gpc)
 	return __replenish_grouped_pages(gpc, node);
 }
 
+struct page *get_grouped_page_atomic(int node, struct grouped_page_cache *gpc)
+{
+	struct page *page;
+
+	if (in_interrupt()) {
+		pr_warn_once("grouped pages: Cannot allocate grouped page in interrupt");
+		return NULL;
+	}
+
+	/* First see if there are any grouped pages already in the cache */
+	page = __remove_first_page(gpc, node);
+
+	/*
+	 * If there wasn't one in the cache, allocate only a single page to not
+	 * stress the reserves.
+	 */
+	if (!page)
+		page = __alloc_page_order(node, gpc->gfp | GFP_ATOMIC, 0);
+
+	/* Convert the single page if configured for this cache */
+	if (!page || __try_convert(gpc, page, 1))
+		return NULL;
+
+	return page;
+}
+
 void free_grouped_page(struct grouped_page_cache *gpc, struct page *page)
 {
 	INIT_LIST_HEAD(&page->lru);

From patchwork Mon Aug 30 23:59:17 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466191
Return-Path: 
 <SRS0=m+Q2=NW=lists.openwall.com=kernel-hardening-return-21363-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 41364C432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:31 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 86AE660E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:30 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 86AE660E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7630 invoked by uid 550); 31 Aug 2021 00:00:36 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7332 invoked from network); 31 Aug 2021 00:00:31 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933722"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933722"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712902"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 09/19] x86/mm: Support GFP_ATOMIC in alloc_table_node()
Date: Mon, 30 Aug 2021 16:59:17 -0700
Message-Id: <20210830235927.6443-10-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

For GFP_ATOMIC in alloc_table/_node(), use get_grouped_page_atomic().

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/mm/pgtable.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ef0b4ce95522..e65d69ad6e0c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,7 +52,10 @@ struct page *alloc_table(gfp_t gfp)
 		return table;
 	}
 
-	table = get_grouped_page(numa_node_id(), &gpc_pks);
+	if (gfp & GFP_ATOMIC)
+		table = get_grouped_page_atomic(numa_node_id(), &gpc_pks);
+	else
+		table = get_grouped_page(numa_node_id(), &gpc_pks);
 	if (!table)
 		return NULL;
 	__SetPageTable(table);

From patchwork Mon Aug 30 23:59:18 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466173
Return-Path: 
 <SRS0=4P+f=NW=lists.openwall.com=kernel-hardening-return-21353-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id BB4A1C4320A
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:04 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 1100060F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:03 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 1100060F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 5966 invoked by uid 550); 31 Aug 2021 00:00:22 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 5856 invoked from network); 31 Aug 2021 00:00:18 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933723"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933723"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712916"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 10/19] x86/mm: Use alloc_table() for fill_pte(), etc
Date: Mon, 30 Aug 2021 16:59:18 -0700
Message-Id: <20210830235927.6443-11-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

fill_pte(), set_pte_vaddr(), etc allocate page tables with
spp_getpage(). Use alloc_table() for these allocations in order to get
tables from the cache of protected pages when needed.

Opportunistically, fix a stale comment.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/mm/init_64.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3c0323ad99da..de5a785ee89f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -220,16 +220,19 @@ static void sync_global_pgds(unsigned long start, unsigned long end)
 
 /*
  * NOTE: This function is marked __ref because it calls __init function
- * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ * (memblock_alloc). It's safe to do it ONLY when after_bootmem == 0.
  */
 static __ref void *spp_getpage(void)
 {
 	void *ptr;
 
-	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
-	else
+	if (after_bootmem) {
+		struct page *page = alloc_table(GFP_ATOMIC | __GFP_ZERO);
+
+		ptr = page ? page_address(page) : NULL;
+	} else {
 		ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+	}
 
 	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
 		panic("set_pte_phys: cannot allocate page data %s\n",

From patchwork Mon Aug 30 23:59:19 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466181
Return-Path: 
 <SRS0=vruS=NW=lists.openwall.com=kernel-hardening-return-21357-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1703FC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:38 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 67CBB60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:01:37 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 67CBB60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7254 invoked by uid 550); 31 Aug 2021 00:00:28 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 6093 invoked from network); 31 Aug 2021 00:00:25 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933725"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933725"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712928"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 11/19] mm/sparsemem: Use alloc_table() for table
 allocations
Date: Mon, 30 Aug 2021 16:59:19 -0700
Message-Id: <20210830235927.6443-12-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

In order to support allocating PKS protected page tables for vmememap,
create a new variant of alloc_table(), alloc_table_node() that allows for
allocation of tables from a specific node. Use it when possible for
allocating vmemmap tables.

vmemmap_alloc_block_zero() is currently only used to allocate page
tables, so fold it into a new function, vmemem_alloc_table() that can
be free to call alloc_table_node(). Since it is today only called with
PAGE_SIZE size, drop the size argument.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/mm/pgtable.c         | 10 +++++++---
 include/asm-generic/pgalloc.h |  2 ++
 include/linux/mm.h            |  5 +++++
 mm/sparse-vmemmap.c           | 22 +++++++++++++++-------
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e65d69ad6e0c..006dc4f81f6d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -41,7 +41,7 @@ static struct grouped_page_cache gpc_pks;
 static bool __ro_after_init pks_tables_inited_val;
 
 
-struct page *alloc_table(gfp_t gfp)
+struct page *alloc_table_node(gfp_t gfp, int node)
 {
 	struct page *table;
 
@@ -53,9 +53,9 @@ struct page *alloc_table(gfp_t gfp)
 	}
 
 	if (gfp & GFP_ATOMIC)
-		table = get_grouped_page_atomic(numa_node_id(), &gpc_pks);
+		table = get_grouped_page_atomic(node, &gpc_pks);
 	else
-		table = get_grouped_page(numa_node_id(), &gpc_pks);
+		table = get_grouped_page(node, &gpc_pks);
 	if (!table)
 		return NULL;
 	__SetPageTable(table);
@@ -72,6 +72,10 @@ struct page *alloc_table(gfp_t gfp)
 
 	return table;
 }
+struct page *alloc_table(gfp_t gfp)
+{
+	return alloc_table_node(gfp, numa_node_id());
+}
 
 void free_table(struct page *table_page)
 {
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index e576c19abc8c..eb08371db211 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -14,8 +14,10 @@ static inline struct page *alloc_table(gfp_t gfp)
 {
 	return alloc_page(gfp);
 }
+/* alloc_table_node() stub defined in mm.h */
 #else /* __HAVE_ARCH_ALLOC_TABLE */
 extern struct page *alloc_table(gfp_t gfp);
+extern struct page *alloc_table_node(gfp_t gfp, int node);
 #endif /* __HAVE_ARCH_ALLOC_TABLE */
 
 #ifdef __HAVE_ARCH_FREE_TABLE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab63d5a201cb..fdb33bc6bba8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2332,6 +2332,11 @@ static inline void free_table(struct page *table_page)
 {
 	__free_pages(table_page, 0);
 }
+
+static inline struct page *alloc_table_node(gfp_t gfp, int node)
+{
+	return alloc_pages_node(node, gfp, 0);
+}
 #endif /* CONFIG_PKS_PG_TABLES */
 
 static inline void pgtable_init(void)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bdce883f9286..4f479c75cc8d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -511,13 +511,21 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 	return pte;
 }
 
-static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
+static void * __meminit vmemmap_alloc_table(int node)
 {
-	void *p = vmemmap_alloc_block(size, node);
+	void *p;
+	if (slab_is_available()) {
+		struct page *page = alloc_table_node(GFP_KERNEL | __GFP_ZERO, node);
+
+		if (!page)
+			return NULL;
+		return page_address(page);
+	}
 
+	p = __earlyonly_bootmem_alloc(node, PAGE_SIZE, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!p)
 		return NULL;
-	memset(p, 0, size);
+	memset(p, 0, PAGE_SIZE);
 
 	return p;
 }
@@ -526,7 +534,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 {
 	pmd_t *pmd = pmd_offset(pud, addr);
 	if (pmd_none(*pmd)) {
-		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+		void *p = vmemmap_alloc_table(node);
 		if (!p)
 			return NULL;
 		pmd_populate_kernel(&init_mm, pmd, p);
@@ -538,7 +546,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
 {
 	pud_t *pud = pud_offset(p4d, addr);
 	if (pud_none(*pud)) {
-		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+		void *p = vmemmap_alloc_table(node);
 		if (!p)
 			return NULL;
 		pud_populate(&init_mm, pud, p);
@@ -550,7 +558,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
 {
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d)) {
-		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+		void *p = vmemmap_alloc_table(node);
 		if (!p)
 			return NULL;
 		p4d_populate(&init_mm, p4d, p);
@@ -562,7 +570,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 {
 	pgd_t *pgd = pgd_offset_k(addr);
 	if (pgd_none(*pgd)) {
-		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+		void *p = vmemmap_alloc_table(node);
 		if (!p)
 			return NULL;
 		pgd_populate(&init_mm, pgd, p);

From patchwork Mon Aug 30 23:59:20 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466195
Return-Path: 
 <SRS0=zb/M=NW=lists.openwall.com=kernel-hardening-return-21364-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-13.9 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,UNWANTED_LANGUAGE_BODY,
	URIBL_BLOCKED,USER_AGENT_GIT autolearn=unavailable autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id AF51DC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:52 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 0536F60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:51 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 0536F60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7645 invoked by uid 550); 31 Aug 2021 00:00:37 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7340 invoked from network); 31 Aug 2021 00:00:31 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933727"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933727"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712949"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 12/19] x86/mm: Use free_table in unmap path
Date: Mon, 30 Aug 2021 16:59:20 -0700
Message-Id: <20210830235927.6443-13-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Memory hot unplug and memremap unmap paths will free direct map page
tables. So use free_table() for this.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/mm/init_64.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index de5a785ee89f..c2680a77ca88 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -975,7 +975,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return add_pages(nid, start_pfn, nr_pages, params);
 }
 
-static void __meminit free_pagetable(struct page *page, int order)
+static void __meminit free_pagetable(struct page *page, int order, bool table)
 {
 	unsigned long magic;
 	unsigned int nr_pages = 1 << order;
@@ -991,8 +991,14 @@ static void __meminit free_pagetable(struct page *page, int order)
 		} else
 			while (nr_pages--)
 				free_reserved_page(page++);
-	} else
-		free_pages((unsigned long)page_address(page), order);
+	} else {
+		if (table) {
+			/* The page tables will always be order 0. */
+			free_table(page);
+		} else {
+			free_pages((unsigned long)page_address(page), order);
+		}
+	}
 }
 
 static void __meminit gather_table(struct page *page, struct list_head *tables)
@@ -1008,7 +1014,7 @@ static void __meminit gather_table_finish(struct list_head *tables)
 
 	list_for_each_entry_safe(page, next, tables, lru) {
 		list_del(&page->lru);
-		free_pagetable(page, 0);
+		free_pagetable(page, 0, true);
 	}
 }
 
@@ -1018,7 +1024,7 @@ static void __meminit free_hugepage_table(struct page *page,
 	if (altmap)
 		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
 	else
-		free_pagetable(page, get_order(PMD_SIZE));
+		free_pagetable(page, get_order(PMD_SIZE), false);
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, struct list_head *tables)
@@ -1102,7 +1108,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			free_pagetable(pte_page(*pte), 0, false);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);

From patchwork Mon Aug 30 23:59:21 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466193
Return-Path: 
 <SRS0=3+xj=NW=lists.openwall.com=kernel-hardening-return-21362-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1CABDC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:42 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 6ECD860E98
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:02:41 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 6ECD860E98
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7552 invoked by uid 550); 31 Aug 2021 00:00:35 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7345 invoked from network); 31 Aug 2021 00:00:31 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933729"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933729"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712959"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 13/19] mm/debug_vm_page_table: Use setters instead of
 WRITE_ONCE
Date: Mon, 30 Aug 2021 16:59:21 -0700
Message-Id: <20210830235927.6443-14-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Currently debug_vm_page_table uses some raw WRITE_ONCE invocations to
write to page tables, which PKS tables is designed to prevent. So use the
set_p**() helpers instead of WRITE_ONCE for x86, so debug_vm_page_table
will work with PKS tables enabled.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 mm/debug_vm_pgtable.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1c922691aa61..a0b8859ecf04 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -411,11 +411,17 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 		return;
 
 	pr_debug("Validating PMD huge\n");
+
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
+	set_pmd(pmdp, __pmd(0));
+#else
+	WRITE_ONCE(*pmdp, __pmd(0));
+#endif
 	/*
 	 * X86 defined pmd_set_huge() verifies that the given
 	 * PMD is not a populated non-leaf entry.
 	 */
-	WRITE_ONCE(*pmdp, __pmd(0));
 	WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
 	WARN_ON(!pmd_clear_huge(pmdp));
 	pmd = READ_ONCE(*pmdp);
@@ -430,11 +436,16 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
 		return;
 
 	pr_debug("Validating PUD huge\n");
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
+	set_pud(pudp, __pud(0));
+#else
+	WRITE_ONCE(*pudp, __pud(0));
+#endif
 	/*
 	 * X86 defined pud_set_huge() verifies that the given
 	 * PUD is not a populated non-leaf entry.
 	 */
-	WRITE_ONCE(*pudp, __pud(0));
 	WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
 	WARN_ON(!pud_clear_huge(pudp));
 	pud = READ_ONCE(*pudp);
@@ -473,7 +484,12 @@ static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
 
 	pr_debug("Validating PUD clear\n");
 	pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
+	set_pud(pudp, pud);
+#else
 	WRITE_ONCE(*pudp, pud);
+#endif
 	pud_clear(pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
@@ -514,7 +530,12 @@ static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
 
 	pr_debug("Validating P4D clear\n");
 	p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
+	set_p4d(p4dp, p4d);
+#else
 	WRITE_ONCE(*p4dp, p4d);
+#endif
 	p4d_clear(p4dp);
 	p4d = READ_ONCE(*p4dp);
 	WARN_ON(!p4d_none(p4d));
@@ -549,7 +570,12 @@ static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
 
 	pr_debug("Validating PGD clear\n");
 	pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
+	set_pgd(pgdp, pgd);
+#else
 	WRITE_ONCE(*pgdp, pgd);
+#endif
 	pgd_clear(pgdp);
 	pgd = READ_ONCE(*pgdp);
 	WARN_ON(!pgd_none(pgd));
@@ -610,8 +636,12 @@ static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
 
 	pr_debug("Validating PMD clear\n");
 	pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
-	WRITE_ONCE(*pmdp, pmd);
+#ifdef CONFIG_X86
+	/* Use setter so that protections can be toggled if needed */
 	pmd_clear(pmdp);
+#else
+	WRITE_ONCE(*pmdp, pmd);
+#endif
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
 }

From patchwork Mon Aug 30 23:59:22 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466201
Return-Path: 
 <SRS0=pJ7B=NW=lists.openwall.com=kernel-hardening-return-21367-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id EAA1AC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:25 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 3972B60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:25 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 3972B60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7968 invoked by uid 550); 31 Aug 2021 00:00:40 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7403 invoked from network); 31 Aug 2021 00:00:32 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933731"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933731"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712977"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 14/19] x86/efi: Toggle table protections when copying
Date: Mon, 30 Aug 2021 16:59:22 -0700
Message-Id: <20210830235927.6443-15-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Toggle page table writability when copying page tables in
efi_sync_low_kernel_mappings(). These page tables will not be protected
until after init, but later on they will not be writable.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/platform/efi/efi_64.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 7515e78ef898..7a5c81450fa4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -116,7 +116,9 @@ void efi_sync_low_kernel_mappings(void)
 	pgd_k = pgd_offset_k(PAGE_OFFSET);
 
 	num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
+	enable_pgtable_write();
 	memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+	disable_pgtable_write();
 
 	pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
 	pgd_k = pgd_offset_k(EFI_VA_END);
@@ -124,7 +126,9 @@ void efi_sync_low_kernel_mappings(void)
 	p4d_k = p4d_offset(pgd_k, 0);
 
 	num_entries = p4d_index(EFI_VA_END);
+	enable_pgtable_write();
 	memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+	disable_pgtable_write();
 
 	/*
 	 * We share all the PUD entries apart from those that map the
@@ -139,13 +143,17 @@ void efi_sync_low_kernel_mappings(void)
 	pud_k = pud_offset(p4d_k, 0);
 
 	num_entries = pud_index(EFI_VA_END);
+	enable_pgtable_write();
 	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+	disable_pgtable_write();
 
 	pud_efi = pud_offset(p4d_efi, EFI_VA_START);
 	pud_k = pud_offset(p4d_k, EFI_VA_START);
 
 	num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
+	enable_pgtable_write();
 	memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+	disable_pgtable_write();
 }
 
 /*

From patchwork Mon Aug 30 23:59:23 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466197
Return-Path: 
 <SRS0=zdu0=NW=lists.openwall.com=kernel-hardening-return-21365-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 72D1BC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:03 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id C1A3C60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:02 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org C1A3C60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7750 invoked by uid 550); 31 Aug 2021 00:00:38 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7404 invoked from network); 31 Aug 2021 00:00:32 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933735"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933735"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530712992"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 15/19] x86/mm/cpa: Add set_memory_pks()
Date: Mon, 30 Aug 2021 16:59:23 -0700
Message-Id: <20210830235927.6443-16-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Add function for setting PKS key on kernel memory.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/set_memory.h | 1 +
 arch/x86/mm/pat/set_memory.c      | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index e0516651698a..1ba2fb45ed05 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -52,6 +52,7 @@ int set_memory_decrypted(unsigned long addr, int numpages);
 int set_memory_np_noalias(unsigned long addr, int numpages);
 int set_memory_nonglobal(unsigned long addr, int numpages);
 int set_memory_global(unsigned long addr, int numpages);
+int set_memory_pks(unsigned long addr, int numpages, int key);
 
 int set_pages_array_uc(struct page **pages, int addrinarray);
 int set_pages_array_wc(struct page **pages, int addrinarray);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c51792a797cb..dc704e8da032 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1922,6 +1922,13 @@ int set_memory_wb(unsigned long addr, int numpages)
 }
 EXPORT_SYMBOL(set_memory_wb);
 
+int set_memory_pks(unsigned long addr, int numpages, int key)
+{
+	return change_page_attr_set_clr(&addr, numpages, __pgprot(_PAGE_PKEY(key)),
+					__pgprot(_PAGE_PKEY(0xF & ~(unsigned int)key)),
+					0, 0, NULL);
+}
+
 int set_memory_x(unsigned long addr, int numpages)
 {
 	if (!(__supported_pte_mask & _PAGE_NX))

From patchwork Mon Aug 30 23:59:24 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466199
Return-Path: 
 <SRS0=9pT7=NW=lists.openwall.com=kernel-hardening-return-21366-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B83DDC432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:15 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 97F6F60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:14 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 97F6F60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 7944 invoked by uid 550); 31 Aug 2021 00:00:39 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7406 invoked from network); 31 Aug 2021 00:00:33 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933738"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933738"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530713007"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 16/19] x86/mm: Protect page tables with PKS
Date: Mon, 30 Aug 2021 16:59:24 -0700
Message-Id: <20210830235927.6443-17-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Write protect page tables with PKS. Toggle writeability inside the
pgtable.h defined page table modifiction functions.

Do not protect the direct map page tables as it is more complicated and
will come in a later patch.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/boot/compressed/ident_map_64.c |  5 ++
 arch/x86/include/asm/pgtable.h          | 18 ++++++-
 arch/x86/include/asm/pgtable_64.h       | 33 ++++++++++--
 arch/x86/include/asm/pkeys_common.h     |  1 -
 arch/x86/mm/pgtable.c                   | 71 ++++++++++++++++++++++---
 arch/x86/mm/pkeys.c                     |  1 +
 include/linux/pkeys.h                   |  1 +
 mm/Kconfig                              |  6 +++
 8 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index f7213d0943b8..2999be8f9347 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -349,3 +349,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 */
 	add_identity_map(address, end);
 }
+
+#ifdef CONFIG_PKS_PG_TABLES
+void enable_pgtable_write(void) {}
+void disable_pgtable_write(void) {}
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 3505e3b1f40b..871308c40dac 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1085,7 +1085,9 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pte_t *ptep)
 {
+	enable_pgtable_write();
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+	disable_pgtable_write();
 }
 
 #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
@@ -1135,7 +1137,9 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp)
 {
+	enable_pgtable_write();
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+	disable_pgtable_write();
 }
 
 #define pud_write pud_write
@@ -1150,10 +1154,18 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmdp, pmd_t pmd)
 {
 	if (IS_ENABLED(CONFIG_SMP)) {
-		return xchg(pmdp, pmd);
+		pmd_t ret;
+
+		enable_pgtable_write();
+		ret = xchg(pmdp, pmd);
+		disable_pgtable_write();
+
+		return ret;
 	} else {
 		pmd_t old = *pmdp;
+		enable_pgtable_write();
 		WRITE_ONCE(*pmdp, pmd);
+		disable_pgtable_write();
 		return old;
 	}
 }
@@ -1236,13 +1248,17 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
+	enable_pgtable_write();
 	memcpy(dst, src, count * sizeof(pgd_t));
+	disable_pgtable_write();
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 	if (!static_cpu_has(X86_FEATURE_PTI))
 		return;
 	/* Clone the user space pgd as well */
+	enable_pgtable_write();
 	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
 	       count * sizeof(pgd_t));
+	disable_pgtable_write();
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 56d0399a0cd1..a287f3c8a0a3 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -64,7 +64,9 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
 
 static inline void native_set_pte(pte_t *ptep, pte_t pte)
 {
+	enable_pgtable_write();
 	WRITE_ONCE(*ptep, pte);
+	disable_pgtable_write();
 }
 
 static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
@@ -80,7 +82,9 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 
 static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
+	enable_pgtable_write();
 	WRITE_ONCE(*pmdp, pmd);
+	disable_pgtable_write();
 }
 
 static inline void native_pmd_clear(pmd_t *pmd)
@@ -91,7 +95,12 @@ static inline void native_pmd_clear(pmd_t *pmd)
 static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 {
 #ifdef CONFIG_SMP
-	return native_make_pte(xchg(&xp->pte, 0));
+	pteval_t pte_val;
+
+	enable_pgtable_write();
+	pte_val = xchg(&xp->pte, 0);
+	disable_pgtable_write();
+	return native_make_pte(pte_val);
 #else
 	/* native_local_ptep_get_and_clear,
 	   but duplicated because of cyclic dependency */
@@ -104,7 +113,12 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 {
 #ifdef CONFIG_SMP
-	return native_make_pmd(xchg(&xp->pmd, 0));
+	pteval_t pte_val;
+
+	enable_pgtable_write();
+	pte_val = xchg(&xp->pmd, 0);
+	disable_pgtable_write();
+	return native_make_pmd(pte_val);
 #else
 	/* native_local_pmdp_get_and_clear,
 	   but duplicated because of cyclic dependency */
@@ -116,7 +130,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 
 static inline void native_set_pud(pud_t *pudp, pud_t pud)
 {
+	enable_pgtable_write();
 	WRITE_ONCE(*pudp, pud);
+	disable_pgtable_write();
 }
 
 static inline void native_pud_clear(pud_t *pud)
@@ -127,7 +143,12 @@ static inline void native_pud_clear(pud_t *pud)
 static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 {
 #ifdef CONFIG_SMP
-	return native_make_pud(xchg(&xp->pud, 0));
+	pteval_t pte_val;
+
+	enable_pgtable_write();
+	pte_val = xchg(&xp->pud, 0);
+	disable_pgtable_write();
+	return native_make_pud(pte_val);
 #else
 	/* native_local_pudp_get_and_clear,
 	 * but duplicated because of cyclic dependency
@@ -144,13 +165,17 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 	pgd_t pgd;
 
 	if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+		enable_pgtable_write();
 		WRITE_ONCE(*p4dp, p4d);
+		disable_pgtable_write();
 		return;
 	}
 
 	pgd = native_make_pgd(native_p4d_val(p4d));
 	pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
+	enable_pgtable_write();
 	WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
+	disable_pgtable_write();
 }
 
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -160,7 +185,9 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+	enable_pgtable_write();
 	WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
+	disable_pgtable_write();
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pkeys_common.h b/arch/x86/include/asm/pkeys_common.h
index 079a8be9686b..13f4341c4c0b 100644
--- a/arch/x86/include/asm/pkeys_common.h
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -15,5 +15,4 @@
 #define PKR_AD_KEY(pkey)     (PKR_AD_BIT << PKR_PKEY_SHIFT(pkey))
 #define PKR_WD_KEY(pkey)     (PKR_WD_BIT << PKR_PKEY_SHIFT(pkey))
 #define PKR_VALUE(pkey, val) (val << PKR_PKEY_SHIFT(pkey))
-
 #endif /*_ASM_X86_PKEYS_COMMON_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 006dc4f81f6d..69b43097c9da 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -8,6 +8,7 @@
 #include <asm/mtrr.h>
 #include <asm/set_memory.h>
 #include <asm/cmdline.h>
+#include <linux/pkeys.h>
 #include <linux/page-flags.h>
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
@@ -60,8 +61,11 @@ struct page *alloc_table_node(gfp_t gfp, int node)
 		return NULL;
 	__SetPageTable(table);
 
-	if (gfp & __GFP_ZERO)
+	if (gfp & __GFP_ZERO) {
+		enable_pgtable_write();
 		memset(page_address(table), 0, PAGE_SIZE);
+		disable_pgtable_write();
+	}
 
 	if (memcg_kmem_enabled() &&
 	    gfp & __GFP_ACCOUNT &&
@@ -614,9 +618,12 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
 {
 	int ret = 0;
 
-	if (pte_young(*ptep))
+	if (pte_young(*ptep)) {
+		enable_pgtable_write();
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 					 (unsigned long *) &ptep->pte);
+		disable_pgtable_write();
+	}
 
 	return ret;
 }
@@ -627,9 +634,12 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 {
 	int ret = 0;
 
-	if (pmd_young(*pmdp))
+	if (pmd_young(*pmdp)) {
+		enable_pgtable_write();
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 					 (unsigned long *)pmdp);
+		disable_pgtable_write();
+	}
 
 	return ret;
 }
@@ -638,9 +648,12 @@ int pudp_test_and_clear_young(struct vm_area_struct *vma,
 {
 	int ret = 0;
 
-	if (pud_young(*pudp))
+	if (pud_young(*pudp)) {
+		enable_pgtable_write();
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 					 (unsigned long *)pudp);
+		disable_pgtable_write();
+	}
 
 	return ret;
 }
@@ -649,6 +662,7 @@ int pudp_test_and_clear_young(struct vm_area_struct *vma,
 int ptep_clear_flush_young(struct vm_area_struct *vma,
 			   unsigned long address, pte_t *ptep)
 {
+	int ret;
 	/*
 	 * On x86 CPUs, clearing the accessed bit without a TLB flush
 	 * doesn't cause data corruption. [ It could cause incorrect
@@ -662,7 +676,10 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
 	 * shouldn't really matter because there's no real memory
 	 * pressure for swapout to react to. ]
 	 */
-	return ptep_test_and_clear_young(vma, address, ptep);
+	enable_pgtable_write();
+	ret = ptep_test_and_clear_young(vma, address, ptep);
+	disable_pgtable_write();
+	return ret;
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -673,7 +690,9 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
+	enable_pgtable_write();
 	young = pmdp_test_and_clear_young(vma, address, pmdp);
+	disable_pgtable_write();
 	if (young)
 		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 
@@ -923,6 +942,30 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 }
 
 #ifdef CONFIG_PKS_PG_TABLES
+static int _pks_protect(struct page *page, unsigned int cnt)
+{
+	set_memory_pks((unsigned long)page_address(page), cnt, PKS_KEY_PG_TABLES);
+	return 0;
+}
+
+static int _pks_unprotect(struct page *page, unsigned int cnt)
+{
+	set_memory_pks((unsigned long)page_address(page), cnt, 0);
+	return 0;
+}
+
+void enable_pgtable_write(void)
+{
+	if (pks_tables_inited())
+		pks_mk_readwrite(PKS_KEY_PG_TABLES);
+}
+
+void disable_pgtable_write(void)
+{
+	if (pks_tables_inited())
+		pks_mk_readonly(PKS_KEY_PG_TABLES);
+}
+
 bool pks_tables_inited(void)
 {
 	return pks_tables_inited_val;
@@ -930,11 +973,23 @@ bool pks_tables_inited(void)
 
 static int __init pks_page_init(void)
 {
+	/*
+	 * If PKS is not enabled, don't try to enable anything and don't
+	 * report anything.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_PKS) || !cpu_feature_enabled(X86_FEATURE_PKS_TABLES))
+		return 0;
+
 	pks_tables_inited_val = !init_grouped_page_cache(&gpc_pks, GFP_KERNEL | PGTABLE_HIGHMEM,
-					       NULL, NULL);
+					       _pks_protect, _pks_unprotect);
 
-out:
-	return !pks_tables_inited_val;
+	if (pks_tables_inited_val) {
+		pr_info("PKS tables initialized\n");
+		return 0;
+	}
+
+	pr_warn("PKS tables failed to initialize\n");
+	return 1;
 }
 
 device_initcall(pks_page_init);
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 201004586c2b..48a390722c06 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -302,6 +302,7 @@ static int __init create_initial_pkrs_value(void)
 
 	consumer_defaults[PKS_KEY_DEFAULT]          = PKR_RW_BIT;
 	consumer_defaults[PKS_KEY_PGMAP_PROTECTION] = PKR_AD_BIT;
+	consumer_defaults[PKS_KEY_PG_TABLES]        = PKR_WD_BIT;
 
 	/* Ensure the number of consumers is less than the number of keys */
 	BUILD_BUG_ON(PKS_KEY_NR_CONSUMERS > PKS_NUM_PKEYS);
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index c06b47264c5d..42187a070df4 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -50,6 +50,7 @@ static inline bool arch_pkeys_enabled(void)
 enum pks_pkey_consumers {
 	PKS_KEY_DEFAULT = 0, /* Must be 0 for default PTE values */
 	PKS_KEY_PGMAP_PROTECTION,
+	PKS_KEY_PG_TABLES,
 	PKS_KEY_NR_CONSUMERS
 };
 extern u32 pkrs_init_value;
diff --git a/mm/Kconfig b/mm/Kconfig
index 4184d0a7531d..0f8e8595a396 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -845,6 +845,12 @@ config ARCH_ENABLE_SUPERVISOR_PKEYS
 	def_bool y
 	depends on PKS_TEST || GENERAL_PKS_USER
 
+config PKS_PG_TABLES
+	bool "PKS write protected page tables"
+	select GENERAL_PKS_USER
+	depends on !HIGHMEM && !X86_PAE && SPARSEMEM_VMEMMAP
+	depends on ARCH_HAS_SUPERVISOR_PKEYS
+
 config PERCPU_STATS
 	bool "Collect percpu memory statistics"
 	help

From patchwork Mon Aug 30 23:59:25 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466205
Return-Path: 
 <SRS0=Iy8F=NW=lists.openwall.com=kernel-hardening-return-21369-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 16B21C43214
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:04:00 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id 1035260FC0
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:58 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 1035260FC0
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 8119 invoked by uid 550); 31 Aug 2021 00:00:42 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7423 invoked from network); 31 Aug 2021 00:00:33 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933744"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933744"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530713019"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 17/19] x86/mm/cpa: PKS protect direct map page tables
Date: Mon, 30 Aug 2021 16:59:25 -0700
Message-Id: <20210830235927.6443-18-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Protecting direct map page tables is a bit more difficult because a page
table may be needed for a page split as part of setting the PKS
permission the new page table. So in the case of an empty cache of page
tables the page table allocator could get into a situation where it cannot
create any more page tables.

Several solutions were looked at:

1. Break the direct map with pages allocated from the large page being
converted to PKS. This would result in a window where the table could be
written to right before it was linked into the page tables. It also
depends on high order pages being available, and so would regress from
the un-protected behavior in that respect.
2. Hold some page tables in reserve to be able to break the large page
for a new 2MB page, but if there are no 2MB page's available we may need
to add a single page to the cache, in which case we would use up the
reserve of page tables needed to break a new page, but not get enough
page tables back to replenish the resereve.
3. Always map the direct map at 4k when protecting page tables so that
pages don't need to be broken to map them with a PKS key. This would have
undesirable performance.

4. Lastly, the strategy employed in this patch, have a separate cache of
page tables just used for the direct map. Early in boot, squirrel away
enough page tables to map the direct map at 4k. This comes with the same
memory overhead of mapping the direct map at 4k, but gets the other
benefits of mapping the direct map as large pages.

There is also the problem of protecting page tables that are allocated
during boot. Instead of recording the tables to protect later, create a
page table traversing infrastructure to walk every page table in init_mm
and apply protection. This also covers non-direct map odds-and-ends page
tables that are allocated during boot. The existing page table traversing
in pagewalk.c cannot be used for this purpose because there are not actual
vmas for all of the kernel address space.

The algorithm for protecting the direct map page table cache, while also
allocating from it for direct map splits is described in the comments of
init_pks_dmap_tables().

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/set_memory.h |   2 +
 arch/x86/mm/init.c                |  89 ++++++++++
 arch/x86/mm/pat/set_memory.c      | 263 +++++++++++++++++++++++++++++-
 3 files changed, 350 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 1ba2fb45ed05..9f8d0d0ae063 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -90,6 +90,8 @@ bool kernel_page_present(struct page *page);
 
 extern int kernel_set_to_readonly;
 
+void add_dmap_table(unsigned long addr);
+
 #ifdef CONFIG_X86_64
 /*
  * Prevent speculative access to the page by either unmapping
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c8933c6d5efd..a91696e3da96 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -6,6 +6,7 @@
 #include <linux/swapfile.h>
 #include <linux/swapops.h>
 #include <linux/kmemleak.h>
+#include <linux/hugetlb.h>
 #include <linux/sched/task.h>
 
 #include <asm/set_memory.h>
@@ -26,6 +27,7 @@
 #include <asm/pti.h>
 #include <asm/text-patching.h>
 #include <asm/memtype.h>
+#include <asm/pgalloc.h>
 
 /*
  * We need to define the tracepoints somewhere, and tlb.c
@@ -119,6 +121,17 @@ __ref void *alloc_low_pages(unsigned int num)
 	if (after_bootmem) {
 		unsigned int order;
 
+		if (cpu_feature_enabled(X86_FEATURE_PKS_TABLES)) {
+			struct page *page;
+
+			/* 64 bit only allocates order 0 pages */
+			WARN_ON(num != 1);
+
+			page = alloc_table(GFP_ATOMIC | __GFP_ZERO);
+			if (!page)
+				return NULL;
+			return (void *)page_address(page);
+		}
 		order = get_order((unsigned long)num << PAGE_SHIFT);
 		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
@@ -504,6 +517,79 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
 	return false;
 }
 
+#ifdef CONFIG_PKS_PG_TABLES
+/* Page tables needed in bytes */
+static u64 calc_tables_needed(unsigned int size)
+{
+	unsigned int puds = size >> PUD_SHIFT;
+	unsigned int pmds = size >> PMD_SHIFT;
+
+	/*
+	 * Catch if direct map ever might need more page tables to split
+	 * down to 4k.
+	 */
+	BUILD_BUG_ON(p4d_huge(foo));
+	BUILD_BUG_ON(pgd_huge(foo));
+
+	return (puds + pmds) << PAGE_SHIFT;
+}
+
+/*
+ * If pre boot, reserve large pages from memory that will be mapped. It's ok that this is not
+ * mapped as PKS, other init code in CPA will handle the conversion.
+ */
+static unsigned int __init reserve_pre_boot(u64 start, u64 end)
+{
+	u64 cur = memblock_find_in_range(start, end, HPAGE_SIZE, HPAGE_SIZE);
+	int i;
+
+	if (!cur)
+		return 0;
+	memblock_reserve(cur, HPAGE_SIZE);
+	for (i = 0; i < HPAGE_SIZE; i += PAGE_SIZE)
+		add_dmap_table((unsigned long)__va(cur + i));
+	return HPAGE_SIZE;
+}
+
+/* If post boot, memblock is not available. Just reserve from other memory regions */
+static unsigned int __init reserve_post_boot(void)
+{
+	struct page *page = alloc_table(GFP_KERNEL);
+
+	if (!page)
+		return 0;
+
+	add_dmap_table((unsigned long)page_address(page));
+
+	return PAGE_SIZE;
+}
+
+static void __init reserve_page_tables(u64 start, u64 end)
+{
+	u64 reserve_size = calc_tables_needed(end - start);
+	u64 reserved = 0;
+	u64 cur_reserved;
+
+	while (reserved < reserve_size) {
+		if (after_bootmem)
+			cur_reserved = reserve_post_boot();
+		else
+			cur_reserved = reserve_pre_boot(start, end);
+
+		if (!cur_reserved) {
+			WARN(1, "Could not reserve direct map page tables %llu/%llu\n",
+				reserved,
+				reserve_size);
+			return;
+		}
+
+		reserved += cur_reserved;
+	}
+}
+#else
+static inline void reserve_page_tables(u64 start, u64 end) { }
+#endif
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -529,6 +615,9 @@ unsigned long __ref init_memory_mapping(unsigned long start,
 
 	add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
+	if (cpu_feature_enabled(X86_FEATURE_PKS_TABLES))
+		reserve_page_tables(start, end);
+
 	return ret >> PAGE_SHIFT;
 }
 
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index dc704e8da032..6acf25999b0f 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -18,6 +18,7 @@
 #include <linux/libnvdimm.h>
 #include <linux/vmstat.h>
 #include <linux/kernel.h>
+#include <linux/pkeys.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -71,6 +72,68 @@ static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_PAGES_ARRAY 4
 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
 
+static struct page *alloc_regular_dmap_table(void)
+{
+	return alloc_pages(GFP_KERNEL, 0);
+}
+
+#ifdef CONFIG_PKS_PG_TABLES
+static LLIST_HEAD(tables_cache);
+static bool dmap_tables_inited;
+
+void add_dmap_table(unsigned long addr)
+{
+	struct llist_node *node = (struct llist_node *)addr;
+
+	enable_pgtable_write();
+	llist_add(node, &tables_cache);
+	disable_pgtable_write();
+}
+
+static struct page *get_pks_table(void)
+{
+	void *ptr = llist_del_first(&tables_cache);
+
+	if (!ptr)
+		return NULL;
+
+	return virt_to_page(ptr);
+}
+
+static struct page *alloc_dmap_table(void)
+{
+	struct page *table;
+
+	if (!pks_tables_inited())
+		return alloc_regular_dmap_table();
+
+	table = get_pks_table();
+	/* Fall back to un-protected table is couldn't get one from cache */
+	if (!table) {
+		if (dmap_tables_inited)
+			WARN(1, "Allocating unprotected direct map table\n");
+		table = alloc_regular_dmap_table();
+	}
+
+	return table;
+}
+
+static void free_dmap_table(struct page *table)
+{
+	add_dmap_table((unsigned long)virt_to_page(table));
+}
+#else /* CONFIG_PKS_PG_TABLES */
+static struct page *alloc_dmap_table(void)
+{
+	return alloc_regular_dmap_table();
+}
+
+static void free_dmap_table(struct page *table)
+{
+	__free_page(table);
+}
+#endif
+
 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
 {
 	return __pgprot(cachemode2protval(pcm));
@@ -1076,14 +1139,15 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL, 0);
+	base = alloc_dmap_table();
+
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
 		return -ENOMEM;
 
 	if (__split_large_page(cpa, kpte, address, base))
-		__free_page(base);
+		free_dmap_table(base);
 
 	return 0;
 }
@@ -1096,7 +1160,7 @@ static bool try_to_free_pte_page(pte_t *pte)
 		if (!pte_none(pte[i]))
 			return false;
 
-	free_page((unsigned long)pte);
+	free_dmap_table(virt_to_page(pte));
 	return true;
 }
 
@@ -1108,7 +1172,7 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
 		if (!pmd_none(pmd[i]))
 			return false;
 
-	free_page((unsigned long)pmd);
+	free_dmap_table(virt_to_page(pmd));
 	return true;
 }
 
@@ -2535,6 +2599,197 @@ void free_grouped_page(struct grouped_page_cache *gpc, struct page *page)
 	list_lru_add_node(&gpc->lru, &page->lru, page_to_nid(page));
 }
 #endif /* !HIGHMEM */
+
+#ifdef CONFIG_PKS_PG_TABLES
+#define IS_TABLE_KEY(val) (((val & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0) == PKS_KEY_PG_TABLES)
+
+static bool is_dmap_protected(unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = init_mm.pgd + pgd_index(addr);
+	if (!pgd_present(*pgd))
+		return true;
+
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d) || (p4d_large(*p4d) && IS_TABLE_KEY(p4d_val(*p4d))))
+		return true;
+	else if (p4d_large(*p4d))
+		return false;
+
+	pud = pud_offset(p4d, addr);
+	if (!pud_present(*pud) || (pud_large(*pud) && IS_TABLE_KEY(pud_val(*pud))))
+		return true;
+	else if (pud_large(*pud))
+		return false;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd) || (pmd_large(*pmd) && IS_TABLE_KEY(pmd_val(*pmd))))
+		return true;
+	else if (pmd_large(*pmd))
+		return false;
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (!pte_present(*pte) || IS_TABLE_KEY(pte_val(*pte)))
+		return true;
+
+	return false;
+}
+
+static void ensure_table_protected(unsigned long pfn, void *vaddr, void *vend)
+{
+	unsigned long addr_table = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+	if (is_dmap_protected(addr_table))
+		return;
+
+	if (set_memory_pks(addr_table, 1, PKS_KEY_PG_TABLES))
+		pr_warn("Failed to protect page table mapping 0x%pK-0x%pK\n", vaddr, vend);
+}
+
+typedef void (*traverse_cb)(unsigned long pfn, void *vaddr, void *vend);
+
+/*
+ * The pXX_page_vaddr() functions are half way done being renamed to pXX_pgtable(),
+ * leaving no pattern in the names, provide local copies of the missing pXX_pgtable()
+ * implementations for the time being so they can be used in the template below.
+ */
+
+static inline p4d_t *pgd_pgtable(pgd_t pgd)
+{
+	return (p4d_t *)pgd_page_vaddr(pgd);
+}
+
+#define TRAVERSE(upper, lower, ptrs_cnt, upper_size, skip) \
+static void traverse_##upper(upper##_t *upper, traverse_cb cb, unsigned long base) \
+{ \
+	unsigned long cur_addr = base; \
+	upper##_t *cur; \
+\
+	if (skip) { \
+		traverse_##lower((lower##_t *)upper, cb, cur_addr); \
+		return; \
+	} \
+\
+	for (cur = upper; cur < upper + ptrs_cnt; cur++) { \
+		/* \
+		 * Use native_foo_val() instead of foo_none() because pgd_none() always \
+		 * return 0 when in 4 level paging. \
+		 */ \
+		if (native_##upper##_val(*cur) && !upper##_large(*cur)) { \
+			void *vstart = (void *)sign_extend64(cur_addr, __VIRTUAL_MASK_SHIFT); \
+			void *vend = vstart + upper_size - 1; \
+\
+			cb(upper##_pfn(*cur), vstart, vend); \
+			traverse_##lower((lower##_t *)upper##_pgtable(*cur), cb, cur_addr); \
+		} \
+		cur_addr += upper_size; \
+	} \
+}
+
+static void traverse_pte(pte_t *pte, traverse_cb cb, unsigned long base) { }
+TRAVERSE(pmd, pte, PTRS_PER_PMD, PMD_SIZE, false)
+TRAVERSE(pud, pmd, PTRS_PER_PUD, PUD_SIZE, false)
+TRAVERSE(p4d, pud, PTRS_PER_P4D, P4D_SIZE, !pgtable_l5_enabled())
+TRAVERSE(pgd, p4d, PTRS_PER_PGD, PGDIR_SIZE, false)
+
+static void traverse_mm(struct mm_struct *mm, traverse_cb cb)
+{
+	cb(__pa(mm->pgd) >> PAGE_SHIFT, 0, (void *)-1);
+	traverse_pgd(mm->pgd, cb, 0);
+}
+
+static void free_maybe_reserved(struct page *page)
+{
+	if (PageReserved(page))
+		free_reserved_page(page);
+	else
+		__free_page(page);
+}
+
+struct pks_table_llnode {
+	struct llist_node node;
+	void *table;
+};
+
+/* PKS protect reserved dmap tables */
+static int __init init_pks_dmap_tables(void)
+{
+	static LLIST_HEAD(tables_to_covert);
+	struct pks_table_llnode *cur_entry;
+	struct llist_node *cur, *next;
+	struct pks_table_llnode *tmp;
+	bool fail_to_build_list = false;
+
+	/*
+	 * If pks tables failed to initialize, return the pages to the page
+	 * allocator, and don't enable dmap tables.
+	 */
+	if (!pks_tables_inited()) {
+		llist_for_each_safe(cur, next, llist_del_all(&tables_cache))
+			free_maybe_reserved(virt_to_page(cur));
+		return 0;
+	}
+
+	/* Build separate list of tables */
+	llist_for_each_safe(cur, next, llist_del_all(&tables_cache)) {
+		tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+		if (!tmp) {
+			fail_to_build_list = true;
+			free_maybe_reserved(virt_to_page(cur));
+			continue;
+		}
+		tmp->table = cur;
+		llist_add(&tmp->node, &tables_to_covert);
+		llist_add(cur, &tables_cache);
+	}
+
+	if (fail_to_build_list)
+		goto out_err;
+
+	/*
+	 * Tables in tables_cache can now be used, because they are being kept track
+	 * of tables_to_covert.
+	 */
+	dmap_tables_inited = true;
+
+	/*
+	 * PKS protect all tables in tables_to_covert. Some of them are also in tables_cache
+	 * and may get used in this process.
+	 */
+	while ((cur = llist_del_first(&tables_to_covert))) {
+		cur_entry = llist_entry(cur, struct pks_table_llnode, node);
+		set_memory_pks((unsigned long)cur_entry->table, 1, PKS_KEY_PG_TABLES);
+		kfree(cur_entry);
+	}
+
+	/*
+	 * It is safe to traverse while the callback ensure_table_protected() may
+	 * change the page tables, because CPA will only split pages and not merge
+	 * them. Any page used for the splits, will have already been protected in
+	 * a previous step, so they will not be missed if tables are mapped by a
+	 * structure that has already been traversed.
+	 */
+	traverse_mm(&init_mm, &ensure_table_protected);
+
+	return 0;
+out_err:
+	while ((cur = llist_del_first(&tables_to_covert))) {
+		cur_entry = llist_entry(cur, struct pks_table_llnode, node);
+		free_maybe_reserved(virt_to_page(cur));
+		kfree(cur_entry);
+	}
+	pr_warn("Unable to protect direct map page cache, direct map unprotected.\n");
+	return 0;
+}
+
+late_initcall(init_pks_dmap_tables);
+#endif /* CONFIG_PKS_PG_TABLES */
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.

From patchwork Mon Aug 30 23:59:26 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466203
Return-Path: 
 <SRS0=QuKx=NW=lists.openwall.com=kernel-hardening-return-21368-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 5F023C432BE
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:37 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id A321F60F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:03:36 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org A321F60F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 8047 invoked by uid 550); 31 Aug 2021 00:00:41 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7427 invoked from network); 31 Aug 2021 00:00:34 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933749"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933749"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530713028"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 18/19] x86/mm: Add PKS table soft mode
Date: Mon, 30 Aug 2021 16:59:26 -0700
Message-Id: <20210830235927.6443-19-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Some users may not want to treat errant page table writes as fatal, and
would prefer to just log the invalid access and continue. Add a "soft"
mode for this. Add a config to make always make this the default behavior,
and a config to enable it at boot in the absence of the new config.

After a single warning, the page tables will be writable, so no warnings
will be reported until the next reboot.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  4 ++++
 arch/x86/include/asm/pgtable.h                  |  1 +
 arch/x86/mm/pgtable.c                           | 16 +++++++++++++++-
 arch/x86/mm/pkeys.c                             |  3 +++
 mm/Kconfig                                      | 12 ++++++++++++
 5 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 7902fce7f1da..8bb290fee77f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4254,6 +4254,10 @@
 	nopti		[X86-64]
 			Equivalent to pti=off
 
+	nopkstables	[X86-64] Disable PKS page table protection
+
+	pkstablessoft	[X86-64] Warn instead of oops on pks tables violations
+
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 871308c40dac..2e4b4308bd59 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -122,6 +122,7 @@ void pks_tables_check_boottime_disable(void);
 void enable_pgtable_write(void);
 void disable_pgtable_write(void);
 bool pks_tables_inited(void);
+bool pks_tables_fault(unsigned long addr, bool write);
 #else /* CONFIG_PKS_PG_TABLES */
 static inline void pks_tables_check_boottime_disable(void) { }
 static void enable_pgtable_write(void) { }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 69b43097c9da..0dcbd976a91b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -40,7 +40,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
 #ifdef CONFIG_PKS_PG_TABLES
 static struct grouped_page_cache gpc_pks;
 static bool __ro_after_init pks_tables_inited_val;
-
+static bool __ro_after_init pks_tables_soft;
 
 struct page *alloc_table_node(gfp_t gfp, int node)
 {
@@ -971,6 +971,16 @@ bool pks_tables_inited(void)
 	return pks_tables_inited_val;
 }
 
+bool pks_tables_fault(unsigned long addr, bool write)
+{
+	WARN(1, "Write to protected page table, exploit attempt?");
+	if (!pks_tables_soft)
+		return 0;
+
+	pks_abandon_protections(PKS_KEY_PG_TABLES);
+	return 1;
+}
+
 static int __init pks_page_init(void)
 {
 	/*
@@ -999,6 +1009,10 @@ __init void pks_tables_check_boottime_disable(void)
 	if (cmdline_find_option_bool(boot_command_line, "nopkstables"))
 		return;
 
+	if (IS_ENABLED(CONFIG_PKS_PG_TABLES_SOFT_ALWAYS) ||
+	    cmdline_find_option_bool(boot_command_line, "pkstablessoft"))
+		pks_tables_soft = true;
+
 	/*
 	 * PTI will want to allocate higher order page table pages, which the
 	 * PKS table allocator doesn't support. So don't attempt to enable PKS
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 48a390722c06..d8df2bb4bbd0 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -247,6 +247,9 @@ static const pks_key_callback pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = {
 #ifdef CONFIG_DEVMAP_ACCESS_PROTECTION
 	[PKS_KEY_PGMAP_PROTECTION]   = pgmap_pks_fault_callback,
 #endif
+#ifdef CONFIG_PKS_PG_TABLES
+	[PKS_KEY_PG_TABLES] = pks_tables_fault,
+#endif
 };
 
 bool handle_pks_key_callback(unsigned long address, bool write, u16 key)
diff --git a/mm/Kconfig b/mm/Kconfig
index 0f8e8595a396..1f4fc85cbd2c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -851,6 +851,18 @@ config PKS_PG_TABLES
 	depends on !HIGHMEM && !X86_PAE && SPARSEMEM_VMEMMAP
 	depends on ARCH_HAS_SUPERVISOR_PKEYS
 
+config PKS_PG_TABLES_SOFT_ALWAYS
+	bool
+	default y
+	depends on PKS_PG_TABLES
+	help
+	  This features enables PKS tables "soft" mode by default, such that
+	  the first PKS table violation is logged and after that protections
+	  are disabled. This is useful for cases where users would not like
+	  to treat bugs that incorrectly modify page tables as fatal, but would
+	  still like to get notifications of illegitimate attempts to modify
+	  them.
+
 config PERCPU_STATS
 	bool "Collect percpu memory statistics"
 	help

From patchwork Mon Aug 30 23:59:27 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
X-Patchwork-Id: 12466207
Return-Path: 
 <SRS0=oLYl=NW=lists.openwall.com=kernel-hardening-return-21370-kernel-hardening=archiver.kernel.org@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-16.8 required=3.0 tests=BAYES_00,
	HEADER_FROM_DIFFERENT_DOMAINS,INCLUDES_CR_TRAILER,INCLUDES_PATCH,
	MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,URIBL_BLOCKED,USER_AGENT_GIT
	autolearn=unavailable autolearn_force=no version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 795F0C4320E
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:04:09 +0000 (UTC)
Received: from mother.openwall.net (mother.openwall.net [195.42.179.200])
	by mail.kernel.org (Postfix) with SMTP id CB6B660F4B
	for <kernel-hardening@archiver.kernel.org>;
 Tue, 31 Aug 2021 00:04:08 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org CB6B660F4B
Authentication-Results: mail.kernel.org;
 dmarc=fail (p=none dis=none) header.from=intel.com
Authentication-Results: mail.kernel.org;
 spf=pass smtp.mailfrom=lists.openwall.com
Received: (qmail 8191 invoked by uid 550); 31 Aug 2021 00:00:43 -0000
Mailing-List: contact kernel-hardening-help@lists.openwall.com; run by ezmlm
Precedence: bulk
List-Post: <mailto:kernel-hardening@lists.openwall.com>
List-Help: <mailto:kernel-hardening-help@lists.openwall.com>
List-Unsubscribe: <mailto:kernel-hardening-unsubscribe@lists.openwall.com>
List-Subscribe: <mailto:kernel-hardening-subscribe@lists.openwall.com>
List-ID: <kernel-hardening.lists.openwall.com>
Received: (qmail 7472 invoked from network); 31 Aug 2021 00:00:34 -0000
X-IronPort-AV: E=McAfee;i="6200,9189,10092"; a="197933750"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="197933750"
X-IronPort-AV: E=Sophos;i="5.84,364,1620716400";
   d="scan'208";a="530713039"
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
To: dave.hansen@intel.com,
	luto@kernel.org,
	peterz@infradead.org,
	x86@kernel.org,
	akpm@linux-foundation.org,
	keescook@chromium.org,
	shakeelb@google.com,
	vbabka@suse.cz,
	rppt@kernel.org
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>,
	linux-mm@kvack.org,
	linux-hardening@vger.kernel.org,
	kernel-hardening@lists.openwall.com,
	ira.weiny@intel.com,
	dan.j.williams@intel.com,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v2 19/19] x86/mm: Add PKS table debug checking
Date: Mon, 30 Aug 2021 16:59:27 -0700
Message-Id: <20210830235927.6443-20-rick.p.edgecombe@intel.com>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20210830235927.6443-1-rick.p.edgecombe@intel.com>
References: <20210830235927.6443-1-rick.p.edgecombe@intel.com>

Add a runtime checker that scans the currently used page tables to check
that they are all protected on the direct map, in the case of PKS
tables. Use the recently added page table traverser.

There are many possible ways to modify and allocate page tables. In
order to catch any missed cases, just traverse the active tables every
second and check the direct map protection for each.

This feature is intended for debugging only. Another way to do this
without the awkward timers, is to check each page while contructing the
PTE. It may be useful for enhance the protection as well. But this could
miss any strange wrong page table modifications hidden away somewhere in
the kernel. So for debug time, the scanner is a little more thorough.

Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/mm/pat/set_memory.c | 43 ++++++++++++++++++++++++++++++++++++
 mm/Kconfig                   |  5 +++++
 2 files changed, 48 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6acf25999b0f..945b3d3e1231 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -19,6 +19,8 @@
 #include <linux/vmstat.h>
 #include <linux/kernel.h>
 #include <linux/pkeys.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -2703,6 +2705,45 @@ static void traverse_mm(struct mm_struct *mm, traverse_cb cb)
 	traverse_pgd(mm->pgd, cb, 0);
 }
 
+#ifdef CONFIG_PKS_PG_TABLES_DEBUG
+static void check_table_protected(unsigned long pfn, void *vaddr, void *vend)
+{
+	if (is_dmap_protected((unsigned long)__va(pfn << PAGE_SHIFT)))
+		return;
+
+	pr_warn("Found unprotected page, pfn: %lx maps address:0x%p\n", pfn, vaddr);
+}
+
+static int table_scan_fn(void *data)
+{
+	while (1) {
+		msleep(MSEC_PER_SEC);
+		mmap_read_lock(current->active_mm);
+		traverse_mm(current->active_mm, &check_table_protected);
+		mmap_read_unlock(current->active_mm);
+	}
+	return 0;
+}
+
+static void __init init_pks_table_scan(void)
+{
+	struct task_struct *thread;
+	int cpu;
+
+	pr_info("Starting pks_table_debug thread on %d cpus\n", num_online_cpus());
+	for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+		thread = kthread_create_on_cpu(table_scan_fn, NULL, cpu, "pks_table_debug");
+		if (IS_ERR(thread)) {
+			pr_err("Failed to create pks_table_debug threads\n");
+			break;
+		}
+		wake_up_process(thread);
+	}
+}
+#else
+static void __init init_pks_table_scan(void) { }
+#endif
+
 static void free_maybe_reserved(struct page *page)
 {
 	if (PageReserved(page))
@@ -2776,6 +2817,8 @@ static int __init init_pks_dmap_tables(void)
 	 */
 	traverse_mm(&init_mm, &ensure_table_protected);
 
+	init_pks_table_scan();
+
 	return 0;
 out_err:
 	while ((cur = llist_del_first(&tables_to_covert))) {
diff --git a/mm/Kconfig b/mm/Kconfig
index 1f4fc85cbd2c..87a4963c63c6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -863,6 +863,11 @@ config PKS_PG_TABLES_SOFT_ALWAYS
 	  still like to get notifications of illegitimate attempts to modify
 	  them.
 
+config PKS_PG_TABLES_DEBUG
+	def_bool y
+	depends on PKS_PG_TABLES
+
+
 config PERCPU_STATS
 	bool "Collect percpu memory statistics"
 	help