From patchwork Tue Nov 28 12:50:20 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471173
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 56505C07E97
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:37 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id B60B210E53C;
	Tue, 28 Nov 2023 13:53:29 +0000 (UTC)
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
 by gabe.freedesktop.org (Postfix) with ESMTPS id BD16710E066;
 Tue, 28 Nov 2023 13:07:56 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.57])
 by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4Sfj3C1QgjzWhqF;
 Tue, 28 Nov 2023 20:49:59 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:40 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:20 +0800
Message-ID: <20231128125025.4449-2-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 1/6] mm/gmem: add heterogeneous NUMA node
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

This patch adds a new NUMA node state, named N_HETEROGENEOUS. It is
utilized to identify heterogeneous NUMA (hNUMA) node. Note that hNUMA node
may not be directly accessible by the CPU.

Each hNUMA node can be identified with a NUMA id. This can be extended to
provide NUMA topology including device local DRAM, where a cache-coherent
bus does not need to exist between the CPU and device local DRAM.
Furthermore, this allows an application user to issue memory hints that
bind with specific hNUMA nodes.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 drivers/base/node.c      |  6 ++++
 include/linux/gmem.h     | 19 ++++++++++
 include/linux/nodemask.h | 10 ++++++
 init/main.c              |  2 ++
 mm/Kconfig               | 14 ++++++++
 mm/Makefile              |  1 +
 mm/gmem.c                | 78 ++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c          |  3 ++
 8 files changed, 133 insertions(+)
 create mode 100644 include/linux/gmem.h
 create mode 100644 mm/gmem.c

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 493d533f8375..aa4d2ca266aa 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -928,6 +928,9 @@ static struct node_attr node_state_attr[] = {
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 	[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
 					   N_GENERIC_INITIATOR),
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS),
+#endif
 };
 
 static struct attribute *node_state_attrs[] = {
@@ -940,6 +943,9 @@ static struct attribute *node_state_attrs[] = {
 	&node_state_attr[N_MEMORY].attr.attr,
 	&node_state_attr[N_CPU].attr.attr,
 	&node_state_attr[N_GENERIC_INITIATOR].attr.attr,
+#ifdef CONFIG_GMEM
+	&node_state_attr[N_HETEROGENEOUS].attr.attr,
+#endif
 	NULL
 };
 
diff --git a/include/linux/gmem.h b/include/linux/gmem.h
new file mode 100644
index 000000000000..fff877873557
--- /dev/null
+++ b/include/linux/gmem.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+#ifndef _GMEM_H
+#define _GMEM_H
+
+#ifdef CONFIG_GMEM
+/* h-NUMA topology */
+void __init hnuma_init(void);
+#else
+static inline void hnuma_init(void) {}
+#endif
+
+#endif /* _GMEM_H */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..66e4640a52ba 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -407,6 +407,9 @@ enum node_states {
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
 	N_CPU,		/* The node has one or more cpus */
 	N_GENERIC_INITIATOR,	/* The node has one or more Generic Initiators */
+#ifdef CONFIG_GMEM
+	N_HETEROGENEOUS,	/* The node has heterogeneous memory */
+#endif
 	NR_NODE_STATES
 };
 
@@ -536,6 +539,13 @@ static inline int node_random(const nodemask_t *maskp)
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
+#ifdef CONFIG_GMEM
+/* For h-NUMA topology */
+#define hnode_map		node_states[N_HETEROGENEOUS]
+#define num_hnodes()		num_node_state(N_HETEROGENEOUS)
+#define for_each_hnode(node)	for_each_node_state(node, N_HETEROGENEOUS)
+#endif
+
 /*
  * For nodemask scratch area.
  * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..12dfb5b63d51 100644
--- a/init/main.c
+++ b/init/main.c
@@ -100,6 +100,7 @@
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
 #include <net/net_namespace.h>
+#include <linux/gmem.h>
 
 #include <asm/io.h>
 #include <asm/setup.h>
@@ -901,6 +902,7 @@ void start_kernel(void)
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 	boot_cpu_hotplug_init();
+	hnuma_init();
 
 	pr_notice("Kernel command line: %s\n", saved_command_line);
 	/* parameters may set static keys */
diff --git a/mm/Kconfig b/mm/Kconfig
index 89971a894b60..1a7d8194513c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1270,6 +1270,20 @@ config LOCK_MM_AND_FIND_VMA
 	bool
 	depends on !STACK_GROWSUP
 
+config GMEM
+	bool "generalized memory management for external memory devices"
+	depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE
+	select ARCH_USES_HIGH_VMA_FLAGS
+	default y
+	help
+	  Supporting GMEM (generalized memory management) for external memory
+	  devices
+
+	  GMEM extends Linux MM to share its machine-independent MM code. Only
+	  high-level interface is provided for device drivers. This prevents
+	  accelerator drivers from reinventing the wheel, but relies on drivers to
+	  implement their hardware-dependent functions declared by GMEM.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 33873c8aedb3..f48ea2eb4a44 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_GMEM) += gmem.o
diff --git a/mm/gmem.c b/mm/gmem.c
new file mode 100644
index 000000000000..767eb070b22e
--- /dev/null
+++ b/mm/gmem.c
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generalized Memory Management.
+ *
+ * Copyright (C) 2023- Huawei, Inc.
+ * Author: Weixi Zhu
+ *
+ */
+#include <linux/mm.h>
+#include <linux/gmem.h>
+
+DEFINE_SPINLOCK(hnode_lock);
+
+struct hnode {
+	unsigned int id;
+	struct gm_dev *dev;
+	struct xarray pages;
+};
+
+struct hnode *hnodes[MAX_NUMNODES];
+
+static bool is_hnode(int node)
+{
+	return !node_isset(node, node_possible_map) &&
+	       node_isset(node, hnode_map);
+}
+
+static bool is_hnode_allowed(int node)
+{
+	return is_hnode(node) && node_isset(node, current->mems_allowed);
+}
+
+static struct hnode *get_hnode(unsigned int hnid)
+{
+	return hnodes[hnid];
+}
+
+void __init hnuma_init(void)
+{
+	unsigned int node;
+
+	for_each_node(node)
+		node_set(node, hnode_map);
+}
+
+static unsigned int alloc_hnode_id(void)
+{
+	unsigned int node;
+
+	spin_lock(&hnode_lock);
+	node = first_unset_node(hnode_map);
+	node_set(node, hnode_map);
+	spin_unlock(&hnode_lock);
+
+	return node;
+}
+
+static void free_hnode_id(unsigned int nid)
+{
+	node_clear(nid, hnode_map);
+}
+
+static void hnode_init(struct hnode *hnode, unsigned int hnid,
+		       struct gm_dev *dev)
+{
+	hnodes[hnid] = hnode;
+	hnodes[hnid]->id = hnid;
+	hnodes[hnid]->dev = dev;
+	xa_init(&hnodes[hnid]->pages);
+}
+
+static void hnode_deinit(unsigned int hnid, struct gm_dev *dev)
+{
+	hnodes[hnid]->id = 0;
+	hnodes[hnid]->dev = NULL;
+	xa_destroy(&hnodes[hnid]->pages);
+	hnodes[hnid] = NULL;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 733732e7e0ba..a785b62a1542 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -192,6 +192,9 @@ EXPORT_SYMBOL(latent_entropy);
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
+#ifdef CONFIG_GMEM
+	[N_HETEROGENEOUS] = NODE_MASK_NONE,
+#endif
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM

From patchwork Tue Nov 28 12:50:21 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471171
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 1F3F8C07CA9
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:33 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 2ADF210E537;
	Tue, 28 Nov 2023 13:53:29 +0000 (UTC)
X-Greylist: delayed 991 seconds by postgrey-1.36 at gabe;
 Tue, 28 Nov 2023 13:07:15 UTC
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
 by gabe.freedesktop.org (Postfix) with ESMTPS id F1AEF10E066;
 Tue, 28 Nov 2023 13:07:15 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.55])
 by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4Sfj3S74D2zvRDD;
 Tue, 28 Nov 2023 20:50:12 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:41 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:21 +0800
Message-ID: <20231128125025.4449-3-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 2/6] mm/gmem: add arch-independent
 abstraction to track address mapping status
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

This patch adds an abstraction layer, struct vm_object, that maintains
per-process virtual-to-physical mapping status stored in struct gm_mapping.
For example, a virtual page may be mapped to a CPU physical page or to a
device physical page. Struct vm_object effectively maintains an
arch-independent page table, which is defined as a "logical page table".
While arch-dependent page table used by a real MMU is named a "physical
page table". The logical page table is useful if Linux core MM is extended
to handle a unified virtual address space with external accelerators using
customized MMUs.

In this patch, struct vm_object utilizes a radix
tree (xarray) to track where a virtual page is mapped to. This adds extra
memory consumption from xarray, but provides a nice abstraction to isolate
mapping status from the machine-dependent layer (PTEs). Besides supporting
accelerators with external MMUs, struct vm_object is planned to further
union with i_pages in struct address_mapping for file-backed memory.

The idea of struct vm_object is originated from FreeBSD VM design, which
provides a unified abstraction for anonymous memory, file-backed memory,
page cache and etc[1].

Currently, Linux utilizes a set of hierarchical page walk functions to
abstract page table manipulations of different CPU architecture. The
problem happens when a device wants to reuse Linux MM code to manage its
page table -- the device page table may not be accessible to the CPU.
Existing solution like Linux HMM utilizes the MMU notifier mechanisms to
invoke device-specific MMU functions, but relies on encoding the mapping
status on the CPU page table entries. This entangles machine-independent
code with machine-dependent code, and also brings unnecessary restrictions.
The PTE size and format vary arch by arch, which harms the extensibility.

[1] https://docs.freebsd.org/en/articles/vm-design/

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 include/linux/gmem.h     | 120 +++++++++++++++++++++++++
 include/linux/mm_types.h |   4 +
 mm/Makefile              |   2 +-
 mm/vm_object.c           | 184 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 309 insertions(+), 1 deletion(-)
 create mode 100644 mm/vm_object.c

diff --git a/include/linux/gmem.h b/include/linux/gmem.h
index fff877873557..529ff6755a99 100644
--- a/include/linux/gmem.h
+++ b/include/linux/gmem.h
@@ -9,11 +9,131 @@
 #ifndef _GMEM_H
 #define _GMEM_H
 
+#include <linux/mm_types.h>
+
 #ifdef CONFIG_GMEM
+
+#define GM_PAGE_CPU	0x10 /* Determines whether page is a pointer or a pfn number. */
+#define GM_PAGE_DEVICE	0x20
+#define GM_PAGE_NOMAP	0x40
+#define GM_PAGE_WILLNEED	0x80
+
+#define GM_PAGE_TYPE_MASK	(GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP)
+
+struct gm_mapping {
+	unsigned int flag;
+
+	union {
+		struct page *page;	/* CPU node */
+		struct gm_dev *dev;	/* hetero-node. TODO: support multiple devices */
+		unsigned long pfn;
+	};
+
+	struct mutex lock;
+};
+
+static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags)
+{
+	if (flags & GM_PAGE_TYPE_MASK)
+		gm_mapping->flag &= ~GM_PAGE_TYPE_MASK;
+
+	gm_mapping->flag |= flags;
+}
+
+static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags)
+{
+	gm_mapping->flag &= ~flags;
+}
+
+static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_CPU);
+}
+
+static inline bool gm_mapping_device(struct gm_mapping *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_DEVICE);
+}
+
+static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_NOMAP);
+}
+
+static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping)
+{
+	return !!(gm_mapping->flag & GM_PAGE_WILLNEED);
+}
+
 /* h-NUMA topology */
 void __init hnuma_init(void);
+
+/* vm object */
+/*
+ * Each per-process vm_object tracks the mapping status of virtual pages from
+ * all VMAs mmap()-ed with MAP_PRIVATE | MAP_PEER_SHARED.
+ */
+struct vm_object {
+	spinlock_t lock;
+
+	/*
+	 * The logical_page_table is a container that holds the mapping
+	 * information between a VA and a struct page.
+	 */
+	struct xarray *logical_page_table;
+	atomic_t nr_pages;
+};
+
+int __init vm_object_init(void);
+struct vm_object *vm_object_create(struct mm_struct *mm);
+void vm_object_drop_locked(struct mm_struct *mm);
+
+struct gm_mapping *alloc_gm_mapping(void);
+void free_gm_mappings(struct vm_area_struct *vma);
+struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va);
+void vm_object_mapping_create(struct vm_object *obj, unsigned long start);
+void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
+			     unsigned long end);
+void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start,
+			    unsigned long end);
 #else
 static inline void hnuma_init(void) {}
+static inline void __init vm_object_init(void)
+{
+}
+static inline struct vm_object *vm_object_create(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+static inline void vm_object_drop_locked(struct vm_area_struct *vma)
+{
+}
+static inline struct gm_mapping *alloc_gm_mapping(void)
+{
+	return NULL;
+}
+static inline void free_gm_mappings(struct vm_area_struct *vma)
+{
+}
+static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj,
+						  unsigned long va)
+{
+	return NULL;
+}
+static inline void vm_object_mapping_create(struct vm_object *obj,
+					    unsigned long start)
+{
+}
+static inline void unmap_gm_mappings_range(struct vm_area_struct *vma,
+					   unsigned long start,
+					   unsigned long end)
+{
+}
+static inline void munmap_in_peer_devices(struct mm_struct *mm,
+					  unsigned long start,
+					  unsigned long end)
+{
+}
 #endif
 
 #endif /* _GMEM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..4e50dc019d75 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -31,6 +31,7 @@
 
 struct address_space;
 struct mem_cgroup;
+struct vm_object;
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -974,6 +975,9 @@ struct mm_struct {
 #endif
 		} lru_gen;
 #endif /* CONFIG_LRU_GEN */
+#ifdef CONFIG_GMEM
+		struct vm_object *vm_obj;
+#endif
 	} __randomize_layout;
 
 	/*
diff --git a/mm/Makefile b/mm/Makefile
index f48ea2eb4a44..d2dfab012c96 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,4 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
-obj-$(CONFIG_GMEM) += gmem.o
+obj-$(CONFIG_GMEM) += gmem.o vm_object.o
diff --git a/mm/vm_object.c b/mm/vm_object.c
new file mode 100644
index 000000000000..4e76737e0ca1
--- /dev/null
+++ b/mm/vm_object.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/alpha/boot/bootp.c
+ *
+ * Copyright (C) 1997 Jay Estabrook
+ *
+ * This file is used for creating a bootp file for the Linux/AXP kernel
+ *
+ * based significantly on the arch/alpha/boot/main.c of Linus Torvalds
+ */
+#include <linux/mm.h>
+#include <linux/gmem.h>
+
+/*
+ * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA
+ * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA
+ * gets changed: merge, split, adjust
+ */
+static struct kmem_cache *vm_object_cachep;
+static struct kmem_cache *gm_mapping_cachep;
+
+static inline void release_gm_mapping(struct gm_mapping *mapping)
+{
+	kmem_cache_free(gm_mapping_cachep, mapping);
+}
+
+static inline struct gm_mapping *lookup_gm_mapping(struct vm_object *obj,
+						   unsigned long pindex)
+{
+	return xa_load(obj->logical_page_table, pindex);
+}
+
+int __init vm_object_init(void)
+{
+	vm_object_cachep = KMEM_CACHE(vm_object, 0);
+	if (!vm_object_cachep)
+		goto out;
+
+	gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0);
+	if (!gm_mapping_cachep)
+		goto free_vm_object;
+
+	return 0;
+free_vm_object:
+	kmem_cache_destroy(vm_object_cachep);
+out:
+	return -ENOMEM;
+}
+
+/*
+ * Create a VM_OBJECT and attach it to a mm_struct
+ * This should be called when a task_struct is created.
+ */
+struct vm_object *vm_object_create(struct mm_struct *mm)
+{
+	struct vm_object *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL);
+
+	if (!obj)
+		return NULL;
+
+	spin_lock_init(&obj->lock);
+
+	/*
+	 * The logical page table maps va >> PAGE_SHIFT
+	 * to pointers of struct gm_mapping.
+	 */
+	obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL);
+	if (!obj->logical_page_table) {
+		kmem_cache_free(vm_object_cachep, obj);
+		return NULL;
+	}
+
+	xa_init(obj->logical_page_table);
+	atomic_set(&obj->nr_pages, 0);
+
+	return obj;
+}
+
+/* This should be called when a mm no longer refers to a VM_OBJECT */
+void vm_object_drop_locked(struct mm_struct *mm)
+{
+	struct vm_object *obj = mm->vm_obj;
+
+	if (!obj)
+		return;
+
+	/*
+	 * We must enter this with VMA write-locked, which is unfortunately a
+	 * giant lock.
+	 */
+	mmap_assert_write_locked(mm);
+	mm->vm_obj = NULL;
+
+	xa_destroy(obj->logical_page_table);
+	kfree(obj->logical_page_table);
+	kmem_cache_free(vm_object_cachep, obj);
+}
+
+/*
+ * Given a VA, the page_index is computed by
+ * page_index = address >> PAGE_SHIFT
+ */
+struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va)
+{
+	return lookup_gm_mapping(obj, va >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(vm_object_lookup);
+
+void vm_object_mapping_create(struct vm_object *obj, unsigned long start)
+{
+
+	unsigned long index = start >> PAGE_SHIFT;
+	struct gm_mapping *gm_mapping;
+
+	if (!obj)
+		return;
+
+	gm_mapping = alloc_gm_mapping();
+	if (!gm_mapping)
+		return;
+
+	__xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL);
+}
+
+/* gm_mapping will not be release dynamically */
+struct gm_mapping *alloc_gm_mapping(void)
+{
+	struct gm_mapping *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL);
+
+	if (!gm_mapping)
+		return NULL;
+
+	gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP);
+	mutex_init(&gm_mapping->lock);
+
+	return gm_mapping;
+}
+
+/* This should be called when a PEER_SHAERD vma is freed */
+void free_gm_mappings(struct vm_area_struct *vma)
+{
+	struct gm_mapping *gm_mapping;
+	struct vm_object *obj;
+
+	obj = vma->vm_mm->vm_obj;
+	if (!obj)
+		return;
+
+	XA_STATE(xas, obj->logical_page_table, vma->vm_start >> PAGE_SHIFT);
+
+	xa_lock(obj->logical_page_table);
+		xas_for_each(&xas, gm_mapping, vma->vm_end >> PAGE_SHIFT) {
+		release_gm_mapping(gm_mapping);
+		xas_store(&xas, NULL);
+	}
+	xa_unlock(obj->logical_page_table);
+}
+
+void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
+			    unsigned long end)
+{
+	struct xarray *logical_page_table;
+	struct gm_mapping *gm_mapping;
+	struct page *page = NULL;
+
+	if (!vma->vm_mm->vm_obj)
+		return;
+
+	logical_page_table = vma->vm_mm->vm_obj->logical_page_table;
+	if (!logical_page_table)
+		return;
+
+	XA_STATE(xas, logical_page_table, start >> PAGE_SHIFT);
+
+	xa_lock(logical_page_table);
+	xas_for_each(&xas, gm_mapping, end >> PAGE_SHIFT) {
+		page = gm_mapping->page;
+		if (page && (page_ref_count(page) != 0)) {
+			put_page(page);
+			gm_mapping->page = NULL;
+		}
+	}
+	xa_unlock(logical_page_table);
+}

From patchwork Tue Nov 28 12:50:22 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471176
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 3DA0FC4167B
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:41 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 8B99610E53E;
	Tue, 28 Nov 2023 13:53:31 +0000 (UTC)
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
 by gabe.freedesktop.org (Postfix) with ESMTPS id C4B8810E518;
 Tue, 28 Nov 2023 13:07:56 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.56])
 by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4Sfj3G6KwCzWhrR;
 Tue, 28 Nov 2023 20:50:02 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:43 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:22 +0800
Message-ID: <20231128125025.4449-4-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 3/6] mm/gmem: add GMEM (Generalized Memory
 Management) interface for external accelerators
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Accelerator driver developers are forced to reinvent external MM subsystems
case by case, introducing redundant code (14K~70K for each case). This is
because Linux core MM only considers host memory resources. At the same
time, application-level developers suffer from poor programmability -- they
must consider parallel address spaces and be careful about the limited
device DRAM capacity.

This patch adds GMEM interface to help accelerator drivers directly reuse
Linux core MM, preventing them from reinventing the wheel. Drivers which
utilize GMEM interface can directly support unified virtual address spaces
for application users -- memory allocated with malloc()/mmap() can be
directly used by either CPU and accelerators, providing a coherent view of
memory.

The GMEM device interface prefixed with "gm_dev" is used to decouple
accelerator-specific operations. Device drivers should invoke
gm_dev_create() to register a device instance at the device boot time. A
device-specific implementation of "struct gm_mmu" must be provided, so
Linux can invoke hardware-related functions at the right time. If the
driver wants Linux to take charge of the local DRAM of the accelerator,
then it should register a range of physical addresses to be managed by
gm_dev_register_physmem().

The GMEM address space interface prefixed with "gm_as" is used to connect a
device context with a CPU context, i.e. an mm_struct. Struct gm_as is
created as a unified address space that not only includes a CPU context,
but may also include one or more device contexts. Device driver should
utilize gm_as_attach() to include a device context to a created struct
gm_as. Then gm_dev_fault() can then serve as a generic device page fault
handler. It is important that a device driver invokes gm_as_attach() at the
beginning of a CPU program. This invocation can happen inside an ioctl()
call when a device context is initialized.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 include/linux/gmem.h     | 196 +++++++++++++++++++
 include/linux/mm_types.h |   1 +
 mm/gmem.c                | 408 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 605 insertions(+)

diff --git a/include/linux/gmem.h b/include/linux/gmem.h
index 529ff6755a99..f424225daa03 100644
--- a/include/linux/gmem.h
+++ b/include/linux/gmem.h
@@ -13,6 +13,35 @@
 
 #ifdef CONFIG_GMEM
 
+#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */
+
+DECLARE_STATIC_KEY_FALSE(gmem_status);
+
+static inline bool gmem_is_enabled(void)
+{
+	return static_branch_likely(&gmem_status);
+}
+
+struct gm_dev {
+	int id;
+
+	/*
+	 * TODO: define more device capabilities and consider different device
+	 * base page sizes
+	 */
+	unsigned long capability;
+	struct gm_mmu *mmu;
+	void *dev_data;
+	/* A device may support time-sliced context switch. */
+	struct gm_context *current_ctx;
+
+	struct list_head gm_ctx_list;
+
+	/* Add tracking of registered device local physical memory. */
+	nodemask_t registered_hnodes;
+	struct device *dma_dev;
+};
+
 #define GM_PAGE_CPU	0x10 /* Determines whether page is a pointer or a pfn number. */
 #define GM_PAGE_DEVICE	0x20
 #define GM_PAGE_NOMAP	0x40
@@ -96,7 +125,161 @@ void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
 			     unsigned long end);
 void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start,
 			    unsigned long end);
+
+/* core gmem */
+enum gm_ret {
+	GM_RET_SUCCESS = 0,
+	GM_RET_NOMEM,
+	GM_RET_PAGE_EXIST,
+	GM_RET_MIGRATING,
+	GM_RET_FAILURE_UNKNOWN,
+};
+
+/**
+ * enum gm_mmu_mode - defines the method to share a physical page table.
+ *
+ * @GM_MMU_MODE_SHARE: Share a physical page table with another attached
+ * device's MMU, requiring one of the attached MMUs to be compatible. For
+ * example, the IOMMU is compatible with the CPU MMU on most modern machines.
+ * This mode requires the device physical memory to be cache-coherent.
+ * TODO: add MMU cookie to detect compatible MMUs.
+ *
+ * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds
+ * exclusive mapping entries, so that device memory accesses can trigger
+ * fault-driven migration for automatic data locality optimizations.
+ * This mode does not require a cache-coherent link between the CPU and device.
+ *
+ * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates
+ * physical mapping entries whenever a physical mapping is installed inside the
+ * address space, so that it may minimize the page faults to be triggered by
+ * this device.
+ * This mode requires the device physical memory to be cache-coherent.
+ */
+enum gm_mmu_mode {
+	GM_MMU_MODE_SHARE,
+	GM_MMU_MODE_COHERENT_EXCLUSIVE,
+	GM_MMU_MODE_REPLICATE,
+};
+
+enum gm_fault_hint {
+	GM_FAULT_HINT_MARK_HOT,
+	/*
+	 * TODO: introduce other fault hints, e.g. read-only duplication, map
+	 * remotely instead of migrating.
+	 */
+};
+
+/* Parameter list for peer_map/peer_unmap mmu functions. */
+struct gm_fault_t {
+	struct mm_struct *mm;
+	struct gm_dev *dev;
+	unsigned long va;
+	unsigned long size;
+	unsigned long prot;
+	bool copy;	/* Set dma_addr with a valid address if true */
+	dma_addr_t dma_addr;
+	enum gm_fault_hint hint;
+};
+
+/**
+ * This struct defines a series of MMU functions registered by a peripheral
+ * device that is to be invoked by GMEM.
+ *
+ * pmap is an opaque pointer that identifies a physical page table of a device.
+ * A physical page table holds the physical mappings that can be interpreted by
+ * the hardware MMU.
+ */
+struct gm_mmu {
+	/*
+	 * TODO: currently the device is assumed to support the same base page
+	 * size and huge page size as the host, which is not necessarily the
+	 * fact. Consider customized page sizes and add MMU cookie to identify
+	 * compatible MMUs which can share page tables.
+	 */
+
+	/* Synchronize VMA in a peer OS to interact with the host OS */
+	int (*peer_va_alloc_fixed)(struct mm_struct *mm, unsigned long va,
+				   unsigned long size, unsigned long prot);
+	int (*peer_va_free)(struct mm_struct *mm, unsigned long va,
+			    unsigned long size);
+
+	/* Create physical mappings on peer host.
+	 * If copy is set, copy data [dma_addr, dma_addr + size] to peer host
+	 */
+	int (*peer_map)(struct gm_fault_t *gmf);
+	/*
+	 * Destroy physical mappings on peer host.
+	 * If copy is set, copy data back to [dma_addr, dma_addr + size]
+	 */
+	int (*peer_unmap)(struct gm_fault_t *gmf);
+
+	/* Create or destroy a device's physical page table. */
+	int (*pmap_create)(struct gm_dev *dev, void **pmap);
+	int (*pmap_destroy)(void *pmap);
+
+	/* Create or destroy a physical mapping of a created physical page table */
+	int (*pmap_enter)(void *pmap, unsigned long va, unsigned long size,
+			  unsigned long pa, unsigned long prot);
+	int (*pmap_release)(void *pmap, unsigned long va, unsigned long size);
+
+	/* Change the protection of a virtual page */
+	int (*pmap_protect)(void *pmap, unsigned long va, unsigned long size,
+			    unsigned long new_prot);
+
+	/* Invalidation functions of the MMU TLB */
+	int (*tlb_invl)(void *pmap, unsigned long va, unsigned long size);
+	int (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings);
+};
+
+/**
+ * gm dev cap defines a composable flag to describe the capabilities of a device.
+ *
+ * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults.
+ * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS
+ */
+#define GM_DEV_CAP_REPLAYABLE	0x00000001
+#define GM_DEV_CAP_PEER		0x00000010
+
+#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0)
+
+struct gm_context {
+	struct gm_as *as;
+	struct gm_dev *dev;
+	void *pmap;
+	/* List of device contexts with the same struct gm_dev */
+	struct list_head gm_dev_link;
+
+	/* List of device contexts within the same address space */
+	struct list_head gm_as_link;
+};
+
+vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order);
+
+/* GMEM Device KPI */
+int gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap,
+		  struct gm_dev **new_dev);
+int gm_dev_destroy(struct gm_dev *dev);
+int gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin,
+			    unsigned long end);
+int gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev,
+		 enum gm_fault_hint hint);
+
+/* Defines an address space. */
+struct gm_as {
+	spinlock_t lock; /* spinlock of struct gm_as */
+	unsigned long start_va;
+	unsigned long end_va;
+
+	struct list_head gm_ctx_list; /* tracks device contexts attached to this va space, using gm_as_link */
+};
+
+/* GMEM address space KPI */
+int gm_as_create(unsigned long begin, unsigned long end, struct gm_as **new_as);
+int gm_as_destroy(struct gm_as *as);
+int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
+		 bool activate, struct gm_context **out_ctx);
 #else
+static inline bool gmem_is_enabled(void) { return false; }
 static inline void hnuma_init(void) {}
 static inline void __init vm_object_init(void)
 {
@@ -134,6 +317,19 @@ static inline void munmap_in_peer_devices(struct mm_struct *mm,
 					  unsigned long end)
 {
 }
+int gm_as_create(unsigned long begin, unsigned long end, struct gm_as **new_as)
+{
+	return 0;
+}
+int gm_as_destroy(struct gm_as *as)
+{
+	return 0;
+}
+int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
+		 bool activate, struct gm_context **out_ctx)
+{
+	return 0;
+}
 #endif
 
 #endif /* _GMEM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4e50dc019d75..ade2b6aee0f3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -977,6 +977,7 @@ struct mm_struct {
 #endif /* CONFIG_LRU_GEN */
 #ifdef CONFIG_GMEM
 		struct vm_object *vm_obj;
+		struct gm_as *gm_as;
 #endif
 	} __randomize_layout;
 
diff --git a/mm/gmem.c b/mm/gmem.c
index 767eb070b22e..b95b6b42ed6d 100644
--- a/mm/gmem.c
+++ b/mm/gmem.c
@@ -8,6 +8,17 @@
  */
 #include <linux/mm.h>
 #include <linux/gmem.h>
+#include <linux/dma-mapping.h>
+
+DEFINE_STATIC_KEY_FALSE(gmem_status);
+EXPORT_SYMBOL_GPL(gmem_status);
+
+static struct kmem_cache *gm_as_cache;
+static struct kmem_cache *gm_dev_cache;
+static struct kmem_cache *gm_ctx_cache;
+static DEFINE_XARRAY_ALLOC(gm_dev_id_pool);
+
+static bool enable_gmem;
 
 DEFINE_SPINLOCK(hnode_lock);
 
@@ -66,6 +77,7 @@ static void hnode_init(struct hnode *hnode, unsigned int hnid,
 	hnodes[hnid] = hnode;
 	hnodes[hnid]->id = hnid;
 	hnodes[hnid]->dev = dev;
+	node_set(hnid, dev->registered_hnodes);
 	xa_init(&hnodes[hnid]->pages);
 }
 
@@ -73,6 +85,402 @@ static void hnode_deinit(unsigned int hnid, struct gm_dev *dev)
 {
 	hnodes[hnid]->id = 0;
 	hnodes[hnid]->dev = NULL;
+	node_clear(hnid, dev->registered_hnodes);
 	xa_destroy(&hnodes[hnid]->pages);
 	hnodes[hnid] = NULL;
 }
+
+static struct workqueue_struct *prefetch_wq;
+
+#define GM_WORK_CONCURRENCY 4
+
+static int __init gmem_init(void)
+{
+	int err = -ENOMEM;
+
+	if (!enable_gmem)
+		return 0;
+
+	gm_as_cache = KMEM_CACHE(gm_as, 0);
+	if (!gm_as_cache)
+		goto out;
+
+	gm_dev_cache = KMEM_CACHE(gm_dev, 0);
+	if (!gm_dev_cache)
+		goto free_as;
+
+	gm_ctx_cache = KMEM_CACHE(gm_context, 0);
+	if (!gm_ctx_cache)
+		goto free_dev;
+
+	err = vm_object_init();
+	if (err)
+		goto free_ctx;
+
+	prefetch_wq = alloc_workqueue("prefetch",
+				      __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI |
+					      WQ_CPU_INTENSIVE,
+				      GM_WORK_CONCURRENCY);
+	if (!prefetch_wq) {
+		pr_info("fail to alloc workqueue prefetch_wq\n");
+		err = -EFAULT;
+		goto free_ctx;
+	}
+
+	static_branch_enable(&gmem_status);
+
+	return 0;
+
+free_ctx:
+	kmem_cache_destroy(gm_ctx_cache);
+free_dev:
+	kmem_cache_destroy(gm_dev_cache);
+free_as:
+	kmem_cache_destroy(gm_as_cache);
+out:
+	return -ENOMEM;
+}
+subsys_initcall(gmem_init);
+
+static int __init setup_gmem(char *str)
+{
+	(void)kstrtobool(str, &enable_gmem);
+
+	return 1;
+}
+__setup("gmem=", setup_gmem);
+
+/*
+ * Create a GMEM device, register its MMU function and the page table.
+ * The returned device pointer will be passed by new_dev.
+ * A unique id will be assigned to the GMEM device, using Linux's xarray.
+ */
+int gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap,
+		  struct gm_dev **new_dev)
+{
+	struct gm_dev *dev;
+
+	if (!gmem_is_enabled())
+		return GM_RET_FAILURE_UNKNOWN;
+
+	dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL);
+	if (!dev)
+		return GM_RET_NOMEM;
+
+	if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, GFP_KERNEL)) {
+		kmem_cache_free(gm_dev_cache, dev);
+		return GM_RET_NOMEM;
+	}
+
+	dev->capability = cap;
+	dev->mmu = mmu;
+	dev->dev_data = dev_data;
+	dev->current_ctx = NULL;
+	INIT_LIST_HEAD(&dev->gm_ctx_list);
+	*new_dev = dev;
+	nodes_clear(dev->registered_hnodes);
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_dev_create);
+
+int gm_dev_destroy(struct gm_dev *dev)
+{
+	/* TODO: implement it */
+	xa_erase(&gm_dev_id_pool, dev->id);
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_dev_destroy);
+
+int gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev,
+		 enum gm_fault_hint hint)
+{
+	int ret = GM_RET_SUCCESS;
+	struct gm_mmu *mmu = dev->mmu;
+	struct device *dma_dev = dev->dma_dev;
+	struct vm_area_struct *vma;
+	struct vm_object *obj;
+	struct gm_mapping *gm_mapping;
+	unsigned long size = HPAGE_SIZE;
+	struct gm_fault_t gmf = {
+		.mm = mm,
+		.va = addr,
+		.dev = dev,
+		.size = size,
+		.copy = false,
+		.hint = hint
+	};
+	struct page *page = NULL;
+
+	mmap_read_lock(mm);
+	obj = mm->vm_obj;
+	if (!obj) {
+		pr_info("gmem: %s no vm_obj\n", __func__);
+		ret = GM_RET_FAILURE_UNKNOWN;
+		goto mmap_unlock;
+	}
+
+	vma = find_vma(mm, addr);
+	if (!vma) {
+		pr_info("gmem: %s no vma\n", __func__);
+		ret = GM_RET_FAILURE_UNKNOWN;
+		goto mmap_unlock;
+	}
+	obj = mm->vm_obj;
+	if (!obj) {
+		pr_info("gmem: %s no vm_obj\n", __func__);
+		ret = GM_RET_FAILURE_UNKNOWN;
+		goto mmap_unlock;
+	}
+
+	xa_lock(obj->logical_page_table);
+	gm_mapping = vm_object_lookup(obj, addr);
+	if (!gm_mapping) {
+		vm_object_mapping_create(obj, addr);
+		gm_mapping = vm_object_lookup(obj, addr);
+	}
+	xa_unlock(obj->logical_page_table);
+
+	mutex_lock(&gm_mapping->lock);
+	if (gm_mapping_nomap(gm_mapping)) {
+		goto peer_map;
+	} else if (gm_mapping_device(gm_mapping)) {
+		if (hint == GM_FAULT_HINT_MARK_HOT) {
+			goto peer_map;
+		} else {
+			ret = 0;
+			goto unlock;
+		}
+	} else if (gm_mapping_cpu(gm_mapping)) {
+		page = gm_mapping->page;
+		if (!page) {
+			pr_err("gmem: host gm_mapping page is NULL. Set nomap\n");
+			gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP);
+			goto unlock;
+		}
+		get_page(page);
+		zap_page_range_single(vma, addr, size, NULL);
+		gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dma_dev, gmf.dma_addr))
+			pr_info("gmem: dma map failed\n");
+
+		gmf.copy = true;
+	}
+
+peer_map:
+	ret = mmu->peer_map(&gmf);
+	if (ret != GM_RET_SUCCESS) {
+		if (ret == GM_RET_MIGRATING) {
+			/*
+			 * gmem page is migrating due to overcommit.
+			 * update page to willneed and this will stop page evicting
+			 */
+			gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED);
+			ret = GM_RET_SUCCESS;
+		} else {
+			pr_err("gmem: peer map failed\n");
+			if (page) {
+				gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP);
+				put_page(page);
+			}
+		}
+		goto unlock;
+	}
+
+	if (page) {
+		dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+		put_page(page);
+	}
+
+	gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE);
+	gm_mapping->dev = dev;
+unlock:
+	mutex_unlock(&gm_mapping->lock);
+mmap_unlock:
+	mmap_read_unlock(mm);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(gm_dev_fault);
+
+vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order)
+{
+	vm_fault_t ret = 0;
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address & ((1 << order) - 1);
+	struct vm_object *obj = vma->vm_mm->vm_obj;
+	struct gm_mapping *gm_mapping;
+	unsigned long size = HPAGE_SIZE;
+	struct gm_dev *dev;
+	struct device *dma_dev;
+	struct gm_fault_t gmf = {
+		.mm = vma->vm_mm,
+		.va = addr,
+		.size = size,
+		.copy = true,
+	};
+
+	gm_mapping = vm_object_lookup(obj, addr);
+	if (!gm_mapping) {
+		pr_err("gmem: host fault gm_mapping should not be NULL\n");
+		return VM_FAULT_SIGBUS;
+	}
+
+	dev = gm_mapping->dev;
+	gmf.dev = dev;
+	dma_dev = dev->dma_dev;
+	gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(dma_dev, gmf.dma_addr)) {
+		pr_err("gmem: host fault dma mapping error\n");
+		return VM_FAULT_SIGBUS;
+	}
+	if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) {
+		pr_err("gmem: peer unmap failed\n");
+		dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+		return VM_FAULT_SIGBUS;
+	}
+
+	dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL);
+	return ret;
+}
+
+int gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin,
+			    unsigned long end)
+{
+	struct gm_mapping *mapping;
+	unsigned long addr = PAGE_ALIGN(begin);
+	unsigned int nid;
+	int i, page_num = (end - addr) >> PAGE_SHIFT;
+	struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL);
+
+	if (!hnode)
+		goto err;
+
+	nid = alloc_hnode_id();
+	if (nid == MAX_NUMNODES)
+		goto free_hnode;
+	hnode_init(hnode, nid, dev);
+
+	/*
+	 * TODO: replace the xarray bookkeeping code with an isolated buddy
+	 * allocator here. Implement customized device page struct, which is
+	 * trimmed for application-level usage.
+	 */
+	mapping = kvmalloc(sizeof(struct gm_mapping) * page_num, GFP_KERNEL);
+	if (!mapping)
+		goto deinit_hnode;
+
+	for (i = 0; i < page_num; i++, addr += PAGE_SIZE) {
+		mapping[i].pfn = addr >> PAGE_SHIFT;
+		mapping[i].flag = 0;
+	}
+
+	xa_lock(&hnode->pages);
+	for (i = 0; i < page_num; i++) {
+		if (xa_err(__xa_store(&hnode->pages, i, mapping + i, GFP_KERNEL))) {
+			kvfree(mapping);
+			xa_unlock(&hnode->pages);
+			goto deinit_hnode;
+		}
+		__xa_set_mark(&hnode->pages, i, XA_MARK_0);
+	}
+	xa_unlock(&hnode->pages);
+
+	return GM_RET_SUCCESS;
+
+deinit_hnode:
+	hnode_deinit(nid, dev);
+	free_hnode_id(nid);
+free_hnode:
+	kfree(hnode);
+err:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gm_dev_register_physmem);
+
+void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid)
+{
+	struct hnode *hnode = get_hnode(nid);
+	struct gm_mapping *mapping = xa_load(&hnode->pages, 0);
+
+	kvfree(mapping);
+	hnode_deinit(nid, dev);
+	free_hnode_id(nid);
+	kfree(hnode);
+}
+EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem);
+
+/* GMEM Virtual Address Space API */
+int gm_as_create(unsigned long begin, unsigned long end, struct gm_as **new_as)
+{
+	struct gm_as *as;
+
+	if (!new_as)
+		return -EINVAL;
+
+	as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC);
+	if (!as)
+		return -ENOMEM;
+
+	spin_lock_init(&as->lock);
+	as->start_va = begin;
+	as->end_va = end;
+
+	INIT_LIST_HEAD(&as->gm_ctx_list);
+
+	*new_as = as;
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_create);
+
+int gm_as_destroy(struct gm_as *as)
+{
+	struct gm_context *ctx, *tmp_ctx;
+
+	list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link)
+		kfree(ctx);
+
+	kmem_cache_free(gm_as_cache, as);
+
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_destroy);
+
+int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
+		 bool activate, struct gm_context **out_ctx)
+{
+	struct gm_context *ctx;
+	int nid;
+	int ret;
+
+	ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL);
+	if (!ctx)
+		return GM_RET_NOMEM;
+
+	ctx->as = as;
+	ctx->dev = dev;
+	ctx->pmap = NULL;
+	ret = dev->mmu->pmap_create(dev, &ctx->pmap);
+	if (ret) {
+		kmem_cache_free(gm_ctx_cache, ctx);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&ctx->gm_dev_link);
+	INIT_LIST_HEAD(&ctx->gm_as_link);
+	list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link);
+	list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list);
+
+	if (activate) {
+		/*
+		 * Here we should really have a callback function to perform the context switch
+		 * for the hardware. E.g. in x86 this function is effectively flushing the CR3 value.
+		 * Currently we do not care time-sliced context switch, unless someone wants to support it.
+		 */
+		dev->current_ctx = ctx;
+	}
+	*out_ctx = ctx;
+
+	for_each_node_mask(nid, dev->registered_hnodes)
+		node_set(nid, current->mems_allowed);
+	return GM_RET_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(gm_as_attach);

From patchwork Tue Nov 28 12:50:23 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471175
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 57941C07CA9
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:40 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 15A0510E53F;
	Tue, 28 Nov 2023 13:53:31 +0000 (UTC)
Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187])
 by gabe.freedesktop.org (Postfix) with ESMTPS id 0059210E515;
 Tue, 28 Nov 2023 13:07:15 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.57])
 by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4Sfj3X4tZmzvRGs;
 Tue, 28 Nov 2023 20:50:16 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:45 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:23 +0800
Message-ID: <20231128125025.4449-5-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 4/6] mm/gmem: add new syscall hmadvise() to
 issue memory hints for heterogeneous NUMA nodes
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

This patch adds a new syscall, hmadvise(), to issue memory hints for
heterogeneous NUMA nodes. The new syscall effectively extends madvise()
with one additional argument that indicates the NUMA id of a heterogeneous
device, which is not necessarily accessible by the CPU.

The implemented memory hint is MADV_PREFETCH, which guarantees that the
physical data of the given VMA [VA, VA+size) is migrated to a designated
NUMA id, so subsequent accesses from the corresponding device can obtain
local memory access speed. This prefetch hint is internally parallized with
multiple workqueue threads, allowing the page table management to be
overlapped. In a test with Huawei's Ascend NPU card, the MADV_PREFETCH is
able to saturate the host-device bandwidth if the given VMA size is larger
than 16MB.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 arch/arm64/include/asm/unistd.h         |   2 +-
 arch/arm64/include/asm/unistd32.h       |   2 +
 include/linux/gmem.h                    |   9 +
 include/uapi/asm-generic/mman-common.h  |   3 +
 include/uapi/asm-generic/unistd.h       |   5 +-
 kernel/sys_ni.c                         |   2 +
 mm/gmem.c                               | 222 ++++++++++++++++++++++++
 tools/include/uapi/asm-generic/unistd.h |   5 +-
 8 files changed, 247 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 531effca5f1f..298313d2e0af 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		457
+#define __NR_compat_syscalls		458
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 9f7c1bf99526..0d44383b98be 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -919,6 +919,8 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake)
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
 #define __NR_futex_requeue 456
 __SYSCALL(__NR_futex_requeue, sys_futex_requeue)
+#define __NR_hmadvise 457
+__SYSCALL(__NR_hmadvise, sys_hmadvise)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/include/linux/gmem.h b/include/linux/gmem.h
index f424225daa03..97186f29638d 100644
--- a/include/linux/gmem.h
+++ b/include/linux/gmem.h
@@ -22,6 +22,11 @@ static inline bool gmem_is_enabled(void)
 	return static_branch_likely(&gmem_status);
 }
 
+static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 struct gm_dev {
 	int id;
 
@@ -280,6 +285,10 @@ int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
 		 bool activate, struct gm_context **out_ctx);
 #else
 static inline bool gmem_is_enabled(void) { return false; }
+static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
+{
+	return false;
+}
 static inline void hnuma_init(void) {}
 static inline void __init vm_object_init(void)
 {
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..49b22a497c5d 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+/* for hmadvise */
+#define MADV_PREFETCH	26		/* prefetch pages for hNUMA node */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 756b013fb832..a0773d4f7fa5 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -829,8 +829,11 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait)
 #define __NR_futex_requeue 456
 __SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
+#define __NR_hmadvise 453
+__SYSCALL(__NR_hmadvise, sys_hmadvise)
+
 #undef __NR_syscalls
-#define __NR_syscalls 457
+#define __NR_syscalls 458
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e1a6e3c675c0..73bc1b35b8c6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -374,3 +374,5 @@ COND_SYSCALL(setuid16);
 
 /* restartable sequence */
 COND_SYSCALL(rseq);
+
+COND_SYSCALL(hmadvise);
diff --git a/mm/gmem.c b/mm/gmem.c
index b95b6b42ed6d..4eb522026a0d 100644
--- a/mm/gmem.c
+++ b/mm/gmem.c
@@ -9,6 +9,8 @@
 #include <linux/mm.h>
 #include <linux/gmem.h>
 #include <linux/dma-mapping.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
 
 DEFINE_STATIC_KEY_FALSE(gmem_status);
 EXPORT_SYMBOL_GPL(gmem_status);
@@ -484,3 +486,223 @@ int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
 	return GM_RET_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(gm_as_attach);
+
+struct prefetch_data {
+	struct mm_struct *mm;
+	struct gm_dev *dev;
+	unsigned long addr;
+	size_t size;
+	struct work_struct work;
+	int *res;
+};
+
+static void prefetch_work_cb(struct work_struct *work)
+{
+	struct prefetch_data *d =
+		container_of(work, struct prefetch_data, work);
+	unsigned long addr = d->addr, end = d->addr + d->size;
+	int page_size = HPAGE_SIZE;
+	int ret;
+
+	do {
+		/*
+		 * Pass a hint to tell gm_dev_fault() to invoke peer_map anyways
+		 * and implicitly mark the mapped physical page as recently-used.
+		 */
+		ret = gm_dev_fault(d->mm, addr, d->dev, GM_FAULT_HINT_MARK_HOT);
+		if (ret == GM_RET_PAGE_EXIST) {
+			pr_info("%s: device has done page fault, ignore prefetch\n", __func__);
+		} else if (ret != GM_RET_SUCCESS) {
+			*d->res = -EFAULT;
+			pr_err("%s: call dev fault error %d\n", __func__, ret);
+		}
+	} while (addr += page_size, addr != end);
+
+	kfree(d);
+}
+
+static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size)
+{
+	unsigned long start, end, per_size;
+	int page_size = HPAGE_SIZE;
+	struct prefetch_data *data;
+	struct vm_area_struct *vma;
+	int res = GM_RET_SUCCESS;
+
+	end = round_up(addr + size, page_size);
+	start = round_down(addr, page_size);
+	size = end - start;
+
+	mmap_read_lock(current->mm);
+	vma = find_vma(current->mm, start);
+	if (!vma || start < vma->vm_start || end > vma->vm_end) {
+		mmap_read_unlock(current->mm);
+		return GM_RET_FAILURE_UNKNOWN;
+	}
+	mmap_read_unlock(current->mm);
+
+	per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1);
+
+	while (start < end) {
+		data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL);
+		if (!data) {
+			flush_workqueue(prefetch_wq);
+			return GM_RET_NOMEM;
+		}
+
+		INIT_WORK(&data->work, prefetch_work_cb);
+		data->mm = current->mm;
+		data->dev = dev;
+		data->addr = start;
+		data->res = &res;
+		if (per_size == 0)
+			data->size = size;
+		else
+			data->size = (end - start < 2 * per_size) ? (end - start) : per_size;
+		queue_work(prefetch_wq, &data->work);
+		start += data->size;
+	}
+
+	flush_workqueue(prefetch_wq);
+	return res;
+}
+
+static int gm_unmap_page_range(struct vm_area_struct *vma, unsigned long start,
+			       unsigned long end, int page_size)
+{
+	struct gm_fault_t gmf = {
+		.mm = current->mm,
+		.size = page_size,
+		.copy = false,
+	};
+	struct gm_mapping *gm_mapping;
+	struct vm_object *obj;
+	int ret;
+
+	obj = current->mm->vm_obj;
+	if (!obj) {
+		pr_err("gmem: peer-shared vma should have vm_object\n");
+		return -EINVAL;
+	}
+
+	for (; start < end; start += page_size) {
+		xa_lock(obj->logical_page_table);
+		gm_mapping = vm_object_lookup(obj, start);
+		if (!gm_mapping) {
+			xa_unlock(obj->logical_page_table);
+			continue;
+		}
+		xa_unlock(obj->logical_page_table);
+		mutex_lock(&gm_mapping->lock);
+		if (gm_mapping_nomap(gm_mapping)) {
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		} else if (gm_mapping_cpu(gm_mapping)) {
+			zap_page_range_single(vma, start, page_size, NULL);
+		} else {
+			gmf.va = start;
+			gmf.dev = gm_mapping->dev;
+			ret = gm_mapping->dev->mmu->peer_unmap(&gmf);
+			if (ret) {
+				pr_err("gmem: peer_unmap failed. ret %d\n",
+				       ret);
+				mutex_unlock(&gm_mapping->lock);
+				continue;
+			}
+		}
+		gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP);
+		mutex_unlock(&gm_mapping->lock);
+	}
+
+	return 0;
+}
+
+static int hmadvise_do_eagerfree(unsigned long addr, size_t size)
+{
+	unsigned long start, end, i_start, i_end;
+	int page_size = HPAGE_SIZE;
+	struct vm_area_struct *vma;
+	int ret = GM_RET_SUCCESS;
+	unsigned long old_start;
+
+	if (check_add_overflow(addr, size, &end))
+		return -EINVAL;
+
+	old_start = addr;
+
+	end = round_down(addr + size, page_size);
+	start = round_up(addr, page_size);
+	if (start >= end)
+		return ret;
+
+	mmap_read_lock(current->mm);
+	do {
+		vma = find_vma_intersection(current->mm, start, end);
+		if (!vma) {
+			pr_info("gmem: there is no valid vma\n");
+			break;
+		}
+
+		if (!vma_is_peer_shared(vma)) {
+			pr_debug("gmem: not peer-shared vma, skip dontneed\n");
+			start = vma->vm_end;
+			continue;
+		}
+
+		i_start = start > vma->vm_start ? start : vma->vm_start;
+		i_end = end < vma->vm_end ? end : vma->vm_end;
+		ret = gm_unmap_page_range(vma, i_start, i_end, page_size);
+		if (ret)
+			break;
+
+		start = vma->vm_end;
+	} while (start < end);
+
+	mmap_read_unlock(current->mm);
+	return ret;
+}
+
+static bool check_hmadvise_behavior(int behavior)
+{
+	return behavior == MADV_DONTNEED;
+}
+
+SYSCALL_DEFINE4(hmadvise, int, hnid, unsigned long, start, size_t, len_in, int, behavior)
+{
+	int error = -EINVAL;
+	struct hnode *node;
+
+	if (hnid == -1) {
+		if (check_hmadvise_behavior(behavior)) {
+			goto no_hnid;
+		} else {
+			pr_err("hmadvise: behavior %d need hnid or is invalid\n",
+				behavior);
+			return error;
+		}
+	}
+
+	if (hnid < 0)
+		return error;
+
+	if (!is_hnode(hnid) || !is_hnode_allowed(hnid))
+		return error;
+
+	node = get_hnode(hnid);
+	if (!node) {
+		pr_err("hmadvise: hnode id %d is invalid\n", hnid);
+		return error;
+	}
+
+no_hnid:
+	switch (behavior) {
+	case MADV_PREFETCH:
+		return hmadvise_do_prefetch(node->dev, start, len_in);
+	case MADV_DONTNEED:
+		return hmadvise_do_eagerfree(start, len_in);
+	default:
+		pr_err("hmadvise: unsupported behavior %d\n", behavior);
+	}
+
+	return error;
+}
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 76d946445391..6d28d7a4096c 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -823,8 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 
+#define __NR_hmadvise 453
+__SYSCALL(__NR_hmadvise, sys_hmadvise)
+
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 454
 
 /*
  * 32 bit systems traditionally used different

From patchwork Tue Nov 28 12:50:24 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471172
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id EF8A1C4167B
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:34 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 67BD910E538;
	Tue, 28 Nov 2023 13:53:29 +0000 (UTC)
Received: from szxga03-in.huawei.com (szxga03-in.huawei.com [45.249.212.189])
 by gabe.freedesktop.org (Postfix) with ESMTPS id E9E4110E066;
 Tue, 28 Nov 2023 13:09:20 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.57])
 by szxga03-in.huawei.com (SkyGuard) with ESMTP id 4Sfhyb0vfNzMnS3;
 Tue, 28 Nov 2023 20:45:59 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:47 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:24 +0800
Message-ID: <20231128125025.4449-6-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 5/6] mm/gmem: resolve VMA conflicts for
 attached peer devices
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

This patch resolves potential VMA conflicts when
mmap(MAP_PRIVATE | MAP_PEER_SHARED) is invoked. Note that the semantic of
mmap(MAP_PRIVATE | MAP_PEER_SHARED) is to provide a coherent view of memory
through the allocated virtual addresses between the CPU and all attached
devices. However, an attached device may create its own computing context
that does not necessarily share the same address space layout with the CPU
process. Therefore, the mmap() syscall must return virtual addresses that
are guaranteed to be valid across all attached peer devices.

In current implementation, if a candidate VMA is detected to be
conflicting, it will be temporarily blacklisted. The mmap_region()
function will retry other VMA candidates for a predefined number of
iterations.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 fs/proc/task_mmu.c                     |  3 ++
 include/linux/gmem.h                   | 26 +++++++++++++++-
 include/linux/mm.h                     |  8 +++++
 include/uapi/asm-generic/mman-common.h |  1 +
 kernel/fork.c                          |  4 +++
 mm/gmem.c                              | 38 ++++++++++++++++++++++++
 mm/mempolicy.c                         |  4 +++
 mm/mmap.c                              | 38 ++++++++++++++++++++++--
 mm/vm_object.c                         | 41 ++++++++++++++++++++++++++
 9 files changed, 159 insertions(+), 4 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef2eb12906da..5af03d8f0319 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -701,6 +701,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 #ifdef CONFIG_X86_USER_SHADOW_STACK
 		[ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
+#ifdef CONFIG_GMEM
+		[ilog2(VM_PEER_SHARED)]	= "ps",
 #endif
 	};
 	size_t i;
diff --git a/include/linux/gmem.h b/include/linux/gmem.h
index 97186f29638d..82d88df5ce44 100644
--- a/include/linux/gmem.h
+++ b/include/linux/gmem.h
@@ -24,7 +24,10 @@ static inline bool gmem_is_enabled(void)
 
 static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
 {
-	return false;
+	if (!gmem_is_enabled())
+		return false;
+
+	return !!(vma->vm_flags & VM_PEER_SHARED);
 }
 
 struct gm_dev {
@@ -130,6 +133,8 @@ void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
 			     unsigned long end);
 void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start,
 			    unsigned long end);
+void gm_reserve_vma(struct vm_area_struct *value, struct list_head *head);
+void gm_release_vma(struct mm_struct *mm, struct list_head *head);
 
 /* core gmem */
 enum gm_ret {
@@ -283,6 +288,10 @@ int gm_as_create(unsigned long begin, unsigned long end, struct gm_as **new_as);
 int gm_as_destroy(struct gm_as *as);
 int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
 		 bool activate, struct gm_context **out_ctx);
+
+int gm_alloc_va_in_peer_devices(struct mm_struct *mm,
+				struct vm_area_struct *vma, unsigned long addr,
+				unsigned long len, vm_flags_t vm_flags);
 #else
 static inline bool gmem_is_enabled(void) { return false; }
 static inline bool vma_is_peer_shared(struct vm_area_struct *vma)
@@ -339,6 +348,21 @@ int gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode,
 {
 	return 0;
 }
+static inline void gm_reserve_vma(struct vm_area_struct *value,
+				  struct list_head *head)
+{
+}
+static inline void gm_release_vma(struct mm_struct *mm, struct list_head *head)
+{
+}
+static inline int gm_alloc_va_in_peer_devices(struct mm_struct *mm,
+					      struct vm_area_struct *vma,
+					      unsigned long addr,
+					      unsigned long len,
+					      vm_flags_t vm_flags)
+{
+	return 0;
+}
 #endif
 
 #endif /* _GMEM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..8837624e4c66 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -320,14 +320,22 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_5	37	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6	38	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6	BIT(VM_HIGH_ARCH_BIT_6)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
+#ifdef CONFIG_GMEM
+#define VM_PEER_SHARED	VM_HIGH_ARCH_6
+#else
+#define VM_PEER_SHARED	VM_NONE
+#endif
+
 #ifdef CONFIG_ARCH_HAS_PKEYS
 # define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
 # define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 49b22a497c5d..eebdbb2375f8 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -32,6 +32,7 @@
 
 #define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be
 					 * uninitialized */
+#define MAP_PEER_SHARED		0x8000000	/* Coherent memory available for both CPU and attached devices. */
 
 /*
  * Flags for mlock
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..eab96cdb25a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -99,6 +99,7 @@
 #include <linux/stackprotector.h>
 #include <linux/user_events.h>
 #include <linux/iommu.h>
+#include <linux/gmem.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1692,6 +1693,9 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
 	if (err)
 		goto free_pt;
 
+#ifdef CONFIG_GMEM
+	mm->vm_obj = NULL;
+#endif
 	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
diff --git a/mm/gmem.c b/mm/gmem.c
index 4eb522026a0d..5f4f26030163 100644
--- a/mm/gmem.c
+++ b/mm/gmem.c
@@ -617,6 +617,44 @@ static int gm_unmap_page_range(struct vm_area_struct *vma, unsigned long start,
 	return 0;
 }
 
+int gm_alloc_va_in_peer_devices(struct mm_struct *mm,
+				struct vm_area_struct *vma, unsigned long addr,
+				unsigned long len, vm_flags_t vm_flags)
+{
+	struct gm_context *ctx, *tmp;
+	int ret;
+
+	pr_debug("gmem: start mmap, as %p\n", mm->gm_as);
+	if (!mm->gm_as)
+		return -ENODEV;
+
+	if (!mm->vm_obj)
+		mm->vm_obj = vm_object_create(mm);
+	if (!mm->vm_obj)
+		return -ENOMEM;
+	/*
+	 * TODO: solve the race condition if a device is concurrently attached
+	 * to mm->gm_as.
+	 */
+	list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) {
+		if (!gm_dev_is_peer(ctx->dev))
+			continue;
+
+		if (!ctx->dev->mmu->peer_va_alloc_fixed) {
+			pr_debug("gmem: mmu ops has no alloc_vma\n");
+			continue;
+		}
+
+		ret = ctx->dev->mmu->peer_va_alloc_fixed(mm, addr, len, vm_flags);
+		if (ret != GM_RET_SUCCESS) {
+			pr_debug("gmem: alloc_vma ret %d\n", ret);
+			return ret;
+		}
+	}
+
+	return GM_RET_SUCCESS;
+}
+
 static int hmadvise_do_eagerfree(unsigned long addr, size_t size)
 {
 	unsigned long start, end, i_start, i_end;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..9fc298480498 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1719,7 +1719,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 
 bool vma_migratable(struct vm_area_struct *vma)
 {
+#ifdef CONFIG_GMEM
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED))
+#else
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+#endif
 		return false;
 
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 1971bfffcc03..55d43763ea49 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -47,6 +47,7 @@
 #include <linux/oom.h>
 #include <linux/sched/mm.h>
 #include <linux/ksm.h>
+#include <linux/gmem.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1376,6 +1377,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
+	if (gmem_is_enabled() && (flags & MAP_PEER_SHARED))
+		vm_flags |= VM_PEER_SHARED;
+
 	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
@@ -1832,6 +1836,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	}
 
 	addr = get_area(file, addr, len, pgoff, flags);
+
 	if (IS_ERR_VALUE(addr))
 		return addr;
 
@@ -2756,7 +2761,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	pgoff_t vm_pgoff;
 	int error;
 	VMA_ITERATOR(vmi, mm, addr);
+	unsigned int retry_times = 0;
+	LIST_HEAD(reserve_list);
 
+retry:
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
 		unsigned long nr_pages;
@@ -2768,21 +2776,27 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		nr_pages = count_vma_pages_range(mm, addr, end);
 
 		if (!may_expand_vm(mm, vm_flags,
-					(len >> PAGE_SHIFT) - nr_pages))
+					(len >> PAGE_SHIFT) - nr_pages)) {
+			gm_release_vma(mm, &reserve_list);
 			return -ENOMEM;
+		}
 	}
 
 	/* Unmap any existing mapping in the area */
-	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) {
+		gm_release_vma(mm, &reserve_list);
 		return -ENOMEM;
+	}
 
 	/*
 	 * Private writable mapping: check memory availability
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
-		if (security_vm_enough_memory_mm(mm, charged))
+		if (security_vm_enough_memory_mm(mm, charged)) {
+			gm_release_vma(mm, &reserve_list);
 			return -ENOMEM;
+		}
 		vm_flags |= VM_ACCOUNT;
 	}
 
@@ -2945,6 +2959,21 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	file = vma->vm_file;
 	ksm_add_vma(vma);
 expanded:
+	if (vma_is_peer_shared(vma)) {
+		int ret = gm_alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags);
+
+		if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) {
+			retry_times++;
+			addr = get_unmapped_area(file, addr, len, pgoff, 0);
+			gm_reserve_vma(vma, &reserve_list);
+			goto retry;
+		} else if (ret != GM_RET_SUCCESS) {
+			pr_debug("gmem: alloc_vma ret %d\n", ret);
+			error = -ENOMEM;
+			goto free_vma;
+		}
+		gm_release_vma(mm, &reserve_list);
+	}
 	perf_event_mmap(vma);
 
 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
@@ -2995,6 +3024,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
+	gm_release_vma(mm, &reserve_list);
 	validate_mm(mm);
 	return error;
 }
@@ -3336,6 +3366,8 @@ void exit_mmap(struct mm_struct *mm)
 
 	BUG_ON(count != mm->map_count);
 
+	vm_object_drop_locked(mm);
+
 	trace_exit_mmap(mm);
 	__mt_destroy(&mm->mm_mt);
 	mmap_write_unlock(mm);
diff --git a/mm/vm_object.c b/mm/vm_object.c
index 4e76737e0ca1..5432930d1226 100644
--- a/mm/vm_object.c
+++ b/mm/vm_object.c
@@ -163,6 +163,9 @@ void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
 	struct gm_mapping *gm_mapping;
 	struct page *page = NULL;
 
+	if (!vma_is_peer_shared(vma))
+		return;
+
 	if (!vma->vm_mm->vm_obj)
 		return;
 
@@ -182,3 +185,41 @@ void unmap_gm_mappings_range(struct vm_area_struct *vma, unsigned long start,
 	}
 	xa_unlock(logical_page_table);
 }
+
+struct gm_vma_list {
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+void gm_reserve_vma(struct vm_area_struct *value, struct list_head *head)
+{
+	struct gm_vma_list *node;
+
+	if (!gmem_is_enabled())
+		return;
+
+	node = kmalloc(sizeof(struct gm_vma_list), GFP_KERNEL);
+	if (!node)
+		return;
+
+	node->vma = value;
+	list_add_tail(&node->list, head);
+}
+
+void gm_release_vma(struct mm_struct *mm, struct list_head *head)
+{
+	struct gm_vma_list *node, *next;
+
+	if (!gmem_is_enabled())
+		return;
+
+	list_for_each_entry_safe(node, next, head, list) {
+		struct vm_area_struct *vma = node->vma;
+
+		if (vma != NULL)
+			vm_area_free(vma);
+
+		list_del(&node->list);
+		kfree(node);
+	}
+}

From patchwork Tue Nov 28 12:50:25 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhuweixi <weixi.zhu@huawei.com>
X-Patchwork-Id: 13471177
Return-Path: <intel-gfx-bounces@lists.freedesktop.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.lore.kernel.org (Postfix) with ESMTPS id 22607C10DAA
	for <intel-gfx@archiver.kernel.org>; Tue, 28 Nov 2023 13:53:42 +0000 (UTC)
Received: from gabe.freedesktop.org (localhost [127.0.0.1])
	by gabe.freedesktop.org (Postfix) with ESMTP id 6690B10E542;
	Tue, 28 Nov 2023 13:53:32 +0000 (UTC)
Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255])
 by gabe.freedesktop.org (Postfix) with ESMTPS id 61BC710E515;
 Tue, 28 Nov 2023 13:10:14 +0000 (UTC)
Received: from kwepemm000018.china.huawei.com (unknown [172.30.72.53])
 by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4Sfj021mG3z1P8qX;
 Tue, 28 Nov 2023 20:47:14 +0800 (CST)
Received: from DESKTOP-RAUQ1L5.china.huawei.com (10.174.179.172) by
 kwepemm000018.china.huawei.com (7.193.23.4) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.35; Tue, 28 Nov 2023 20:50:49 +0800
From: Weixi Zhu <weixi.zhu@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
 <akpm@linux-foundation.org>
Date: Tue, 28 Nov 2023 20:50:25 +0800
Message-ID: <20231128125025.4449-7-weixi.zhu@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20231128125025.4449-1-weixi.zhu@huawei.com>
References: <20231128125025.4449-1-weixi.zhu@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.174.179.172]
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
 kwepemm000018.china.huawei.com (7.193.23.4)
X-CFilter-Loop: Reflected
X-Mailman-Approved-At: Tue, 28 Nov 2023 13:53:26 +0000
Subject: [Intel-gfx] [RFC PATCH 6/6] mm/gmem: extending Linux core MM to
 support unified virtual address space
X-BeenThere: intel-gfx@lists.freedesktop.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Intel graphics driver community testing & development
 <intel-gfx.lists.freedesktop.org>
List-Unsubscribe: <https://lists.freedesktop.org/mailman/options/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=unsubscribe>
List-Archive: <https://lists.freedesktop.org/archives/intel-gfx>
List-Post: <mailto:intel-gfx@lists.freedesktop.org>
List-Help: <mailto:intel-gfx-request@lists.freedesktop.org?subject=help>
List-Subscribe: <https://lists.freedesktop.org/mailman/listinfo/intel-gfx>,
 <mailto:intel-gfx-request@lists.freedesktop.org?subject=subscribe>
Cc: dri-devel@lists.freedesktop.org, leonro@nvidia.com, apopple@nvidia.com,
 amd-gfx@lists.freedesktop.org, mgorman@suse.de, ziy@nvidia.com,
 rcampbell@nvidia.com, jgg@nvidia.com, weixi.zhu@openeuler.sh,
 jhubbard@nvidia.com, intel-gfx@lists.freedesktop.org, mhairgrove@nvidia.com,
 jglisse@redhat.com, rodrigo.vivi@intel.com,
 intel-gvt-dev@lists.freedesktop.org, Felix.Kuehling@amd.com,
 Xinhui.Pan@amd.com, christian.koenig@amd.com, alexander.deucher@amd.com,
 ogabbay@kernel.org, Weixi Zhu <weixi.zhu@huawei.com>
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

This patch extends Linux core MM to support unified virtual address space.
A unified virtual address space provides a coherent view of memory for the
CPU and devices. This is achieved by maintaining coherent page tables for
the CPU and any attached devices for each process, without assuming that
the underlying interconnect between the CPU and peripheral device is
cache-coherent.

Specifically, for each mm_struct that is attached with one or more device
computing contexts, a per-process logical page table is utilized to track
the mapping status of anonymous memory allocated via mmap(MAP_PRIVATE |
MAP_PEER_SHARED). The CPU page fault handling path is modified to examine
whether a faulted virtual page has already been faulted elsewhere, e.g. on
a device, by looking up the logical page table in vm_object. If so, a page
migration operation should be orchestrated by the core MM to prepare the
CPU physical page, instead of zero-filling. This is achieved by invoking
gm_host_fault_locked(). The logical page table must also be updated once
the CPU page table gets modified.

Ideally, the logical page table should always be looked up or modified
first if the CPU page table is changed, but the currently implementation is
reverse. Also, current implementation only considers anonymous memory,
while a device may want to operate on a disk-file directly via mmap(fd). In
the future, logical page table is planned to play a more generic role for
anonymous memory, folios/huge pages and file-backed memory, as well as to
provide a clean abstraction for CPU page table functions (including these
stage-2 functions). More, the page fault handler path will be enhanced to
deal with cache-coherent buses as well, since it might be desirable for
devices to operate sparse data remotely instead of migration data at page
granules.

Signed-off-by: Weixi Zhu <weixi.zhu@huawei.com>
---
 kernel/fork.c    |  1 +
 mm/huge_memory.c | 85 +++++++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c      | 42 +++++++++++++++++++++---
 mm/mmap.c        |  2 ++
 mm/oom_kill.c    |  2 ++
 mm/vm_object.c   | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 203 insertions(+), 13 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index eab96cdb25a6..06130c73bf2e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -543,6 +543,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 
 void vm_area_free(struct vm_area_struct *vma)
 {
+	free_gm_mappings(vma);
 #ifdef CONFIG_PER_VMA_LOCK
 	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4f542444a91f..590000f63f04 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,7 @@
 #include <linux/page_owner.h>
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
+#include <linux/gmem.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -684,6 +685,10 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 	pgtable_t pgtable;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	vm_fault_t ret = 0;
+	struct gm_mapping *gm_mapping = NULL;
+
+	if (vma_is_peer_shared(vma))
+		gm_mapping = vm_object_lookup(vma->vm_mm->vm_obj, haddr);
 
 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 
@@ -691,7 +696,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-		return VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
 	}
 	folio_throttle_swaprate(folio, gfp);
 
@@ -701,7 +707,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		goto release;
 	}
 
-	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+	/*
+	 * Skip zero-filling page if the logical mapping indicates
+	 * that page contains valid data of the virtual address. This
+	 * could happen if the page was a victim of device memory
+	 * oversubscription.
+	 */
+	if (!(vma_is_peer_shared(vma) && gm_mapping_cpu(gm_mapping)))
+		clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
@@ -726,7 +739,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			pte_free(vma->vm_mm, pgtable);
 			ret = handle_userfault(vmf, VM_UFFD_MISSING);
 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
-			return ret;
+			goto gm_mapping_release;
 		}
 
 		entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -734,6 +747,13 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_add_new_anon_rmap(folio, vma, haddr);
 		folio_add_lru_vma(folio, vma);
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+		if (vma_is_peer_shared(vma) && gm_mapping_device(gm_mapping)) {
+			vmf->page = page;
+			ret = gm_host_fault_locked(vmf, PMD_ORDER);
+			if (ret)
+				goto unlock_release;
+		}
+
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -741,6 +761,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+		if (vma_is_peer_shared(vma)) {
+			gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU);
+			gm_mapping->page = page;
+			mutex_unlock(&gm_mapping->lock);
+		}
 	}
 
 	return 0;
@@ -750,6 +775,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 	if (pgtable)
 		pte_free(vma->vm_mm, pgtable);
 	folio_put(folio);
+gm_mapping_release:
+	if (vma_is_peer_shared(vma))
+		mutex_unlock(&gm_mapping->lock);
 	return ret;
 
 }
@@ -808,17 +836,41 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	gfp_t gfp;
-	struct folio *folio;
+	struct folio *folio = NULL;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	vm_fault_t ret = 0;
+	struct gm_mapping *gm_mapping;
+
+	if (vma_is_peer_shared(vma)) {
+		struct vm_object *vm_obj = vma->vm_mm->vm_obj;
 
-	if (!transhuge_vma_suitable(vma, haddr))
-		return VM_FAULT_FALLBACK;
-	if (unlikely(anon_vma_prepare(vma)))
-		return VM_FAULT_OOM;
+		xa_lock(vm_obj->logical_page_table);
+		gm_mapping = vm_object_lookup(vm_obj, haddr);
+		if (!gm_mapping) {
+			vm_object_mapping_create(vm_obj, haddr);
+			gm_mapping = vm_object_lookup(vm_obj, haddr);
+		}
+		xa_unlock(vm_obj->logical_page_table);
+		mutex_lock(&gm_mapping->lock);
+		if (unlikely(!pmd_none(*vmf->pmd))) {
+			mutex_unlock(&gm_mapping->lock);
+			goto gm_mapping_release;
+		}
+	}
+
+	if (!transhuge_vma_suitable(vma, haddr)) {
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
+	}
+	if (unlikely(anon_vma_prepare(vma))) {
+		ret = VM_FAULT_OOM;
+		goto gm_mapping_release;
+	}
 	khugepaged_enter_vma(vma, vma->vm_flags);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm) &&
+			!vma_is_peer_shared(vma) &&
 			transparent_hugepage_use_zero_page()) {
 		pgtable_t pgtable;
 		struct page *zero_page;
@@ -857,12 +909,27 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		return ret;
 	}
 	gfp = vma_thp_gfp_mask(vma);
+
+	if (vma_is_peer_shared(vma) && gm_mapping_cpu(gm_mapping))
+		folio = page_folio(gm_mapping->page);
+	if (!folio) {
+		if (vma_is_peer_shared(vma))
+			gfp = GFP_TRANSHUGE;
+		folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+	}
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
-		return VM_FAULT_FALLBACK;
+		ret = VM_FAULT_FALLBACK;
+		goto gm_mapping_release;
 	}
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+gm_mapping_release:
+	if (vma_is_peer_shared(vma))
+		mutex_unlock(&gm_mapping->lock);
+	return ret;
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 1f18ed4a5497..d6cc278dc39b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,6 +78,7 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/gmem.h>
 
 #include <trace/events/kmem.h>
 
@@ -1695,8 +1696,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 				__unmap_hugepage_range(tlb, vma, start, end,
 							     NULL, zap_flags);
 			}
-		} else
+		} else {
 			unmap_page_range(tlb, vma, start, end, details);
+			unmap_gm_mappings_range(vma, start, end);
+		}
 	}
 }
 
@@ -4126,7 +4129,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
-	struct folio *folio;
+	struct gm_mapping *gm_mapping;
+	bool skip_put_page = false;
+	struct folio *folio = NULL;
 	vm_fault_t ret = 0;
 	pte_t entry;
 
@@ -4141,8 +4146,25 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (pte_alloc(vma->vm_mm, vmf->pmd))
 		return VM_FAULT_OOM;
 
+	if (vma_is_peer_shared(vma)) {
+		xa_lock(vma->vm_mm->vm_obj->logical_page_table);
+		gm_mapping = vm_object_lookup(vma->vm_mm->vm_obj, vmf->address);
+		if (!gm_mapping) {
+			vm_object_mapping_create(vma->vm_mm->vm_obj, vmf->address);
+			gm_mapping = vm_object_lookup(vma->vm_mm->vm_obj, vmf->address);
+		}
+		xa_unlock(vma->vm_mm->vm_obj->logical_page_table);
+		mutex_lock(&gm_mapping->lock);
+
+		if (gm_mapping_cpu(gm_mapping)) {
+			folio = page_folio(gm_mapping->page);
+			skip_put_page = true;
+		}
+	}
+
 	/* Use the zero-page for reads */
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+			!vma_is_peer_shared(vma) &&
 			!mm_forbids_zeropage(vma->vm_mm)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
 						vma->vm_page_prot));
@@ -4168,7 +4190,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+	if (!folio)
+		folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
 	if (!folio)
 		goto oom;
 
@@ -4211,6 +4234,14 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
 	folio_add_new_anon_rmap(folio, vma, vmf->address);
 	folio_add_lru_vma(folio, vma);
+	if (vma_is_peer_shared(vma)) {
+		if (gm_mapping_device(gm_mapping)) {
+			vmf->page = &folio->page;
+			gm_host_fault_locked(vmf, 0);
+		}
+		gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU);
+		gm_mapping->page = &folio->page;
+	}
 setpte:
 	if (uffd_wp)
 		entry = pte_mkuffd_wp(entry);
@@ -4221,9 +4252,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
+	if (vma_is_peer_shared(vma))
+		mutex_unlock(&gm_mapping->lock);
 	return ret;
 release:
-	folio_put(folio);
+	if (!skip_put_page)
+		folio_put(folio);
 	goto unlock;
 oom_free_page:
 	folio_put(folio);
diff --git a/mm/mmap.c b/mm/mmap.c
index 55d43763ea49..8b8faa007dbc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2616,6 +2616,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 #endif
 	} for_each_vma_range(*vmi, next, end);
 
+	munmap_in_peer_devices(mm, start, end);
+
 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
 	/* Make sure no VMAs are about to be lost. */
 	{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e6071fde34a..31ec027e98c7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/gmem.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -547,6 +548,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
 				continue;
 			}
 			unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+			unmap_gm_mappings_range(vma, range.start, range.end);
 			mmu_notifier_invalidate_range_end(&range);
 			tlb_finish_mmu(&tlb);
 		}
diff --git a/mm/vm_object.c b/mm/vm_object.c
index 5432930d1226..e0d1b558df31 100644
--- a/mm/vm_object.c
+++ b/mm/vm_object.c
@@ -142,6 +142,9 @@ void free_gm_mappings(struct vm_area_struct *vma)
 	struct gm_mapping *gm_mapping;
 	struct vm_object *obj;
 
+	if (vma_is_peer_shared(vma))
+		return;
+
 	obj = vma->vm_mm->vm_obj;
 	if (!obj)
 		return;
@@ -223,3 +226,84 @@ void gm_release_vma(struct mm_struct *mm, struct list_head *head)
 		kfree(node);
 	}
 }
+
+static int munmap_in_peer_devices_inner(struct mm_struct *mm,
+					struct vm_area_struct *vma,
+					unsigned long start, unsigned long end,
+					int page_size)
+{
+	struct vm_object *obj = mm->vm_obj;
+	struct gm_mapping *gm_mapping;
+	struct gm_fault_t gmf = {
+		.mm = mm,
+		.copy = false,
+	};
+	int ret;
+
+	start = start > vma->vm_start ? start : vma->vm_start;
+	end = end < vma->vm_end ? end : vma->vm_end;
+
+	for (; start < end; start += page_size) {
+		xa_lock(obj->logical_page_table);
+		gm_mapping = vm_object_lookup(obj, start);
+		if (!gm_mapping) {
+			xa_unlock(obj->logical_page_table);
+			continue;
+		}
+		xa_unlock(obj->logical_page_table);
+
+		mutex_lock(&gm_mapping->lock);
+		if (!gm_mapping_device(gm_mapping)) {
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		}
+
+		gmf.va = start;
+		gmf.size = page_size;
+		gmf.dev = gm_mapping->dev;
+		ret = gm_mapping->dev->mmu->peer_unmap(&gmf);
+		if (ret != GM_RET_SUCCESS) {
+			pr_err("%s: call dev peer_unmap error %d\n", __func__,
+			       ret);
+			mutex_unlock(&gm_mapping->lock);
+			continue;
+		}
+		mutex_unlock(&gm_mapping->lock);
+	}
+
+	return 0;
+}
+
+void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start,
+			    unsigned long end)
+{
+	struct vm_object *obj = mm->vm_obj;
+	struct vm_area_struct *vma;
+
+	if (!gmem_is_enabled())
+		return;
+
+	if (!obj)
+		return;
+
+	if (!mm->gm_as)
+		return;
+
+	mmap_read_lock(mm);
+	do {
+		vma = find_vma_intersection(mm, start, end);
+		if (!vma) {
+			pr_debug("gmem: there is no valid vma\n");
+			break;
+		}
+
+		if (!vma_is_peer_shared(vma)) {
+			pr_debug("gmem: not peer-shared vma, skip dontneed\n");
+			start = vma->vm_end;
+			continue;
+		}
+
+		munmap_in_peer_devices_inner(mm, vma, start, end, HPAGE_SIZE);
+	} while (start < end);
+	mmap_read_unlock(mm);
+}