From patchwork Fri Sep 16 03:35:15 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978075
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 188F5C6FA90
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:35:40 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229906AbiIPDfg (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:35:36 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55152 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229939AbiIPDfS (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:18 -0400
Received: from mga05.intel.com (mga05.intel.com [192.55.52.43])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A910721274;
        Thu, 15 Sep 2022 20:35:16 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299316; x=1694835316;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=TFsjnfnCv3qQkchX3Q8w6OPDWs44NJ34NXLtENxpehE=;
  b=lCoFo6LBdLCbxOktV/voBXwOw90yV85nxXczPXkSpPWscsABf9AN+kmv
   M8pGlBuWod5FhhZVdHcPNj0eWVkmM6HvgYKnAq2FHqRDJVXjVXfaz33b4
   zXt5Oq5lf2U9gNZPx33GB9NZxMnTSgUEXx6gNJOtDG6LIR0Uuf5MN0mP2
   +QxnDekbCFJycwJqmhF9h7Ru+rJmc5j1cmxZvXeCqcLfenZO7LSpLdRzr
   wKkXZa3giuw/eWBlR5EgkgO5B0+RIuS5LiXBndU9b3vmL0gcwBpn1zlXf
   yThvOuS6jYfk8JzJyI0fuQcZZueMtkHn5mfa7mwlJDDY5JkdXqSyITG3U
   g==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="385192359"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="385192359"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:16 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809163"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:15 -0700
Subject: [PATCH v2 01/18] fsdax: Wait on @page not @page->_refcount
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:15 -0700
Message-ID: 
 <166329931529.2786261.12375427940949385300.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

The __wait_var_event facility calculates a wait queue from a hash of the
address of the variable being passed. Use the @page argument directly as
it is less to type and is the object that is being waited upon.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
---
 fs/ext4/inode.c   |    8 ++++----
 fs/fuse/dax.c     |    6 +++---
 fs/xfs/xfs_file.c |    6 +++---
 mm/memremap.c     |    2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 601214453c3a..b028a4413bea 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,10 +3961,10 @@ int ext4_break_layouts(struct inode *inode)
 		if (!page)
 			return 0;
 
-		error = ___wait_var_event(&page->_refcount,
-				atomic_read(&page->_refcount) == 1,
-				TASK_INTERRUPTIBLE, 0, 0,
-				ext4_wait_dax_page(inode));
+		error = ___wait_var_event(page,
+					  atomic_read(&page->_refcount) == 1,
+					  TASK_INTERRUPTIBLE, 0, 0,
+					  ext4_wait_dax_page(inode));
 	} while (error == 0);
 
 	return error;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index e23e802a8013..4e12108c68af 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -676,9 +676,9 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(&page->_refcount,
-			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-			0, 0, fuse_wait_dax_page(inode));
+	return ___wait_var_event(page, atomic_read(&page->_refcount) == 1,
+				 TASK_INTERRUPTIBLE, 0, 0,
+				 fuse_wait_dax_page(inode));
 }
 
 /* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c6c80265c0b2..73e7b7ec0a4c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -827,9 +827,9 @@ xfs_break_dax_layouts(
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(&page->_refcount,
-			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-			0, 0, xfs_wait_dax_page(inode));
+	return ___wait_var_event(page, atomic_read(&page->_refcount) == 1,
+				 TASK_INTERRUPTIBLE, 0, 0,
+				 xfs_wait_dax_page(inode));
 }
 
 int
diff --git a/mm/memremap.c b/mm/memremap.c
index 58b20c3c300b..95f6ffe9cb0f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -520,7 +520,7 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs)
 	 * stable because nobody holds a reference on the page.
 	 */
 	if (page_ref_sub_return(page, refs) == 1)
-		wake_up_var(&page->_refcount);
+		wake_up_var(page);
 	return true;
 }
 EXPORT_SYMBOL(__put_devmap_managed_page_refs);

From patchwork Fri Sep 16 03:35:21 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978076
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 68CA9C6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:35:43 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229940AbiIPDfi (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:35:38 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55594 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229483AbiIPDfZ (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:25 -0400
Received: from mga12.intel.com (mga12.intel.com [192.55.52.136])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C7846766F;
        Thu, 15 Sep 2022 20:35:22 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299322; x=1694835322;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=09smizwI4GSJLiX2SQxDnF1gC8wYEWwvB8zT8M3pE8M=;
  b=ALRJa740N8oIKKi6rgQ3AAxzXSpOdduIWifehDoHQBhZAEEjT1qfuRDt
   mrBXp20ssWqwVeIE0vj285fonMzHX0EHrnnnjtC5pj1HVOtmuqy4czNlJ
   UDal7VO4ptHynOBI7lEglaQzoSz+FDaU03KVRMywf4Fy0jjv4Jn4PSt4l
   YNMZVt1rYdX6fdyI8NjjCfsgyb5t1gG1dx80OhgCXRexrDpoj6hSijMEW
   iPsvNrPyI20y7sTe5k7/V9tfWwVdZvmSfSWURkOq2C2VbN4Ja97BNX7KD
   2JoBH6GR6Mj6ISoK3cXEpUoX3iMYf9k3lLeV+hwsJ1BuHRPfLJ5ClRRYW
   g==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="278630752"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="278630752"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by fmsmga106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:22 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961772"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:21 -0700
Subject: [PATCH v2 02/18] fsdax: Use dax_page_idle() to document DAX busy
 page checking
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:21 -0700
Message-ID: 
 <166329932151.2786261.15762187070104795379.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In advance of converting DAX pages to be 0-based, use a new
dax_page_idle() helper to both simplify that future conversion, but also
document all the kernel locations that are watching for DAX page idle
events.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
---
 fs/dax.c            |    4 ++--
 fs/ext4/inode.c     |    3 +--
 fs/fuse/dax.c       |    5 ++---
 fs/xfs/xfs_file.c   |    5 ++---
 include/linux/dax.h |    9 +++++++++
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c440dcef4b1b..e762b9c04fb4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -395,7 +395,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+		WARN_ON_ONCE(trunc && !dax_page_idle(page));
 		if (dax_mapping_is_cow(page->mapping)) {
 			/* keep the CoW flag if this page is still shared */
 			if (page->index-- > 0)
@@ -414,7 +414,7 @@ static struct page *dax_busy_page(void *entry)
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (page_ref_count(page) > 1)
+		if (!dax_page_idle(page))
 			return page;
 	}
 	return NULL;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b028a4413bea..478ec6bc0935 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,8 +3961,7 @@ int ext4_break_layouts(struct inode *inode)
 		if (!page)
 			return 0;
 
-		error = ___wait_var_event(page,
-					  atomic_read(&page->_refcount) == 1,
+		error = ___wait_var_event(page, dax_page_idle(page),
 					  TASK_INTERRUPTIBLE, 0, 0,
 					  ext4_wait_dax_page(inode));
 	} while (error == 0);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 4e12108c68af..ae52ef7dbabe 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -676,9 +676,8 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(page, atomic_read(&page->_refcount) == 1,
-				 TASK_INTERRUPTIBLE, 0, 0,
-				 fuse_wait_dax_page(inode));
+	return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
+				 0, 0, fuse_wait_dax_page(inode));
 }
 
 /* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 73e7b7ec0a4c..556e28d06788 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -827,9 +827,8 @@ xfs_break_dax_layouts(
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(page, atomic_read(&page->_refcount) == 1,
-				 TASK_INTERRUPTIBLE, 0, 0,
-				 xfs_wait_dax_page(inode));
+	return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
+				 0, 0, xfs_wait_dax_page(inode));
 }
 
 int
diff --git a/include/linux/dax.h b/include/linux/dax.h
index ba985333e26b..04987d14d7e0 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -210,6 +210,15 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
 
+/*
+ * Document all the code locations that want know when a dax page is
+ * unreferenced.
+ */
+static inline bool dax_page_idle(struct page *page)
+{
+	return page_ref_count(page) == 1;
+}
+
 #if IS_ENABLED(CONFIG_DAX)
 int dax_read_lock(void);
 void dax_read_unlock(int id);

From patchwork Fri Sep 16 03:35:27 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978077
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B71BEC6FA92
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:35:44 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229501AbiIPDfk (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:35:40 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56250 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229847AbiIPDfb (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:31 -0400
Received: from mga18.intel.com (mga18.intel.com [134.134.136.126])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 646701A818;
        Thu, 15 Sep 2022 20:35:28 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299328; x=1694835328;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=3wvSjSkW/zXAYAgI4UeAUQfAuIeQwEc+TBBp6c7MzGs=;
  b=Mb9F74okTHTmaV/cHLI1gJbR1l4CoGfOhappm4V4iOTjYZ05X1urdOxD
   G2jrazMPdOsV2GSYMOIkN9WmnSm+cQqfdIdd5WaZ+SaO/fnlc9HXW9JLp
   uakhy241zwgKOOFyVDT03ccpJIv8JuM28nkgIUUXFvfhlTogHY0j10Fp6
   VGBij8x4GFIbJphhHxw4IPvexqQntK3357LnqBEBMQEo1coN36sqMkU1r
   HoVxWPgIzcaUzFeIIMHB0I2MjHTos3uSC8jFQVmD0V5wwWDfjTn6pf3og
   f9/ImlqTDD2i7N/F5iFIr8d2IDbUg8zIuPIDWPcHRYvXyzc+79kIAZUTp
   A==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="281930433"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="281930433"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by orsmga106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:27 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961807"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:27 -0700
Subject: [PATCH v2 03/18] fsdax: Include unmapped inodes for page-idle
 detection
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:27 -0700
Message-ID: 
 <166329932730.2786261.8645669907699123863.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

A page can remain pinned even after it has been unmapped from userspace
/ removed from the rmap. In advance of requiring that all
dax_insert_entry() events are followed up 'break layouts' before a
truncate event, make sure that 'break layouts' can find unmapped
entries.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index e762b9c04fb4..76bad1c095c0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -698,7 +698,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 		return NULL;
 
-	if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+	if (!dax_mapping(mapping))
 		return NULL;
 
 	/* If end == LLONG_MAX, all pages from start to till end of file */

From patchwork Fri Sep 16 03:35:33 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978078
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C0845ECAAD3
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:35:47 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229948AbiIPDfo (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:35:44 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56392 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229898AbiIPDff (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:35 -0400
Received: from mga17.intel.com (mga17.intel.com [192.55.52.151])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 74D2E2C674;
        Thu, 15 Sep 2022 20:35:34 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299334; x=1694835334;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=JMW9u8fUfl6xVouZZpm8FHKzsQjwv8MqqGLA900GgrQ=;
  b=Xuf6d6X+UD9QkxwoTrdhGwNEY5nkE0zeG8RgtOVR+iRm2S2fK2B6sW+h
   vTVMGbUyPoVN9/cmVTb/+z2BuQkZee8JczkIqG7Ff7Y7Q5IeBWkBZUZz+
   278XDmxGX/OO4n7EK0rhct6OYNkQAsYNviEWEPrBKHLpggeoL3AltTG9Q
   BrvM4KtIBzGams+B8hdakgtb0nc/aS6Z/bBF7lB3gspnaXdcTVuN3SGhc
   bvzC8nrHA2FlE4XUwgtpndDndn5/bLNmtdUx79NJ3bBFJth07zgwHQ9s1
   nECNpPi5WqRbOC1ZgnZs6aR671UVYsyM1318+0HJQyGgC6c/2lT6GoJpo
   Q==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="279283812"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="279283812"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:33 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961823"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:33 -0700
Subject: [PATCH v2 04/18] ext4: Add ext4_break_layouts() to the inode
 eviction path
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:33 -0700
Message-ID: 
 <166329933305.2786261.13953404062673878108.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for moving DAX pages to be 0-based rather than 1-based
for the idle refcount, the fsdax core wants to have all mappings in a
"zapped" state before truncate. For typical pages this happens naturally
via unmap_mapping_range(), for DAX pages some help is needed to record
this state in the 'struct address_space' of the inode(s) where the page
is mapped.

That "zapped" state is recorded in DAX entries as a side effect of
ext4_break_layouts(). Arrange for it to be called before all truncation
events which already happens for truncate() and PUNCH_HOLE, but not
truncate_inode_pages_final(). Arrange for ext4_break_layouts() before
truncate_inode_pages_final().

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext4/inode.c |    8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 478ec6bc0935..326269ad3961 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -207,7 +207,11 @@ void ext4_evict_inode(struct inode *inode)
 			jbd2_complete_transaction(journal, commit_tid);
 			filemap_write_and_wait(&inode->i_data);
 		}
+
+		filemap_invalidate_lock(inode->i_mapping);
+		ext4_break_layouts(inode);
 		truncate_inode_pages_final(&inode->i_data);
+		filemap_invalidate_unlock(inode->i_mapping);
 
 		goto no_delete;
 	}
@@ -218,7 +222,11 @@ void ext4_evict_inode(struct inode *inode)
 
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
+
+	filemap_invalidate_lock(inode->i_mapping);
+	ext4_break_layouts(inode);
 	truncate_inode_pages_final(&inode->i_data);
+	filemap_invalidate_unlock(inode->i_mapping);
 
 	/*
 	 * For inodes with journalled data, transaction commit could have

From patchwork Fri Sep 16 03:35:38 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978079
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id B34E6C6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:00 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229953AbiIPDf7 (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:35:59 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56930 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229951AbiIPDft (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:49 -0400
Received: from mga07.intel.com (mga07.intel.com [134.134.136.100])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CCE742EF1A;
        Thu, 15 Sep 2022 20:35:39 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299340; x=1694835340;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=MOEX+9Os5MyF4ZlF0WkG4j7W52nt+aIbvJOvJCH+Qx0=;
  b=Rx65N7N+wsVwtZUqR+EoOSXCcj0Jj4IhAkt9tDLdAA71oWOPdSSB5DYp
   URPSBrjpfu5xDPJdJOUXn2uZTGgbCZojARkluhlovTdNfIGB4TDAprOCM
   4tz/JW0IsoRvL3QTu8vpPvhG5SkgGPg+LVzX0j6NGC3T7fr9V5MapVXsr
   Ymb6W76FlBRdrv6GQg1lnHG/a+7owLyuAThWLpmhppTaD0q8UVRqISfKm
   EHbpcixvRpo4i2hto2Ep2MzDOnLrFPcCnsAaP0WU1RBjN7zgG6clbZPzP
   GprbrGXqdE1MXouwb02ExSrS+ha6nyzxyM+rbjNgbYqSn2oRnPAiTguPc
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="362866862"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="362866862"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:39 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961859"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:39 -0700
Subject: [PATCH v2 05/18] xfs: Add xfs_break_layouts() to the inode eviction
 path
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:38 -0700
Message-ID: 
 <166329933874.2786261.18236541386474985669.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for moving DAX pages to be 0-based rather than 1-based
for the idle refcount, the fsdax core wants to have all mappings in a
"zapped" state before truncate. For typical pages this happens naturally
via unmap_mapping_range(), for DAX pages some help is needed to record
this state in the 'struct address_space' of the inode(s) where the page
is mapped.

That "zapped" state is recorded in DAX entries as a side effect of
xfs_break_layouts(). Arrange for it to be called before all truncation
events which already happens for truncate() and PUNCH_HOLE, but not
truncate_inode_pages_final(). Arrange for xfs_break_layouts() before
truncate_inode_pages_final().

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_file.c  |   13 +++++++++----
 fs/xfs/xfs_inode.c |    3 ++-
 fs/xfs/xfs_inode.h |    6 ++++--
 fs/xfs/xfs_super.c |   22 ++++++++++++++++++++++
 4 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 556e28d06788..d3ff692d5546 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -816,7 +816,8 @@ xfs_wait_dax_page(
 int
 xfs_break_dax_layouts(
 	struct inode		*inode,
-	bool			*retry)
+	bool			*retry,
+	int			state)
 {
 	struct page		*page;
 
@@ -827,8 +828,8 @@ xfs_break_dax_layouts(
 		return 0;
 
 	*retry = true;
-	return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
-				 0, 0, xfs_wait_dax_page(inode));
+	return ___wait_var_event(page, dax_page_idle(page), state, 0, 0,
+				 xfs_wait_dax_page(inode));
 }
 
 int
@@ -839,14 +840,18 @@ xfs_break_layouts(
 {
 	bool			retry;
 	int			error;
+	int			state = TASK_INTERRUPTIBLE;
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
 
 	do {
 		retry = false;
 		switch (reason) {
+		case BREAK_UNMAP_FINAL:
+			state = TASK_UNINTERRUPTIBLE;
+			fallthrough;
 		case BREAK_UNMAP:
-			error = xfs_break_dax_layouts(inode, &retry);
+			error = xfs_break_dax_layouts(inode, &retry, state);
 			if (error || retry)
 				break;
 			fallthrough;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 28493c8e9bb2..72ce1cb72736 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3452,6 +3452,7 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	struct xfs_inode	*ip1,
 	struct xfs_inode	*ip2)
 {
+	int			state = TASK_INTERRUPTIBLE;
 	int			error;
 	bool			retry;
 	struct page		*page;
@@ -3463,7 +3464,7 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	retry = false;
 	/* Lock the first inode */
 	xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
-	error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+	error = xfs_break_dax_layouts(VFS_I(ip1), &retry, state);
 	if (error || retry) {
 		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
 		if (error == 0 && retry)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fa780f08dc89..e4994eb6e521 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -454,11 +454,13 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
  * layout-holder has a consistent view of the file's extent map. While
  * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
  * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
- * go idle.
+ * go idle. BREAK_UNMAP_FINAL is an uninterruptible version of
+ * BREAK_UNMAP.
  */
 enum layout_break_reason {
         BREAK_WRITE,
         BREAK_UNMAP,
+        BREAK_UNMAP_FINAL,
 };
 
 /*
@@ -531,7 +533,7 @@ xfs_itruncate_extents(
 }
 
 /* from xfs_file.c */
-int	xfs_break_dax_layouts(struct inode *inode, bool *retry);
+int	xfs_break_dax_layouts(struct inode *inode, bool *retry, int state);
 int	xfs_break_layouts(struct inode *inode, uint *iolock,
 		enum layout_break_reason reason);
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9ac59814bbb6..ebb4a6eba3fc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -725,6 +725,27 @@ xfs_fs_drop_inode(
 	return generic_drop_inode(inode);
 }
 
+STATIC void
+xfs_fs_evict_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	long			error;
+
+	xfs_ilock(ip, iolock);
+
+	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP_FINAL);
+
+	/* The final layout break is uninterruptible */
+	ASSERT_ALWAYS(!error);
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	xfs_iunlock(ip, iolock);
+}
+
 static void
 xfs_mount_free(
 	struct xfs_mount	*mp)
@@ -1144,6 +1165,7 @@ static const struct super_operations xfs_super_operations = {
 	.destroy_inode		= xfs_fs_destroy_inode,
 	.dirty_inode		= xfs_fs_dirty_inode,
 	.drop_inode		= xfs_fs_drop_inode,
+	.evict_inode		= xfs_fs_evict_inode,
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,

From patchwork Fri Sep 16 03:35:44 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978080
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 457D4C32771
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:23 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229959AbiIPDgV (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:21 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56962 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229960AbiIPDfu (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:50 -0400
Received: from mga09.intel.com (mga09.intel.com [134.134.136.24])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7F8D033347;
        Thu, 15 Sep 2022 20:35:45 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299345; x=1694835345;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=3aJv8sV5UvVLt6I9GPcQ4FOyHFWHfylczL3dCRbOmfY=;
  b=WW1oUihip05VkMLLJgE9sG2dsN+Wi8THIJJLjtsQjHGK3dtBcdTsuO7t
   7EErkKr0RuasJaXWdUq9lRk0OG4VPPdVPJqtZW87UaZhvpVouSljYmoVv
   iDBCJHGfNwGf/IhjCmVjFeEqznUmA6NkBa44jScVfrkNXR860+ZQcPNDp
   YbtHMP+zSbT5bagpsdpV9Ay+iTjOlTy3/1EMpjOpZIl7KstOSlngMX2Cy
   qxyLhqyxxB0H97l8oYp1SobMQzOyYynPkvX9K+qWsOk7T/2YY5JMf61/J
   eQ2T0DM0+D2x6X2dD9DvjwEvaWvkK9GUlc0vjWNTWfknfae5KDB0ffb1t
   Q==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="299726128"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="299726128"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by orsmga102.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:45 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961875"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:44 -0700
Subject: [PATCH v2 06/18] fsdax: Rework dax_layout_busy_page() to
 dax_zap_mappings()
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:44 -0700
Message-ID: 
 <166329934448.2786261.3862047806587561874.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for moving the truncate vs DAX-busy-page detection from
detecting _refcount == 1 to _refcount == 0, change the busy page
tracking to take refernces at dax_insert_entry(), drop references at
dax_zap_mappings() time, and finally clean out the entries at
dax_delete_mapping_entries().

This approach will rely on all paths that call truncate_inode_pages() to
first call dax_zap_mappings(). This mirrors the zapped state of pages
after unmap_mapping_range(), but since DAX pages do not maintain
_mapcount this DAX specific flow is introduced. This approach helps
address the immediate _refcount problem, but continues to kick the "DAX
without pages?" question down the road.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            |   82 ++++++++++++++++++++++++++++++++++++---------------
 fs/ext4/inode.c     |    2 +
 fs/fuse/dax.c       |    4 +-
 fs/xfs/xfs_file.c   |    2 +
 fs/xfs/xfs_inode.c  |    4 +-
 include/linux/dax.h |   11 ++++---
 6 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 76bad1c095c0..616bac4b7df3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -74,11 +74,12 @@ fs_initcall(init_dax_wait_table);
  * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
-#define DAX_SHIFT	(4)
+#define DAX_SHIFT	(5)
 #define DAX_LOCKED	(1UL << 0)
 #define DAX_PMD		(1UL << 1)
 #define DAX_ZERO_PAGE	(1UL << 2)
 #define DAX_EMPTY	(1UL << 3)
+#define DAX_ZAP		(1UL << 4)
 
 static unsigned long dax_to_pfn(void *entry)
 {
@@ -95,6 +96,11 @@ static bool dax_is_locked(void *entry)
 	return xa_to_value(entry) & DAX_LOCKED;
 }
 
+static bool dax_is_zapped(void *entry)
+{
+	return xa_to_value(entry) & DAX_ZAP;
+}
+
 static unsigned int dax_entry_order(void *entry)
 {
 	if (xa_to_value(entry) & DAX_PMD)
@@ -380,6 +386,7 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 			WARN_ON_ONCE(page->mapping);
 			page->mapping = mapping;
 			page->index = index + i++;
+			page_ref_inc(page);
 		}
 	}
 }
@@ -395,31 +402,20 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		WARN_ON_ONCE(trunc && !dax_page_idle(page));
 		if (dax_mapping_is_cow(page->mapping)) {
 			/* keep the CoW flag if this page is still shared */
 			if (page->index-- > 0)
 				continue;
-		} else
+		} else {
+			WARN_ON_ONCE(trunc && !dax_is_zapped(entry));
+			WARN_ON_ONCE(trunc && !dax_page_idle(page));
 			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+		}
 		page->mapping = NULL;
 		page->index = 0;
 	}
 }
 
-static struct page *dax_busy_page(void *entry)
-{
-	unsigned long pfn;
-
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		if (!dax_page_idle(page))
-			return page;
-	}
-	return NULL;
-}
-
 /*
  * dax_lock_page - Lock the DAX entry corresponding to a page
  * @page: The page whose entry we want to lock
@@ -664,8 +660,46 @@ static void *grab_mapping_entry(struct xa_state *xas,
 	return xa_mk_internal(VM_FAULT_FALLBACK);
 }
 
+static void *dax_zap_entry(struct xa_state *xas, void *entry)
+{
+	unsigned long v = xa_to_value(entry);
+
+	return xas_store(xas, xa_mk_value(v | DAX_ZAP));
+}
+
+/**
+ * Return NULL if the entry is zapped and all pages in the entry are
+ * idle, otherwise return the non-idle page in the entry
+ */
+static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
+{
+	struct page *ret = NULL;
+	unsigned long pfn;
+	bool zap;
+
+	if (!dax_entry_size(entry))
+		return NULL;
+
+	zap = !dax_is_zapped(entry);
+
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (zap)
+			page_ref_dec(page);
+
+		if (!ret && !dax_page_idle(page))
+			ret = page;
+	}
+
+	if (zap)
+		dax_zap_entry(xas, entry);
+
+	return ret;
+}
+
 /**
- * dax_layout_busy_page_range - find first pinned page in @mapping
+ * dax_zap_mappings_range - find first pinned page in @mapping
  * @mapping: address space to scan for a page with ref count > 1
  * @start: Starting offset. Page containing 'start' is included.
  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
@@ -682,8 +716,8 @@ static void *grab_mapping_entry(struct xa_state *xas,
  * to be able to run unmap_mapping_range() and subsequently not race
  * mapping_mapped() becoming true.
  */
-struct page *dax_layout_busy_page_range(struct address_space *mapping,
-					loff_t start, loff_t end)
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end)
 {
 	void *entry;
 	unsigned int scanned = 0;
@@ -727,7 +761,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 		if (unlikely(dax_is_locked(entry)))
 			entry = get_unlocked_entry(&xas, 0);
 		if (entry)
-			page = dax_busy_page(entry);
+			page = dax_zap_pages(&xas, entry);
 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
 		if (page)
 			break;
@@ -742,13 +776,13 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 	xas_unlock_irq(&xas);
 	return page;
 }
-EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
+EXPORT_SYMBOL_GPL(dax_zap_mappings_range);
 
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_zap_mappings(struct address_space *mapping)
 {
-	return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
+	return dax_zap_mappings_range(mapping, 0, LLONG_MAX);
 }
-EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+EXPORT_SYMBOL_GPL(dax_zap_mappings);
 
 static int __dax_invalidate_entry(struct address_space *mapping,
 					  pgoff_t index, bool trunc)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 326269ad3961..0ce73af69c49 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3965,7 +3965,7 @@ int ext4_break_layouts(struct inode *inode)
 		return -EINVAL;
 
 	do {
-		page = dax_layout_busy_page(inode->i_mapping);
+		page = dax_zap_mappings(inode->i_mapping);
 		if (!page)
 			return 0;
 
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index ae52ef7dbabe..8cdc9402e8f7 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -443,7 +443,7 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
 
 	/*
 	 * Can't do inline reclaim in fault path. We call
-	 * dax_layout_busy_page() before we free a range. And
+	 * dax_zap_mappings() before we free a range. And
 	 * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
 	 * In fault path we enter with mapping->invalidate_lock held and can't
 	 * drop it. Also in fault path we hold mapping->invalidate_lock shared
@@ -671,7 +671,7 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
 {
 	struct page *page;
 
-	page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+	page = dax_zap_mappings_range(inode->i_mapping, start, end);
 	if (!page)
 		return 0;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d3ff692d5546..918ab9130c96 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -823,7 +823,7 @@ xfs_break_dax_layouts(
 
 	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 
-	page = dax_layout_busy_page(inode->i_mapping);
+	page = dax_zap_mappings(inode->i_mapping);
 	if (!page)
 		return 0;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 72ce1cb72736..9bbc68500cec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3482,8 +3482,8 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
 	 * for this nested lock case.
 	 */
-	page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
-	if (page && page_ref_count(page) != 1) {
+	page = dax_zap_mappings(VFS_I(ip2)->i_mapping);
+	if (page) {
 		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
 		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
 		goto again;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 04987d14d7e0..f6acb4ed73cb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -157,8 +157,9 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
-struct page *dax_layout_busy_page(struct address_space *mapping);
-struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+struct page *dax_zap_mappings(struct address_space *mapping);
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
@@ -166,12 +167,14 @@ dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 void dax_unlock_mapping_entry(struct address_space *mapping,
 		unsigned long index, dax_entry_t cookie);
 #else
-static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+static inline struct page *dax_zap_mappings(struct address_space *mapping)
 {
 	return NULL;
 }
 
-static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+static inline struct page *dax_zap_mappings_range(struct address_space *mapping,
+						  pgoff_t start,
+						  pgoff_t nr_pages)
 {
 	return NULL;
 }

From patchwork Fri Sep 16 03:35:50 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978081
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A8589C6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229803AbiIPDgW (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:22 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57180 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229905AbiIPDf5 (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:35:57 -0400
Received: from mga17.intel.com (mga17.intel.com [192.55.52.151])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id F391D371A2;
        Thu, 15 Sep 2022 20:35:51 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299352; x=1694835352;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=gvNMBXmzRliL/61+0l3Vp0DMaKfxeHPNaGSpEyGRros=;
  b=iDypxwJUKMQkCx0bXijvtpfGHOlepKeIsgyxigK9wYgjZbQ/60jSz9AK
   NlAaJvP7kGGJbAZ9iwvw6p5/7zYEvjq1qHi/YvpiX3NW8QTYgIhS63Vym
   JG7YxiqCL8dv2jYDN1QCciw7Y4IDM/mIK4reF+nTc6yhUHg4un+s7wgoG
   xC2WxpxuDBCQJr9UV/InKYj9Ssbcmyiwtq6Vu6ql2uGzs9WC4aaGxWoAj
   nxakp5OMI+AG7FEgYscpApusjWAs9AAkdUtsaRqIT4mZZ70Qwr9yu1G/t
   VhZle4xhZ8VNHO71JUbiznkadbEeANkAeSloFIBC8cGy64wqPXJC3Y3q2
   Q==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="279283844"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="279283844"
Received: from orsmga005.jf.intel.com ([10.7.209.41])
  by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:51 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="792961964"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga005-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:50 -0700
Subject: [PATCH v2 07/18] fsdax: Update dax_insert_entry() calling
 convention to return an error
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:50 -0700
Message-ID: 
 <166329935018.2786261.15861171979773593749.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for teaching dax_insert_entry() to take live @pgmap
references, enable it to return errors. Given the observation that all
callers overwrite the passed in entry with the return value, just update
@entry in place and convert the return code to a vm_fault_t status.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |   27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 616bac4b7df3..8382aab0d2f7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -887,14 +887,15 @@ static bool dax_fault_is_cow(const struct iomap_iter *iter)
  * already in the tree, we will skip the insertion and just dirty the PMD as
  * appropriate.
  */
-static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
-		const struct iomap_iter *iter, void *entry, pfn_t pfn,
-		unsigned long flags)
+static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+				   const struct iomap_iter *iter, void **pentry,
+				   pfn_t pfn, unsigned long flags)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	void *new_entry = dax_make_entry(pfn, flags);
 	bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
 	bool cow = dax_fault_is_cow(iter);
+	void *entry = *pentry;
 
 	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -940,7 +941,8 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
 
 	xas_unlock_irq(xas);
-	return entry;
+	*pentry = entry;
+	return 0;
 }
 
 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
@@ -1188,9 +1190,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
 	vm_fault_t ret;
 
-	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
+	ret = dax_insert_entry(xas, vmf, iter, entry, pfn, DAX_ZERO_PAGE);
+	if (ret)
+		goto out;
 
 	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+out:
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
 }
@@ -1207,6 +1212,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	struct page *zero_page;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
+	vm_fault_t ret;
 	pfn_t pfn;
 
 	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
@@ -1215,8 +1221,10 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 		goto fallback;
 
 	pfn = page_to_pfn_t(zero_page);
-	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
-				  DAX_PMD | DAX_ZERO_PAGE);
+	ret = dax_insert_entry(xas, vmf, iter, entry, pfn,
+			       DAX_PMD | DAX_ZERO_PAGE);
+	if (ret)
+		return ret;
 
 	if (arch_needs_pgtable_deposit()) {
 		pgtable = pte_alloc_one(vma->vm_mm);
@@ -1568,6 +1576,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
 	bool write = iter->flags & IOMAP_WRITE;
 	unsigned long entry_flags = pmd ? DAX_PMD : 0;
+	vm_fault_t ret;
 	int err = 0;
 	pfn_t pfn;
 	void *kaddr;
@@ -1592,7 +1601,9 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	if (err)
 		return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
 
-	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
+	ret = dax_insert_entry(xas, vmf, iter, entry, pfn, entry_flags);
+	if (ret)
+		return ret;
 
 	if (write &&
 	    srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {

From patchwork Fri Sep 16 03:35:56 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978082
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 3088FC32771
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:37 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229954AbiIPDge (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:34 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57348 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229951AbiIPDgD (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:03 -0400
Received: from mga11.intel.com (mga11.intel.com [192.55.52.93])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2A1383D587;
        Thu, 15 Sep 2022 20:36:01 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299361; x=1694835361;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=j5tRXbceFLZcGlHWWzLaIriWLFbXvQfVW7h15WIRvnM=;
  b=GOLuqJzpGOcIUNjCwZtSSt9Y+VZ3V2Z6DD+eqb+qTfB5CtVhi+/gmfnN
   JwQpvuu+CnmHtd5gqTddgH+AKsiAyyw2AHleh/L7+lUKWgMGebm0sEkZR
   mWnnMfqUijTK7wr2ibJSDDrChzvr7cT4QITDnTcfGx4clVn5KSP9j/uYu
   YqRMbrCawwtkcPAffxVige2E49kUT+3mUOX9FoUUTo++CxoFxEoKwWiBZ
   QC59BKARs++cnvEe9ankH1KOQyPfr094uAVjpv1DTxGhEgqdIiIskvQ13
   nGoKsmAyBbfY6ThklyBVRxaJT1qgEnoJnOC3Yc29tJiMRu85x9sU5aGKV
   Q==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="296491024"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="296491024"
Received: from orsmga008.jf.intel.com ([10.7.209.65])
  by fmsmga102.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:59 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="648099914"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga008-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:35:56 -0700
Subject: [PATCH v2 08/18] fsdax: Cleanup dax_associate_entry()
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:35:56 -0700
Message-ID: 
 <166329935598.2786261.15591591637555586864.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Pass @vmf to drop the separate @vma and @address arguments to
dax_associate_entry(), use the existing DAX flags to convey the @cow
argument, and replace the open-coded ALIGN().

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 8382aab0d2f7..bd5c6b6e371e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -368,7 +368,7 @@ static inline void dax_mapping_set_cow(struct page *page)
  * FS_DAX_MAPPING_COW, and use page->index as refcount.
  */
 static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address, bool cow)
+				struct vm_fault *vmf, unsigned long flags)
 {
 	unsigned long size = dax_entry_size(entry), pfn, index;
 	int i = 0;
@@ -376,11 +376,11 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 		return;
 
-	index = linear_page_index(vma, address & ~(size - 1));
+	index = linear_page_index(vmf->vma, ALIGN(vmf->address, size));
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (cow) {
+		if (flags & DAX_COW) {
 			dax_mapping_set_cow(page);
 		} else {
 			WARN_ON_ONCE(page->mapping);
@@ -916,8 +916,7 @@ static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 		void *old;
 
 		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
-				cow);
+		dax_associate_entry(new_entry, mapping, vmf, flags);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or

From patchwork Fri Sep 16 03:36:01 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978083
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id EA9FCC6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:37 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229997AbiIPDgf (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:35 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57404 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229898AbiIPDgE (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:04 -0400
Received: from mga11.intel.com (mga11.intel.com [192.55.52.93])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A78A34D815;
        Thu, 15 Sep 2022 20:36:02 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299363; x=1694835363;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=bcuK2KG5EjEAqreYAdzRR3rq0EZdnkZSoL2J30lxqoE=;
  b=lp1U3OCfCV+qGfUV+6dbeB09e/4MKNE7DTbDCDGVQ+qaf4ccCpOsffQZ
   zc/5ljU1+v+D7Yiec5KBa654X3A2uJCN3rd/jjucgzyZ45qLQeuZPI+Jb
   XRb8cK0iOtp1J6Us1FCJ0C88Zi7XfOiLJoqR7CqvFB1SsBdDFXsGQSH/G
   dg69iMQPSH9hF297E4DajzntYAQnnjg+FEUBMzGCctf0st2obkihBUvBT
   bxBxXlXrIgmb0F/kiBZnlkADHGqAOiWGk+oZfI26nlVAxTZcTRnZLnzEs
   qNZ2ZUVkFZmCgEUuiXDqS+3QR2hlEEVnWb8BLXtOAHAmhT08Rz4SHaI4W
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="296491037"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="296491037"
Received: from orsmga008.jf.intel.com ([10.7.209.65])
  by fmsmga102.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:02 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="648099926"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga008-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:01 -0700
Subject: [PATCH v2 09/18] fsdax: Rework dax_insert_entry() calling convention
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:01 -0700
Message-ID: 
 <166329936170.2786261.6094157723547541341.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Move the determination of @dirty and @cow in dax_insert_entry() to flags
(DAX_DIRTY and DAX_COW) that are passed in. This allows the iomap
related code to remain fs/dax.c in preparation for the Xarray
infrastructure to move to drivers/dax/mapping.c.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |   44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index bd5c6b6e371e..5d9f30105db4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -75,12 +75,20 @@ fs_initcall(init_dax_wait_table);
  * block allocation.
  */
 #define DAX_SHIFT	(5)
+#define DAX_MASK	((1UL << DAX_SHIFT) - 1)
 #define DAX_LOCKED	(1UL << 0)
 #define DAX_PMD		(1UL << 1)
 #define DAX_ZERO_PAGE	(1UL << 2)
 #define DAX_EMPTY	(1UL << 3)
 #define DAX_ZAP		(1UL << 4)
 
+/*
+ * These flags are not conveyed in Xarray value entries, they are just
+ * modifiers to dax_insert_entry().
+ */
+#define DAX_DIRTY (1UL << (DAX_SHIFT + 0))
+#define DAX_COW   (1UL << (DAX_SHIFT + 1))
+
 static unsigned long dax_to_pfn(void *entry)
 {
 	return xa_to_value(entry) >> DAX_SHIFT;
@@ -88,7 +96,8 @@ static unsigned long dax_to_pfn(void *entry)
 
 static void *dax_make_entry(pfn_t pfn, unsigned long flags)
 {
-	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+	return xa_mk_value((flags & DAX_MASK) |
+			   (pfn_t_to_pfn(pfn) << DAX_SHIFT));
 }
 
 static bool dax_is_locked(void *entry)
@@ -880,6 +889,20 @@ static bool dax_fault_is_cow(const struct iomap_iter *iter)
 		(iter->iomap.flags & IOMAP_F_SHARED);
 }
 
+static unsigned long dax_iter_flags(const struct iomap_iter *iter,
+				    struct vm_fault *vmf)
+{
+	unsigned long flags = 0;
+
+	if (!dax_fault_is_synchronous(iter, vmf->vma))
+		flags |= DAX_DIRTY;
+
+	if (dax_fault_is_cow(iter))
+		flags |= DAX_COW;
+
+	return flags;
+}
+
 /*
  * By this point grab_mapping_entry() has ensured that we have a locked entry
  * of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -888,13 +911,13 @@ static bool dax_fault_is_cow(const struct iomap_iter *iter)
  * appropriate.
  */
 static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
-				   const struct iomap_iter *iter, void **pentry,
-				   pfn_t pfn, unsigned long flags)
+				   void **pentry, pfn_t pfn,
+				   unsigned long flags)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	void *new_entry = dax_make_entry(pfn, flags);
-	bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
-	bool cow = dax_fault_is_cow(iter);
+	bool dirty = flags & DAX_DIRTY;
+	bool cow = flags & DAX_COW;
 	void *entry = *pentry;
 
 	if (dirty)
@@ -1189,7 +1212,8 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
 	vm_fault_t ret;
 
-	ret = dax_insert_entry(xas, vmf, iter, entry, pfn, DAX_ZERO_PAGE);
+	ret = dax_insert_entry(xas, vmf, entry, pfn,
+			       DAX_ZERO_PAGE | dax_iter_flags(iter, vmf));
 	if (ret)
 		goto out;
 
@@ -1220,8 +1244,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 		goto fallback;
 
 	pfn = page_to_pfn_t(zero_page);
-	ret = dax_insert_entry(xas, vmf, iter, entry, pfn,
-			       DAX_PMD | DAX_ZERO_PAGE);
+	ret = dax_insert_entry(xas, vmf, entry, pfn,
+			       DAX_PMD | DAX_ZERO_PAGE |
+				       dax_iter_flags(iter, vmf));
 	if (ret)
 		return ret;
 
@@ -1600,7 +1625,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	if (err)
 		return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
 
-	ret = dax_insert_entry(xas, vmf, iter, entry, pfn, entry_flags);
+	ret = dax_insert_entry(xas, vmf, entry, pfn,
+			       entry_flags | dax_iter_flags(iter, vmf));
 	if (ret)
 		return ret;
 

From patchwork Fri Sep 16 03:36:07 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978084
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id C579DECAAD3
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:43 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229708AbiIPDgk (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:40 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57542 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229600AbiIPDgJ (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:09 -0400
Received: from mga01.intel.com (mga01.intel.com [192.55.52.88])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A6631895CE;
        Thu, 15 Sep 2022 20:36:08 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299368; x=1694835368;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=iTs/4dIxqdixjfZltzlbEEuflTOzT+TWXRHIgDMaAds=;
  b=jVKhW6ulwMdPMvE0jGhxOS3FB3E0aZiykNK0fmAnLILX0w4tfkAEWp22
   8787s4Z7WfXt8GgBL74ZLf6pHwbAlhMSdXrFvOMYFvLd+Df8XAop1ilSK
   O7ZSLJYJcZAqcFBgs2FmPod4J+GuFfObCUf1Qo9Co+fzmxQ2phxHbHUT5
   FmPMw02IIzKnasFGS+2Kj8XVITAg+y92SrCLuzhUs7/NvNanFz8q2VC2u
   1KskvjTbVGFohu5IrGh5u11ip/d8w3axVafFgZSempIlD++7vzKZDOPUh
   jBQbaCMn114wacL/pIKSZBYT2SNNww1IpNaDgAt2KCQMvd4OV5Gys30aA
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="325170396"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="325170396"
Received: from orsmga008.jf.intel.com ([10.7.209.65])
  by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:08 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="648099943"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga008-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:07 -0700
Subject: [PATCH v2 10/18] fsdax: Manage pgmap references at entry insertion
 and deletion
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:07 -0700
Message-ID: 
 <166329936739.2786261.14035402420254589047.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

The percpu_ref in 'struct dev_pagemap' is used to coordinate active
mappings of device-memory with the device-removal / unbind path. It
enables the semantic that initiating device-removal (or
device-driver-unbind) blocks new mapping and DMA attempts, and waits for
mapping revocation or inflight DMA to complete.

Expand the scope of the reference count to pin the DAX device active at
mapping time and not later at the first gup event. With a device
reference being held while any page on that device is mapped the need to
manage pgmap reference counts in the gup code is eliminated. That
cleanup is saved for a follow-on change.

For now, teach dax_insert_entry() and dax_delete_mapping_entry() to take
and drop pgmap references respectively. Where dax_insert_entry() is
called to take the initial reference on the page, and
dax_delete_mapping_entry() is called once there are no outstanding
references to the given page(s).

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c                 |   34 ++++++++++++++++++++++++++++------
 include/linux/memremap.h |   18 ++++++++++++++----
 mm/memremap.c            |   13 ++++++++-----
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5d9f30105db4..ee2568c8b135 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -376,14 +376,26 @@ static inline void dax_mapping_set_cow(struct page *page)
  * whether this entry is shared by multiple files.  If so, set the page->mapping
  * FS_DAX_MAPPING_COW, and use page->index as refcount.
  */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
-				struct vm_fault *vmf, unsigned long flags)
+static vm_fault_t dax_associate_entry(void *entry,
+				      struct address_space *mapping,
+				      struct vm_fault *vmf, unsigned long flags)
 {
 	unsigned long size = dax_entry_size(entry), pfn, index;
+	struct dev_pagemap *pgmap;
 	int i = 0;
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
+		return 0;
+
+	if (!size)
+		return 0;
+
+	if (!(flags & DAX_COW)) {
+		pfn = dax_to_pfn(entry);
+		pgmap = get_dev_pagemap_many(pfn, NULL, PHYS_PFN(size));
+		if (!pgmap)
+			return VM_FAULT_SIGBUS;
+	}
 
 	index = linear_page_index(vmf->vma, ALIGN(vmf->address, size));
 	for_each_mapped_pfn(entry, pfn) {
@@ -398,19 +410,24 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 			page_ref_inc(page);
 		}
 	}
+
+	return 0;
 }
 
 static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		bool trunc)
 {
-	unsigned long pfn;
+	unsigned long size = dax_entry_size(entry), pfn;
+	struct page *page;
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
 		return;
 
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
+	if (!size)
+		return;
 
+	for_each_mapped_pfn(entry, pfn) {
+		page = pfn_to_page(pfn);
 		if (dax_mapping_is_cow(page->mapping)) {
 			/* keep the CoW flag if this page is still shared */
 			if (page->index-- > 0)
@@ -423,6 +440,11 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		page->mapping = NULL;
 		page->index = 0;
 	}
+
+	if (trunc && !dax_mapping_is_cow(page->mapping)) {
+		page = pfn_to_page(dax_to_pfn(entry));
+		put_dev_pagemap_many(page->pgmap, PHYS_PFN(size));
+	}
 }
 
 /*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c3b4cc84877b..fd57407e7f3d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -191,8 +191,13 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
-		struct dev_pagemap *pgmap);
+struct dev_pagemap *get_dev_pagemap_many(unsigned long pfn,
+					 struct dev_pagemap *pgmap, int refs);
+static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+						  struct dev_pagemap *pgmap)
+{
+	return get_dev_pagemap_many(pfn, pgmap, 1);
+}
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
@@ -244,10 +249,15 @@ static inline unsigned long memremap_compat_align(void)
 }
 #endif /* CONFIG_ZONE_DEVICE */
 
-static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
+static inline void put_dev_pagemap_many(struct dev_pagemap *pgmap, int refs)
 {
 	if (pgmap)
-		percpu_ref_put(&pgmap->ref);
+		percpu_ref_put_many(&pgmap->ref, refs);
+}
+
+static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
+{
+	put_dev_pagemap_many(pgmap, 1);
 }
 
 #endif /* _LINUX_MEMREMAP_H_ */
diff --git a/mm/memremap.c b/mm/memremap.c
index 95f6ffe9cb0f..83c5e6fafd84 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -430,15 +430,16 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
 }
 
 /**
- * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * get_dev_pagemap_many() - take new live references(s) on the dev_pagemap for @pfn
  * @pfn: page frame number to lookup page_map
  * @pgmap: optional known pgmap that already has a reference
+ * @refs: number of references to take
  *
  * If @pgmap is non-NULL and covers @pfn it will be returned as-is.  If @pgmap
  * is non-NULL but does not cover @pfn the reference to it will be released.
  */
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
-		struct dev_pagemap *pgmap)
+struct dev_pagemap *get_dev_pagemap_many(unsigned long pfn,
+					 struct dev_pagemap *pgmap, int refs)
 {
 	resource_size_t phys = PFN_PHYS(pfn);
 
@@ -454,13 +455,15 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 	/* fall back to slow path lookup */
 	rcu_read_lock();
 	pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
-	if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
+	if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
 		pgmap = NULL;
+	if (pgmap && refs > 1)
+		percpu_ref_get_many(&pgmap->ref, refs - 1);
 	rcu_read_unlock();
 
 	return pgmap;
 }
-EXPORT_SYMBOL_GPL(get_dev_pagemap);
+EXPORT_SYMBOL_GPL(get_dev_pagemap_many);
 
 void free_zone_device_page(struct page *page)
 {

From patchwork Fri Sep 16 03:36:13 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978085
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1F4FCC6FA91
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230017AbiIPDgl (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:41 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56970 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229773AbiIPDgS (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:18 -0400
Received: from mga01.intel.com (mga01.intel.com [192.55.52.88])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id F1F8B89809;
        Thu, 15 Sep 2022 20:36:13 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299373; x=1694835373;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=qfzx5y9B9XHcY/WNGZvpd9adYJw1q+hyjxJ0sZQS8JA=;
  b=eWigBSljAb7z3Tke61JlKS+I5LMhsoVPUuVMsyfbmv1wmpceppEtGHnl
   Ef8T/rnnqSilGFUJC5+qCkeFdvJF5fxFxNq9XIbxk+D8YiRKY9i/AxWn8
   MIBXf6yFYEJfARh1927+PB1T43ejxqmHfeiCXYMNOFO8sG5wMhMpHPqyq
   F3Cw0AmIrRLKzsHoowMreQ/0CVTVCVWaTHAzG0VWvpXVf873ywPrkOPSK
   vX2FMlyhFunJuU9iyZWa8cZxocZ+hkmRdO5+yVrKs6+ce13H0/5qzynxU
   la0rMnC0xCCCBilPJ2ZDZMiLWwxFVOPpGp2tYt2cOv698GODiAZPxUQc3
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="325170420"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="325170420"
Received: from orsmga008.jf.intel.com ([10.7.209.65])
  by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:13 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="648099963"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga008-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:13 -0700
Subject: [PATCH v2 11/18] devdax: Minor warning fixups
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: hch@lst.de, linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:13 -0700
Message-ID: 
 <166329937313.2786261.6805174536617254263.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Fix a missing prototype warning for dev_dax_probe(), and fix
dax_holder() comment block format.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax-private.h |    1 +
 drivers/dax/super.c       |    2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 1c974b7caae6..202cafd836e8 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -87,6 +87,7 @@ static inline struct dax_mapping *to_dax_mapping(struct device *dev)
 }
 
 phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size);
+int dev_dax_probe(struct dev_dax *dev_dax);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline bool dax_align_valid(unsigned long align)
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 9b5e2a5eb0ae..4909ad945a49 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(put_dax);
 /**
  * dax_holder() - obtain the holder of a dax device
  * @dax_dev: a dax_device instance
-
+ *
  * Return: the holder's data which represents the holder if registered,
  * otherwize NULL.
  */

From patchwork Fri Sep 16 03:36:18 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978087
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 64C7CECAAD3
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:47 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230019AbiIPDgo (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:44 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57800 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229972AbiIPDgY (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:24 -0400
Received: from mga07.intel.com (mga07.intel.com [134.134.136.100])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id ADE1D33347;
        Thu, 15 Sep 2022 20:36:20 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299380; x=1694835380;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=yokvy2vJtxWd/DCQDxwhex72QghSUCJTijVP30FYlfM=;
  b=PUvkVeNMbxIQE0YzoSIAQ6ohr8iu015CpT0LZgw950aU1/KQGHtCDJzF
   89hsUYfl9Wr11Dup77YOuz9utMO//Y3Ty983AvyuI+qqNOlI9zEtMfZ8J
   0wbcjO06cCWdJuVCPLbwBUiudRmBCH6sA72/YBS14TctxMlhorFdJEDnM
   wihSpO4uBC6dqMalSXzhAvUr2H3L2ayB4juP5KM6yAW7Rtk9lHQWt5QRI
   6FimrS1+ZjR3A5g2GxUOz4yM/LICMLCdaFGMfP5LXZbOfjcv/XHSNsel0
   bbUs/lkAFYff8fU1bH+tL+4WESbglynO9IWFvTIFlRB/6jrB+b3OXA4AI
   A==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="362867021"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="362867021"
Received: from fmsmga007.fm.intel.com ([10.253.24.52])
  by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:20 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="619942605"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga007-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:19 -0700
Subject: [PATCH v2 12/18] devdax: Move address_space helpers to the DAX core
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:18 -0700
Message-ID: 
 <166329937873.2786261.10966526479509910698.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for decamping get_dev_pagemap() and
put_devmap_managed_page() from code paths outside of DAX, device-dax
needs to track mapping references similar to the tracking done for
fsdax. Reuse the same infrastructure as fsdax (dax_insert_entry() and
dax_delete_mapping_entry()). For now, just move that infrastructure into
a common location with no other code changes.

The move involves splitting iomap and supporting helpers into fs/dax.c
and all 'struct address_space' and DAX-entry manipulation into
drivers/dax/mapping.c. grab_mapping_entry() is renamed
dax_grab_mapping_entry(), and some common definitions and declarations
are moved to include/linux/dax.h.

No functional change is intended, just code movement.

The interactions between drivers/dax/mapping.o and mm/memory-failure.o
result in drivers/dax/mapping.o and the rest of the dax core losing its
option to be compiled as a module. That can be addressed later given the
fact the CONFIG_FS_DAX has always been forcing the dax core to be
compiled in. I.e. this is only a vmlinux size regression for
CONFIG_FS_DAX=n and CONFIG_DEV_DAX=m builds.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format             |    1 
 drivers/Makefile          |    2 
 drivers/dax/Kconfig       |    4 
 drivers/dax/Makefile      |    1 
 drivers/dax/dax-private.h |    1 
 drivers/dax/mapping.c     | 1010 +++++++++++++++++++++++++++++++++++++++++
 drivers/dax/super.c       |    4 
 drivers/nvdimm/Kconfig    |    1 
 fs/dax.c                  | 1109 +--------------------------------------------
 include/linux/dax.h       |  110 +++-
 include/linux/memremap.h  |    6 
 11 files changed, 1143 insertions(+), 1106 deletions(-)
 create mode 100644 drivers/dax/mapping.c

diff --git a/.clang-format b/.clang-format
index 1247d54f9e49..336fa266386e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -269,6 +269,7 @@ ForEachMacros:
   - 'for_each_link_cpus'
   - 'for_each_link_platforms'
   - 'for_each_lru'
+  - 'for_each_mapped_pfn'
   - 'for_each_matching_node'
   - 'for_each_matching_node_and_match'
   - 'for_each_mem_pfn_range'
diff --git a/drivers/Makefile b/drivers/Makefile
index 057857258bfd..ec6c4146b966 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
-obj-$(CONFIG_DAX)		+= dax/
+obj-y				+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= cxl/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 5fdf269a822e..205e9dda8928 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menuconfig DAX
-	tristate "DAX: direct access to differentiated memory"
+	bool "DAX: direct access to differentiated memory"
+	depends on MMU
 	select SRCU
-	default m if NVDIMM_DAX
 
 if DAX
 
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 90a56ca3b345..3546bca7adbf 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 dax-y := super.o
 dax-y += bus.o
+dax-y += mapping.o
 device_dax-y := device.o
 dax_pmem-y := pmem.o
 
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 202cafd836e8..19076f9d5c51 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -15,6 +15,7 @@ struct dax_device *inode_dax(struct inode *inode);
 struct inode *dax_inode(struct dax_device *dax_dev);
 int dax_bus_init(void);
 void dax_bus_exit(void);
+void dax_mapping_init(void);
 
 /**
  * struct dax_region - mapping infrastructure for dax devices
diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
new file mode 100644
index 000000000000..70576aa02148
--- /dev/null
+++ b/drivers/dax/mapping.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Direct Access mapping infrastructure split from fs/dax.c
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/rmap.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+#include <linux/pagemap.h>
+
+#include "dax-private.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+void __init dax_mapping_init(void)
+{
+	int i;
+
+	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+		init_waitqueue_head(wait_table + i);
+}
+
+static unsigned long dax_to_pfn(void *entry)
+{
+	return xa_to_value(entry) >> DAX_SHIFT;
+}
+
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+{
+	return xa_mk_value((flags & DAX_MASK) |
+			   (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+}
+
+static bool dax_is_locked(void *entry)
+{
+	return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static bool dax_is_zapped(void *entry)
+{
+	return xa_to_value(entry) & DAX_ZAP;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+	if (xa_to_value(entry) & DAX_PMD)
+		return PMD_ORDER;
+	return 0;
+}
+
+static unsigned long dax_is_pmd_entry(void *entry)
+{
+	return xa_to_value(entry) & DAX_PMD;
+}
+
+static bool dax_is_pte_entry(void *entry)
+{
+	return !(xa_to_value(entry) & DAX_PMD);
+}
+
+static int dax_is_zero_entry(void *entry)
+{
+	return xa_to_value(entry) & DAX_ZERO_PAGE;
+}
+
+static int dax_is_empty_entry(void *entry)
+{
+	return xa_to_value(entry) & DAX_EMPTY;
+}
+
+/*
+ * true if the entry that was found is of a smaller order than the entry
+ * we were looking for
+ */
+static bool dax_is_conflict(void *entry)
+{
+	return entry == XA_RETRY_ENTRY;
+}
+
+/*
+ * DAX page cache entry locking
+ */
+struct exceptional_entry_key {
+	struct xarray *xa;
+	pgoff_t entry_start;
+};
+
+struct wait_exceptional_entry_queue {
+	wait_queue_entry_t wait;
+	struct exceptional_entry_key key;
+};
+
+/**
+ * enum dax_wake_mode: waitqueue wakeup behaviour
+ * @WAKE_ALL: wake all waiters in the waitqueue
+ * @WAKE_NEXT: wake only the first waiter in the waitqueue
+ */
+enum dax_wake_mode {
+	WAKE_ALL,
+	WAKE_NEXT,
+};
+
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry,
+					      struct exceptional_entry_key *key)
+{
+	unsigned long hash;
+	unsigned long index = xas->xa_index;
+
+	/*
+	 * If 'entry' is a PMD, align the 'index' that we use for the wait
+	 * queue to the start of that PMD.  This ensures that all offsets in
+	 * the range covered by the PMD map to the same bit lock.
+	 */
+	if (dax_is_pmd_entry(entry))
+		index &= ~PG_PMD_COLOUR;
+	key->xa = xas->xa;
+	key->entry_start = index;
+
+	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+				       unsigned int mode, int sync, void *keyp)
+{
+	struct exceptional_entry_key *key = keyp;
+	struct wait_exceptional_entry_queue *ewait =
+		container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+	if (key->xa != ewait->key.xa ||
+	    key->entry_start != ewait->key.entry_start)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * @entry may no longer be the entry at the index in the mapping.
+ * The important information it's conveying is whether the entry at
+ * this index used to be a PMD entry.
+ */
+static void dax_wake_entry(struct xa_state *xas, void *entry,
+			   enum dax_wake_mode mode)
+{
+	struct exceptional_entry_key key;
+	wait_queue_head_t *wq;
+
+	wq = dax_entry_waitqueue(xas, entry, &key);
+
+	/*
+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
+	 * under the i_pages lock, ditto for entry handling in our callers.
+	 * So at this point all tasks that could have seen our entry locked
+	 * must be in the waitqueue and the following check will see them.
+	 */
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
+}
+
+/*
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it.  The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.  The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a dax_is_conflict entry.
+ *
+ * Must be called with the i_pages lock held.
+ */
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+{
+	void *entry;
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq;
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+
+	for (;;) {
+		entry = xas_find_conflict(xas);
+		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+			return entry;
+		if (dax_entry_order(entry) < order)
+			return XA_RETRY_ENTRY;
+		if (!dax_is_locked(entry))
+			return entry;
+
+		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+		prepare_to_wait_exclusive(wq, &ewait.wait,
+					  TASK_UNINTERRUPTIBLE);
+		xas_unlock_irq(xas);
+		xas_reset(xas);
+		schedule();
+		finish_wait(wq, &ewait.wait);
+		xas_lock_irq(xas);
+	}
+}
+
+/*
+ * The only thing keeping the address space around is the i_pages lock
+ * (it's cycled in clear_inode() after removing the entries from i_pages)
+ * After we call xas_unlock_irq(), we cannot touch xas->xa.
+ */
+static void wait_entry_unlocked(struct xa_state *xas, void *entry)
+{
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq;
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+
+	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+	/*
+	 * Unlike get_unlocked_entry() there is no guarantee that this
+	 * path ever successfully retrieves an unlocked entry before an
+	 * inode dies. Perform a non-exclusive wait in case this path
+	 * never successfully performs its own wake up.
+	 */
+	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+	xas_unlock_irq(xas);
+	schedule();
+	finish_wait(wq, &ewait.wait);
+}
+
+static void put_unlocked_entry(struct xa_state *xas, void *entry,
+			       enum dax_wake_mode mode)
+{
+	if (entry && !dax_is_conflict(entry))
+		dax_wake_entry(xas, entry, mode);
+}
+
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+void dax_unlock_entry(struct xa_state *xas, void *entry)
+{
+	void *old;
+
+	WARN_ON(dax_is_locked(entry));
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	old = xas_store(xas, entry);
+	xas_unlock_irq(xas);
+	WARN_ON(!dax_is_locked(old));
+	dax_wake_entry(xas, entry, WAKE_NEXT);
+}
+
+/*
+ * Return: The entry stored at this location before it was locked.
+ */
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
+{
+	unsigned long v = xa_to_value(entry);
+
+	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
+}
+
+static unsigned long dax_entry_size(void *entry)
+{
+	if (dax_is_zero_entry(entry))
+		return 0;
+	else if (dax_is_empty_entry(entry))
+		return 0;
+	else if (dax_is_pmd_entry(entry))
+		return PMD_SIZE;
+	else
+		return PAGE_SIZE;
+}
+
+static unsigned long dax_end_pfn(void *entry)
+{
+	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+}
+
+/*
+ * Iterate through all mapped pfns represented by an entry, i.e. skip
+ * 'empty' and 'zero' entries.
+ */
+#define for_each_mapped_pfn(entry, pfn) \
+	for (pfn = dax_to_pfn(entry); pfn < dax_end_pfn(entry); pfn++)
+
+static bool dax_mapping_is_cow(struct address_space *mapping)
+{
+	return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
+/*
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static void dax_mapping_set_cow(struct page *page)
+{
+	if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+		/*
+		 * Reset the index if the page was already mapped
+		 * regularly before.
+		 */
+		if (page->mapping)
+			page->index = 1;
+		page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+	}
+	page->index++;
+}
+
+/*
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files.  If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ */
+static vm_fault_t dax_associate_entry(void *entry,
+				      struct address_space *mapping,
+				      struct vm_fault *vmf, unsigned long flags)
+{
+	unsigned long size = dax_entry_size(entry), pfn, index;
+	struct dev_pagemap *pgmap;
+	int i = 0;
+
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return 0;
+
+	if (!size)
+		return 0;
+
+	if (!(flags & DAX_COW)) {
+		pfn = dax_to_pfn(entry);
+		pgmap = get_dev_pagemap_many(pfn, NULL, PHYS_PFN(size));
+		if (!pgmap)
+			return VM_FAULT_SIGBUS;
+	}
+
+	index = linear_page_index(vmf->vma, ALIGN(vmf->address, size));
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (flags & DAX_COW) {
+			dax_mapping_set_cow(page);
+		} else {
+			WARN_ON_ONCE(page->mapping);
+			page->mapping = mapping;
+			page->index = index + i++;
+			page_ref_inc(page);
+		}
+	}
+
+	return 0;
+}
+
+static void dax_disassociate_entry(void *entry, struct address_space *mapping,
+		bool trunc)
+{
+	unsigned long size = dax_entry_size(entry), pfn;
+	struct page *page;
+
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return;
+
+	if (!size)
+		return;
+
+	for_each_mapped_pfn(entry, pfn) {
+		page = pfn_to_page(pfn);
+		if (dax_mapping_is_cow(page->mapping)) {
+			/* keep the CoW flag if this page is still shared */
+			if (page->index-- > 0)
+				continue;
+		} else {
+			WARN_ON_ONCE(trunc && !dax_is_zapped(entry));
+			WARN_ON_ONCE(trunc && !dax_page_idle(page));
+			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+		}
+		page->mapping = NULL;
+		page->index = 0;
+	}
+
+	if (trunc && !dax_mapping_is_cow(page->mapping)) {
+		page = pfn_to_page(dax_to_pfn(entry));
+		put_dev_pagemap_many(page->pgmap, PHYS_PFN(size));
+	}
+}
+
+/*
+ * dax_lock_page - Lock the DAX entry corresponding to a page
+ * @page: The page whose entry we want to lock
+ *
+ * Context: Process context.
+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * not be locked.
+ */
+dax_entry_t dax_lock_page(struct page *page)
+{
+	XA_STATE(xas, NULL, 0);
+	void *entry;
+
+	/* Ensure page->mapping isn't freed while we look at it */
+	rcu_read_lock();
+	for (;;) {
+		struct address_space *mapping = READ_ONCE(page->mapping);
+
+		entry = NULL;
+		if (!mapping || !dax_mapping(mapping))
+			break;
+
+		/*
+		 * In the device-dax case there's no need to lock, a
+		 * struct dev_pagemap pin is sufficient to keep the
+		 * inode alive, and we assume we have dev_pagemap pin
+		 * otherwise we would not have a valid pfn_to_page()
+		 * translation.
+		 */
+		entry = (void *)~0UL;
+		if (S_ISCHR(mapping->host->i_mode))
+			break;
+
+		xas.xa = &mapping->i_pages;
+		xas_lock_irq(&xas);
+		if (mapping != page->mapping) {
+			xas_unlock_irq(&xas);
+			continue;
+		}
+		xas_set(&xas, page->index);
+		entry = xas_load(&xas);
+		if (dax_is_locked(entry)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(&xas, entry);
+			rcu_read_lock();
+			continue;
+		}
+		dax_lock_entry(&xas, entry);
+		xas_unlock_irq(&xas);
+		break;
+	}
+	rcu_read_unlock();
+	return (dax_entry_t)entry;
+}
+
+void dax_unlock_page(struct page *page, dax_entry_t cookie)
+{
+	struct address_space *mapping = page->mapping;
+	XA_STATE(xas, &mapping->i_pages, page->index);
+
+	if (S_ISCHR(mapping->host->i_mode))
+		return;
+
+	dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+				   struct page **page)
+{
+	XA_STATE(xas, NULL, 0);
+	void *entry;
+
+	rcu_read_lock();
+	for (;;) {
+		entry = NULL;
+		if (!dax_mapping(mapping))
+			break;
+
+		xas.xa = &mapping->i_pages;
+		xas_lock_irq(&xas);
+		xas_set(&xas, index);
+		entry = xas_load(&xas);
+		if (dax_is_locked(entry)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(&xas, entry);
+			rcu_read_lock();
+			continue;
+		}
+		if (!entry || dax_is_zero_entry(entry) ||
+		    dax_is_empty_entry(entry)) {
+			/*
+			 * Because we are looking for entry from file's mapping
+			 * and index, so the entry may not be inserted for now,
+			 * or even a zero/empty entry.  We don't think this is
+			 * an error case.  So, return a special value and do
+			 * not output @page.
+			 */
+			entry = (void *)~0UL;
+		} else {
+			*page = pfn_to_page(dax_to_pfn(entry));
+			dax_lock_entry(&xas, entry);
+		}
+		xas_unlock_irq(&xas);
+		break;
+	}
+	rcu_read_unlock();
+	return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+			      dax_entry_t cookie)
+{
+	XA_STATE(xas, &mapping->i_pages, index);
+
+	if (cookie == ~0UL)
+		return;
+
+	dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
+ *
+ * When requesting an entry with size DAX_PMD, dax_grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
+ *
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries.  This happens for both PMD zero pages as
+ * well as PMD empty entries.
+ *
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them.  We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ *
+ * On error, this function does not return an ERR_PTR.  Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
+ * overlap with xarray value entries.
+ */
+void *dax_grab_mapping_entry(struct xa_state *xas,
+			     struct address_space *mapping, unsigned int order)
+{
+	unsigned long index = xas->xa_index;
+	bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
+	void *entry;
+
+retry:
+	pmd_downgrade = false;
+	xas_lock_irq(xas);
+	entry = get_unlocked_entry(xas, order);
+
+	if (entry) {
+		if (dax_is_conflict(entry))
+			goto fallback;
+		if (!xa_is_value(entry)) {
+			xas_set_err(xas, -EIO);
+			goto out_unlock;
+		}
+
+		if (order == 0) {
+			if (dax_is_pmd_entry(entry) &&
+			    (dax_is_zero_entry(entry) ||
+			     dax_is_empty_entry(entry))) {
+				pmd_downgrade = true;
+			}
+		}
+	}
+
+	if (pmd_downgrade) {
+		/*
+		 * Make sure 'entry' remains valid while we drop
+		 * the i_pages lock.
+		 */
+		dax_lock_entry(xas, entry);
+
+		/*
+		 * Besides huge zero pages the only other thing that gets
+		 * downgraded are empty entries which don't need to be
+		 * unmapped.
+		 */
+		if (dax_is_zero_entry(entry)) {
+			xas_unlock_irq(xas);
+			unmap_mapping_pages(mapping,
+					    xas->xa_index & ~PG_PMD_COLOUR,
+					    PG_PMD_NR, false);
+			xas_reset(xas);
+			xas_lock_irq(xas);
+		}
+
+		dax_disassociate_entry(entry, mapping, false);
+		xas_store(xas, NULL); /* undo the PMD join */
+		dax_wake_entry(xas, entry, WAKE_ALL);
+		mapping->nrpages -= PG_PMD_NR;
+		entry = NULL;
+		xas_set(xas, index);
+	}
+
+	if (entry) {
+		dax_lock_entry(xas, entry);
+	} else {
+		unsigned long flags = DAX_EMPTY;
+
+		if (order > 0)
+			flags |= DAX_PMD;
+		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+		dax_lock_entry(xas, entry);
+		if (xas_error(xas))
+			goto out_unlock;
+		mapping->nrpages += 1UL << order;
+	}
+
+out_unlock:
+	xas_unlock_irq(xas);
+	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+		goto retry;
+	if (xas->xa_node == XA_ERROR(-ENOMEM))
+		return xa_mk_internal(VM_FAULT_OOM);
+	if (xas_error(xas))
+		return xa_mk_internal(VM_FAULT_SIGBUS);
+	return entry;
+fallback:
+	xas_unlock_irq(xas);
+	return xa_mk_internal(VM_FAULT_FALLBACK);
+}
+
+static void *dax_zap_entry(struct xa_state *xas, void *entry)
+{
+	unsigned long v = xa_to_value(entry);
+
+	return xas_store(xas, xa_mk_value(v | DAX_ZAP));
+}
+
+/*
+ * Return NULL if the entry is zapped and all pages in the entry are
+ * idle, otherwise return the non-idle page in the entry
+ */
+static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
+{
+	struct page *ret = NULL;
+	unsigned long pfn;
+	bool zap;
+
+	if (!dax_entry_size(entry))
+		return NULL;
+
+	zap = !dax_is_zapped(entry);
+
+	for_each_mapped_pfn(entry, pfn) {
+		struct page *page = pfn_to_page(pfn);
+
+		if (zap)
+			page_ref_dec(page);
+
+		if (!ret && !dax_page_idle(page))
+			ret = page;
+	}
+
+	if (zap)
+		dax_zap_entry(xas, entry);
+
+	return ret;
+}
+
+/**
+ * dax_zap_mappings_range - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ *       pages from 'start' till the end of file are included.
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end)
+{
+	void *entry;
+	unsigned int scanned = 0;
+	struct page *page = NULL;
+	pgoff_t start_idx = start >> PAGE_SHIFT;
+	pgoff_t end_idx;
+	XA_STATE(xas, &mapping->i_pages, start_idx);
+
+	/*
+	 * In the 'limited' case get_user_pages() for dax is disabled.
+	 */
+	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+		return NULL;
+
+	if (!dax_mapping(mapping))
+		return NULL;
+
+	/* If end == LLONG_MAX, all pages from start to till end of file */
+	if (end == LLONG_MAX)
+		end_idx = ULONG_MAX;
+	else
+		end_idx = end >> PAGE_SHIFT;
+	/*
+	 * If we race get_user_pages_fast() here either we'll see the
+	 * elevated page count in the iteration and wait, or
+	 * get_user_pages_fast() will see that the page it took a reference
+	 * against is no longer mapped in the page tables and bail to the
+	 * get_user_pages() slow path.  The slow path is protected by
+	 * pte_lock() and pmd_lock(). New references are not taken without
+	 * holding those locks, and unmap_mapping_pages() will not zero the
+	 * pte or pmd without holding the respective lock, so we are
+	 * guaranteed to either see new references or prevent new
+	 * references from being established.
+	 */
+	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
+
+	xas_lock_irq(&xas);
+	xas_for_each(&xas, entry, end_idx) {
+		if (WARN_ON_ONCE(!xa_is_value(entry)))
+			continue;
+		if (unlikely(dax_is_locked(entry)))
+			entry = get_unlocked_entry(&xas, 0);
+		if (entry)
+			page = dax_zap_pages(&xas, entry);
+		put_unlocked_entry(&xas, entry, WAKE_NEXT);
+		if (page)
+			break;
+		if (++scanned % XA_CHECK_SCHED)
+			continue;
+
+		xas_pause(&xas);
+		xas_unlock_irq(&xas);
+		cond_resched();
+		xas_lock_irq(&xas);
+	}
+	xas_unlock_irq(&xas);
+	return page;
+}
+EXPORT_SYMBOL_GPL(dax_zap_mappings_range);
+
+struct page *dax_zap_mappings(struct address_space *mapping)
+{
+	return dax_zap_mappings_range(mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_zap_mappings);
+
+static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index,
+				  bool trunc)
+{
+	XA_STATE(xas, &mapping->i_pages, index);
+	int ret = 0;
+	void *entry;
+
+	xas_lock_irq(&xas);
+	entry = get_unlocked_entry(&xas, 0);
+	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+		goto out;
+	if (!trunc && (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+		       xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
+		goto out;
+	dax_disassociate_entry(entry, mapping, trunc);
+	xas_store(&xas, NULL);
+	mapping->nrpages -= 1UL << dax_entry_order(entry);
+	ret = 1;
+out:
+	put_unlocked_entry(&xas, entry, WAKE_ALL);
+	xas_unlock_irq(&xas);
+	return ret;
+}
+
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+				      pgoff_t index)
+{
+	return __dax_invalidate_entry(mapping, index, false);
+}
+
+/*
+ * Delete DAX entry at @index from @mapping.  Wait for it
+ * to be unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	int ret = __dax_invalidate_entry(mapping, index, true);
+
+	/*
+	 * This gets called from truncate / punch_hole path. As such, the caller
+	 * must hold locks protecting against concurrent modifications of the
+	 * page cache (usually fs-private i_mmap_sem for writing). Since the
+	 * caller has seen a DAX entry for this index, we better find it
+	 * at that index as well...
+	 */
+	WARN_ON_ONCE(!ret);
+	return ret;
+}
+
+/*
+ * By this point dax_grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
+vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+			    void **pentry, pfn_t pfn, unsigned long flags)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	void *new_entry = dax_make_entry(pfn, flags);
+	bool dirty = flags & DAX_DIRTY;
+	bool cow = flags & DAX_COW;
+	void *entry = *pentry;
+
+	if (dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+		unsigned long index = xas->xa_index;
+		/* we are replacing a zero page with block mapping */
+		if (dax_is_pmd_entry(entry))
+			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
+					    PG_PMD_NR, false);
+		else /* pte entry */
+			unmap_mapping_pages(mapping, index, 1, false);
+	}
+
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+		void *old;
+
+		dax_disassociate_entry(entry, mapping, false);
+		dax_associate_entry(new_entry, mapping, vmf, flags);
+		/*
+		 * Only swap our new entry into the page cache if the current
+		 * entry is a zero page or an empty entry.  If a normal PTE or
+		 * PMD entry is already in the cache, we leave it alone.  This
+		 * means that if we are trying to insert a PTE and the
+		 * existing entry is a PMD, we will just leave the PMD in the
+		 * tree and dirty it if necessary.
+		 */
+		old = dax_lock_entry(xas, new_entry);
+		WARN_ON_ONCE(old !=
+			     xa_mk_value(xa_to_value(entry) | DAX_LOCKED));
+		entry = new_entry;
+	} else {
+		xas_load(xas); /* Walk the xa_state */
+	}
+
+	if (dirty)
+		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+
+	if (cow)
+		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
+	xas_unlock_irq(xas);
+	*pentry = entry;
+	return 0;
+}
+
+int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+		      struct address_space *mapping, void *entry)
+{
+	unsigned long pfn, index, count, end;
+	long ret = 0;
+	struct vm_area_struct *vma;
+
+	/*
+	 * A page got tagged dirty in DAX mapping? Something is seriously
+	 * wrong.
+	 */
+	if (WARN_ON(!xa_is_value(entry)))
+		return -EIO;
+
+	if (unlikely(dax_is_locked(entry))) {
+		void *old_entry = entry;
+
+		entry = get_unlocked_entry(xas, 0);
+
+		/* Entry got punched out / reallocated? */
+		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+			goto put_unlocked;
+		/*
+		 * Entry got reallocated elsewhere? No need to writeback.
+		 * We have to compare pfns as we must not bail out due to
+		 * difference in lockbit or entry type.
+		 */
+		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+			goto put_unlocked;
+		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+					dax_is_zero_entry(entry))) {
+			ret = -EIO;
+			goto put_unlocked;
+		}
+
+		/* Another fsync thread may have already done this entry */
+		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+			goto put_unlocked;
+	}
+
+	/* Lock the entry to serialize with page faults */
+	dax_lock_entry(xas, entry);
+
+	/*
+	 * We can clear the tag now but we have to be careful so that concurrent
+	 * dax_writeback_one() calls for the same index cannot finish before we
+	 * actually flush the caches. This is achieved as the calls will look
+	 * at the entry only under the i_pages lock and once they do that
+	 * they will see the entry locked and wait for it to unlock.
+	 */
+	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+	xas_unlock_irq(xas);
+
+	/*
+	 * If dax_writeback_mapping_range() was given a wbc->range_start
+	 * in the middle of a PMD, the 'index' we use needs to be
+	 * aligned to the start of the PMD.
+	 * This allows us to flush for PMD_SIZE and not have to worry about
+	 * partial PMD writebacks.
+	 */
+	pfn = dax_to_pfn(entry);
+	count = 1UL << dax_entry_order(entry);
+	index = xas->xa_index & ~(count - 1);
+	end = index + count - 1;
+
+	/* Walk all mappings of a given index of a file and writeprotect them */
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+		pfn_mkclean_range(pfn, count, index, vma);
+		cond_resched();
+	}
+	i_mmap_unlock_read(mapping);
+
+	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
+	/*
+	 * After we have flushed the cache, we can clear the dirty tag. There
+	 * cannot be new dirty data in the pfn after the flush has completed as
+	 * the pfn mappings are writeprotected and fault waits for mapping
+	 * entry lock.
+	 */
+	xas_reset(xas);
+	xas_lock_irq(xas);
+	xas_store(xas, entry);
+	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+	dax_wake_entry(xas, entry, WAKE_NEXT);
+
+	trace_dax_writeback_one(mapping->host, index, count);
+	return ret;
+
+ put_unlocked:
+	put_unlocked_entry(xas, entry, WAKE_NEXT);
+	return ret;
+}
+
+/*
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pfn: PFN to insert
+ * @order: Order of entry to insert.
+ *
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file.  It also marks the page cache entry as dirty.
+ */
+vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
+				  unsigned int order)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+	void *entry;
+	vm_fault_t ret;
+
+	xas_lock_irq(&xas);
+	entry = get_unlocked_entry(&xas, order);
+	/* Did we race with someone splitting entry or so? */
+	if (!entry || dax_is_conflict(entry) ||
+	    (order == 0 && !dax_is_pte_entry(entry))) {
+		put_unlocked_entry(&xas, entry, WAKE_NEXT);
+		xas_unlock_irq(&xas);
+		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+						      VM_FAULT_NOPAGE);
+		return VM_FAULT_NOPAGE;
+	}
+	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+	dax_lock_entry(&xas, entry);
+	xas_unlock_irq(&xas);
+	if (order == 0)
+		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+#ifdef CONFIG_FS_DAX_PMD
+	else if (order == PMD_ORDER)
+		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+#endif
+	else
+		ret = VM_FAULT_FALLBACK;
+	dax_unlock_entry(&xas, entry);
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+	return ret;
+}
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 4909ad945a49..0976857ec7f2 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -564,6 +564,8 @@ static int __init dax_core_init(void)
 	if (rc)
 		return rc;
 
+	dax_mapping_init();
+
 	rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
 	if (rc)
 		goto err_chrdev;
@@ -590,5 +592,5 @@ static void __exit dax_core_exit(void)
 
 MODULE_AUTHOR("Intel Corporation");
 MODULE_LICENSE("GPL v2");
-subsys_initcall(dax_core_init);
+fs_initcall(dax_core_init);
 module_exit(dax_core_exit);
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 5a29046e3319..3bb17448d1c8 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -78,6 +78,7 @@ config NVDIMM_DAX
 	bool "NVDIMM DAX: Raw access to persistent memory"
 	default LIBNVDIMM
 	depends on NVDIMM_PFN
+	depends on DAX
 	help
 	  Support raw device dax access to a persistent memory
 	  namespace.  For environments that want to hard partition
diff --git a/fs/dax.c b/fs/dax.c
index ee2568c8b135..79e49e718d33 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -27,847 +27,8 @@
 #include <linux/rmap.h>
 #include <asm/pgalloc.h>
 
-#define CREATE_TRACE_POINTS
 #include <trace/events/fs_dax.h>
 
-static inline unsigned int pe_order(enum page_entry_size pe_size)
-{
-	if (pe_size == PE_SIZE_PTE)
-		return PAGE_SHIFT - PAGE_SHIFT;
-	if (pe_size == PE_SIZE_PMD)
-		return PMD_SHIFT - PAGE_SHIFT;
-	if (pe_size == PE_SIZE_PUD)
-		return PUD_SHIFT - PAGE_SHIFT;
-	return ~0;
-}
-
-/* We choose 4096 entries - same as per-zone page wait tables */
-#define DAX_WAIT_TABLE_BITS 12
-#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
-
-/* The 'colour' (ie low bits) within a PMD of a page offset.  */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
-#define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
-
-/* The order of a PMD entry */
-#define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
-
-static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
-
-static int __init init_dax_wait_table(void)
-{
-	int i;
-
-	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
-		init_waitqueue_head(wait_table + i);
-	return 0;
-}
-fs_initcall(init_dax_wait_table);
-
-/*
- * DAX pagecache entries use XArray value entries so they can't be mistaken
- * for pages.  We use one bit for locking, one bit for the entry size (PMD)
- * and two more to tell us if the entry is a zero page or an empty entry that
- * is just used for locking.  In total four special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
- * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define DAX_SHIFT	(5)
-#define DAX_MASK	((1UL << DAX_SHIFT) - 1)
-#define DAX_LOCKED	(1UL << 0)
-#define DAX_PMD		(1UL << 1)
-#define DAX_ZERO_PAGE	(1UL << 2)
-#define DAX_EMPTY	(1UL << 3)
-#define DAX_ZAP		(1UL << 4)
-
-/*
- * These flags are not conveyed in Xarray value entries, they are just
- * modifiers to dax_insert_entry().
- */
-#define DAX_DIRTY (1UL << (DAX_SHIFT + 0))
-#define DAX_COW   (1UL << (DAX_SHIFT + 1))
-
-static unsigned long dax_to_pfn(void *entry)
-{
-	return xa_to_value(entry) >> DAX_SHIFT;
-}
-
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
-{
-	return xa_mk_value((flags & DAX_MASK) |
-			   (pfn_t_to_pfn(pfn) << DAX_SHIFT));
-}
-
-static bool dax_is_locked(void *entry)
-{
-	return xa_to_value(entry) & DAX_LOCKED;
-}
-
-static bool dax_is_zapped(void *entry)
-{
-	return xa_to_value(entry) & DAX_ZAP;
-}
-
-static unsigned int dax_entry_order(void *entry)
-{
-	if (xa_to_value(entry) & DAX_PMD)
-		return PMD_ORDER;
-	return 0;
-}
-
-static unsigned long dax_is_pmd_entry(void *entry)
-{
-	return xa_to_value(entry) & DAX_PMD;
-}
-
-static bool dax_is_pte_entry(void *entry)
-{
-	return !(xa_to_value(entry) & DAX_PMD);
-}
-
-static int dax_is_zero_entry(void *entry)
-{
-	return xa_to_value(entry) & DAX_ZERO_PAGE;
-}
-
-static int dax_is_empty_entry(void *entry)
-{
-	return xa_to_value(entry) & DAX_EMPTY;
-}
-
-/*
- * true if the entry that was found is of a smaller order than the entry
- * we were looking for
- */
-static bool dax_is_conflict(void *entry)
-{
-	return entry == XA_RETRY_ENTRY;
-}
-
-/*
- * DAX page cache entry locking
- */
-struct exceptional_entry_key {
-	struct xarray *xa;
-	pgoff_t entry_start;
-};
-
-struct wait_exceptional_entry_queue {
-	wait_queue_entry_t wait;
-	struct exceptional_entry_key key;
-};
-
-/**
- * enum dax_wake_mode: waitqueue wakeup behaviour
- * @WAKE_ALL: wake all waiters in the waitqueue
- * @WAKE_NEXT: wake only the first waiter in the waitqueue
- */
-enum dax_wake_mode {
-	WAKE_ALL,
-	WAKE_NEXT,
-};
-
-static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
-		void *entry, struct exceptional_entry_key *key)
-{
-	unsigned long hash;
-	unsigned long index = xas->xa_index;
-
-	/*
-	 * If 'entry' is a PMD, align the 'index' that we use for the wait
-	 * queue to the start of that PMD.  This ensures that all offsets in
-	 * the range covered by the PMD map to the same bit lock.
-	 */
-	if (dax_is_pmd_entry(entry))
-		index &= ~PG_PMD_COLOUR;
-	key->xa = xas->xa;
-	key->entry_start = index;
-
-	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
-	return wait_table + hash;
-}
-
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
-		unsigned int mode, int sync, void *keyp)
-{
-	struct exceptional_entry_key *key = keyp;
-	struct wait_exceptional_entry_queue *ewait =
-		container_of(wait, struct wait_exceptional_entry_queue, wait);
-
-	if (key->xa != ewait->key.xa ||
-	    key->entry_start != ewait->key.entry_start)
-		return 0;
-	return autoremove_wake_function(wait, mode, sync, NULL);
-}
-
-/*
- * @entry may no longer be the entry at the index in the mapping.
- * The important information it's conveying is whether the entry at
- * this index used to be a PMD entry.
- */
-static void dax_wake_entry(struct xa_state *xas, void *entry,
-			   enum dax_wake_mode mode)
-{
-	struct exceptional_entry_key key;
-	wait_queue_head_t *wq;
-
-	wq = dax_entry_waitqueue(xas, entry, &key);
-
-	/*
-	 * Checking for locked entry and prepare_to_wait_exclusive() happens
-	 * under the i_pages lock, ditto for entry handling in our callers.
-	 * So at this point all tasks that could have seen our entry locked
-	 * must be in the waitqueue and the following check will see them.
-	 */
-	if (waitqueue_active(wq))
-		__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
-}
-
-/*
- * Look up entry in page cache, wait for it to become unlocked if it
- * is a DAX entry and return it.  The caller must subsequently call
- * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
- * if it did.  The entry returned may have a larger order than @order.
- * If @order is larger than the order of the entry found in i_pages, this
- * function returns a dax_is_conflict entry.
- *
- * Must be called with the i_pages lock held.
- */
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
-{
-	void *entry;
-	struct wait_exceptional_entry_queue ewait;
-	wait_queue_head_t *wq;
-
-	init_wait(&ewait.wait);
-	ewait.wait.func = wake_exceptional_entry_func;
-
-	for (;;) {
-		entry = xas_find_conflict(xas);
-		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
-			return entry;
-		if (dax_entry_order(entry) < order)
-			return XA_RETRY_ENTRY;
-		if (!dax_is_locked(entry))
-			return entry;
-
-		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
-		prepare_to_wait_exclusive(wq, &ewait.wait,
-					  TASK_UNINTERRUPTIBLE);
-		xas_unlock_irq(xas);
-		xas_reset(xas);
-		schedule();
-		finish_wait(wq, &ewait.wait);
-		xas_lock_irq(xas);
-	}
-}
-
-/*
- * The only thing keeping the address space around is the i_pages lock
- * (it's cycled in clear_inode() after removing the entries from i_pages)
- * After we call xas_unlock_irq(), we cannot touch xas->xa.
- */
-static void wait_entry_unlocked(struct xa_state *xas, void *entry)
-{
-	struct wait_exceptional_entry_queue ewait;
-	wait_queue_head_t *wq;
-
-	init_wait(&ewait.wait);
-	ewait.wait.func = wake_exceptional_entry_func;
-
-	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
-	/*
-	 * Unlike get_unlocked_entry() there is no guarantee that this
-	 * path ever successfully retrieves an unlocked entry before an
-	 * inode dies. Perform a non-exclusive wait in case this path
-	 * never successfully performs its own wake up.
-	 */
-	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
-	xas_unlock_irq(xas);
-	schedule();
-	finish_wait(wq, &ewait.wait);
-}
-
-static void put_unlocked_entry(struct xa_state *xas, void *entry,
-			       enum dax_wake_mode mode)
-{
-	if (entry && !dax_is_conflict(entry))
-		dax_wake_entry(xas, entry, mode);
-}
-
-/*
- * We used the xa_state to get the entry, but then we locked the entry and
- * dropped the xa_lock, so we know the xa_state is stale and must be reset
- * before use.
- */
-static void dax_unlock_entry(struct xa_state *xas, void *entry)
-{
-	void *old;
-
-	BUG_ON(dax_is_locked(entry));
-	xas_reset(xas);
-	xas_lock_irq(xas);
-	old = xas_store(xas, entry);
-	xas_unlock_irq(xas);
-	BUG_ON(!dax_is_locked(old));
-	dax_wake_entry(xas, entry, WAKE_NEXT);
-}
-
-/*
- * Return: The entry stored at this location before it was locked.
- */
-static void *dax_lock_entry(struct xa_state *xas, void *entry)
-{
-	unsigned long v = xa_to_value(entry);
-	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
-}
-
-static unsigned long dax_entry_size(void *entry)
-{
-	if (dax_is_zero_entry(entry))
-		return 0;
-	else if (dax_is_empty_entry(entry))
-		return 0;
-	else if (dax_is_pmd_entry(entry))
-		return PMD_SIZE;
-	else
-		return PAGE_SIZE;
-}
-
-static unsigned long dax_end_pfn(void *entry)
-{
-	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
-}
-
-/*
- * Iterate through all mapped pfns represented by an entry, i.e. skip
- * 'empty' and 'zero' entries.
- */
-#define for_each_mapped_pfn(entry, pfn) \
-	for (pfn = dax_to_pfn(entry); \
-			pfn < dax_end_pfn(entry); pfn++)
-
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
-{
-	return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
-}
-
-/*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
- */
-static inline void dax_mapping_set_cow(struct page *page)
-{
-	if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
-		/*
-		 * Reset the index if the page was already mapped
-		 * regularly before.
-		 */
-		if (page->mapping)
-			page->index = 1;
-		page->mapping = (void *)PAGE_MAPPING_DAX_COW;
-	}
-	page->index++;
-}
-
-/*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
- * whether this entry is shared by multiple files.  If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
- */
-static vm_fault_t dax_associate_entry(void *entry,
-				      struct address_space *mapping,
-				      struct vm_fault *vmf, unsigned long flags)
-{
-	unsigned long size = dax_entry_size(entry), pfn, index;
-	struct dev_pagemap *pgmap;
-	int i = 0;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return 0;
-
-	if (!size)
-		return 0;
-
-	if (!(flags & DAX_COW)) {
-		pfn = dax_to_pfn(entry);
-		pgmap = get_dev_pagemap_many(pfn, NULL, PHYS_PFN(size));
-		if (!pgmap)
-			return VM_FAULT_SIGBUS;
-	}
-
-	index = linear_page_index(vmf->vma, ALIGN(vmf->address, size));
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		if (flags & DAX_COW) {
-			dax_mapping_set_cow(page);
-		} else {
-			WARN_ON_ONCE(page->mapping);
-			page->mapping = mapping;
-			page->index = index + i++;
-			page_ref_inc(page);
-		}
-	}
-
-	return 0;
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
-		bool trunc)
-{
-	unsigned long size = dax_entry_size(entry), pfn;
-	struct page *page;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	if (!size)
-		return;
-
-	for_each_mapped_pfn(entry, pfn) {
-		page = pfn_to_page(pfn);
-		if (dax_mapping_is_cow(page->mapping)) {
-			/* keep the CoW flag if this page is still shared */
-			if (page->index-- > 0)
-				continue;
-		} else {
-			WARN_ON_ONCE(trunc && !dax_is_zapped(entry));
-			WARN_ON_ONCE(trunc && !dax_page_idle(page));
-			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
-		}
-		page->mapping = NULL;
-		page->index = 0;
-	}
-
-	if (trunc && !dax_mapping_is_cow(page->mapping)) {
-		page = pfn_to_page(dax_to_pfn(entry));
-		put_dev_pagemap_many(page->pgmap, PHYS_PFN(size));
-	}
-}
-
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
- *
- * Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
- * not be locked.
- */
-dax_entry_t dax_lock_page(struct page *page)
-{
-	XA_STATE(xas, NULL, 0);
-	void *entry;
-
-	/* Ensure page->mapping isn't freed while we look at it */
-	rcu_read_lock();
-	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
-
-		entry = NULL;
-		if (!mapping || !dax_mapping(mapping))
-			break;
-
-		/*
-		 * In the device-dax case there's no need to lock, a
-		 * struct dev_pagemap pin is sufficient to keep the
-		 * inode alive, and we assume we have dev_pagemap pin
-		 * otherwise we would not have a valid pfn_to_page()
-		 * translation.
-		 */
-		entry = (void *)~0UL;
-		if (S_ISCHR(mapping->host->i_mode))
-			break;
-
-		xas.xa = &mapping->i_pages;
-		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
-			xas_unlock_irq(&xas);
-			continue;
-		}
-		xas_set(&xas, page->index);
-		entry = xas_load(&xas);
-		if (dax_is_locked(entry)) {
-			rcu_read_unlock();
-			wait_entry_unlocked(&xas, entry);
-			rcu_read_lock();
-			continue;
-		}
-		dax_lock_entry(&xas, entry);
-		xas_unlock_irq(&xas);
-		break;
-	}
-	rcu_read_unlock();
-	return (dax_entry_t)entry;
-}
-
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
-{
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
-
-	if (S_ISCHR(mapping->host->i_mode))
-		return;
-
-	dax_unlock_entry(&xas, (void *)cookie);
-}
-
-/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
- * @mapping: the file's mapping whose entry we want to lock
- * @index: the offset within this file
- * @page: output the dax page corresponding to this dax entry
- *
- * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
- * could not be locked.
- */
-dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
-		struct page **page)
-{
-	XA_STATE(xas, NULL, 0);
-	void *entry;
-
-	rcu_read_lock();
-	for (;;) {
-		entry = NULL;
-		if (!dax_mapping(mapping))
-			break;
-
-		xas.xa = &mapping->i_pages;
-		xas_lock_irq(&xas);
-		xas_set(&xas, index);
-		entry = xas_load(&xas);
-		if (dax_is_locked(entry)) {
-			rcu_read_unlock();
-			wait_entry_unlocked(&xas, entry);
-			rcu_read_lock();
-			continue;
-		}
-		if (!entry ||
-		    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
-			/*
-			 * Because we are looking for entry from file's mapping
-			 * and index, so the entry may not be inserted for now,
-			 * or even a zero/empty entry.  We don't think this is
-			 * an error case.  So, return a special value and do
-			 * not output @page.
-			 */
-			entry = (void *)~0UL;
-		} else {
-			*page = pfn_to_page(dax_to_pfn(entry));
-			dax_lock_entry(&xas, entry);
-		}
-		xas_unlock_irq(&xas);
-		break;
-	}
-	rcu_read_unlock();
-	return (dax_entry_t)entry;
-}
-
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
-		dax_entry_t cookie)
-{
-	XA_STATE(xas, &mapping->i_pages, index);
-
-	if (cookie == ~0UL)
-		return;
-
-	dax_unlock_entry(&xas, (void *)cookie);
-}
-
-/*
- * Find page cache entry at given index. If it is a DAX entry, return it
- * with the entry locked. If the page cache doesn't contain an entry at
- * that index, add a locked empty entry.
- *
- * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return VM_FAULT_FALLBACK.
- * This will happen if there are any PTE entries within the PMD range
- * that we are requesting.
- *
- * We always favor PTE entries over PMD entries. There isn't a flow where we
- * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
- * insertion will fail if it finds any PTE entries already in the tree, and a
- * PTE insertion will cause an existing PMD entry to be unmapped and
- * downgraded to PTE entries.  This happens for both PMD zero pages as
- * well as PMD empty entries.
- *
- * The exception to this downgrade path is for PMD entries that have
- * real storage backing them.  We will leave these real PMD entries in
- * the tree, and PTE writes will simply dirty the entire PMD entry.
- *
- * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
- * persistent memory the benefit is doubtful. We can add that later if we can
- * show it helps.
- *
- * On error, this function does not return an ERR_PTR.  Instead it returns
- * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
- * overlap with xarray value entries.
- */
-static void *grab_mapping_entry(struct xa_state *xas,
-		struct address_space *mapping, unsigned int order)
-{
-	unsigned long index = xas->xa_index;
-	bool pmd_downgrade;	/* splitting PMD entry into PTE entries? */
-	void *entry;
-
-retry:
-	pmd_downgrade = false;
-	xas_lock_irq(xas);
-	entry = get_unlocked_entry(xas, order);
-
-	if (entry) {
-		if (dax_is_conflict(entry))
-			goto fallback;
-		if (!xa_is_value(entry)) {
-			xas_set_err(xas, -EIO);
-			goto out_unlock;
-		}
-
-		if (order == 0) {
-			if (dax_is_pmd_entry(entry) &&
-			    (dax_is_zero_entry(entry) ||
-			     dax_is_empty_entry(entry))) {
-				pmd_downgrade = true;
-			}
-		}
-	}
-
-	if (pmd_downgrade) {
-		/*
-		 * Make sure 'entry' remains valid while we drop
-		 * the i_pages lock.
-		 */
-		dax_lock_entry(xas, entry);
-
-		/*
-		 * Besides huge zero pages the only other thing that gets
-		 * downgraded are empty entries which don't need to be
-		 * unmapped.
-		 */
-		if (dax_is_zero_entry(entry)) {
-			xas_unlock_irq(xas);
-			unmap_mapping_pages(mapping,
-					xas->xa_index & ~PG_PMD_COLOUR,
-					PG_PMD_NR, false);
-			xas_reset(xas);
-			xas_lock_irq(xas);
-		}
-
-		dax_disassociate_entry(entry, mapping, false);
-		xas_store(xas, NULL);	/* undo the PMD join */
-		dax_wake_entry(xas, entry, WAKE_ALL);
-		mapping->nrpages -= PG_PMD_NR;
-		entry = NULL;
-		xas_set(xas, index);
-	}
-
-	if (entry) {
-		dax_lock_entry(xas, entry);
-	} else {
-		unsigned long flags = DAX_EMPTY;
-
-		if (order > 0)
-			flags |= DAX_PMD;
-		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
-		dax_lock_entry(xas, entry);
-		if (xas_error(xas))
-			goto out_unlock;
-		mapping->nrpages += 1UL << order;
-	}
-
-out_unlock:
-	xas_unlock_irq(xas);
-	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
-		goto retry;
-	if (xas->xa_node == XA_ERROR(-ENOMEM))
-		return xa_mk_internal(VM_FAULT_OOM);
-	if (xas_error(xas))
-		return xa_mk_internal(VM_FAULT_SIGBUS);
-	return entry;
-fallback:
-	xas_unlock_irq(xas);
-	return xa_mk_internal(VM_FAULT_FALLBACK);
-}
-
-static void *dax_zap_entry(struct xa_state *xas, void *entry)
-{
-	unsigned long v = xa_to_value(entry);
-
-	return xas_store(xas, xa_mk_value(v | DAX_ZAP));
-}
-
-/**
- * Return NULL if the entry is zapped and all pages in the entry are
- * idle, otherwise return the non-idle page in the entry
- */
-static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
-{
-	struct page *ret = NULL;
-	unsigned long pfn;
-	bool zap;
-
-	if (!dax_entry_size(entry))
-		return NULL;
-
-	zap = !dax_is_zapped(entry);
-
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		if (zap)
-			page_ref_dec(page);
-
-		if (!ret && !dax_page_idle(page))
-			ret = page;
-	}
-
-	if (zap)
-		dax_zap_entry(xas, entry);
-
-	return ret;
-}
-
-/**
- * dax_zap_mappings_range - find first pinned page in @mapping
- * @mapping: address space to scan for a page with ref count > 1
- * @start: Starting offset. Page containing 'start' is included.
- * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
- *       pages from 'start' till the end of file are included.
- *
- * DAX requires ZONE_DEVICE mapped pages. These pages are never
- * 'onlined' to the page allocator so they are considered idle when
- * page->count == 1. A filesystem uses this interface to determine if
- * any page in the mapping is busy, i.e. for DMA, or other
- * get_user_pages() usages.
- *
- * It is expected that the filesystem is holding locks to block the
- * establishment of new mappings in this address_space. I.e. it expects
- * to be able to run unmap_mapping_range() and subsequently not race
- * mapping_mapped() becoming true.
- */
-struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
-				    loff_t end)
-{
-	void *entry;
-	unsigned int scanned = 0;
-	struct page *page = NULL;
-	pgoff_t start_idx = start >> PAGE_SHIFT;
-	pgoff_t end_idx;
-	XA_STATE(xas, &mapping->i_pages, start_idx);
-
-	/*
-	 * In the 'limited' case get_user_pages() for dax is disabled.
-	 */
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return NULL;
-
-	if (!dax_mapping(mapping))
-		return NULL;
-
-	/* If end == LLONG_MAX, all pages from start to till end of file */
-	if (end == LLONG_MAX)
-		end_idx = ULONG_MAX;
-	else
-		end_idx = end >> PAGE_SHIFT;
-	/*
-	 * If we race get_user_pages_fast() here either we'll see the
-	 * elevated page count in the iteration and wait, or
-	 * get_user_pages_fast() will see that the page it took a reference
-	 * against is no longer mapped in the page tables and bail to the
-	 * get_user_pages() slow path.  The slow path is protected by
-	 * pte_lock() and pmd_lock(). New references are not taken without
-	 * holding those locks, and unmap_mapping_pages() will not zero the
-	 * pte or pmd without holding the respective lock, so we are
-	 * guaranteed to either see new references or prevent new
-	 * references from being established.
-	 */
-	unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
-
-	xas_lock_irq(&xas);
-	xas_for_each(&xas, entry, end_idx) {
-		if (WARN_ON_ONCE(!xa_is_value(entry)))
-			continue;
-		if (unlikely(dax_is_locked(entry)))
-			entry = get_unlocked_entry(&xas, 0);
-		if (entry)
-			page = dax_zap_pages(&xas, entry);
-		put_unlocked_entry(&xas, entry, WAKE_NEXT);
-		if (page)
-			break;
-		if (++scanned % XA_CHECK_SCHED)
-			continue;
-
-		xas_pause(&xas);
-		xas_unlock_irq(&xas);
-		cond_resched();
-		xas_lock_irq(&xas);
-	}
-	xas_unlock_irq(&xas);
-	return page;
-}
-EXPORT_SYMBOL_GPL(dax_zap_mappings_range);
-
-struct page *dax_zap_mappings(struct address_space *mapping)
-{
-	return dax_zap_mappings_range(mapping, 0, LLONG_MAX);
-}
-EXPORT_SYMBOL_GPL(dax_zap_mappings);
-
-static int __dax_invalidate_entry(struct address_space *mapping,
-					  pgoff_t index, bool trunc)
-{
-	XA_STATE(xas, &mapping->i_pages, index);
-	int ret = 0;
-	void *entry;
-
-	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas, 0);
-	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
-		goto out;
-	if (!trunc &&
-	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
-	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
-		goto out;
-	dax_disassociate_entry(entry, mapping, trunc);
-	xas_store(&xas, NULL);
-	mapping->nrpages -= 1UL << dax_entry_order(entry);
-	ret = 1;
-out:
-	put_unlocked_entry(&xas, entry, WAKE_ALL);
-	xas_unlock_irq(&xas);
-	return ret;
-}
-
-/*
- * Delete DAX entry at @index from @mapping.  Wait for it
- * to be unlocked before deleting it.
- */
-int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-	int ret = __dax_invalidate_entry(mapping, index, true);
-
-	/*
-	 * This gets called from truncate / punch_hole path. As such, the caller
-	 * must hold locks protecting against concurrent modifications of the
-	 * page cache (usually fs-private i_mmap_sem for writing). Since the
-	 * caller has seen a DAX entry for this index, we better find it
-	 * at that index as well...
-	 */
-	WARN_ON_ONCE(!ret);
-	return ret;
-}
-
-/*
- * Invalidate DAX entry if it is clean.
- */
-int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
-				      pgoff_t index)
-{
-	return __dax_invalidate_entry(mapping, index, false);
-}
-
 static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
 {
 	return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
@@ -894,195 +55,6 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
 	return 0;
 }
 
-/*
- * MAP_SYNC on a dax mapping guarantees dirty metadata is
- * flushed on write-faults (non-cow), but not read-faults.
- */
-static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
-		struct vm_area_struct *vma)
-{
-	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
-		(iter->iomap.flags & IOMAP_F_DIRTY);
-}
-
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
-	return (iter->flags & IOMAP_WRITE) &&
-		(iter->iomap.flags & IOMAP_F_SHARED);
-}
-
-static unsigned long dax_iter_flags(const struct iomap_iter *iter,
-				    struct vm_fault *vmf)
-{
-	unsigned long flags = 0;
-
-	if (!dax_fault_is_synchronous(iter, vmf->vma))
-		flags |= DAX_DIRTY;
-
-	if (dax_fault_is_cow(iter))
-		flags |= DAX_COW;
-
-	return flags;
-}
-
-/*
- * By this point grab_mapping_entry() has ensured that we have a locked entry
- * of the appropriate size so we don't have to worry about downgrading PMDs to
- * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
- * already in the tree, we will skip the insertion and just dirty the PMD as
- * appropriate.
- */
-static vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
-				   void **pentry, pfn_t pfn,
-				   unsigned long flags)
-{
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	void *new_entry = dax_make_entry(pfn, flags);
-	bool dirty = flags & DAX_DIRTY;
-	bool cow = flags & DAX_COW;
-	void *entry = *pentry;
-
-	if (dirty)
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
-		unsigned long index = xas->xa_index;
-		/* we are replacing a zero page with block mapping */
-		if (dax_is_pmd_entry(entry))
-			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
-					PG_PMD_NR, false);
-		else /* pte entry */
-			unmap_mapping_pages(mapping, index, 1, false);
-	}
-
-	xas_reset(xas);
-	xas_lock_irq(xas);
-	if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
-		void *old;
-
-		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf, flags);
-		/*
-		 * Only swap our new entry into the page cache if the current
-		 * entry is a zero page or an empty entry.  If a normal PTE or
-		 * PMD entry is already in the cache, we leave it alone.  This
-		 * means that if we are trying to insert a PTE and the
-		 * existing entry is a PMD, we will just leave the PMD in the
-		 * tree and dirty it if necessary.
-		 */
-		old = dax_lock_entry(xas, new_entry);
-		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
-					DAX_LOCKED));
-		entry = new_entry;
-	} else {
-		xas_load(xas);	/* Walk the xa_state */
-	}
-
-	if (dirty)
-		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
-
-	if (cow)
-		xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
-
-	xas_unlock_irq(xas);
-	*pentry = entry;
-	return 0;
-}
-
-static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
-		struct address_space *mapping, void *entry)
-{
-	unsigned long pfn, index, count, end;
-	long ret = 0;
-	struct vm_area_struct *vma;
-
-	/*
-	 * A page got tagged dirty in DAX mapping? Something is seriously
-	 * wrong.
-	 */
-	if (WARN_ON(!xa_is_value(entry)))
-		return -EIO;
-
-	if (unlikely(dax_is_locked(entry))) {
-		void *old_entry = entry;
-
-		entry = get_unlocked_entry(xas, 0);
-
-		/* Entry got punched out / reallocated? */
-		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
-			goto put_unlocked;
-		/*
-		 * Entry got reallocated elsewhere? No need to writeback.
-		 * We have to compare pfns as we must not bail out due to
-		 * difference in lockbit or entry type.
-		 */
-		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
-			goto put_unlocked;
-		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
-					dax_is_zero_entry(entry))) {
-			ret = -EIO;
-			goto put_unlocked;
-		}
-
-		/* Another fsync thread may have already done this entry */
-		if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
-			goto put_unlocked;
-	}
-
-	/* Lock the entry to serialize with page faults */
-	dax_lock_entry(xas, entry);
-
-	/*
-	 * We can clear the tag now but we have to be careful so that concurrent
-	 * dax_writeback_one() calls for the same index cannot finish before we
-	 * actually flush the caches. This is achieved as the calls will look
-	 * at the entry only under the i_pages lock and once they do that
-	 * they will see the entry locked and wait for it to unlock.
-	 */
-	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
-	xas_unlock_irq(xas);
-
-	/*
-	 * If dax_writeback_mapping_range() was given a wbc->range_start
-	 * in the middle of a PMD, the 'index' we use needs to be
-	 * aligned to the start of the PMD.
-	 * This allows us to flush for PMD_SIZE and not have to worry about
-	 * partial PMD writebacks.
-	 */
-	pfn = dax_to_pfn(entry);
-	count = 1UL << dax_entry_order(entry);
-	index = xas->xa_index & ~(count - 1);
-	end = index + count - 1;
-
-	/* Walk all mappings of a given index of a file and writeprotect them */
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
-		pfn_mkclean_range(pfn, count, index, vma);
-		cond_resched();
-	}
-	i_mmap_unlock_read(mapping);
-
-	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
-	/*
-	 * After we have flushed the cache, we can clear the dirty tag. There
-	 * cannot be new dirty data in the pfn after the flush has completed as
-	 * the pfn mappings are writeprotected and fault waits for mapping
-	 * entry lock.
-	 */
-	xas_reset(xas);
-	xas_lock_irq(xas);
-	xas_store(xas, entry);
-	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
-	dax_wake_entry(xas, entry, WAKE_NEXT);
-
-	trace_dax_writeback_one(mapping->host, index, count);
-	return ret;
-
- put_unlocked:
-	put_unlocked_entry(xas, entry, WAKE_NEXT);
-	return ret;
-}
-
 /*
  * Flush the mapping to the persistent domain within the byte range of [start,
  * end]. This is required by data integrity operations to ensure file data is
@@ -1219,6 +191,37 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
 	return 0;
 }
 
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
+				     struct vm_area_struct *vma)
+{
+	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
+	       (iter->iomap.flags & IOMAP_F_DIRTY);
+}
+
+static bool dax_fault_is_cow(const struct iomap_iter *iter)
+{
+	return (iter->flags & IOMAP_WRITE) &&
+	       (iter->iomap.flags & IOMAP_F_SHARED);
+}
+
+static unsigned long dax_iter_flags(const struct iomap_iter *iter,
+				    struct vm_fault *vmf)
+{
+	unsigned long flags = 0;
+
+	if (!dax_fault_is_synchronous(iter, vmf->vma))
+		flags |= DAX_DIRTY;
+
+	if (dax_fault_is_cow(iter))
+		flags |= DAX_COW;
+
+	return flags;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating a new
  * page in the file would cause excessive storage usage for workloads with
@@ -1701,7 +704,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
 		iter.flags |= IOMAP_WRITE;
 
-	entry = grab_mapping_entry(&xas, mapping, 0);
+	entry = dax_grab_mapping_entry(&xas, mapping, 0);
 	if (xa_is_internal(entry)) {
 		ret = xa_to_internal(entry);
 		goto out;
@@ -1818,12 +821,12 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 		goto fallback;
 
 	/*
-	 * grab_mapping_entry() will make sure we get an empty PMD entry,
+	 * dax_grab_mapping_entry() will make sure we get an empty PMD entry,
 	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
 	 * entry is already in the array, for instance), it will return
 	 * VM_FAULT_FALLBACK.
 	 */
-	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
+	entry = dax_grab_mapping_entry(&xas, mapping, PMD_ORDER);
 	if (xa_is_internal(entry)) {
 		ret = xa_to_internal(entry);
 		goto fallback;
@@ -1897,50 +900,6 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
-/*
- * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
- * @vmf: The description of the fault
- * @pfn: PFN to insert
- * @order: Order of entry to insert.
- *
- * This function inserts a writeable PTE or PMD entry into the page tables
- * for an mmaped DAX file.  It also marks the page cache entry as dirty.
- */
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
-{
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
-	void *entry;
-	vm_fault_t ret;
-
-	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas, order);
-	/* Did we race with someone splitting entry or so? */
-	if (!entry || dax_is_conflict(entry) ||
-	    (order == 0 && !dax_is_pte_entry(entry))) {
-		put_unlocked_entry(&xas, entry, WAKE_NEXT);
-		xas_unlock_irq(&xas);
-		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
-						      VM_FAULT_NOPAGE);
-		return VM_FAULT_NOPAGE;
-	}
-	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
-	dax_lock_entry(&xas, entry);
-	xas_unlock_irq(&xas);
-	if (order == 0)
-		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-#ifdef CONFIG_FS_DAX_PMD
-	else if (order == PMD_ORDER)
-		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
-#endif
-	else
-		ret = VM_FAULT_FALLBACK;
-	dax_unlock_entry(&xas, entry);
-	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
-	return ret;
-}
-
 /**
  * dax_finish_sync_fault - finish synchronous page fault
  * @vmf: The description of the fault
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f6acb4ed73cb..de60a34088bb 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -157,15 +157,33 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
-struct page *dax_zap_mappings(struct address_space *mapping);
-struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
-				    loff_t end);
+#else
+static inline int dax_writeback_mapping_range(struct address_space *mapping,
+		struct dax_device *dax_dev, struct writeback_control *wbc)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		const struct iomap_ops *ops);
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		const struct iomap_ops *ops);
+
+#if IS_ENABLED(CONFIG_DAX)
+int dax_read_lock(void);
+void dax_read_unlock(int id);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
+void run_dax(struct dax_device *dax_dev);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 		unsigned long index, struct page **page);
 void dax_unlock_mapping_entry(struct address_space *mapping,
 		unsigned long index, dax_entry_t cookie);
+struct page *dax_zap_mappings(struct address_space *mapping);
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+				    loff_t end);
 #else
 static inline struct page *dax_zap_mappings(struct address_space *mapping)
 {
@@ -179,12 +197,6 @@ static inline struct page *dax_zap_mappings_range(struct address_space *mapping,
 	return NULL;
 }
 
-static inline int dax_writeback_mapping_range(struct address_space *mapping,
-		struct dax_device *dax_dev, struct writeback_control *wbc)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline dax_entry_t dax_lock_page(struct page *page)
 {
 	if (IS_DAX(page->mapping->host))
@@ -196,6 +208,15 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
 {
 }
 
+static inline int dax_read_lock(void)
+{
+	return 0;
+}
+
+static inline void dax_read_unlock(int id)
+{
+}
+
 static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 		unsigned long index, struct page **page)
 {
@@ -208,11 +229,6 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping,
 }
 #endif
 
-int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		const struct iomap_ops *ops);
-int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops);
-
 /*
  * Document all the code locations that want know when a dax page is
  * unreferenced.
@@ -222,19 +238,6 @@ static inline bool dax_page_idle(struct page *page)
 	return page_ref_count(page) == 1;
 }
 
-#if IS_ENABLED(CONFIG_DAX)
-int dax_read_lock(void);
-void dax_read_unlock(int id);
-#else
-static inline int dax_read_lock(void)
-{
-	return 0;
-}
-
-static inline void dax_read_unlock(int id)
-{
-}
-#endif /* CONFIG_DAX */
 bool dax_alive(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
@@ -255,6 +258,9 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 		enum page_entry_size pe_size, pfn_t pfn);
+void *dax_grab_mapping_entry(struct xa_state *xas,
+			     struct address_space *mapping, unsigned int order);
+void dax_unlock_entry(struct xa_state *xas, void *entry);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
@@ -271,6 +277,56 @@ static inline bool dax_mapping(struct address_space *mapping)
 	return mapping->host && IS_DAX(mapping->host);
 }
 
+/*
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages.  We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking.  In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define DAX_SHIFT	(5)
+#define DAX_MASK	((1UL << DAX_SHIFT) - 1)
+#define DAX_LOCKED	(1UL << 0)
+#define DAX_PMD		(1UL << 1)
+#define DAX_ZERO_PAGE	(1UL << 2)
+#define DAX_EMPTY	(1UL << 3)
+#define DAX_ZAP		(1UL << 4)
+
+/*
+ * These flags are not conveyed in Xarray value entries, they are just
+ * modifiers to dax_insert_entry().
+ */
+#define DAX_DIRTY (1UL << (DAX_SHIFT + 0))
+#define DAX_COW   (1UL << (DAX_SHIFT + 1))
+
+vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+			    void **pentry, pfn_t pfn, unsigned long flags);
+vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
+				  unsigned int order);
+int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+		      struct address_space *mapping, void *entry);
+
+/* The 'colour' (ie low bits) within a PMD of a page offset.  */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+	if (pe_size == PE_SIZE_PTE)
+		return PAGE_SHIFT - PAGE_SHIFT;
+	if (pe_size == PE_SIZE_PMD)
+		return PMD_SHIFT - PAGE_SHIFT;
+	if (pe_size == PE_SIZE_PUD)
+		return PUD_SHIFT - PAGE_SHIFT;
+	return ~0;
+}
+
 #ifdef CONFIG_DEV_DAX_HMEM_DEVICES
 void hmem_register_device(int target_nid, struct resource *r);
 #else
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index fd57407e7f3d..e5d30eec3bf1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -221,6 +221,12 @@ static inline void devm_memunmap_pages(struct device *dev,
 {
 }
 
+static inline struct dev_pagemap *
+get_dev_pagemap_many(unsigned long pfn, struct dev_pagemap *pgmap, int refs)
+{
+	return NULL;
+}
+
 static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap)
 {

From patchwork Fri Sep 16 03:36:25 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978086
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0B32EC6FA93
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:36:49 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229773AbiIPDgp (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:36:45 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57880 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229679AbiIPDg2 (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:28 -0400
Received: from mga01.intel.com (mga01.intel.com [192.55.52.88])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CF1139D126;
        Thu, 15 Sep 2022 20:36:26 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299386; x=1694835386;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=ajaFzvrRPG16T180cU73eT4NP/l2RlRiUDdGWZj7wgU=;
  b=M7BW1+EzIhNgpVsBTrNRSxKsaN2T/mazZoEBrJxq+h5Oxk59wtKgHC4V
   uV7mKLkszUHF9dPikfeweM2+81CAq/yWd2GwEzHvLADFuKqvasA325CVf
   nFt6s1kv1MmREQpFwhQ4WK0wGXZ/Yckb9k9C7AtyOz22p3nMcEZTb6FWr
   AM2JZ89R/kREtAqLd8lis0oSzI7tBZkN/0KPlXaq9CzN5MWuZKrrDkCsu
   4k4hqj4FnMgKpNVx7BrIXDwV6dWmT/VsS+PygiUsdUYOmlHdLx5RFeHfM
   Gyeym6sGIpV86x1AtnngNrmb+V10+q7pqGoFXlwNLq4Bwb1KYV0JhqLZw
   g==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="325170450"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="325170450"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:26 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809498"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:25 -0700
Subject: [PATCH v2 13/18] dax: Prep mapping helpers for compound pages
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:25 -0700
Message-ID: 
 <166329938508.2786261.5544204703263725154.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for device-dax to use the same mapping machinery as
fsdax, add support for device-dax compound pages.

Presently this is handled by dax_set_mapping() which is careful to only
update page->mapping for head pages. However, it does that by looking at
properties in the 'struct dev_dax' instance associated with the page.
Switch to just checking PageHead() directly in the functions that
iterate over pages in a large mapping.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/Kconfig   |    1 +
 drivers/dax/mapping.c |   16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 205e9dda8928..2eddd32c51f4 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -9,6 +9,7 @@ if DAX
 config DEV_DAX
 	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
+	depends on !FS_DAX_LIMITED
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
index 70576aa02148..5d4b9601f183 100644
--- a/drivers/dax/mapping.c
+++ b/drivers/dax/mapping.c
@@ -345,6 +345,8 @@ static vm_fault_t dax_associate_entry(void *entry,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
+		page = compound_head(page);
+
 		if (flags & DAX_COW) {
 			dax_mapping_set_cow(page);
 		} else {
@@ -353,6 +355,9 @@ static vm_fault_t dax_associate_entry(void *entry,
 			page->index = index + i++;
 			page_ref_inc(page);
 		}
+
+		if (PageHead(page))
+			break;
 	}
 
 	return 0;
@@ -372,6 +377,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 
 	for_each_mapped_pfn(entry, pfn) {
 		page = pfn_to_page(pfn);
+
+		page = compound_head(page);
+
 		if (dax_mapping_is_cow(page->mapping)) {
 			/* keep the CoW flag if this page is still shared */
 			if (page->index-- > 0)
@@ -383,6 +391,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		}
 		page->mapping = NULL;
 		page->index = 0;
+
+		if (PageHead(page))
+			break;
 	}
 
 	if (trunc && !dax_mapping_is_cow(page->mapping)) {
@@ -660,11 +671,16 @@ static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
+		page = compound_head(page);
+
 		if (zap)
 			page_ref_dec(page);
 
 		if (!ret && !dax_page_idle(page))
 			ret = page;
+
+		if (PageHead(page))
+			break;
 	}
 
 	if (zap)

From patchwork Fri Sep 16 03:36:31 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978088
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A3138C32771
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:37:21 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229978AbiIPDhO (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:37:14 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57926 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229852AbiIPDgd (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:33 -0400
Received: from mga17.intel.com (mga17.intel.com [192.55.52.151])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 746A19C8FA;
        Thu, 15 Sep 2022 20:36:32 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299392; x=1694835392;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=r/wpsgIfHbDzQJNWJ9HD7Sb+85QkD9aLuJZ8FLqHRVk=;
  b=g53wPSeVkvdSF6Cs1+SbG6Lb3K55J4CV3euWiOe+LGlxvPMHRioSiiXl
   6FDei0xret8EXyc/VCyzgJahC2eCftQsTs911COEQOyZ5PGiq+Eu3/ha3
   FLUS2IwBX52WxKsFLF+dKUgzwL9REOZ5t5zwVmjrpPqekTvE/yRmN1oKT
   yDngm8tKMcJ6cVdlCZdLHUWGKov4N9anQTuE2n+nDhfXBBZh9w3qaSNYs
   7MozGc2cTkMsSa6I/wqEq4LbIWu3NcYN3VWaH/vva74h234EMhLfm9l+P
   e6q/JXaDupx/YhJonFcbQbBsEGdE4RvXvmHbPXm4YASBFZHObKid1GT9J
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="279284002"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="279284002"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:32 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809542"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:31 -0700
Subject: [PATCH v2 14/18] devdax: add PUD support to the DAX mapping
 infrastructure
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:31 -0700
Message-ID: 
 <166329939123.2786261.4488002998591622104.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

In preparation for using the DAX mapping infrastructure for device-dax,
update the helpers to handle PUD entries.

In practice the code related to @size_downgrade will go unused for PUD
entries since only devdax creates DAX PUD entries and devdax enforces
aligned mappings. The conversion is included for completeness.

The addition of PUD support to dax_insert_pfn_mkwrite() requires a new
stub for vmf_insert_pfn_pud() in the
CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=n case.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/mapping.c   |   50 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/dax.h     |   32 ++++++++++++++++++++----------
 include/linux/huge_mm.h |   11 ++++++++--
 3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
index 5d4b9601f183..b5a5196f8831 100644
--- a/drivers/dax/mapping.c
+++ b/drivers/dax/mapping.c
@@ -13,6 +13,7 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 #include <linux/pagemap.h>
+#include <linux/huge_mm.h>
 
 #include "dax-private.h"
 
@@ -56,6 +57,8 @@ static bool dax_is_zapped(void *entry)
 
 static unsigned int dax_entry_order(void *entry)
 {
+	if (xa_to_value(entry) & DAX_PUD)
+		return PUD_ORDER;
 	if (xa_to_value(entry) & DAX_PMD)
 		return PMD_ORDER;
 	return 0;
@@ -66,9 +69,14 @@ static unsigned long dax_is_pmd_entry(void *entry)
 	return xa_to_value(entry) & DAX_PMD;
 }
 
+static unsigned long dax_is_pud_entry(void *entry)
+{
+	return xa_to_value(entry) & DAX_PUD;
+}
+
 static bool dax_is_pte_entry(void *entry)
 {
-	return !(xa_to_value(entry) & DAX_PMD);
+	return !(xa_to_value(entry) & (DAX_PMD|DAX_PUD));
 }
 
 static int dax_is_zero_entry(void *entry)
@@ -277,6 +285,8 @@ static unsigned long dax_entry_size(void *entry)
 		return 0;
 	else if (dax_is_pmd_entry(entry))
 		return PMD_SIZE;
+	else if (dax_is_pud_entry(entry))
+		return PUD_SIZE;
 	else
 		return PAGE_SIZE;
 }
@@ -564,11 +574,11 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 			     struct address_space *mapping, unsigned int order)
 {
 	unsigned long index = xas->xa_index;
-	bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
+	bool size_downgrade; /* splitting entry into PTE entries? */
 	void *entry;
 
 retry:
-	pmd_downgrade = false;
+	size_downgrade = false;
 	xas_lock_irq(xas);
 	entry = get_unlocked_entry(xas, order);
 
@@ -581,15 +591,25 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 		}
 
 		if (order == 0) {
-			if (dax_is_pmd_entry(entry) &&
+			if (!dax_is_pte_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
-				pmd_downgrade = true;
+				size_downgrade = true;
 			}
 		}
 	}
 
-	if (pmd_downgrade) {
+	if (size_downgrade) {
+		unsigned long colour, nr;
+
+		if (dax_is_pmd_entry(entry)) {
+			colour = PG_PMD_COLOUR;
+			nr = PG_PMD_NR;
+		} else {
+			colour = PG_PUD_COLOUR;
+			nr = PG_PUD_NR;
+		}
+
 		/*
 		 * Make sure 'entry' remains valid while we drop
 		 * the i_pages lock.
@@ -603,9 +623,8 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 		 */
 		if (dax_is_zero_entry(entry)) {
 			xas_unlock_irq(xas);
-			unmap_mapping_pages(mapping,
-					    xas->xa_index & ~PG_PMD_COLOUR,
-					    PG_PMD_NR, false);
+			unmap_mapping_pages(mapping, xas->xa_index & ~colour,
+					    nr, false);
 			xas_reset(xas);
 			xas_lock_irq(xas);
 		}
@@ -613,7 +632,7 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 		dax_disassociate_entry(entry, mapping, false);
 		xas_store(xas, NULL); /* undo the PMD join */
 		dax_wake_entry(xas, entry, WAKE_ALL);
-		mapping->nrpages -= PG_PMD_NR;
+		mapping->nrpages -= nr;
 		entry = NULL;
 		xas_set(xas, index);
 	}
@@ -623,7 +642,9 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 	} else {
 		unsigned long flags = DAX_EMPTY;
 
-		if (order > 0)
+		if (order == PUD_SHIFT - PAGE_SHIFT)
+			flags |= DAX_PUD;
+		else if (order == PMD_SHIFT - PAGE_SHIFT)
 			flags |= DAX_PMD;
 		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
 		dax_lock_entry(xas, entry);
@@ -846,7 +867,10 @@ vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
 		unsigned long index = xas->xa_index;
 		/* we are replacing a zero page with block mapping */
-		if (dax_is_pmd_entry(entry))
+		if (dax_is_pud_entry(entry))
+			unmap_mapping_pages(mapping, index & ~PG_PUD_COLOUR,
+					    PG_PUD_NR, false);
+		else if (dax_is_pmd_entry(entry))
 			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
 					    PG_PMD_NR, false);
 		else /* pte entry */
@@ -1018,6 +1042,8 @@ vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
 	else if (order == PMD_ORDER)
 		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 #endif
+	else if (order == PUD_ORDER)
+		ret = vmf_insert_pfn_pud(vmf, pfn, FAULT_FLAG_WRITE);
 	else
 		ret = VM_FAULT_FALLBACK;
 	dax_unlock_entry(&xas, entry);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index de60a34088bb..3a27fecf072a 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -278,22 +278,25 @@ static inline bool dax_mapping(struct address_space *mapping)
 }
 
 /*
- * DAX pagecache entries use XArray value entries so they can't be mistaken
- * for pages.  We use one bit for locking, one bit for the entry size (PMD)
- * and two more to tell us if the entry is a zero page or an empty entry that
- * is just used for locking.  In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be
+ * mistaken for pages.  We use one bit for locking, two bits for the
+ * entry size (PMD, PUD) and two more to tell us if the entry is a zero
+ * page or an empty entry that is just used for locking.  In total 5
+ * special bits which limits the max pfn that can be stored as:
+ * (1UL << 57 - PAGE_SHIFT). 63 - DAX_SHIFT - 1 (for xa_mk_value()).
  *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
- * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
+ * If the P{M,U}D bits are not set the entry has size PAGE_SIZE, and if
+ * the ZERO_PAGE and EMPTY bits aren't set the entry is a normal DAX
+ * entry with a filesystem block allocation.
  */
-#define DAX_SHIFT	(5)
+#define DAX_SHIFT	(6)
 #define DAX_MASK	((1UL << DAX_SHIFT) - 1)
 #define DAX_LOCKED	(1UL << 0)
 #define DAX_PMD		(1UL << 1)
-#define DAX_ZERO_PAGE	(1UL << 2)
-#define DAX_EMPTY	(1UL << 3)
-#define DAX_ZAP		(1UL << 4)
+#define DAX_PUD		(1UL << 2)
+#define DAX_ZERO_PAGE	(1UL << 3)
+#define DAX_EMPTY	(1UL << 4)
+#define DAX_ZAP		(1UL << 5)
 
 /*
  * These flags are not conveyed in Xarray value entries, they are just
@@ -316,6 +319,13 @@ int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 /* The order of a PMD entry */
 #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
 
+/* The 'colour' (ie low bits) within a PUD of a page offset.  */
+#define PG_PUD_COLOUR ((PUD_SIZE >> PAGE_SHIFT) - 1)
+#define PG_PUD_NR (PUD_SIZE >> PAGE_SHIFT)
+
+/* The order of a PUD entry */
+#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
+
 static inline unsigned int pe_order(enum page_entry_size pe_size)
 {
 	if (pe_size == PE_SIZE_PTE)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..de73f5a16252 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -18,10 +18,19 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
+				   pgprot_t pgprot, bool write);
 #else
 static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 {
 }
+
+static inline vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf,
+						 pfn_t pfn, pgprot_t pgprot,
+						 bool write)
+{
+	return VM_FAULT_SIGBUS;
+}
 #endif
 
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
@@ -58,8 +67,6 @@ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
 {
 	return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
 }
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
-				   pgprot_t pgprot, bool write);
 
 /**
  * vmf_insert_pfn_pud - insert a pud size pfn

From patchwork Fri Sep 16 03:36:37 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978089
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D851BC6FA90
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:37:22 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230060AbiIPDhS (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:37:18 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56970 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229964AbiIPDgk (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:40 -0400
Received: from mga06.intel.com (mga06b.intel.com [134.134.136.31])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1275A9D8D7;
        Thu, 15 Sep 2022 20:36:39 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299399; x=1694835399;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=H9Kht40qne3iP+SMB12jPN9XbzdsW4QgUBLEqx1cMBY=;
  b=WBx4k0XKhCXKcBorVbs4q/8RCFDmw83+sL+Uwi4/SZSg6V/CiW1mZ75u
   PR8UMzFVQnkto5lVccXqfi2/04nwxSlARKkZW4hG0hgSHVn+BZjVGABvN
   ajHLTZ3Cg3w429OAE+FzplYLV8mtjoIbnwGjR4l6Za5r18SZnCrj1Z6l6
   vyeLUfRPu70ARL5VAb6wkeqvcwFPHQDnF9iuuamWk1PAmKrHtVW9pzR7B
   ImgdlHPyHLwc1flt7imETMNgHX43Dnp65kxEKyA2RSmmePpcLEi+41gMG
   Gm265+AHpp4t0hLVx6PiIwVmARQ9qRQ+ucob64JcHfzwKUJcjhxNhc6yq
   A==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="360643292"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="360643292"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by orsmga104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:38 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809564"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:37 -0700
Subject: [PATCH v2 15/18] devdax: Use dax_insert_entry() +
 dax_delete_mapping_entry()
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:37 -0700
Message-ID: 
 <166329939733.2786261.13946962468817639563.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Track entries and take pgmap references at mapping insertion time.
Revoke mappings (dax_zap_mappings()) and drop the associated pgmap
references at device destruction or inode eviction time. With this in
place, and the fsdax equivalent already in place, the gup code no longer
needs to consider PTE_DEVMAP as an indicator to get a pgmap reference
before taking a page reference.

In other words, GUP takes additional references on mapped pages. Until
now, DAX in all its forms was failing to take references at mapping
time. With that fixed there is no longer a requirement for gup to manage
@pgmap references. However, that cleanup is saved for a follow-on patch.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/bus.c     |   15 +++++++++-
 drivers/dax/device.c  |   73 +++++++++++++++++++++++++++++--------------------
 drivers/dax/mapping.c |    3 ++
 3 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1dad813ee4a6..35a319a76c82 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -382,9 +382,22 @@ void kill_dev_dax(struct dev_dax *dev_dax)
 {
 	struct dax_device *dax_dev = dev_dax->dax_dev;
 	struct inode *inode = dax_inode(dax_dev);
+	struct page *page;
 
 	kill_dax(dax_dev);
-	unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+	/*
+	 * New mappings are blocked. Wait for all GUP users to release
+	 * their pins.
+	 */
+	do {
+		page = dax_zap_mappings(inode->i_mapping);
+		if (!page)
+			break;
+		__wait_var_event(page, dax_page_idle(page));
+	} while (true);
+
+	truncate_inode_pages(inode->i_mapping, 0);
 
 	/*
 	 * Dynamic dax region have the pgmap allocated via dev_kzalloc()
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 5494d745ced5..7f306939807e 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -73,38 +73,15 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 	return -1;
 }
 
-static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
-			      unsigned long fault_size)
-{
-	unsigned long i, nr_pages = fault_size / PAGE_SIZE;
-	struct file *filp = vmf->vma->vm_file;
-	struct dev_dax *dev_dax = filp->private_data;
-	pgoff_t pgoff;
-
-	/* mapping is only set on the head */
-	if (dev_dax->pgmap->vmemmap_shift)
-		nr_pages = 1;
-
-	pgoff = linear_page_index(vmf->vma,
-			ALIGN(vmf->address, fault_size));
-
-	for (i = 0; i < nr_pages; i++) {
-		struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
-
-		page = compound_head(page);
-		if (page->mapping)
-			continue;
-
-		page->mapping = filp->f_mapping;
-		page->index = pgoff + i;
-	}
-}
-
 static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 				struct vm_fault *vmf)
 {
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
+	vm_fault_t ret;
+	void *entry;
 	pfn_t pfn;
 	unsigned int fault_size = PAGE_SIZE;
 
@@ -128,7 +105,16 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 
 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	entry = dax_grab_mapping_entry(&xas, mapping, 0);
+	if (xa_is_internal(entry))
+		return xa_to_internal(entry);
+
+	ret = dax_insert_entry(&xas, vmf, &entry, pfn, 0);
+
+	dax_unlock_entry(&xas, entry);
+
+	if (ret)
+		return ret;
 
 	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
 }
@@ -136,10 +122,14 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 				struct vm_fault *vmf)
 {
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
+	vm_fault_t ret;
 	pgoff_t pgoff;
+	void *entry;
 	pfn_t pfn;
 	unsigned int fault_size = PMD_SIZE;
 
@@ -171,7 +161,16 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 
 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	entry = dax_grab_mapping_entry(&xas, mapping, PMD_ORDER);
+	if (xa_is_internal(entry))
+		return xa_to_internal(entry);
+
+	ret = dax_insert_entry(&xas, vmf, &entry, pfn, DAX_PMD);
+
+	dax_unlock_entry(&xas, entry);
+
+	if (ret)
+		return ret;
 
 	return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
@@ -180,10 +179,14 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 				struct vm_fault *vmf)
 {
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	unsigned long pud_addr = vmf->address & PUD_MASK;
+	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
+	vm_fault_t ret;
 	pgoff_t pgoff;
+	void *entry;
 	pfn_t pfn;
 	unsigned int fault_size = PUD_SIZE;
 
@@ -216,7 +219,16 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 
 	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	entry = dax_grab_mapping_entry(&xas, mapping, PUD_ORDER);
+	if (xa_is_internal(entry))
+		return xa_to_internal(entry);
+
+	ret = dax_insert_entry(&xas, vmf, &entry, pfn, DAX_PUD);
+
+	dax_unlock_entry(&xas, entry);
+
+	if (ret)
+		return ret;
 
 	return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
@@ -494,3 +506,4 @@ MODULE_LICENSE("GPL v2");
 module_init(dax_init);
 module_exit(dax_exit);
 MODULE_ALIAS_DAX_DEVICE(0);
+MODULE_IMPORT_NS(DAX);
diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
index b5a5196f8831..9981eebb2dc5 100644
--- a/drivers/dax/mapping.c
+++ b/drivers/dax/mapping.c
@@ -266,6 +266,7 @@ void dax_unlock_entry(struct xa_state *xas, void *entry)
 	WARN_ON(!dax_is_locked(old));
 	dax_wake_entry(xas, entry, WAKE_NEXT);
 }
+EXPORT_SYMBOL_NS_GPL(dax_unlock_entry, DAX);
 
 /*
  * Return: The entry stored at this location before it was locked.
@@ -666,6 +667,7 @@ void *dax_grab_mapping_entry(struct xa_state *xas,
 	xas_unlock_irq(xas);
 	return xa_mk_internal(VM_FAULT_FALLBACK);
 }
+EXPORT_SYMBOL_NS_GPL(dax_grab_mapping_entry, DAX);
 
 static void *dax_zap_entry(struct xa_state *xas, void *entry)
 {
@@ -910,6 +912,7 @@ vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	*pentry = entry;
 	return 0;
 }
+EXPORT_SYMBOL_NS_GPL(dax_insert_entry, DAX);
 
 int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 		      struct address_space *mapping, void *entry)

From patchwork Fri Sep 16 03:36:43 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978090
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 66196C6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:37:38 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229994AbiIPDhg (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:37:36 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58080 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230036AbiIPDgq (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:46 -0400
Received: from mga14.intel.com (mga14.intel.com [192.55.52.115])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EEC0D9DB62;
        Thu, 15 Sep 2022 20:36:44 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299404; x=1694835404;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=R5iLP9zAHTpb9OkjwMC0Bw2xXqlB6pFYCCoagiNt1Hw=;
  b=hK8BkB3sxn2JUZhAgf/GNrgpRzyFSwppBTX4Ggryi3x++DtQ2/wsR6ix
   kqOiWdCQSelMJeXDpua5As2IEdR8KqIlTkoMb7jb5ji4GNTKZKbnfvBU+
   FvjLZcXQAKywJzxrNErQe5QhfYkbUkrzD4wTx36R3qUgkwOROWdvPmdac
   g+e2dmVInEea/tb1hGVDra0vw3I7rX916/CfsDxuGQ66RZTq4m7ZUDFZC
   rgfciB4SlSWBcMWLA9Api4lkjC6YG9r+zGWCV9mFE5kxa52dN/kaudDNt
   DbqZHSh1SXUQqFFEsp82MXQzWBRpM9rkHC0OAmdv1Apx635QMQIh8PXoV
   A==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="298895594"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="298895594"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by fmsmga103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:44 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809589"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:43 -0700
Subject: [PATCH v2 16/18] mm/memremap_pages: Support initializing pages to a
 zero reference count
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:43 -0700
Message-ID: 
 <166329940343.2786261.6047770378829215962.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

The initial memremap_pages() implementation inherited the
__init_single_page() default of pages starting life with an elevated
reference count. This originally allowed for the page->pgmap pointer to
alias with the storage for page->lru since a page was only allowed to be
on an lru list when its reference count was zero.

Since then, 'struct page' definition cleanups have arranged for
dedicated space for the ZONE_DEVICE page metadata, and the
MEMORY_DEVICE_{PRIVATE,COHERENT} work has arranged for the 1 -> 0
page->_refcount transition to route the page to free_zone_device_page()
and not the core-mm page-free. With those cleanups in place and with
filesystem-dax and device-dax now converted to take and drop references
at map and truncate time, it is possible to start MEMORY_DEVICE_FS_DAX
and MEMORY_DEVICE_GENERIC reference counts at 0.

MEMORY_DEVICE_{PRIVATE,COHERENT} still expect that their ZONE_DEVICE
pages start life at _refcount 1, so make that the default if
pgmap->init_mode is left at zero.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/device.c     |    1 +
 drivers/nvdimm/pmem.c    |    2 ++
 include/linux/dax.h      |    2 +-
 include/linux/memremap.h |    5 +++++
 mm/memremap.c            |   15 ++++++++++-----
 mm/page_alloc.c          |    2 ++
 6 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 7f306939807e..8a7281d16c99 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -460,6 +460,7 @@ int dev_dax_probe(struct dev_dax *dev_dax)
 	}
 
 	pgmap->type = MEMORY_DEVICE_GENERIC;
+	pgmap->init_mode = INIT_PAGEMAP_IDLE;
 	if (dev_dax->align > PAGE_SIZE)
 		pgmap->vmemmap_shift =
 			order_base_2(dev_dax->align >> PAGE_SHIFT);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 7e88cd242380..9c98dcb9f33d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -529,6 +529,7 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.init_mode = INIT_PAGEMAP_IDLE;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
@@ -543,6 +544,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.range.end = res->end;
 		pmem->pgmap.nr_range = 1;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.init_mode = INIT_PAGEMAP_IDLE;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 3a27fecf072a..b9fdd8951e06 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -235,7 +235,7 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping,
  */
 static inline bool dax_page_idle(struct page *page)
 {
-	return page_ref_count(page) == 1;
+	return page_ref_count(page) == 0;
 }
 
 bool dax_alive(struct dax_device *dax_dev);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e5d30eec3bf1..9f1a57efd371 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -116,6 +116,7 @@ struct dev_pagemap_ops {
  *	representation. A bigger value will set up compound struct pages
  *	of the requested order value.
  * @ops: method table
+ * @init_mode: initial reference count mode
  * @owner: an opaque pointer identifying the entity that manages this
  *	instance.  Used by various helpers to make sure that no
  *	foreign ZONE_DEVICE memory is accessed.
@@ -131,6 +132,10 @@ struct dev_pagemap {
 	unsigned int flags;
 	unsigned long vmemmap_shift;
 	const struct dev_pagemap_ops *ops;
+	enum {
+		INIT_PAGEMAP_BUSY = 0, /* default / historical */
+		INIT_PAGEMAP_IDLE,
+	} init_mode;
 	void *owner;
 	int nr_range;
 	union {
diff --git a/mm/memremap.c b/mm/memremap.c
index 83c5e6fafd84..b6a7a95339b3 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -467,8 +467,10 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap_many);
 
 void free_zone_device_page(struct page *page)
 {
-	if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
-		return;
+	struct dev_pagemap *pgmap = page->pgmap;
+
+	/* wake filesystem 'break dax layouts' waiters */
+	wake_up_var(page);
 
 	mem_cgroup_uncharge(page_folio(page));
 
@@ -503,12 +505,15 @@ void free_zone_device_page(struct page *page)
 	 * to clear page->mapping.
 	 */
 	page->mapping = NULL;
-	page->pgmap->ops->page_free(page);
+	if (pgmap->ops && pgmap->ops->page_free)
+		pgmap->ops->page_free(page);
 
 	/*
-	 * Reset the page count to 1 to prepare for handing out the page again.
+	 * Reset the page count to the @init_mode value to prepare for
+	 * handing out the page again.
 	 */
-	set_page_count(page, 1);
+	if (pgmap->init_mode == INIT_PAGEMAP_BUSY)
+		set_page_count(page, 1);
 }
 
 #ifdef CONFIG_FS_DAX
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5486d47406e..8ee52992055b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6719,6 +6719,8 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 {
 
 	__init_single_page(page, pfn, zone_idx, nid);
+	if (pgmap->init_mode == INIT_PAGEMAP_IDLE)
+		set_page_count(page, 0);
 
 	/*
 	 * Mark page reserved as it will need to wait for onlining

From patchwork Fri Sep 16 03:36:49 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978091
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D9157ECAAD3
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:37:56 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229904AbiIPDhy (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:37:54 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58230 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229679AbiIPDgx (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:36:53 -0400
Received: from mga03.intel.com (mga03.intel.com [134.134.136.65])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CD3079E682;
        Thu, 15 Sep 2022 20:36:51 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299411; x=1694835411;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=ZO7eiqr8wx9WyqWIgXnO5hpkhzDCAJO+DedF+qkd8PM=;
  b=XuI08Z35nz2ZKEMAOCciIDOhCk6KW89eYv1P18yXt7ObtP4gQoxPsptg
   JbdwZTjkQt/E5BHQVU5GQa8uDTEBNAFxqmKsRqhQtioUp+PzmsqvnMy3Y
   hcY3GIs/TUhrOwz4Z2p0Z9FmQNtkBga2m6kyc+6tuya9+NgmFqb6/iVxE
   YvronUUUQjpyUMbloDNRBO037xoZwWej62j3mGhfAV9RsXijIbt9fofpz
   EB3K9C7dxCJqx0pfj40v/Ies6IJhiG+6O+N/bOhJZjujrmi8Mqprkq7MF
   G0kx2d4IEsNphq2Lh80AipJVNNDJr36QnSId9NIWMhuKPMIl5rJefRZB6
   w==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="300264140"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="300264140"
Received: from fmsmga008.fm.intel.com ([10.253.24.58])
  by orsmga103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:51 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="679809618"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by fmsmga008-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:50 -0700
Subject: [PATCH v2 17/18] fsdax: Delete put_devmap_managed_page_refs()
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Jason Gunthorpe <jgg@nvidia.com>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:49 -0700
Message-ID: 
 <166329940976.2786261.12969674850633492781.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Now that fsdax DMA-idle detection no longer depends on catching
transitions of page->_refcount to 1, remove
put_devmap_managed_page_refs() and associated infrastructure.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mm.h |   30 ------------------------------
 mm/gup.c           |    6 ++----
 mm/memremap.c      |   18 ------------------
 mm/swap.c          |    2 --
 4 files changed, 2 insertions(+), 54 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..182fe336a268 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1048,30 +1048,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
-	if (!static_branch_unlikely(&devmap_managed_key))
-		return false;
-	if (!is_zone_device_page(page))
-		return false;
-	return __put_devmap_managed_page_refs(page, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
-	return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
-static inline bool put_devmap_managed_page(struct page *page)
-{
-	return put_devmap_managed_page_refs(page, 1);
-}
-
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1168,12 +1144,6 @@ static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
 
-	/*
-	 * For some devmap managed pages we need to catch refcount transition
-	 * from 2 to 1:
-	 */
-	if (put_devmap_managed_page(&folio->page))
-		return;
 	folio_put(folio);
 }
 
diff --git a/mm/gup.c b/mm/gup.c
index 732825157430..c6d060dee9e0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,8 +87,7 @@ static inline struct folio *try_get_folio(struct page *page, int refs)
 	 * belongs to this folio.
 	 */
 	if (unlikely(page_folio(page) != folio)) {
-		if (!put_devmap_managed_page_refs(&folio->page, refs))
-			folio_put_refs(folio, refs);
+		folio_put_refs(folio, refs);
 		goto retry;
 	}
 
@@ -177,8 +176,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
 			refs *= GUP_PIN_COUNTING_BIAS;
 	}
 
-	if (!put_devmap_managed_page_refs(&folio->page, refs))
-		folio_put_refs(folio, refs);
+	folio_put_refs(folio, refs);
 }
 
 /**
diff --git a/mm/memremap.c b/mm/memremap.c
index b6a7a95339b3..0f4a2e20c159 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -515,21 +515,3 @@ void free_zone_device_page(struct page *page)
 	if (pgmap->init_mode == INIT_PAGEMAP_BUSY)
 		set_page_count(page, 1);
 }
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page_refs(struct page *page, int refs)
-{
-	if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
-		return false;
-
-	/*
-	 * fsdax page refcounts are 1-based, rather than 0-based: if
-	 * refcount is 1, then the page is free and the refcount is
-	 * stable because nobody holds a reference on the page.
-	 */
-	if (page_ref_sub_return(page, refs) == 1)
-		wake_up_var(page);
-	return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_page_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/swap.c b/mm/swap.c
index 9cee7f6a3809..b346dd24cde8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -960,8 +960,6 @@ void release_pages(struct page **pages, int nr)
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			if (put_devmap_managed_page(&folio->page))
-				continue;
 			if (folio_put_testzero(folio))
 				free_zone_device_page(&folio->page);
 			continue;

From patchwork Fri Sep 16 03:36:56 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Dan Williams <dan.j.williams@intel.com>
X-Patchwork-Id: 12978092
Return-Path: <linux-xfs-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id F34A2C6FA86
	for <linux-xfs@archiver.kernel.org>; Fri, 16 Sep 2022 03:38:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229884AbiIPDiH (ORCPT <rfc822;linux-xfs@archiver.kernel.org>);
        Thu, 15 Sep 2022 23:38:07 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58488 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229690AbiIPDhB (ORCPT
        <rfc822;linux-xfs@vger.kernel.org>); Thu, 15 Sep 2022 23:37:01 -0400
Received: from mga06.intel.com (mga06b.intel.com [134.134.136.31])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9819D9E0DB;
        Thu, 15 Sep 2022 20:36:58 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1663299419; x=1694835419;
  h=subject:from:to:cc:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=shTt/uNd4pi9Ct2gNVWBBoQz77p4cEL4DPuvM6dkcww=;
  b=fVQpPTu6xRLB42XdMaWSav7NeKLffvWUAZ8CUH2aLg/bE+b4nHB9vb6C
   Z/9CmzWRiAwjeb0Fztk4c8jdcSTwnsY6yJFhThGBFRUwDyLljH/BvHjKP
   U0BlysFLx91tAoYJoxZ4wvpy7OWlIZQzMnVUkBf/4Wx1dqlIgGfNY7j4f
   UBdOyYzZeqZRhl8gRNYSOpDwpBxHYF/8WCvYgWT3tNS5FJ8SgIh9WdQM1
   Iv6HSxfQ0oDWL/ZAql2pyzqp4+clnzrCMtDyTmIIGC1L7PGLevyEdCKfB
   HEUJPyiT3uAbVLDzusBoTWqYS9Ck3FKU4dKCub1lDiB/vMnzl5BVP0kaQ
   Q==;
X-IronPort-AV: E=McAfee;i="6500,9779,10471"; a="360643403"
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="360643403"
Received: from orsmga004.jf.intel.com ([10.7.209.38])
  by orsmga104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:56 -0700
X-IronPort-AV: E=Sophos;i="5.93,319,1654585200";
   d="scan'208";a="743194587"
Received: from colinlix-mobl.amr.corp.intel.com (HELO
 dwillia2-xfh.jf.intel.com) ([10.209.29.52])
  by orsmga004-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 15 Sep 2022 20:36:56 -0700
Subject: [PATCH v2 18/18] mm/gup: Drop DAX pgmap accounting
From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Matthew Wilcox <willy@infradead.org>, Jan Kara <jack@suse.cz>,
        "Darrick J. Wong" <djwong@kernel.org>,
        Christoph Hellwig <hch@lst.de>,
        John Hubbard <jhubbard@nvidia.com>,
        Jason Gunthorpe <jgg@nvidia.com>,
        linux-fsdevel@vger.kernel.org, nvdimm@lists.linux.dev,
        linux-xfs@vger.kernel.org, linux-mm@kvack.org,
        linux-ext4@vger.kernel.org
Date: Thu, 15 Sep 2022 20:36:56 -0700
Message-ID: 
 <166329941594.2786261.16402766003789003164.stgit@dwillia2-xfh.jf.intel.com>
In-Reply-To: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
References: 
 <166329930818.2786261.6086109734008025807.stgit@dwillia2-xfh.jf.intel.com>
User-Agent: StGit/0.18-3-g996c
MIME-Version: 1.0
Precedence: bulk
List-ID: <linux-xfs.vger.kernel.org>
X-Mailing-List: linux-xfs@vger.kernel.org

Now that pgmap accounting is handled at map time, it can be dropped from
gup time.

A hurdle still remains that filesystem-DAX huge pages are not compound
pages which still requires infrastructure like
__gup_device_huge_p{m,u}d() to stick around.

Additionally, ZONE_DEVICE pages with this change are still not suitable
to be returned from vm_normal_page(), so this cleanup is limited to
deleting pgmap reference manipulation. This is an incremental step on
the path to removing pte_devmap() altogether.

Note that follow_pmd_devmap() can be deleted entirely since a few
additions of pmd_devmap() allows the transparent huge page path to be
reused.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: John Hubbard <jhubbard@nvidia.com>
Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/huge_mm.h |   12 +------
 mm/gup.c                |   83 +++++++++++------------------------------------
 mm/huge_memory.c        |   54 +------------------------------
 3 files changed, 22 insertions(+), 127 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de73f5a16252..b8ed373c6090 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -263,10 +263,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return folio_order(folio) >= HPAGE_PMD_ORDER;
 }
 
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap);
+		pud_t *pud, int flags);
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
@@ -418,14 +416,8 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 	return;
 }
 
-static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
-	unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
-	return NULL;
-}
-
 static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-	unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
+	unsigned long addr, pud_t *pud, int flags)
 {
 	return NULL;
 }
diff --git a/mm/gup.c b/mm/gup.c
index c6d060dee9e0..8e6dd4308e19 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -25,7 +25,6 @@
 #include "internal.h"
 
 struct follow_page_context {
-	struct dev_pagemap *pgmap;
 	unsigned int page_mask;
 };
 
@@ -487,8 +486,7 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 }
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct dev_pagemap **pgmap)
+		unsigned long address, pmd_t *pmd, unsigned int flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *page;
@@ -532,17 +530,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 
 	page = vm_normal_page(vma, address, pte);
-	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+	if (!page && pte_devmap(pte)) {
 		/*
-		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
-		 * case since they are only valid while holding the pgmap
-		 * reference.
+		 * ZONE_DEVICE pages are not yet treated as vm_normal_page()
+		 * instances, with respect to mapcount and compound-page
+		 * metadata
 		 */
-		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
-		if (*pgmap)
-			page = pte_page(pte);
-		else
-			goto no_page;
+		page = pte_page(pte);
 	} else if (unlikely(!page)) {
 		if (flags & FOLL_DUMP) {
 			/* Avoid special (like zero) pages in core dumps */
@@ -660,15 +654,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			return no_page_table(vma, flags);
 		goto retry;
 	}
-	if (pmd_devmap(pmdval)) {
-		ptl = pmd_lock(mm, pmd);
-		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
-		spin_unlock(ptl);
-		if (page)
-			return page;
-	}
-	if (likely(!pmd_trans_huge(pmdval)))
-		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+	if (likely(!(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))))
+		return follow_page_pte(vma, address, pmd, flags);
 
 	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
 		return no_page_table(vma, flags);
@@ -686,9 +673,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		pmd_migration_entry_wait(mm, pmd);
 		goto retry_locked;
 	}
-	if (unlikely(!pmd_trans_huge(*pmd))) {
+	if (unlikely(!(pmd_trans_huge(*pmd) || pmd_devmap(pmdval)))) {
 		spin_unlock(ptl);
-		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+		return follow_page_pte(vma, address, pmd, flags);
 	}
 	if (flags & FOLL_SPLIT_PMD) {
 		int ret;
@@ -706,7 +693,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		}
 
 		return ret ? ERR_PTR(ret) :
-			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+			follow_page_pte(vma, address, pmd, flags);
 	}
 	page = follow_trans_huge_pmd(vma, address, pmd, flags);
 	spin_unlock(ptl);
@@ -743,7 +730,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 	}
 	if (pud_devmap(*pud)) {
 		ptl = pud_lock(mm, pud);
-		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
+		page = follow_devmap_pud(vma, address, pud, flags);
 		spin_unlock(ptl);
 		if (page)
 			return page;
@@ -790,9 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
  *
  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
  *
- * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
- * the device's dev_pagemap metadata to avoid repeating expensive lookups.
- *
  * When getting an anonymous page and the caller has to trigger unsharing
  * of a shared anonymous page first, -EMLINK is returned. The caller should
  * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
@@ -847,7 +831,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			 unsigned int foll_flags)
 {
-	struct follow_page_context ctx = { NULL };
+	struct follow_page_context ctx = { 0 };
 	struct page *page;
 
 	if (vma_is_secretmem(vma))
@@ -857,8 +841,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		return NULL;
 
 	page = follow_page_mask(vma, address, foll_flags, &ctx);
-	if (ctx.pgmap)
-		put_dev_pagemap(ctx.pgmap);
 	return page;
 }
 
@@ -1118,7 +1100,7 @@ static long __get_user_pages(struct mm_struct *mm,
 {
 	long ret = 0, i = 0;
 	struct vm_area_struct *vma = NULL;
-	struct follow_page_context ctx = { NULL };
+	struct follow_page_context ctx = { 0 };
 
 	if (!nr_pages)
 		return 0;
@@ -1241,8 +1223,6 @@ static long __get_user_pages(struct mm_struct *mm,
 		nr_pages -= page_increm;
 	} while (nr_pages);
 out:
-	if (ctx.pgmap)
-		put_dev_pagemap(ctx.pgmap);
 	return i ? i : ret;
 }
 
@@ -2322,9 +2302,8 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
-	struct dev_pagemap *pgmap = NULL;
-	int nr_start = *nr, ret = 0;
 	pte_t *ptep, *ptem;
+	int ret = 0;
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
@@ -2345,12 +2324,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (pte_devmap(pte)) {
 			if (unlikely(flags & FOLL_LONGTERM))
 				goto pte_unmap;
-
-			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
-			if (unlikely(!pgmap)) {
-				undo_dev_pagemap(nr, nr_start, flags, pages);
-				goto pte_unmap;
-			}
 		} else if (pte_special(pte))
 			goto pte_unmap;
 
@@ -2397,8 +2370,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 	ret = 1;
 
 pte_unmap:
-	if (pgmap)
-		put_dev_pagemap(pgmap);
 	pte_unmap(ptem);
 	return ret;
 }
@@ -2425,28 +2396,17 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 			     unsigned long end, unsigned int flags,
 			     struct page **pages, int *nr)
 {
-	int nr_start = *nr;
-	struct dev_pagemap *pgmap = NULL;
-
 	do {
 		struct page *page = pfn_to_page(pfn);
 
-		pgmap = get_dev_pagemap(pfn, pgmap);
-		if (unlikely(!pgmap)) {
-			undo_dev_pagemap(nr, nr_start, flags, pages);
-			break;
-		}
 		SetPageReferenced(page);
 		pages[*nr] = page;
-		if (unlikely(!try_grab_page(page, flags))) {
-			undo_dev_pagemap(nr, nr_start, flags, pages);
+		if (unlikely(!try_grab_page(page, flags)))
 			break;
-		}
 		(*nr)++;
 		pfn++;
 	} while (addr += PAGE_SIZE, addr != end);
 
-	put_dev_pagemap(pgmap);
 	return addr == end;
 }
 
@@ -2455,16 +2415,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 				 struct page **pages, int *nr)
 {
 	unsigned long fault_pfn;
-	int nr_start = *nr;
 
 	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
 		return 0;
 
-	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-		undo_dev_pagemap(nr, nr_start, flags, pages);
+	if (unlikely(pmd_val(orig) != pmd_val(*pmdp)))
 		return 0;
-	}
+
 	return 1;
 }
 
@@ -2473,16 +2431,13 @@ static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 				 struct page **pages, int *nr)
 {
 	unsigned long fault_pfn;
-	int nr_start = *nr;
 
 	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
 		return 0;
 
-	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-		undo_dev_pagemap(nr, nr_start, flags, pages);
+	if (unlikely(pud_val(orig) != pud_val(*pudp)))
 		return 0;
-	}
 	return 1;
 }
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..ef68296f2158 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,55 +1031,6 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pmd(vma, addr, pmd);
 }
 
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
-	unsigned long pfn = pmd_pfn(*pmd);
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-
-	assert_spin_locked(pmd_lockptr(mm, pmd));
-
-	/*
-	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
-	 * not be in this function with `flags & FOLL_COW` set.
-	 */
-	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
-
-	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
-	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
-			 (FOLL_PIN | FOLL_GET)))
-		return NULL;
-
-	if (flags & FOLL_WRITE && !pmd_write(*pmd))
-		return NULL;
-
-	if (pmd_present(*pmd) && pmd_devmap(*pmd))
-		/* pass */;
-	else
-		return NULL;
-
-	if (flags & FOLL_TOUCH)
-		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
-	/*
-	 * device mapped pages can only be returned if the
-	 * caller will manage the page reference count.
-	 */
-	if (!(flags & (FOLL_GET | FOLL_PIN)))
-		return ERR_PTR(-EEXIST);
-
-	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-	if (!try_grab_page(page, flags))
-		page = ERR_PTR(-ENOMEM);
-
-	return page;
-}
-
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1196,7 +1147,7 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 }
 
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap)
+			       pud_t *pud, int flags)
 {
 	unsigned long pfn = pud_pfn(*pud);
 	struct mm_struct *mm = vma->vm_mm;
@@ -1230,9 +1181,6 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 		return ERR_PTR(-EEXIST);
 
 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
 	page = pfn_to_page(pfn);
 	if (!try_grab_page(page, flags))
 		page = ERR_PTR(-ENOMEM);