diff mbox

[v4,14/14] block, dax: opt-in control for raw block dax support

Message ID 20151108192838.9104.11328.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Accepted
Commit abcc9eb
Headers show

Commit Message

Dan Williams Nov. 8, 2015, 7:28 p.m. UTC
Now that we have the ability to dynamically enable/disable DAX for a raw
block inode, make the default behavior a compile time decision.  DAX
does not yet have feature parity with pagecache backed mappings, and it
may disable statistics that an application depends on, so environments
should knowingly enable DAX semantics.

Note, that this does not affect the mmap path for filesystems on top of
a DAX capable block device.  They currently open code a check for the
->direct_access() op in the gendisk.  That said, DAX support is already
opt-in for filesystems via a mount flag.

Cc: Dave Chinner <david@fromorbit.com>
[dgc: leave the dax_do_io() path alone, let it honor S_DAX]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/Kconfig      |   15 +++++++++++++++
 block/ioctl.c      |    2 +-
 fs/block_dev.c     |   16 ++++++++--------
 include/linux/fs.h |    8 ++++++++
 4 files changed, 32 insertions(+), 9 deletions(-)
diff mbox

Patch

diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..6fb05c570332 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -111,6 +111,21 @@  config BLK_CMDLINE_PARSER
 
 	See Documentation/block/cmdline-partition.txt for more information.
 
+config BLK_DEV_DAX
+	bool "Block layer DAX support default"
+	depends on FS_DAX
+	help
+	  When DAX support is available (CONFIG_FS_DAX) raw block devices
+	  can also support direct userspace access to the storage capacity
+	  via MMAP(2) similar to a file on a DAX-enabled filesystem.
+	  However, the DAX I/O-path disables some standard I/O-statistics,
+	  and the MMAP(2) path has some functional differences due to
+	  bypassing the page cache.  The choice here can be overridden at
+	  run time via the BLKDAXSET ioctl.  If you are unsure if the DAX
+	  behavior is compatible with your environment, say N.  Otherwise
+	  DAX is a significantly faster way to access persistent memory
+	  from NVDIMM devices.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/ioctl.c b/block/ioctl.c
index 604438f36ddd..a353bcd29987 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -296,7 +296,7 @@  static inline int is_unrecognized_ioctl(int ret)
 }
 
 #ifdef CONFIG_FS_DAX
-static bool blkdev_dax_capable(struct block_device *bdev)
+bool blkdev_dax_capable(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 09d10667cc19..43af861463f4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1185,7 +1185,8 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+			bdev->bd_inode->i_flags = S_DAX;
 		if (!partno) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
@@ -1212,8 +1213,11 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret)
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+				if (!blkdev_dax_capable(bdev))
+					bdev->bd_inode->i_flags &= ~S_DAX;
+			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1227,6 +1231,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				else if (ret == -ENOMEDIUM)
 					invalidate_partitions(disk, bdev);
 			}
+
 			if (ret)
 				goto out_clear;
 		} else {
@@ -1247,12 +1252,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			/*
-			 * If the partition is not aligned on a page
-			 * boundary, we can't do dax I/O to it.
-			 */
-			if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
-			    (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+			if (!blkdev_dax_capable(bdev))
 				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8fb2d4b848bf..5a9e14538f69 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2282,6 +2282,14 @@  extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
+#ifdef CONFIG_FS_DAX
+extern bool blkdev_dax_capable(struct block_device *bdev);
+#else
+static inline bool blkdev_dax_capable(struct block_device *bdev)
+{
+	return false;
+}
+#endif
 
 extern struct super_block *blockdev_superblock;