diff mbox

[v3,13/15] block, dax: make dax mappings opt-in by default

Message ID 20151102043053.6610.43948.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Dan Williams Nov. 2, 2015, 4:30 a.m. UTC
Now that we have the ability to dynamically enable DAX for a raw block
inode, make the behavior opt-in by default.  DAX does not have feature
parity with pagecache backed mappings, so applications should knowingly
enable DAX semantics.

Note, this is only for mappings returned to userspace.  For the
synchronous usages of DAX, dax_do_io(), there is no semantic difference
with the bio submission path, so that path remains default enabled.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/ioctl.c      |    3 +--
 fs/block_dev.c     |   33 +++++++++++++++++++++++----------
 include/linux/fs.h |    8 ++++++++
 3 files changed, 32 insertions(+), 12 deletions(-)

Comments

Dave Chinner Nov. 3, 2015, 12:32 a.m. UTC | #1
On Sun, Nov 01, 2015 at 11:30:53PM -0500, Dan Williams wrote:
> Now that we have the ability to dynamically enable DAX for a raw block
> inode, make the behavior opt-in by default.  DAX does not have feature
> parity with pagecache backed mappings, so applications should knowingly
> enable DAX semantics.
> 
> Note, this is only for mappings returned to userspace.  For the
> synchronous usages of DAX, dax_do_io(), there is no semantic difference
> with the bio submission path, so that path remains default enabled.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  block/ioctl.c      |    3 +--
>  fs/block_dev.c     |   33 +++++++++++++++++++++++----------
>  include/linux/fs.h |    8 ++++++++
>  3 files changed, 32 insertions(+), 12 deletions(-)
> 
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 205d57612fbd..c4c3a09d9ca9 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -298,13 +298,12 @@ static inline int is_unrecognized_ioctl(int ret)
>  #ifdef CONFIG_FS_DAX
>  static int blkdev_set_dax(struct block_device *bdev, int n)
>  {
> -	struct gendisk *disk = bdev->bd_disk;
>  	int rc = 0;
>  
>  	if (n)
>  		n = S_DAX;
>  
> -	if (n && !disk->fops->direct_access)
> +	if (n && !blkdev_dax_capable(bdev))
>  		return -ENOTTY;
>  
>  	mutex_lock(&bdev->bd_inode->i_mutex);
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 13ce6d0ff7f6..ee34a31e6fa4 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -152,16 +152,37 @@ static struct inode *bdev_file_inode(struct file *file)
>  	return file->f_mapping->host;
>  }
>  
> +#ifdef CONFIG_FS_DAX
> +bool blkdev_dax_capable(struct block_device *bdev)
> +{
> +	struct gendisk *disk = bdev->bd_disk;
> +
> +	if (!disk->fops->direct_access)
> +		return false;
> +
> +	/*
> +	 * If the partition is not aligned on a page boundary, we can't
> +	 * do dax I/O to it.
> +	 */
> +	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
> +			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
> +		return false;
> +
> +	return true;

Where do you check that S_DAX has been enabled on the block device
now?

Cheers,

Dave.
Dan Williams Nov. 3, 2015, 7:35 a.m. UTC | #2
On Mon, Nov 2, 2015 at 4:32 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Sun, Nov 01, 2015 at 11:30:53PM -0500, Dan Williams wrote:
>> Now that we have the ability to dynamically enable DAX for a raw block
>> inode, make the behavior opt-in by default.  DAX does not have feature
>> parity with pagecache backed mappings, so applications should knowingly
>> enable DAX semantics.
>>
>> Note, this is only for mappings returned to userspace.  For the
>> synchronous usages of DAX, dax_do_io(), there is no semantic difference
>> with the bio submission path, so that path remains default enabled.
>>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  block/ioctl.c      |    3 +--
>>  fs/block_dev.c     |   33 +++++++++++++++++++++++----------
>>  include/linux/fs.h |    8 ++++++++
>>  3 files changed, 32 insertions(+), 12 deletions(-)
>>
>> diff --git a/block/ioctl.c b/block/ioctl.c
>> index 205d57612fbd..c4c3a09d9ca9 100644
>> --- a/block/ioctl.c
>> +++ b/block/ioctl.c
>> @@ -298,13 +298,12 @@ static inline int is_unrecognized_ioctl(int ret)
>>  #ifdef CONFIG_FS_DAX
>>  static int blkdev_set_dax(struct block_device *bdev, int n)
>>  {
>> -     struct gendisk *disk = bdev->bd_disk;
>>       int rc = 0;
>>
>>       if (n)
>>               n = S_DAX;
>>
>> -     if (n && !disk->fops->direct_access)
>> +     if (n && !blkdev_dax_capable(bdev))
>>               return -ENOTTY;
>>
>>       mutex_lock(&bdev->bd_inode->i_mutex);
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 13ce6d0ff7f6..ee34a31e6fa4 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -152,16 +152,37 @@ static struct inode *bdev_file_inode(struct file *file)
>>       return file->f_mapping->host;
>>  }
>>
>> +#ifdef CONFIG_FS_DAX
>> +bool blkdev_dax_capable(struct block_device *bdev)
>> +{
>> +     struct gendisk *disk = bdev->bd_disk;
>> +
>> +     if (!disk->fops->direct_access)
>> +             return false;
>> +
>> +     /*
>> +      * If the partition is not aligned on a page boundary, we can't
>> +      * do dax I/O to it.
>> +      */
>> +     if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
>> +                     || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
>> +             return false;
>> +
>> +     return true;
>
> Where do you check that S_DAX has been enabled on the block device
> now?
>

Only in the mmap path:

static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *bd_inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(bd_inode);

        file_accessed(file);
        mutex_lock(&bd_inode->i_mutex);
        bdev->bd_map_count++;
        if (IS_DAX(bd_inode)) {
                vma->vm_ops = &blkdev_dax_vm_ops;
                vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        } else {
                vma->vm_ops = &blkdev_default_vm_ops;
        }
        mutex_unlock(&bd_inode->i_mutex);

        return 0;
}
Dave Chinner Nov. 3, 2015, 8:20 p.m. UTC | #3
On Mon, Nov 02, 2015 at 11:35:04PM -0800, Dan Williams wrote:
> On Mon, Nov 2, 2015 at 4:32 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Sun, Nov 01, 2015 at 11:30:53PM -0500, Dan Williams wrote:
> >> Now that we have the ability to dynamically enable DAX for a raw block
> >> inode, make the behavior opt-in by default.  DAX does not have feature
> >> parity with pagecache backed mappings, so applications should knowingly
> >> enable DAX semantics.
> >>
> >> Note, this is only for mappings returned to userspace.  For the
> >> synchronous usages of DAX, dax_do_io(), there is no semantic difference
> >> with the bio submission path, so that path remains default enabled.
> >>
> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >> ---
> >>  block/ioctl.c      |    3 +--
> >>  fs/block_dev.c     |   33 +++++++++++++++++++++++----------
> >>  include/linux/fs.h |    8 ++++++++
> >>  3 files changed, 32 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/block/ioctl.c b/block/ioctl.c
> >> index 205d57612fbd..c4c3a09d9ca9 100644
> >> --- a/block/ioctl.c
> >> +++ b/block/ioctl.c
> >> @@ -298,13 +298,12 @@ static inline int is_unrecognized_ioctl(int ret)
> >>  #ifdef CONFIG_FS_DAX
> >>  static int blkdev_set_dax(struct block_device *bdev, int n)
> >>  {
> >> -     struct gendisk *disk = bdev->bd_disk;
> >>       int rc = 0;
> >>
> >>       if (n)
> >>               n = S_DAX;
> >>
> >> -     if (n && !disk->fops->direct_access)
> >> +     if (n && !blkdev_dax_capable(bdev))
> >>               return -ENOTTY;
> >>
> >>       mutex_lock(&bdev->bd_inode->i_mutex);
> >> diff --git a/fs/block_dev.c b/fs/block_dev.c
> >> index 13ce6d0ff7f6..ee34a31e6fa4 100644
> >> --- a/fs/block_dev.c
> >> +++ b/fs/block_dev.c
> >> @@ -152,16 +152,37 @@ static struct inode *bdev_file_inode(struct file *file)
> >>       return file->f_mapping->host;
> >>  }
> >>
> >> +#ifdef CONFIG_FS_DAX
> >> +bool blkdev_dax_capable(struct block_device *bdev)
> >> +{
> >> +     struct gendisk *disk = bdev->bd_disk;
> >> +
> >> +     if (!disk->fops->direct_access)
> >> +             return false;
> >> +
> >> +     /*
> >> +      * If the partition is not aligned on a page boundary, we can't
> >> +      * do dax I/O to it.
> >> +      */
> >> +     if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
> >> +                     || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
> >> +             return false;
> >> +
> >> +     return true;
> >
> > Where do you check that S_DAX has been enabled on the block device
> > now?
> >
> 
> Only in the mmap path:

which means blkdev_direct_IO() is now always going to go down the
dax_do_io() path for any driver with a ->direct_access method rather
than the direct IO path, regardless of whether DAX is enabled on the
device or not.

That really seems wrong to me - you've replace explicit "is DAX
enabled" checks with "is DAX possible" checks, and so DAX paths are
used regardless of whether DAX is enabled or not. And it's not
obvious why this is done, nor is it now obvious how DAX interacts
with the block device.

This really seems like a step backwards to me.

Cheers,

Dave.
Dan Williams Nov. 3, 2015, 11:04 p.m. UTC | #4
On Tue, Nov 3, 2015 at 12:20 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Nov 02, 2015 at 11:35:04PM -0800, Dan Williams wrote:
>> On Mon, Nov 2, 2015 at 4:32 PM, Dave Chinner <david@fromorbit.com> wrote:
[..]
>> Only in the mmap path:
>
> which means blkdev_direct_IO() is now always going to go down the
> dax_do_io() path for any driver with a ->direct_access method rather
> than the direct IO path, regardless of whether DAX is enabled on the
> device or not.
>
> That really seems wrong to me - you've replace explicit "is DAX
> enabled" checks with "is DAX possible" checks, and so DAX paths are
> used regardless of whether DAX is enabled or not. And it's not
> obvious why this is done, nor is it now obvious how DAX interacts
> with the block device.
>
> This really seems like a step backwards to me.

I think the reason it is not obvious is the original justification for
the bypass as stated in commit bbab37ddc20b "block: Add support for
DAX reads/writes to block devices" was:

    "instead of allocating a DIO and a BIO"

It turns out it's faster and as far as I can tell semantically
equivalent to the __blockdev_direct_IO() path.  The DAX mmap path in
comparison has plenty of sharp edges and semantic differences that
would be avoided by turning off DAX.

I'm not opposed to also turning off dax_do_io() when S_DAX is clear,
but I don't currently see the point.  At the very least I need to add
the above comments to the code, but do you still think opt-in DAX is a
backwards step?
Dan Williams Nov. 4, 2015, 7:23 p.m. UTC | #5
On Tue, Nov 3, 2015 at 3:04 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Tue, Nov 3, 2015 at 12:20 PM, Dave Chinner <david@fromorbit.com> wrote:
>> On Mon, Nov 02, 2015 at 11:35:04PM -0800, Dan Williams wrote:
>>> On Mon, Nov 2, 2015 at 4:32 PM, Dave Chinner <david@fromorbit.com> wrote:
> [..]
>>> Only in the mmap path:
>>
>> which means blkdev_direct_IO() is now always going to go down the
>> dax_do_io() path for any driver with a ->direct_access method rather
>> than the direct IO path, regardless of whether DAX is enabled on the
>> device or not.
>>
>> That really seems wrong to me - you've replace explicit "is DAX
>> enabled" checks with "is DAX possible" checks, and so DAX paths are
>> used regardless of whether DAX is enabled or not. And it's not
>> obvious why this is done, nor is it now obvious how DAX interacts
>> with the block device.
>>
>> This really seems like a step backwards to me.
>
> I think the reason it is not obvious is the original justification for
> the bypass as stated in commit bbab37ddc20b "block: Add support for
> DAX reads/writes to block devices" was:
>
>     "instead of allocating a DIO and a BIO"
>
> It turns out it's faster and as far as I can tell semantically
> equivalent to the __blockdev_direct_IO() path.  The DAX mmap path in
> comparison has plenty of sharp edges and semantic differences that
> would be avoided by turning off DAX.
>
> I'm not opposed to also turning off dax_do_io() when S_DAX is clear,
> but I don't currently see the point.  At the very least I need to add
> the above comments to the code, but do you still think opt-in DAX is a
> backwards step?

I thought of one way dax_do_io() breaks current semantics, it defeats
blktrace and i/o stat tracking.  I'll restore the existing behavior
that gates dax_do_io() on S_DAX.
diff mbox

Patch

diff --git a/block/ioctl.c b/block/ioctl.c
index 205d57612fbd..c4c3a09d9ca9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -298,13 +298,12 @@  static inline int is_unrecognized_ioctl(int ret)
 #ifdef CONFIG_FS_DAX
 static int blkdev_set_dax(struct block_device *bdev, int n)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	int rc = 0;
 
 	if (n)
 		n = S_DAX;
 
-	if (n && !disk->fops->direct_access)
+	if (n && !blkdev_dax_capable(bdev))
 		return -ENOTTY;
 
 	mutex_lock(&bdev->bd_inode->i_mutex);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 13ce6d0ff7f6..ee34a31e6fa4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -152,16 +152,37 @@  static struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access)
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	return true;
+}
+#endif
+
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
+	struct block_device *bdev = I_BDEV(inode);
 
-	if (IS_DAX(inode))
+	if (blkdev_dax_capable(bdev))
 		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
 				NULL, DIO_SKIP_DIO_COUNT);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+	return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);
 }
@@ -1185,7 +1206,6 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
 		if (!partno) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
@@ -1247,13 +1267,6 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			/*
-			 * If the partition is not aligned on a page
-			 * boundary, we can't do dax I/O to it.
-			 */
-			if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
-			    (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
 		if (bdev->bd_contains == bdev) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8fb2d4b848bf..5a9e14538f69 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2282,6 +2282,14 @@  extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
+#ifdef CONFIG_FS_DAX
+extern bool blkdev_dax_capable(struct block_device *bdev);
+#else
+static inline bool blkdev_dax_capable(struct block_device *bdev)
+{
+	return false;
+}
+#endif
 
 extern struct super_block *blockdev_superblock;