diff mbox

[RFC,2/2] block: enable dax for raw block devices

Message ID 20151017004702.2742.82530.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Superseded
Headers show

Commit Message

Dan Williams Oct. 17, 2015, 12:49 a.m. UTC
If an application wants exclusive access to all of the persistent memory
provided by an NVDIMM namespace it can use this raw-block-dax facility
to forgo establishing a filesystem.  This capability is targeted
primarily to hypervisors wanting to provision persistent memory for
guests.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

Only lighted tested so far, but seems to work, is the shortest path to a
DAX mapping, and makes it easier to trigger the pmd_fault path (no
fs-block-allocator interactions).

 fs/block_dev.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

Comments

Ross Zwisler Oct. 19, 2015, 3:01 a.m. UTC | #1
On Fri, Oct 16, 2015 at 08:49:41PM -0400, Dan Williams wrote:
> If an application wants exclusive access to all of the persistent memory
> provided by an NVDIMM namespace it can use this raw-block-dax facility
> to forgo establishing a filesystem.  This capability is targeted
> primarily to hypervisors wanting to provision persistent memory for
> guests.
> 
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
> 
> Only lighted tested so far, but seems to work, is the shortest path to a
> DAX mapping, and makes it easier to trigger the pmd_fault path (no
> fs-block-allocator interactions).
> 
>  fs/block_dev.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 83 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 5277dd83d254..498b71455570 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1687,13 +1687,95 @@ static const struct address_space_operations def_blk_aops = {
>  	.is_dirty_writeback = buffer_check_dirty_writeback,
>  };
>  
> +#ifdef CONFIG_FS_DAX
> +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	struct inode *bd_inode = file_bd_inode(vma->vm_file);
> +	struct block_device *bdev = I_BDEV(bd_inode);
> +	int ret;
> +
> +	mutex_lock(&bdev->bd_mutex);
> +	ret = __dax_fault(vma, vmf, blkdev_get_block, NULL);
> +	mutex_unlock(&bdev->bd_mutex);
> +
> +	return ret;
> +}

This all looks very straightforward.  The one comment I have is that this
code is missing the calls to sb_[start|end]_pagefault(), and to
file_update_time() that are found in ext[24]/xfs and the generic fault code.

The previous version of this code used the generic fault implementation, and
was calling these functions via filemap_page_mkwrite().

It is possible that they were omitted for a reason - does protection from
filesystem freezing still make sense when talking with a raw block device?
For example, if that block device *has* a mounted filesystem on it that is
frozen, does sb_start_pagefault() prevent against page faults on the raw
device that try and make something writable?  

In any case, the presence of them in filemap_page_mkwrite() tells me that they
at least aren't harmful, and I wanted to make sure they weren't needed before
leaving them out.  If the omission was intentional, should we add a comment to
explain why they are missing?
Dan Williams Oct. 19, 2015, 8:49 p.m. UTC | #2
On Sun, Oct 18, 2015 at 8:01 PM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> On Fri, Oct 16, 2015 at 08:49:41PM -0400, Dan Williams wrote:
>> If an application wants exclusive access to all of the persistent memory
>> provided by an NVDIMM namespace it can use this raw-block-dax facility
>> to forgo establishing a filesystem.  This capability is targeted
>> primarily to hypervisors wanting to provision persistent memory for
>> guests.
>>
>> Cc: Jeff Moyer <jmoyer@redhat.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Al Viro <viro@zeniv.linux.org.uk>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>
>> Only lighted tested so far, but seems to work, is the shortest path to a
>> DAX mapping, and makes it easier to trigger the pmd_fault path (no
>> fs-block-allocator interactions).
>>
>>  fs/block_dev.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 83 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 5277dd83d254..498b71455570 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1687,13 +1687,95 @@ static const struct address_space_operations def_blk_aops = {
>>       .is_dirty_writeback = buffer_check_dirty_writeback,
>>  };
>>
>> +#ifdef CONFIG_FS_DAX
>> +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>> +{
>> +     struct inode *bd_inode = file_bd_inode(vma->vm_file);
>> +     struct block_device *bdev = I_BDEV(bd_inode);
>> +     int ret;
>> +
>> +     mutex_lock(&bdev->bd_mutex);
>> +     ret = __dax_fault(vma, vmf, blkdev_get_block, NULL);
>> +     mutex_unlock(&bdev->bd_mutex);
>> +
>> +     return ret;
>> +}
>
> This all looks very straightforward.  The one comment I have is that this
> code is missing the calls to sb_[start|end]_pagefault(), and to
> file_update_time() that are found in ext[24]/xfs and the generic fault code.
>
> The previous version of this code used the generic fault implementation, and
> was calling these functions via filemap_page_mkwrite().
>
> It is possible that they were omitted for a reason - does protection from
> filesystem freezing still make sense when talking with a raw block device?
> For example, if that block device *has* a mounted filesystem on it that is
> frozen, does sb_start_pagefault() prevent against page faults on the raw
> device that try and make something writable?
>
> In any case, the presence of them in filemap_page_mkwrite() tells me that they
> at least aren't harmful, and I wanted to make sure they weren't needed before
> leaving them out.  If the omission was intentional, should we add a comment to
> explain why they are missing?

So, I left them out on purpose and labeled this "RFC" mainly because I
wasn't ready to assert that the new locking using bd_mutex did not
have bad interactions with those paths or the mmap path in general.

The access time and interactions with freezing are different for raw
block device files given that the inode for the per-instance
device-node file is separate from the inode for the block device
itself.  The device node file is typically on a tmpfs filesystem
mounted at /dev.  The access time of the device node doesn't have a
reliable correlation with the access time of the block-device due to
the fact that you can mknod() a new device node anywhere, on any
filesystem, and do i/o.

I'll consider adding them back if they do no harm, but "device special
files" are already sufficiently different than regular files that I'm
not sure it matters.  Either way I agree this could use more
commentary in the code.
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5277dd83d254..498b71455570 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1687,13 +1687,95 @@  static const struct address_space_operations def_blk_aops = {
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
+#ifdef CONFIG_FS_DAX
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *bd_inode = file_bd_inode(vma->vm_file);
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = __dax_fault(vma, vmf, blkdev_get_block, NULL);
+	mutex_unlock(&bdev->bd_mutex);
+
+	return ret;
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	struct inode *bd_inode = file_bd_inode(vma->vm_file);
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+	mutex_unlock(&bdev->bd_mutex);
+
+	return ret;
+}
+
+static int blkdev_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *bd_inode = file_bd_inode(vma->vm_file);
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = __dax_mkwrite(vma, vmf, blkdev_get_block, NULL);
+	mutex_unlock(&bdev->bd_mutex);
+
+	return ret;
+}
+
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	struct inode *bd_inode = file_bd_inode(vma->vm_file);
+	struct block_device *bdev = I_BDEV(bd_inode);
+	int ret = VM_FAULT_NOPAGE;
+	loff_t size;
+
+	/* check that the faulting page hasn't raced with bdev resize */
+	mutex_lock(&bdev->bd_mutex);
+	size = (i_size_read(bd_inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	mutex_unlock(&bdev->bd_mutex);
+
+	return ret;
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+	.fault		= blkdev_dax_fault,
+	.pmd_fault	= blkdev_dax_pmd_fault,
+	.page_mkwrite	= blkdev_dax_mkwrite,
+	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *bd_inode = file_bd_inode(file);
+
+	if (!IS_DAX(bd_inode))
+		return generic_file_mmap(file, vma);
+
+	file_accessed(file);
+	vma->vm_ops = &blkdev_dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= generic_file_mmap,
+	.mmap		= blkdev_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT