diff mbox

NFS: allow name_to_handle_at() to work for Amazon EFS.

Message ID 87r2s7ql5m.fsf@notabene.neil.brown.name (mailing list archive)
State New, archived
Headers show

Commit Message

NeilBrown Dec. 7, 2017, 3:20 a.m. UTC
On Wed, Dec 06 2017, Linus Torvalds wrote:

> On Thu, Nov 30, 2017 at 12:56 PM, NeilBrown <neilb@suse.com> wrote:
>>
>> -/* limit the handle size to NFSv4 handle size now */
>> -#define MAX_HANDLE_SZ 128
>> +/* Must be larger than NFSv4 file handle, but small
>> + * enough for an on-stack allocation. overlayfs doesn't
>> + * want this too close to 255.
>> + */
>> +#define MAX_HANDLE_SZ 200
>
> This really smells for so many reasons.
>
> Also, that really is starting to be a fairly big stack allocation, and
> it seems to be used in exactly one place (show_mark_fhandle), which
> makes me go "why is that on the stack anyway?".
>
> Could we just allocate a buffer at open time or something?
>
>                Linus

"open time" would be when /proc/X/fdinfo/Y was opened in
seq_fdinfo_open(), and allocating a file_handle there seems a bit odd.

We can allocate in fs/notify/fdinfo.c:show_fdinfo() which is
the earliest 'notify' specific code to run.  There is no
opportunity to return an error but GFP_KERNEL allocations under 1 page
never fail..

This patch allocates a single buffer for all inodes reported for a given
inotify fdinfo, and if the allocation files, the filehandle is silently
left blank.  More surgery would be needed to be able to return an error.

Is that at all suitable?

Thanks,
NeilBrown

From: NeilBrown <neilb@suse.com>
Subject: fs/notify: don't put file handle buffer on stack.

A file handle buffer is not tiny, and could need to be larger in future,
so it isn't safe to allocate one on the stack.  Instead, we need to
kmalloc().

There is no way to return an error status from a ->show_fdinfo()
function, so if the kmalloc fails, we silently exclude the filehandle
from the output.  As it is at the end of line, this shouldn't
upset parsing too much.

Signed-off-by: NeilBrown <neilb@suse.com>

Comments

Amir Goldstein Dec. 7, 2017, 4:04 a.m. UTC | #1
On Thu, Dec 7, 2017 at 5:20 AM, NeilBrown <neilb@suse.com> wrote:
> On Wed, Dec 06 2017, Linus Torvalds wrote:
>
>> On Thu, Nov 30, 2017 at 12:56 PM, NeilBrown <neilb@suse.com> wrote:
>>>
>>> -/* limit the handle size to NFSv4 handle size now */
>>> -#define MAX_HANDLE_SZ 128
>>> +/* Must be larger than NFSv4 file handle, but small
>>> + * enough for an on-stack allocation. overlayfs doesn't
>>> + * want this too close to 255.
>>> + */
>>> +#define MAX_HANDLE_SZ 200
>>
>> This really smells for so many reasons.
>>
>> Also, that really is starting to be a fairly big stack allocation, and
>> it seems to be used in exactly one place (show_mark_fhandle), which
>> makes me go "why is that on the stack anyway?".
>>
>> Could we just allocate a buffer at open time or something?
>>
>>                Linus
>
> "open time" would be when /proc/X/fdinfo/Y was opened in
> seq_fdinfo_open(), and allocating a file_handle there seems a bit odd.
>
> We can allocate in fs/notify/fdinfo.c:show_fdinfo() which is
> the earliest 'notify' specific code to run.  There is no
> opportunity to return an error but GFP_KERNEL allocations under 1 page
> never fail..
>
> This patch allocates a single buffer for all inodes reported for a given
> inotify fdinfo, and if the allocation files, the filehandle is silently
> left blank.  More surgery would be needed to be able to return an error.
>
> Is that at all suitable?
>
> Thanks,
> NeilBrown
>
> From: NeilBrown <neilb@suse.com>
> Subject: fs/notify: don't put file handle buffer on stack.
>
> A file handle buffer is not tiny, and could need to be larger in future,
> so it isn't safe to allocate one on the stack.  Instead, we need to
> kmalloc().
>
> There is no way to return an error status from a ->show_fdinfo()
> function, so if the kmalloc fails, we silently exclude the filehandle
> from the output.  As it is at the end of line, this shouldn't
> upset parsing too much.

It shouldn't upset parsing because that would be the same out
output as without CONFIG_EXPORTFS. AFAIK this information
is used by CRUI.

>
> Signed-off-by: NeilBrown <neilb@suse.com>
>
> diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
> index d478629c728b..20d863b9ae16 100644
> --- a/fs/notify/fdinfo.c
> +++ b/fs/notify/fdinfo.c
> @@ -23,56 +23,58 @@
>
>  static void show_fdinfo(struct seq_file *m, struct file *f,
>                         void (*show)(struct seq_file *m,
> -                                    struct fsnotify_mark *mark))
> +                                    struct fsnotify_mark *mark,
> +                                    struct fid *fh))
>  {
>         struct fsnotify_group *group = f->private_data;
>         struct fsnotify_mark *mark;
> +       struct fid *fh = kmalloc(MAX_HANDLE_SZ, GFP_KERNEL);
>
>         mutex_lock(&group->mark_mutex);
>         list_for_each_entry(mark, &group->marks_list, g_list) {
> -               show(m, mark);
> +               show(m, mark, fh);
>                 if (seq_has_overflowed(m))
>                         break;
>         }
>         mutex_unlock(&group->mark_mutex);
> +       kfree(fh);
>  }
>
>  #if defined(CONFIG_EXPORTFS)
> -static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
> +static void show_mark_fhandle(struct seq_file *m, struct inode *inode,
> +                             struct fid *fhbuf)
>  {
> -       struct {
> -               struct file_handle handle;
> -               u8 pad[MAX_HANDLE_SZ];
> -       } f;
>         int size, ret, i;
> +       unsigned char *bytes;
>
> -       f.handle.handle_bytes = sizeof(f.pad);
> -       size = f.handle.handle_bytes >> 2;
> +       if (!fhbuf)
> +               return;
> +       size = MAX_HANDLE_SZ >> 2;
>
> -       ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
> +       ret = exportfs_encode_inode_fh(inode, fhbuf, &size, 0);
>         if ((ret == FILEID_INVALID) || (ret < 0)) {
>                 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);

This WARN_ONCE is out of order. It is perfectly valid for inotify/fanotify
to watch over fs that doesn't support exportfs. Care to clean it up?
Perhaps a pr_warn_ratelimited() for either !fhbuf or can't encode?

Cheers,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matthew Wilcox Dec. 7, 2017, 5:34 a.m. UTC | #2
On Thu, Dec 07, 2017 at 02:20:05PM +1100, NeilBrown wrote:
> We can allocate in fs/notify/fdinfo.c:show_fdinfo() which is
> the earliest 'notify' specific code to run.  There is no
> opportunity to return an error but GFP_KERNEL allocations under 1 page
> never fail..

"never"

 * The default allocator behavior depends on the request size. We have a concept
 * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER).
 * !costly allocations are too essential to fail so they are implicitly
 * non-failing by default (with some exceptions like OOM victims might fail so
 * the caller still has to check for failures) while costly requests try to be
 * not disruptive and back off even without invoking the OOM killer.
 * The following three modifiers might be used to override some of these
 * implicit rules

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NeilBrown Dec. 8, 2017, 2:17 a.m. UTC | #3
On Thu, Dec 07 2017, Amir Goldstein wrote:

> On Thu, Dec 7, 2017 at 5:20 AM, NeilBrown <neilb@suse.com> wrote:
>> On Wed, Dec 06 2017, Linus Torvalds wrote:
>>
>>> On Thu, Nov 30, 2017 at 12:56 PM, NeilBrown <neilb@suse.com> wrote:
>>>>
>>>> -/* limit the handle size to NFSv4 handle size now */
>>>> -#define MAX_HANDLE_SZ 128
>>>> +/* Must be larger than NFSv4 file handle, but small
>>>> + * enough for an on-stack allocation. overlayfs doesn't
>>>> + * want this too close to 255.
>>>> + */
>>>> +#define MAX_HANDLE_SZ 200
>>>
>>> This really smells for so many reasons.
>>>
>>> Also, that really is starting to be a fairly big stack allocation, and
>>> it seems to be used in exactly one place (show_mark_fhandle), which
>>> makes me go "why is that on the stack anyway?".
>>>
>>> Could we just allocate a buffer at open time or something?
>>>
>>>                Linus
>>
>> "open time" would be when /proc/X/fdinfo/Y was opened in
>> seq_fdinfo_open(), and allocating a file_handle there seems a bit odd.
>>
>> We can allocate in fs/notify/fdinfo.c:show_fdinfo() which is
>> the earliest 'notify' specific code to run.  There is no
>> opportunity to return an error but GFP_KERNEL allocations under 1 page
>> never fail..
>>
>> This patch allocates a single buffer for all inodes reported for a given
>> inotify fdinfo, and if the allocation files, the filehandle is silently
>> left blank.  More surgery would be needed to be able to return an error.
>>
>> Is that at all suitable?
>>
>> Thanks,
>> NeilBrown
>>
>> From: NeilBrown <neilb@suse.com>
>> Subject: fs/notify: don't put file handle buffer on stack.
>>
>> A file handle buffer is not tiny, and could need to be larger in future,
>> so it isn't safe to allocate one on the stack.  Instead, we need to
>> kmalloc().
>>
>> There is no way to return an error status from a ->show_fdinfo()
>> function, so if the kmalloc fails, we silently exclude the filehandle
>> from the output.  As it is at the end of line, this shouldn't
>> upset parsing too much.
>
> It shouldn't upset parsing because that would be the same out
> output as without CONFIG_EXPORTFS. AFAIK this information
> is used by CRUI.
>
>>
>> Signed-off-by: NeilBrown <neilb@suse.com>
>>
>> diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
>> index d478629c728b..20d863b9ae16 100644
>> --- a/fs/notify/fdinfo.c
>> +++ b/fs/notify/fdinfo.c
>> @@ -23,56 +23,58 @@
>>
>>  static void show_fdinfo(struct seq_file *m, struct file *f,
>>                         void (*show)(struct seq_file *m,
>> -                                    struct fsnotify_mark *mark))
>> +                                    struct fsnotify_mark *mark,
>> +                                    struct fid *fh))
>>  {
>>         struct fsnotify_group *group = f->private_data;
>>         struct fsnotify_mark *mark;
>> +       struct fid *fh = kmalloc(MAX_HANDLE_SZ, GFP_KERNEL);
>>
>>         mutex_lock(&group->mark_mutex);
>>         list_for_each_entry(mark, &group->marks_list, g_list) {
>> -               show(m, mark);
>> +               show(m, mark, fh);
>>                 if (seq_has_overflowed(m))
>>                         break;
>>         }
>>         mutex_unlock(&group->mark_mutex);
>> +       kfree(fh);
>>  }
>>
>>  #if defined(CONFIG_EXPORTFS)
>> -static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
>> +static void show_mark_fhandle(struct seq_file *m, struct inode *inode,
>> +                             struct fid *fhbuf)
>>  {
>> -       struct {
>> -               struct file_handle handle;
>> -               u8 pad[MAX_HANDLE_SZ];
>> -       } f;
>>         int size, ret, i;
>> +       unsigned char *bytes;
>>
>> -       f.handle.handle_bytes = sizeof(f.pad);
>> -       size = f.handle.handle_bytes >> 2;
>> +       if (!fhbuf)
>> +               return;
>> +       size = MAX_HANDLE_SZ >> 2;
>>
>> -       ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
>> +       ret = exportfs_encode_inode_fh(inode, fhbuf, &size, 0);
>>         if ((ret == FILEID_INVALID) || (ret < 0)) {
>>                 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
>
> This WARN_ONCE is out of order. It is perfectly valid for inotify/fanotify
> to watch over fs that doesn't support exportfs. Care to clean it up?
> Perhaps a pr_warn_ratelimited() for either !fhbuf or can't encode?

If I were going to clean it up, I would need to do more than remove the
WARN_ONCE(), which almost certainly never fires.

exportfs_encode_inode_fh() should only be called if sb->s_export_op is
not NULL.
When it is NULL, it means that the filesystem doesn't support file
handles.
do_sys_name_to_handle() tests this, as does nfsd.  But this inotify code
doesn't.
So it can report a "file handle" for a file for which file handles
aren't supported.  It will use the default export_encode_fh which
reports i_ino and i_generation, which may or may not be stable or
meaningful.

So yes, this code does need a bit of cleaning up....

Thanks,
NeilBrown
diff mbox

Patch

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index d478629c728b..20d863b9ae16 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -23,56 +23,58 @@ 
 
 static void show_fdinfo(struct seq_file *m, struct file *f,
 			void (*show)(struct seq_file *m,
-				     struct fsnotify_mark *mark))
+				     struct fsnotify_mark *mark,
+				     struct fid *fh))
 {
 	struct fsnotify_group *group = f->private_data;
 	struct fsnotify_mark *mark;
+	struct fid *fh = kmalloc(MAX_HANDLE_SZ, GFP_KERNEL);
 
 	mutex_lock(&group->mark_mutex);
 	list_for_each_entry(mark, &group->marks_list, g_list) {
-		show(m, mark);
+		show(m, mark, fh);
 		if (seq_has_overflowed(m))
 			break;
 	}
 	mutex_unlock(&group->mark_mutex);
+	kfree(fh);
 }
 
 #if defined(CONFIG_EXPORTFS)
-static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode,
+			      struct fid *fhbuf)
 {
-	struct {
-		struct file_handle handle;
-		u8 pad[MAX_HANDLE_SZ];
-	} f;
 	int size, ret, i;
+	unsigned char *bytes;
 
-	f.handle.handle_bytes = sizeof(f.pad);
-	size = f.handle.handle_bytes >> 2;
+	if (!fhbuf)
+		return;
+	size = MAX_HANDLE_SZ >> 2;
 
-	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+	ret = exportfs_encode_inode_fh(inode, fhbuf, &size, 0);
 	if ((ret == FILEID_INVALID) || (ret < 0)) {
 		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
 		return;
 	}
 
-	f.handle.handle_type = ret;
-	f.handle.handle_bytes = size * sizeof(u32);
-
-	seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
-		   f.handle.handle_bytes, f.handle.handle_type);
+	seq_printf(m, "fhandle-bytes:%zx fhandle-type:%x f_handle:",
+		   size * sizeof(u32), ret);
 
-	for (i = 0; i < f.handle.handle_bytes; i++)
-		seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+	bytes = (unsigned char *)(fhbuf->raw);
+	for (i = 0; i < size * sizeof(u32); i++)
+		seq_printf(m, "%02x", bytes[i]);
 }
 #else
-static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
+static void show_mark_fhandle(struct seq_file *m, struct inode *inode,
+			      struct fid *fhbuf)
 {
 }
 #endif
 
 #ifdef CONFIG_INOTIFY_USER
 
-static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark,
+			   struct fid *fhbuf)
 {
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
@@ -93,7 +95,7 @@  static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
 			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
 			   mask, mark->ignored_mask);
-		show_mark_fhandle(m, inode);
+		show_mark_fhandle(m, inode, fhbuf);
 		seq_putc(m, '\n');
 		iput(inode);
 	}
@@ -108,7 +110,8 @@  void inotify_show_fdinfo(struct seq_file *m, struct file *f)
 
 #ifdef CONFIG_FANOTIFY
 
-static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark,
+			    struct fid *fhbuf)
 {
 	unsigned int mflags = 0;
 	struct inode *inode;
@@ -123,7 +126,7 @@  static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
 			   inode->i_ino, inode->i_sb->s_dev,
 			   mflags, mark->mask, mark->ignored_mask);
-		show_mark_fhandle(m, inode);
+		show_mark_fhandle(m, inode, fhbuf);
 		seq_putc(m, '\n');
 		iput(inode);
 	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {