diff mbox

WARNING in kill_block_super

Message ID 6c95e826-4b9f-fb21-b311-830411e58480@I-love.SAKURA.ne.jp (mailing list archive)
State New, archived
Headers show

Commit Message

Tetsuo Handa April 4, 2018, 10:53 a.m. UTC
Al and Michal, are you OK with this patch?


From bbc0d00935ebcb7e287403bae545fae9340830d9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 4 Apr 2018 12:19:42 +0900
Subject: [PATCH] mm,vmscan: Allow preallocating memory for register_shrinker().

syzbot is catching so many bugs triggered by commit 9ee332d99e4d5a97
("sget(): handle failures of register_shrinker()"). That commit expected
that calling kill_sb() from deactivate_locked_super() without successful
fill_super() is safe, but the reality was different; some callers assign
attributes which are needed for kill_sb() after sget() succeeds.

For example, [1] is a report where sb->s_mode (which seems to be either
FMODE_READ | FMODE_EXCL | FMODE_WRITE or FMODE_READ | FMODE_EXCL) is not
assigned unless sget() succeeds. But it does not worth complicate sget()
so that register_shrinker() failure path can safely call
kill_block_super() via kill_sb(). Making alloc_super() fail if memory
allocation for register_shrinker() failed is much simpler. Let's avoid
calling deactivate_locked_super() from sget_userns() by preallocating
memory for the shrinker and making register_shrinker() in sget_userns()
never fail.

[1] https://syzkaller.appspot.com/bug?id=588996a25a2587be2e3a54e8646728fb9cae44e7

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+5a170e19c963a2e0df79@syzkaller.appspotmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
---
 fs/super.c               |  9 ++++-----
 include/linux/shrinker.h |  6 ++++--
 mm/vmscan.c              | 15 ++++++++++++++-
 3 files changed, 22 insertions(+), 8 deletions(-)

Comments

Michal Hocko April 6, 2018, 8:09 a.m. UTC | #1
On Wed 04-04-18 19:53:07, Tetsuo Handa wrote:
> Al and Michal, are you OK with this patch?

Maybe I've misunderstood, but hasn't Al explained [1] that the
appropriate fix is in the fs code?

[1] http://lkml.kernel.org/r/20180402143415.GC30522@ZenIV.linux.org.uk
Tetsuo Handa April 7, 2018, 5:55 a.m. UTC | #2
Michal Hocko wrote:
> On Wed 04-04-18 19:53:07, Tetsuo Handa wrote:
> > Al and Michal, are you OK with this patch?
> 
> Maybe I've misunderstood, but hasn't Al explained [1] that the
> appropriate fix is in the fs code?
> 
> [1] http://lkml.kernel.org/r/20180402143415.GC30522@ZenIV.linux.org.uk

Yes. But I wonder whether it worth complicating sget() only for handling
kmalloc() failure.

----------------------------------------
static struct file_system_type fuseblk_fs_type = {
  .owner          = THIS_MODULE,
  .name           = "fuseblk",
  .mount          = fuse_mount_blk,
  .kill_sb        = fuse_kill_sb_blk,
  .fs_flags       = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
};

static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) {
  return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super) {
    fmode_t mode = FMODE_READ | FMODE_EXCL;
    if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE;
    s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, bdev) {
      return sget_userns(type, test, set, flags, user_ns, data) {
        s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
        err = register_shrinker(&s->s_shrink);
        if (err) {
          deactivate_locked_super(s) {
            fs->kill_sb(s) = fuse_kill_sb_blk(s) {
              kill_block_super(sb) {
                struct block_device *bdev = sb->s_bdev;
                fmode_t mode = sb->s_mode;
                WARN_ON_ONCE(!(mode & FMODE_EXCL)); // <= Unsafe because FMODE_EXCL is not yet set which will be set at
                blkdev_put(bdev, mode | FMODE_EXCL);
              }
            }
          }
          s = ERR_PTR(err);
        }
      }
    }
    /* If sget() succeeds then ... */
    s->s_mode = mode;                               // <= this location.
    error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
    if (error) {
      deactivate_locked_super(s) {
        fs->kill_sb(s) = fuse_kill_sb_blk(s) {
          kill_block_super(sb) {
            struct block_device *bdev = sb->s_bdev;
            fmode_t mode = sb->s_mode;
            WARN_ON_ONCE(!(mode & FMODE_EXCL));     // <= Safe because FMODE_EXCL already set.
            blkdev_put(bdev, mode | FMODE_EXCL);
          }
        }
      }
      goto error;
    }
    /* If sget() fails then ... */
    error = PTR_ERR(s);
    blkdev_put(bdev, mode);                         // <= Calls blkdev_put() after deactivate_locked_super() already called blkdev_put().
  }
}
----------------------------------------

mount_bdev() is not ready to call blkdev_put() from sget().
Do we want to pass "s->s_mode" to sget() which allocates "s" ?

I feel it is preposterous that a function which allocates memory for an object
requires some of fields being already initialized in order to call a destroy
function.

By splitting register_shrinker() into prepare_shrinker() which might fail and
register_shrinker_prepared() which will not fail, we can allow shrinker users
to allocate memory at object creation time. I wrote a patch which adds
__must_check to register_shrinker() and we keep that patch in linux-next.git,
but what we got is a fake change which do not implement proper error handling
(e.g.

  Commit 6c4ca1e36cdc1a0a ("bcache: check return value of register_shrinker")

        if (register_shrinker(&c->shrink))
                pr_warn("bcache: %s: could not register shrinker",
                                __func__);

). It is not trivial to undo an error at register_shrinker().
Allocating memory for the shrinker at the time memory for an object which
contains the shrinker is allocated is much easier to undo.
Al Viro April 11, 2018, 12:59 a.m. UTC | #3
On Wed, Apr 04, 2018 at 07:53:07PM +0900, Tetsuo Handa wrote:
> Al and Michal, are you OK with this patch?

First of all, it does *NOT* fix the problems with careless ->kill_sb().
The fuse-blk case is the only real rationale so far.  Said that,

> @@ -166,6 +166,7 @@ static void destroy_unused_super(struct super_block *s)
>  	security_sb_free(s);
>  	put_user_ns(s->s_user_ns);
>  	kfree(s->s_subtype);
> +	kfree(s->s_shrink.nr_deferred);

is probably better done with an inlined helper (fs/super.c has no business knowing
about ->nr_deferred name, and there probably will be other users of that
preallocation of yours).  And the same helper would be better off zeroing the
pointer, same as unregister_shrinker() does.


> -int register_shrinker(struct shrinker *shrinker)
> +int prepare_shrinker(struct shrinker *shrinker)

preallocate_shrinker(), perhaps?

> +int register_shrinker(struct shrinker *shrinker)
> +{
> +	int err = prepare_shrinker(shrinker);
> +
> +	if (err)
> +		return err;
> +	register_shrinker_prepared(shrinker);

	if (!err)
		register_....;
	return err;

would be better, IMO.
Al Viro April 11, 2018, 1:38 a.m. UTC | #4
On Wed, Apr 11, 2018 at 10:28:06AM +0900, Tetsuo Handa wrote:
> Al Viro wrote:
> > On Wed, Apr 04, 2018 at 07:53:07PM +0900, Tetsuo Handa wrote:
> > > Al and Michal, are you OK with this patch?
> > 
> > First of all, it does *NOT* fix the problems with careless ->kill_sb().
> > The fuse-blk case is the only real rationale so far.  Said that,
> > 
> 
> Please notice below one as well. Fixing all careless ->kill_sb() will be too
> difficult to backport. For now, avoid calling deactivate_locked_super() is
> safer.

How will that fix e.g. jffs2?

> [upstream] WARNING: refcount bug in put_pid_ns
> https://syzkaller.appspot.com/bug?id=17e202b4794da213570ba33ac2f70277ef1ce015

Should be fixed by 8e666cb33597 in that series, AFAICS.
diff mbox

Patch

diff --git a/fs/super.c b/fs/super.c
index 672538c..db00f67 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,7 @@  static void destroy_unused_super(struct super_block *s)
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
+	kfree(s->s_shrink.nr_deferred);
 	/* no delays needed */
 	destroy_super_work(&s->destroy_work);
 }
@@ -251,6 +252,8 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+	if (prepare_shrinker(&s->s_shrink))
+		goto fail;
 	return s;
 
 fail:
@@ -517,11 +520,7 @@  struct super_block *sget_userns(struct file_system_type *type,
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	err = register_shrinker(&s->s_shrink);
-	if (err) {
-		deactivate_locked_super(s);
-		s = ERR_PTR(err);
-	}
+	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 388ff29..2728918 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -75,6 +75,8 @@  struct shrinker {
 #define SHRINKER_NUMA_AWARE	(1 << 0)
 #define SHRINKER_MEMCG_AWARE	(1 << 1)
 
-extern int register_shrinker(struct shrinker *);
-extern void unregister_shrinker(struct shrinker *);
+extern int prepare_shrinker(struct shrinker *shrinker);
+extern void register_shrinker_prepared(struct shrinker *shrinker);
+extern int register_shrinker(struct shrinker *shrinker);
+extern void unregister_shrinker(struct shrinker *shrinker);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cd5dc3f..a10fe8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -258,7 +258,7 @@  unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
 /*
  * Add a shrinker callback to be called from the vm.
  */
-int register_shrinker(struct shrinker *shrinker)
+int prepare_shrinker(struct shrinker *shrinker)
 {
 	size_t size = sizeof(*shrinker->nr_deferred);
 
@@ -268,10 +268,23 @@  int register_shrinker(struct shrinker *shrinker)
 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
+	return 0;
+}
 
+void register_shrinker_prepared(struct shrinker *shrinker)
+{
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+}
+
+int register_shrinker(struct shrinker *shrinker)
+{
+	int err = prepare_shrinker(shrinker);
+
+	if (err)
+		return err;
+	register_shrinker_prepared(shrinker);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);