diff mbox

[RFC] locks: fix file locking on overlayfs

Message ID 1468931264-25837-1-git-send-email-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Miklos Szeredi July 19, 2016, 12:27 p.m. UTC
This patch allows flock, posix locks, ofd locks and leases to work
correctly on overlayfs.

Instead of using the underlying inode for storing lock context use the
overlay inode.  This allows locks to be persistent across copy-up.

This is done by introducing locks_inode() helper and using it instead of
file_inode() to get the inode in locking code.  For non-overlayfs the two
are equivalent, except for an extra pointer dereference in locks_inode().

Since lock operations are in "struct file_operations" we must also make
sure not to call underlying filesystem's lock operations.  Introcude a
super block flag MS_NOREMOTELOCK to this effect.

Finally for correct operation of leases i_writecount too needs to be
modified on the overlay inode.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/file_table.c         |  2 +-
 fs/locks.c              | 50 +++++++++++++++++++++++++++----------------------
 fs/namespace.c          |  2 +-
 fs/open.c               |  9 +++++----
 fs/overlayfs/super.c    |  2 +-
 include/linux/fs.h      | 20 ++++++++++++++++----
 include/uapi/linux/fs.h |  1 +
 kernel/fork.c           |  2 +-
 mm/mmap.c               |  4 ++--
 9 files changed, 56 insertions(+), 36 deletions(-)

Comments

J. Bruce Fields July 19, 2016, 2:01 p.m. UTC | #1
On Tue, Jul 19, 2016 at 02:27:44PM +0200, Miklos Szeredi wrote:
> This patch allows flock, posix locks, ofd locks and leases to work
> correctly on overlayfs.
> 
> Instead of using the underlying inode for storing lock context use the
> overlay inode.  This allows locks to be persistent across copy-up.

Remind me when the overlay inode is created--is it immediately on any
(even read-only) open?

--b.

> 
> This is done by introducing locks_inode() helper and using it instead of
> file_inode() to get the inode in locking code.  For non-overlayfs the two
> are equivalent, except for an extra pointer dereference in locks_inode().
> 
> Since lock operations are in "struct file_operations" we must also make
> sure not to call underlying filesystem's lock operations.  Introcude a
> super block flag MS_NOREMOTELOCK to this effect.
> 
> Finally for correct operation of leases i_writecount too needs to be
> modified on the overlay inode.
> 
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/file_table.c         |  2 +-
>  fs/locks.c              | 50 +++++++++++++++++++++++++++----------------------
>  fs/namespace.c          |  2 +-
>  fs/open.c               |  9 +++++----
>  fs/overlayfs/super.c    |  2 +-
>  include/linux/fs.h      | 20 ++++++++++++++++----
>  include/uapi/linux/fs.h |  1 +
>  kernel/fork.c           |  2 +-
>  mm/mmap.c               |  4 ++--
>  9 files changed, 56 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/file_table.c b/fs/file_table.c
> index ad17e05ebf95..bb59284c220c 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -216,7 +216,7 @@ static void __fput(struct file *file)
>  	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
>  		i_readcount_dec(inode);
>  	if (file->f_mode & FMODE_WRITER) {
> -		put_write_access(inode);
> +		put_write_access(locks_inode(file));
>  		__mnt_drop_write(mnt);
>  	}
>  	file->f_path.dentry = NULL;
> diff --git a/fs/locks.c b/fs/locks.c
> index ee1b15f6fc13..c1656cff53ee 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -139,6 +139,11 @@
>  #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
>  #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
>  
> +static inline bool is_remote_lock(struct file *filp)
> +{
> +	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
> +}
> +
>  static bool lease_breaking(struct file_lock *fl)
>  {
>  	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
> @@ -791,7 +796,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>  {
>  	struct file_lock *cfl;
>  	struct file_lock_context *ctx;
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  
>  	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> @@ -1192,7 +1197,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
>  int posix_lock_file(struct file *filp, struct file_lock *fl,
>  			struct file_lock *conflock)
>  {
> -	return posix_lock_inode(file_inode(filp), fl, conflock);
> +	return posix_lock_inode(locks_inode(filp), fl, conflock);
>  }
>  EXPORT_SYMBOL(posix_lock_file);
>  
> @@ -1232,7 +1237,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
>  int locks_mandatory_locked(struct file *file)
>  {
>  	int ret;
> -	struct inode *inode = file_inode(file);
> +	struct inode *inode = locks_inode(file);
>  	struct file_lock_context *ctx;
>  	struct file_lock *fl;
>  
> @@ -1572,7 +1577,7 @@ EXPORT_SYMBOL(lease_get_mtime);
>  int fcntl_getlease(struct file *filp)
>  {
>  	struct file_lock *fl;
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  	struct file_lock_context *ctx;
>  	int type = F_UNLCK;
>  	LIST_HEAD(dispose);
> @@ -1580,7 +1585,7 @@ int fcntl_getlease(struct file *filp)
>  	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>  		spin_lock(&ctx->flc_lock);
> -		time_out_leases(file_inode(filp), &dispose);
> +		time_out_leases(inode, &dispose);
>  		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
>  			if (fl->fl_file != filp)
>  				continue;
> @@ -1628,7 +1633,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
>  {
>  	struct file_lock *fl, *my_fl = NULL, *lease;
>  	struct dentry *dentry = filp->f_path.dentry;
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = dentry->d_inode;
>  	struct file_lock_context *ctx;
>  	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
>  	int error;
> @@ -1742,7 +1747,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  {
>  	int error = -EAGAIN;
>  	struct file_lock *fl, *victim = NULL;
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  	struct file_lock_context *ctx;
>  	LIST_HEAD(dispose);
>  
> @@ -1782,7 +1787,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
>  			void **priv)
>  {
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  	int error;
>  
>  	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
> @@ -1830,7 +1835,7 @@ EXPORT_SYMBOL(generic_setlease);
>  int
>  vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
>  {
> -	if (filp->f_op->setlease)
> +	if (filp->f_op->setlease && is_remote_lock(filp))
>  		return filp->f_op->setlease(filp, arg, lease, priv);
>  	else
>  		return generic_setlease(filp, arg, lease, priv);
> @@ -1979,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
>  	if (error)
>  		goto out_free;
>  
> -	if (f.file->f_op->flock)
> +	if (f.file->f_op->flock && is_remote_lock(f.file))
>  		error = f.file->f_op->flock(f.file,
>  					  (can_sleep) ? F_SETLKW : F_SETLK,
>  					  lock);
> @@ -2005,7 +2010,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
>   */
>  int vfs_test_lock(struct file *filp, struct file_lock *fl)
>  {
> -	if (filp->f_op->lock)
> +	if (filp->f_op->lock && is_remote_lock(filp))
>  		return filp->f_op->lock(filp, F_GETLK, fl);
>  	posix_test_lock(filp, fl);
>  	return 0;
> @@ -2129,7 +2134,7 @@ out:
>   */
>  int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
>  {
> -	if (filp->f_op->lock)
> +	if (filp->f_op->lock && is_remote_lock(filp))
>  		return filp->f_op->lock(filp, cmd, fl);
>  	else
>  		return posix_lock_file(filp, fl, conf);
> @@ -2191,7 +2196,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
>  	if (file_lock == NULL)
>  		return -ENOLCK;
>  
> -	inode = file_inode(filp);
> +	inode = locks_inode(filp);
>  
>  	/*
>  	 * This might block, so we do it before checking the inode.
> @@ -2343,7 +2348,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
>  	if (copy_from_user(&flock, l, sizeof(flock)))
>  		goto out;
>  
> -	inode = file_inode(filp);
> +	inode = locks_inode(filp);
>  
>  	/* Don't allow mandatory locks on files that may be memory mapped
>  	 * and shared.
> @@ -2426,6 +2431,7 @@ out:
>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  {
>  	int error;
> +	struct inode *inode = locks_inode(filp);
>  	struct file_lock lock;
>  	struct file_lock_context *ctx;
>  
> @@ -2434,7 +2440,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  	 * posix_lock_file().  Another process could be setting a lock on this
>  	 * file at the same time, but we wouldn't remove that lock anyway.
>  	 */
> -	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> +	ctx =  smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty(&ctx->flc_posix))
>  		return;
>  
> @@ -2452,7 +2458,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  
>  	if (lock.fl_ops && lock.fl_ops->fl_release_private)
>  		lock.fl_ops->fl_release_private(&lock);
> -	trace_locks_remove_posix(file_inode(filp), &lock, error);
> +	trace_locks_remove_posix(inode, &lock, error);
>  }
>  
>  EXPORT_SYMBOL(locks_remove_posix);
> @@ -2469,12 +2475,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>  		.fl_type = F_UNLCK,
>  		.fl_end = OFFSET_MAX,
>  	};
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  
>  	if (list_empty(&flctx->flc_flock))
>  		return;
>  
> -	if (filp->f_op->flock)
> +	if (filp->f_op->flock && is_remote_lock(filp))
>  		filp->f_op->flock(filp, F_SETLKW, &fl);
>  	else
>  		flock_lock_inode(inode, &fl);
> @@ -2508,7 +2514,7 @@ void locks_remove_file(struct file *filp)
>  {
>  	struct file_lock_context *ctx;
>  
> -	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> +	ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
>  	if (!ctx)
>  		return;
>  
> @@ -2552,7 +2558,7 @@ EXPORT_SYMBOL(posix_unblock_lock);
>   */
>  int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
>  {
> -	if (filp->f_op->lock)
> +	if (filp->f_op->lock && is_remote_lock(filp))
>  		return filp->f_op->lock(filp, F_CANCELLK, fl);
>  	return 0;
>  }
> @@ -2580,7 +2586,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
>  		fl_pid = fl->fl_pid;
>  
>  	if (fl->fl_file != NULL)
> -		inode = file_inode(fl->fl_file);
> +		inode = locks_inode(fl->fl_file);
>  
>  	seq_printf(f, "%lld:%s ", id, pfx);
>  	if (IS_POSIX(fl)) {
> @@ -2682,7 +2688,7 @@ static void __show_fd_locks(struct seq_file *f,
>  void show_fd_locks(struct seq_file *f,
>  		  struct file *filp, struct files_struct *files)
>  {
> -	struct inode *inode = file_inode(filp);
> +	struct inode *inode = locks_inode(filp);
>  	struct file_lock_context *ctx;
>  	int id = 0;
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 419f746d851d..05daf3f98d86 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2722,7 +2722,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>  
>  	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
>  		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
> -		   MS_STRICTATIME);
> +		   MS_STRICTATIME | MS_NOREMOTELOCK);
>  
>  	if (flags & MS_REMOUNT)
>  		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
> diff --git a/fs/open.c b/fs/open.c
> index bf66cf1a9f5c..48f38e33e95b 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -685,6 +685,7 @@ static int do_dentry_open(struct file *f,
>  			  const struct cred *cred)
>  {
>  	static const struct file_operations empty_fops = {};
> +	struct inode *lkinode = locks_inode(f);
>  	int error;
>  
>  	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
> @@ -701,12 +702,12 @@ static int do_dentry_open(struct file *f,
>  	}
>  
>  	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
> -		error = get_write_access(inode);
> +		error = get_write_access(lkinode);
>  		if (unlikely(error))
>  			goto cleanup_file;
>  		error = __mnt_want_write(f->f_path.mnt);
>  		if (unlikely(error)) {
> -			put_write_access(inode);
> +			put_write_access(lkinode);
>  			goto cleanup_file;
>  		}
>  		f->f_mode |= FMODE_WRITER;
> @@ -726,7 +727,7 @@ static int do_dentry_open(struct file *f,
>  	if (error)
>  		goto cleanup_all;
>  
> -	error = break_lease(inode, f->f_flags);
> +	error = break_lease(lkinode, f->f_flags);
>  	if (error)
>  		goto cleanup_all;
>  
> @@ -755,7 +756,7 @@ static int do_dentry_open(struct file *f,
>  cleanup_all:
>  	fops_put(f->f_op);
>  	if (f->f_mode & FMODE_WRITER) {
> -		put_write_access(inode);
> +		put_write_access(lkinode);
>  		__mnt_drop_write(f->f_path.mnt);
>  	}
>  cleanup_file:
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index ed4aa34211a6..4c91d9ed4689 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -1280,7 +1280,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
>  		sb->s_xattr = ovl_xattr_noacl_handlers;
>  	sb->s_root = root_dentry;
>  	sb->s_fs_info = ufs;
> -	sb->s_flags |= MS_POSIXACL;
> +	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
>  
>  	return 0;
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index bacc0733663c..be919d18fa6a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1090,6 +1090,18 @@ struct file_lock_context {
>  
>  extern void send_sigio(struct fown_struct *fown, int fd, int band);
>  
> +/*
> + * Return the inode to use for locking
> + *
> + * For overlayfs this should be the overlay inode, not the real inode returned
> + * by file_inode().  For any other fs file_inode(filp) and locks_inode(filp) are
> + * equal.
> + */
> +static inline struct inode *locks_inode(const struct file *f)
> +{
> +	return f->f_path.dentry->d_inode;
> +}
> +
>  #ifdef CONFIG_FILE_LOCKING
>  extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *);
>  extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
> @@ -1277,7 +1289,7 @@ static inline struct dentry *file_dentry(const struct file *file)
>  
>  static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>  {
> -	return locks_lock_inode_wait(file_inode(filp), fl);
> +	return locks_lock_inode_wait(locks_inode(filp), fl);
>  }
>  
>  struct fasync_struct {
> @@ -2132,7 +2144,7 @@ static inline int mandatory_lock(struct inode *ino)
>  
>  static inline int locks_verify_locked(struct file *file)
>  {
> -	if (mandatory_lock(file_inode(file)))
> +	if (mandatory_lock(locks_inode(file)))
>  		return locks_mandatory_locked(file);
>  	return 0;
>  }
> @@ -2584,7 +2596,7 @@ static inline int get_write_access(struct inode *inode)
>  }
>  static inline int deny_write_access(struct file *file)
>  {
> -	struct inode *inode = file_inode(file);
> +	struct inode *inode = locks_inode(file);
>  	return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
>  }
>  static inline void put_write_access(struct inode * inode)
> @@ -2594,7 +2606,7 @@ static inline void put_write_access(struct inode * inode)
>  static inline void allow_write_access(struct file *file)
>  {
>  	if (file)
> -		atomic_inc(&file_inode(file)->i_writecount);
> +		atomic_inc(&locks_inode(file)->i_writecount);
>  }
>  static inline bool inode_is_open_for_write(const struct inode *inode)
>  {
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 3b00f7c8943f..2473272169f2 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -132,6 +132,7 @@ struct inodes_stat_t {
>  #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
>  
>  /* These sb flags are internal to the kernel */
> +#define MS_NOREMOTELOCK	(1<<27)
>  #define MS_NOSEC	(1<<28)
>  #define MS_BORN		(1<<29)
>  #define MS_ACTIVE	(1<<30)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4a7ec0c6c88c..104d687f63f1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -476,7 +476,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
>  		file = tmp->vm_file;
>  		if (file) {
> -			struct inode *inode = file_inode(file);
> +			struct inode *inode = locks_inode(file);
>  			struct address_space *mapping = file->f_mapping;
>  
>  			get_file(file);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index de2c1769cc68..a023caff19d5 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -126,7 +126,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
>  		struct file *file, struct address_space *mapping)
>  {
>  	if (vma->vm_flags & VM_DENYWRITE)
> -		atomic_inc(&file_inode(file)->i_writecount);
> +		atomic_inc(&locks_inode(file)->i_writecount);
>  	if (vma->vm_flags & VM_SHARED)
>  		mapping_unmap_writable(mapping);
>  
> @@ -537,7 +537,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
>  		struct address_space *mapping = file->f_mapping;
>  
>  		if (vma->vm_flags & VM_DENYWRITE)
> -			atomic_dec(&file_inode(file)->i_writecount);
> +			atomic_dec(&locks_inode(file)->i_writecount);
>  		if (vma->vm_flags & VM_SHARED)
>  			atomic_inc(&mapping->i_mmap_writable);
>  
> -- 
> 2.5.5
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi July 19, 2016, 2:46 p.m. UTC | #2
On Tue, Jul 19, 2016 at 4:01 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Tue, Jul 19, 2016 at 02:27:44PM +0200, Miklos Szeredi wrote:
>> This patch allows flock, posix locks, ofd locks and leases to work
>> correctly on overlayfs.
>>
>> Instead of using the underlying inode for storing lock context use the
>> overlay inode.  This allows locks to be persistent across copy-up.
>
> Remind me when the overlay inode is created--is it immediately on any
> (even read-only) open?

So basically overlay has three types of inodes:  lower, upper (these
are called underlying or real inodes) and overlay inode.

Overlay inode is created on lookup, just like any other filesystem.
Overlayfs's own lookup then it proceeds to look up underlying dentry
and stores ref in the overlay dentry.

Copy-up happens on read-write open (or other modifying operation),
which creates inode on upper and copies data/metadata from lower inode
to upper inode.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields July 19, 2016, 3:36 p.m. UTC | #3
On Tue, Jul 19, 2016 at 04:46:14PM +0200, Miklos Szeredi wrote:
> On Tue, Jul 19, 2016 at 4:01 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > On Tue, Jul 19, 2016 at 02:27:44PM +0200, Miklos Szeredi wrote:
> >> This patch allows flock, posix locks, ofd locks and leases to work
> >> correctly on overlayfs.
> >>
> >> Instead of using the underlying inode for storing lock context use the
> >> overlay inode.  This allows locks to be persistent across copy-up.
> >
> > Remind me when the overlay inode is created--is it immediately on any
> > (even read-only) open?
> 
> So basically overlay has three types of inodes:  lower, upper (these
> are called underlying or real inodes) and overlay inode.
> 
> Overlay inode is created on lookup, just like any other filesystem.
> Overlayfs's own lookup then it proceeds to look up underlying dentry
> and stores ref in the overlay dentry.
> 
> Copy-up happens on read-write open (or other modifying operation),
> which creates inode on upper and copies data/metadata from lower inode
> to upper inode.

Thanks for the explanation. So, if I understand correctly:

Locking should look completely normal as long as all locking is done
through the overlay.

Locks may fail to conflict when expected if one of the overlayed layers
is accessed elsewhere somehow--but that's a reasonably easy rule to
document, and consistent with other overlayfs behavior.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton July 19, 2016, 6:01 p.m. UTC | #4
On Tue, 2016-07-19 at 14:27 +0200, Miklos Szeredi wrote:
> This patch allows flock, posix locks, ofd locks and leases to work
> correctly on overlayfs.
> 
> Instead of using the underlying inode for storing lock context use the
> overlay inode.  This allows locks to be persistent across copy-up.
> 
> This is done by introducing locks_inode() helper and using it instead of
> file_inode() to get the inode in locking code.  For non-overlayfs the two
> are equivalent, except for an extra pointer dereference in locks_inode().
> 
> Since lock operations are in "struct file_operations" we must also make
> sure not to call underlying filesystem's lock operations.  Introcude a
> super block flag MS_NOREMOTELOCK to this effect.
> 
> Finally for correct operation of leases i_writecount too needs to be
> modified on the overlay inode.
> 
> > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/file_table.c         |  2 +-
>  fs/locks.c              | 50 +++++++++++++++++++++++++++----------------------
>  fs/namespace.c          |  2 +-
>  fs/open.c               |  9 +++++----
>  fs/overlayfs/super.c    |  2 +-
>  include/linux/fs.h      | 20 ++++++++++++++++----
>  include/uapi/linux/fs.h |  1 +
>  kernel/fork.c           |  2 +-
>  mm/mmap.c               |  4 ++--
>  9 files changed, 56 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/file_table.c b/fs/file_table.c
> index ad17e05ebf95..bb59284c220c 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -216,7 +216,7 @@ static void __fput(struct file *file)
> >  	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
> >  		i_readcount_dec(inode);
> >  	if (file->f_mode & FMODE_WRITER) {
> > -		put_write_access(inode);
> > +		put_write_access(locks_inode(file));
> >  		__mnt_drop_write(mnt);
> >  	}
> >  	file->f_path.dentry = NULL;
> diff --git a/fs/locks.c b/fs/locks.c
> index ee1b15f6fc13..c1656cff53ee 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -139,6 +139,11 @@
> >  #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
> >  #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
>  
> +static inline bool is_remote_lock(struct file *filp)
> +{
> > +	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
> +}
> +
>  static bool lease_breaking(struct file_lock *fl)
>  {
> >  	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
> @@ -791,7 +796,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>  {
> >  	struct file_lock *cfl;
> >  	struct file_lock_context *ctx;
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
>  
> >  	ctx = smp_load_acquire(&inode->i_flctx);
> >  	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> @@ -1192,7 +1197,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
>  int posix_lock_file(struct file *filp, struct file_lock *fl,
> >  			struct file_lock *conflock)
>  {
> > -	return posix_lock_inode(file_inode(filp), fl, conflock);
> > +	return posix_lock_inode(locks_inode(filp), fl, conflock);
>  }
>  EXPORT_SYMBOL(posix_lock_file);
>  
> @@ -1232,7 +1237,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
>  int locks_mandatory_locked(struct file *file)
>  {
> >  	int ret;
> > -	struct inode *inode = file_inode(file);
> > +	struct inode *inode = locks_inode(file);
> >  	struct file_lock_context *ctx;
> >  	struct file_lock *fl;
>  
> @@ -1572,7 +1577,7 @@ EXPORT_SYMBOL(lease_get_mtime);
>  int fcntl_getlease(struct file *filp)
>  {
> >  	struct file_lock *fl;
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
> >  	struct file_lock_context *ctx;
> >  	int type = F_UNLCK;
> >  	LIST_HEAD(dispose);
> @@ -1580,7 +1585,7 @@ int fcntl_getlease(struct file *filp)
> >  	ctx = smp_load_acquire(&inode->i_flctx);
> >  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >  		spin_lock(&ctx->flc_lock);
> > -		time_out_leases(file_inode(filp), &dispose);
> > +		time_out_leases(inode, &dispose);
> >  		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
> >  			if (fl->fl_file != filp)
> >  				continue;
> @@ -1628,7 +1633,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
>  {
> >  	struct file_lock *fl, *my_fl = NULL, *lease;
> >  	struct dentry *dentry = filp->f_path.dentry;
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = dentry->d_inode;
> >  	struct file_lock_context *ctx;
> >  	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
> >  	int error;
> @@ -1742,7 +1747,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  {
> >  	int error = -EAGAIN;
> >  	struct file_lock *fl, *victim = NULL;
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
> >  	struct file_lock_context *ctx;
> >  	LIST_HEAD(dispose);
>  
> @@ -1782,7 +1787,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
> >  			void **priv)
>  {
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
> >  	int error;
>  
> >  	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
> @@ -1830,7 +1835,7 @@ EXPORT_SYMBOL(generic_setlease);
>  int
>  vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
>  {
> > -	if (filp->f_op->setlease)
> > +	if (filp->f_op->setlease && is_remote_lock(filp))
> >  		return filp->f_op->setlease(filp, arg, lease, priv);
> >  	else
> >  		return generic_setlease(filp, arg, lease, priv);
> @@ -1979,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
> >  	if (error)
> >  		goto out_free;
>  
> > -	if (f.file->f_op->flock)
> > +	if (f.file->f_op->flock && is_remote_lock(f.file))
> >  		error = f.file->f_op->flock(f.file,
> >  					  (can_sleep) ? F_SETLKW : F_SETLK,
> >  					  lock);
> @@ -2005,7 +2010,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
>   */
>  int vfs_test_lock(struct file *filp, struct file_lock *fl)
>  {
> > -	if (filp->f_op->lock)
> > +	if (filp->f_op->lock && is_remote_lock(filp))
> >  		return filp->f_op->lock(filp, F_GETLK, fl);
> >  	posix_test_lock(filp, fl);
> >  	return 0;
> @@ -2129,7 +2134,7 @@ out:
>   */
>  int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
>  {
> > -	if (filp->f_op->lock)
> > +	if (filp->f_op->lock && is_remote_lock(filp))
> >  		return filp->f_op->lock(filp, cmd, fl);
> >  	else
> >  		return posix_lock_file(filp, fl, conf);
> @@ -2191,7 +2196,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
> >  	if (file_lock == NULL)
> >  		return -ENOLCK;
>  
> > -	inode = file_inode(filp);
> > +	inode = locks_inode(filp);
>  
> >  	/*
> >  	 * This might block, so we do it before checking the inode.
> @@ -2343,7 +2348,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
> >  	if (copy_from_user(&flock, l, sizeof(flock)))
> >  		goto out;
>  
> > -	inode = file_inode(filp);
> > +	inode = locks_inode(filp);
>  
> >  	/* Don't allow mandatory locks on files that may be memory mapped
> >  	 * and shared.
> @@ -2426,6 +2431,7 @@ out:
>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  {
> >  	int error;
> > +	struct inode *inode = locks_inode(filp);
> >  	struct file_lock lock;
> >  	struct file_lock_context *ctx;
>  
> @@ -2434,7 +2440,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >  	 * posix_lock_file().  Another process could be setting a lock on this
> >  	 * file at the same time, but we wouldn't remove that lock anyway.
> >  	 */
> > -	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> > +	ctx =  smp_load_acquire(&inode->i_flctx);
> >  	if (!ctx || list_empty(&ctx->flc_posix))
> >  		return;
>  
> @@ -2452,7 +2458,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  
> >  	if (lock.fl_ops && lock.fl_ops->fl_release_private)
> >  		lock.fl_ops->fl_release_private(&lock);
> > -	trace_locks_remove_posix(file_inode(filp), &lock, error);
> > +	trace_locks_remove_posix(inode, &lock, error);
>  }
>  
>  EXPORT_SYMBOL(locks_remove_posix);
> @@ -2469,12 +2475,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >  		.fl_type = F_UNLCK,
> >  		.fl_end = OFFSET_MAX,
> >  	};
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
>  
> >  	if (list_empty(&flctx->flc_flock))
> >  		return;
>  
> > -	if (filp->f_op->flock)
> > +	if (filp->f_op->flock && is_remote_lock(filp))
> >  		filp->f_op->flock(filp, F_SETLKW, &fl);
> >  	else
> >  		flock_lock_inode(inode, &fl);
> @@ -2508,7 +2514,7 @@ void locks_remove_file(struct file *filp)
>  {
> >  	struct file_lock_context *ctx;
>  
> > -	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> > +	ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
> >  	if (!ctx)
> >  		return;
>  
> @@ -2552,7 +2558,7 @@ EXPORT_SYMBOL(posix_unblock_lock);
>   */
>  int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
>  {
> > -	if (filp->f_op->lock)
> > +	if (filp->f_op->lock && is_remote_lock(filp))
> >  		return filp->f_op->lock(filp, F_CANCELLK, fl);
> >  	return 0;
>  }
> @@ -2580,7 +2586,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
> >  		fl_pid = fl->fl_pid;
>  
> >  	if (fl->fl_file != NULL)
> > -		inode = file_inode(fl->fl_file);
> > +		inode = locks_inode(fl->fl_file);
>  
> >  	seq_printf(f, "%lld:%s ", id, pfx);
> >  	if (IS_POSIX(fl)) {
> @@ -2682,7 +2688,7 @@ static void __show_fd_locks(struct seq_file *f,
>  void show_fd_locks(struct seq_file *f,
> >  		  struct file *filp, struct files_struct *files)
>  {
> > -	struct inode *inode = file_inode(filp);
> > +	struct inode *inode = locks_inode(filp);
> >  	struct file_lock_context *ctx;
> >  	int id = 0;
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 419f746d851d..05daf3f98d86 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2722,7 +2722,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>  
> >  	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
> >  		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
> > -		   MS_STRICTATIME);
> > +		   MS_STRICTATIME | MS_NOREMOTELOCK);
>  
> >  	if (flags & MS_REMOUNT)
> >  		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
> diff --git a/fs/open.c b/fs/open.c
> index bf66cf1a9f5c..48f38e33e95b 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -685,6 +685,7 @@ static int do_dentry_open(struct file *f,
> >  			  const struct cred *cred)
>  {
> >  	static const struct file_operations empty_fops = {};
> > +	struct inode *lkinode = locks_inode(f);
> >  	int error;
>  
> >  	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
> @@ -701,12 +702,12 @@ static int do_dentry_open(struct file *f,
> >  	}
>  
> >  	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
> > -		error = get_write_access(inode);
> > +		error = get_write_access(lkinode);
> >  		if (unlikely(error))
> >  			goto cleanup_file;
> >  		error = __mnt_want_write(f->f_path.mnt);
> >  		if (unlikely(error)) {
> > -			put_write_access(inode);
> > +			put_write_access(lkinode);
> >  			goto cleanup_file;
> >  		}
> >  		f->f_mode |= FMODE_WRITER;
> @@ -726,7 +727,7 @@ static int do_dentry_open(struct file *f,
> >  	if (error)
> >  		goto cleanup_all;
>  
> > -	error = break_lease(inode, f->f_flags);
> > +	error = break_lease(lkinode, f->f_flags);
> >  	if (error)
> >  		goto cleanup_all;
>  
> @@ -755,7 +756,7 @@ static int do_dentry_open(struct file *f,
>  cleanup_all:
> >  	fops_put(f->f_op);
> >  	if (f->f_mode & FMODE_WRITER) {
> > -		put_write_access(inode);
> > +		put_write_access(lkinode);
> >  		__mnt_drop_write(f->f_path.mnt);
> >  	}
>  cleanup_file:
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index ed4aa34211a6..4c91d9ed4689 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -1280,7 +1280,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
> >  		sb->s_xattr = ovl_xattr_noacl_handlers;
> >  	sb->s_root = root_dentry;
> >  	sb->s_fs_info = ufs;
> > -	sb->s_flags |= MS_POSIXACL;
> > +	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
>  
> >  	return 0;
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index bacc0733663c..be919d18fa6a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1090,6 +1090,18 @@ struct file_lock_context {
>  
>  extern void send_sigio(struct fown_struct *fown, int fd, int band);
>  
> +/*
> + * Return the inode to use for locking
> + *
> + * For overlayfs this should be the overlay inode, not the real inode returned
> + * by file_inode().  For any other fs file_inode(filp) and locks_inode(filp) are
> + * equal.
> + */
> +static inline struct inode *locks_inode(const struct file *f)
> +{
> > +	return f->f_path.dentry->d_inode;
> +}
> +
>  #ifdef CONFIG_FILE_LOCKING
>  extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *);
>  extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
> @@ -1277,7 +1289,7 @@ static inline struct dentry *file_dentry(const struct file *file)
>  
>  static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>  {
> > -	return locks_lock_inode_wait(file_inode(filp), fl);
> > +	return locks_lock_inode_wait(locks_inode(filp), fl);
>  }
>  
>  struct fasync_struct {
> @@ -2132,7 +2144,7 @@ static inline int mandatory_lock(struct inode *ino)
>  
>  static inline int locks_verify_locked(struct file *file)
>  {
> > -	if (mandatory_lock(file_inode(file)))
> > +	if (mandatory_lock(locks_inode(file)))
> >  		return locks_mandatory_locked(file);
> >  	return 0;
>  }
> @@ -2584,7 +2596,7 @@ static inline int get_write_access(struct inode *inode)
>  }
>  static inline int deny_write_access(struct file *file)
>  {
> > -	struct inode *inode = file_inode(file);
> > +	struct inode *inode = locks_inode(file);
> >  	return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
>  }
>  static inline void put_write_access(struct inode * inode)
> @@ -2594,7 +2606,7 @@ static inline void put_write_access(struct inode * inode)
>  static inline void allow_write_access(struct file *file)
>  {
> >  	if (file)
> > -		atomic_inc(&file_inode(file)->i_writecount);
> > +		atomic_inc(&locks_inode(file)->i_writecount);
>  }
>  static inline bool inode_is_open_for_write(const struct inode *inode)
>  {
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 3b00f7c8943f..2473272169f2 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -132,6 +132,7 @@ struct inodes_stat_t {
> >  #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
>  
>  /* These sb flags are internal to the kernel */
> > +#define MS_NOREMOTELOCK	(1<<27)
> >  #define MS_NOSEC	(1<<28)
> >  #define MS_BORN		(1<<29)
> >  #define MS_ACTIVE	(1<<30)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4a7ec0c6c88c..104d687f63f1 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -476,7 +476,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> >  		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> >  		file = tmp->vm_file;
> >  		if (file) {
> > -			struct inode *inode = file_inode(file);
> > +			struct inode *inode = locks_inode(file);
> >  			struct address_space *mapping = file->f_mapping;
>  
> > >  			get_file(file);
> > diff --git a/mm/mmap.c b/mm/mmap.c
> index de2c1769cc68..a023caff19d5 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -126,7 +126,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> >  		struct file *file, struct address_space *mapping)
>  {
> >  	if (vma->vm_flags & VM_DENYWRITE)
> > -		atomic_inc(&file_inode(file)->i_writecount);
> > +		atomic_inc(&locks_inode(file)->i_writecount);
> >  	if (vma->vm_flags & VM_SHARED)
> >  		mapping_unmap_writable(mapping);
> >  

Not sure about this bit with the i_writecount, as it's used for other
things besides file locking. Could this cause problems when accessing
the writable layer while the overlay is active? ISTR that the openwrt
backup instructions have you do exactly that when overlayfs is used.

Other than that, this looks pretty reasonable to me.

> > @@ -537,7 +537,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
> >  		struct address_space *mapping = file->f_mapping;
>  
> >  		if (vma->vm_flags & VM_DENYWRITE)
> > -			atomic_dec(&file_inode(file)->i_writecount);
> > +			atomic_dec(&locks_inode(file)->i_writecount);
> >  		if (vma->vm_flags & VM_SHARED)
> >  			atomic_inc(&mapping->i_mmap_writable);
>
Miklos Szeredi July 20, 2016, 4:08 a.m. UTC | #5
On Tue, Jul 19, 2016 at 8:01 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Tue, 2016-07-19 at 14:27 +0200, Miklos Szeredi wrote:

>> > diff --git a/mm/mmap.c b/mm/mmap.c
>> index de2c1769cc68..a023caff19d5 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -126,7 +126,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
>> >             struct file *file, struct address_space *mapping)
>>  {
>> >     if (vma->vm_flags & VM_DENYWRITE)
>> > -           atomic_inc(&file_inode(file)->i_writecount);
>> > +           atomic_inc(&locks_inode(file)->i_writecount);
>> >     if (vma->vm_flags & VM_SHARED)
>> >             mapping_unmap_writable(mapping);
>> >
>
> Not sure about this bit with the i_writecount, as it's used for other
> things besides file locking. Could this cause problems when accessing
> the writable layer while the overlay is active? ISTR that the openwrt
> backup instructions have you do exactly that when overlayfs is used.

Hmm,  We could get write access on upper layer only.  That's trivial
for open (it was done that way previously) but needs some thought for
truncate(2).

What we want for truncate is copy up to happen before
get_write_access().  It's simple enough with

  get_write_access(d_inode(d_real(dentry, NULL, O_WRONLY)));

plus error handling.  Problem with this is if something fails after
that, then copy-up was done needlessly.  E.g. if break_lease() was
interrupted.  Probably not a big deal in practice.

The other thing is what happens if there's a denywrite on a lower file
that is then opened for write or truncated.  With the current patch
get_write_access() wil fail.  With the above modification it will
succeed.  Either behavior is acceptable, considering that the copy-up
does actually create a different file, so the old, denywrite mapping
won't be touched.

So I'm inclined to go with this approach to prevent issues with access
to underlying layers while overlay is active.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/file_table.c b/fs/file_table.c
index ad17e05ebf95..bb59284c220c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -216,7 +216,7 @@  static void __fput(struct file *file)
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_dec(inode);
 	if (file->f_mode & FMODE_WRITER) {
-		put_write_access(inode);
+		put_write_access(locks_inode(file));
 		__mnt_drop_write(mnt);
 	}
 	file->f_path.dentry = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index ee1b15f6fc13..c1656cff53ee 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -139,6 +139,11 @@ 
 #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 
+static inline bool is_remote_lock(struct file *filp)
+{
+	return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK));
+}
+
 static bool lease_breaking(struct file_lock *fl)
 {
 	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
@@ -791,7 +796,7 @@  posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
 	struct file_lock_context *ctx;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -1192,7 +1197,7 @@  static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 int posix_lock_file(struct file *filp, struct file_lock *fl,
 			struct file_lock *conflock)
 {
-	return posix_lock_inode(file_inode(filp), fl, conflock);
+	return posix_lock_inode(locks_inode(filp), fl, conflock);
 }
 EXPORT_SYMBOL(posix_lock_file);
 
@@ -1232,7 +1237,7 @@  static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 int locks_mandatory_locked(struct file *file)
 {
 	int ret;
-	struct inode *inode = file_inode(file);
+	struct inode *inode = locks_inode(file);
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
@@ -1572,7 +1577,7 @@  EXPORT_SYMBOL(lease_get_mtime);
 int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
@@ -1580,7 +1585,7 @@  int fcntl_getlease(struct file *filp)
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
-		time_out_leases(file_inode(filp), &dispose);
+		time_out_leases(inode, &dispose);
 		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 			if (fl->fl_file != filp)
 				continue;
@@ -1628,7 +1633,7 @@  generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 {
 	struct file_lock *fl, *my_fl = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = dentry->d_inode;
 	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
@@ -1742,7 +1747,7 @@  static int generic_delete_lease(struct file *filp, void *owner)
 {
 	int error = -EAGAIN;
 	struct file_lock *fl, *victim = NULL;
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
@@ -1782,7 +1787,7 @@  static int generic_delete_lease(struct file *filp, void *owner)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 			void **priv)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 	int error;
 
 	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -1830,7 +1835,7 @@  EXPORT_SYMBOL(generic_setlease);
 int
 vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
 {
-	if (filp->f_op->setlease)
+	if (filp->f_op->setlease && is_remote_lock(filp))
 		return filp->f_op->setlease(filp, arg, lease, priv);
 	else
 		return generic_setlease(filp, arg, lease, priv);
@@ -1979,7 +1984,7 @@  SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	if (error)
 		goto out_free;
 
-	if (f.file->f_op->flock)
+	if (f.file->f_op->flock && is_remote_lock(f.file))
 		error = f.file->f_op->flock(f.file,
 					  (can_sleep) ? F_SETLKW : F_SETLK,
 					  lock);
@@ -2005,7 +2010,7 @@  SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  */
 int vfs_test_lock(struct file *filp, struct file_lock *fl)
 {
-	if (filp->f_op->lock)
+	if (filp->f_op->lock && is_remote_lock(filp))
 		return filp->f_op->lock(filp, F_GETLK, fl);
 	posix_test_lock(filp, fl);
 	return 0;
@@ -2129,7 +2134,7 @@  out:
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
 {
-	if (filp->f_op->lock)
+	if (filp->f_op->lock && is_remote_lock(filp))
 		return filp->f_op->lock(filp, cmd, fl);
 	else
 		return posix_lock_file(filp, fl, conf);
@@ -2191,7 +2196,7 @@  int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (file_lock == NULL)
 		return -ENOLCK;
 
-	inode = file_inode(filp);
+	inode = locks_inode(filp);
 
 	/*
 	 * This might block, so we do it before checking the inode.
@@ -2343,7 +2348,7 @@  int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (copy_from_user(&flock, l, sizeof(flock)))
 		goto out;
 
-	inode = file_inode(filp);
+	inode = locks_inode(filp);
 
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
@@ -2426,6 +2431,7 @@  out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	int error;
+	struct inode *inode = locks_inode(filp);
 	struct file_lock lock;
 	struct file_lock_context *ctx;
 
@@ -2434,7 +2440,7 @@  void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
+	ctx =  smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
@@ -2452,7 +2458,7 @@  void locks_remove_posix(struct file *filp, fl_owner_t owner)
 
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
-	trace_locks_remove_posix(file_inode(filp), &lock, error);
+	trace_locks_remove_posix(inode, &lock, error);
 }
 
 EXPORT_SYMBOL(locks_remove_posix);
@@ -2469,12 +2475,12 @@  locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 		.fl_type = F_UNLCK,
 		.fl_end = OFFSET_MAX,
 	};
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 
 	if (list_empty(&flctx->flc_flock))
 		return;
 
-	if (filp->f_op->flock)
+	if (filp->f_op->flock && is_remote_lock(filp))
 		filp->f_op->flock(filp, F_SETLKW, &fl);
 	else
 		flock_lock_inode(inode, &fl);
@@ -2508,7 +2514,7 @@  void locks_remove_file(struct file *filp)
 {
 	struct file_lock_context *ctx;
 
-	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+	ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
 	if (!ctx)
 		return;
 
@@ -2552,7 +2558,7 @@  EXPORT_SYMBOL(posix_unblock_lock);
  */
 int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 {
-	if (filp->f_op->lock)
+	if (filp->f_op->lock && is_remote_lock(filp))
 		return filp->f_op->lock(filp, F_CANCELLK, fl);
 	return 0;
 }
@@ -2580,7 +2586,7 @@  static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 		fl_pid = fl->fl_pid;
 
 	if (fl->fl_file != NULL)
-		inode = file_inode(fl->fl_file);
+		inode = locks_inode(fl->fl_file);
 
 	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
@@ -2682,7 +2688,7 @@  static void __show_fd_locks(struct seq_file *f,
 void show_fd_locks(struct seq_file *f,
 		  struct file *filp, struct files_struct *files)
 {
-	struct inode *inode = file_inode(filp);
+	struct inode *inode = locks_inode(filp);
 	struct file_lock_context *ctx;
 	int id = 0;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 419f746d851d..05daf3f98d86 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2722,7 +2722,7 @@  long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME);
+		   MS_STRICTATIME | MS_NOREMOTELOCK);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/fs/open.c b/fs/open.c
index bf66cf1a9f5c..48f38e33e95b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,7 @@  static int do_dentry_open(struct file *f,
 			  const struct cred *cred)
 {
 	static const struct file_operations empty_fops = {};
+	struct inode *lkinode = locks_inode(f);
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
@@ -701,12 +702,12 @@  static int do_dentry_open(struct file *f,
 	}
 
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
-		error = get_write_access(inode);
+		error = get_write_access(lkinode);
 		if (unlikely(error))
 			goto cleanup_file;
 		error = __mnt_want_write(f->f_path.mnt);
 		if (unlikely(error)) {
-			put_write_access(inode);
+			put_write_access(lkinode);
 			goto cleanup_file;
 		}
 		f->f_mode |= FMODE_WRITER;
@@ -726,7 +727,7 @@  static int do_dentry_open(struct file *f,
 	if (error)
 		goto cleanup_all;
 
-	error = break_lease(inode, f->f_flags);
+	error = break_lease(lkinode, f->f_flags);
 	if (error)
 		goto cleanup_all;
 
@@ -755,7 +756,7 @@  static int do_dentry_open(struct file *f,
 cleanup_all:
 	fops_put(f->f_op);
 	if (f->f_mode & FMODE_WRITER) {
-		put_write_access(inode);
+		put_write_access(lkinode);
 		__mnt_drop_write(f->f_path.mnt);
 	}
 cleanup_file:
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ed4aa34211a6..4c91d9ed4689 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1280,7 +1280,7 @@  static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_xattr = ovl_xattr_noacl_handlers;
 	sb->s_root = root_dentry;
 	sb->s_fs_info = ufs;
-	sb->s_flags |= MS_POSIXACL;
+	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
 
 	return 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bacc0733663c..be919d18fa6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1090,6 +1090,18 @@  struct file_lock_context {
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
+/*
+ * Return the inode to use for locking
+ *
+ * For overlayfs this should be the overlay inode, not the real inode returned
+ * by file_inode().  For any other fs file_inode(filp) and locks_inode(filp) are
+ * equal.
+ */
+static inline struct inode *locks_inode(const struct file *f)
+{
+	return f->f_path.dentry->d_inode;
+}
+
 #ifdef CONFIG_FILE_LOCKING
 extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *);
 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
@@ -1277,7 +1289,7 @@  static inline struct dentry *file_dentry(const struct file *file)
 
 static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 {
-	return locks_lock_inode_wait(file_inode(filp), fl);
+	return locks_lock_inode_wait(locks_inode(filp), fl);
 }
 
 struct fasync_struct {
@@ -2132,7 +2144,7 @@  static inline int mandatory_lock(struct inode *ino)
 
 static inline int locks_verify_locked(struct file *file)
 {
-	if (mandatory_lock(file_inode(file)))
+	if (mandatory_lock(locks_inode(file)))
 		return locks_mandatory_locked(file);
 	return 0;
 }
@@ -2584,7 +2596,7 @@  static inline int get_write_access(struct inode *inode)
 }
 static inline int deny_write_access(struct file *file)
 {
-	struct inode *inode = file_inode(file);
+	struct inode *inode = locks_inode(file);
 	return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
 }
 static inline void put_write_access(struct inode * inode)
@@ -2594,7 +2606,7 @@  static inline void put_write_access(struct inode * inode)
 static inline void allow_write_access(struct file *file)
 {
 	if (file)
-		atomic_inc(&file_inode(file)->i_writecount);
+		atomic_inc(&locks_inode(file)->i_writecount);
 }
 static inline bool inode_is_open_for_write(const struct inode *inode)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3b00f7c8943f..2473272169f2 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -132,6 +132,7 @@  struct inodes_stat_t {
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define MS_NOREMOTELOCK	(1<<27)
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..104d687f63f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -476,7 +476,7 @@  static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
-			struct inode *inode = file_inode(file);
+			struct inode *inode = locks_inode(file);
 			struct address_space *mapping = file->f_mapping;
 
 			get_file(file);
diff --git a/mm/mmap.c b/mm/mmap.c
index de2c1769cc68..a023caff19d5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -126,7 +126,7 @@  static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
 {
 	if (vma->vm_flags & VM_DENYWRITE)
-		atomic_inc(&file_inode(file)->i_writecount);
+		atomic_inc(&locks_inode(file)->i_writecount);
 	if (vma->vm_flags & VM_SHARED)
 		mapping_unmap_writable(mapping);
 
@@ -537,7 +537,7 @@  static void __vma_link_file(struct vm_area_struct *vma)
 		struct address_space *mapping = file->f_mapping;
 
 		if (vma->vm_flags & VM_DENYWRITE)
-			atomic_dec(&file_inode(file)->i_writecount);
+			atomic_dec(&locks_inode(file)->i_writecount);
 		if (vma->vm_flags & VM_SHARED)
 			atomic_inc(&mapping->i_mmap_writable);