diff mbox series

[v7,04/14] mm/shmem: Support memfile_notifier

Message ID 20220706082016.2603916-5-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng July 6, 2022, 8:20 a.m. UTC
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Implement shmem as a memfile_notifier backing store. Essentially it
interacts with the memfile_notifier feature flags for userspace
access/page migration/page reclaiming and implements the necessary
memfile_backing_store callbacks.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/shmem_fs.h |   2 +
 mm/shmem.c               | 109 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

Comments

Gupta, Pankaj July 12, 2022, 6:02 p.m. UTC | #1
On 7/6/2022 10:20 AM, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> Implement shmem as a memfile_notifier backing store. Essentially it
> interacts with the memfile_notifier feature flags for userspace
> access/page migration/page reclaiming and implements the necessary
> memfile_backing_store callbacks.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>   include/linux/shmem_fs.h |   2 +
>   mm/shmem.c               | 109 ++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 110 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index a68f982f22d1..6031c0b08d26 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
>   #include <linux/percpu_counter.h>
>   #include <linux/xattr.h>
>   #include <linux/fs_parser.h>
> +#include <linux/memfile_notifier.h>
>   
>   /* inode in-kernel data */
>   
> @@ -25,6 +26,7 @@ struct shmem_inode_info {
>   	struct simple_xattrs	xattrs;		/* list of xattrs */
>   	atomic_t		stop_eviction;	/* hold when working on inode */
>   	struct timespec64	i_crtime;	/* file creation time */
> +	struct memfile_node	memfile_node;	/* memfile node */
>   	struct inode		vfs_inode;
>   };
>   
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 6c8aef15a17d..627e315c3b4d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -905,6 +905,17 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
>   	return page ? page_folio(page) : NULL;
>   }
>   
> +static void notify_invalidate(struct inode *inode, struct folio *folio,
> +				   pgoff_t start, pgoff_t end)
> +{
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	start = max(start, folio->index);
> +	end = min(end, folio->index + folio_nr_pages(folio));
> +
> +	memfile_notifier_invalidate(&info->memfile_node, start, end);
> +}
> +
>   /*
>    * Remove range of pages and swap entries from page cache, and free them.
>    * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
> @@ -948,6 +959,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>   			}
>   			index += folio_nr_pages(folio) - 1;
>   
> +			notify_invalidate(inode, folio, start, end);
> +
>   			if (!unfalloc || !folio_test_uptodate(folio))
>   				truncate_inode_folio(mapping, folio);
>   			folio_unlock(folio);
> @@ -1021,6 +1034,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
>   					index--;
>   					break;
>   				}
> +
> +				notify_invalidate(inode, folio, start, end);
> +
>   				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
>   						folio);
>   				truncate_inode_folio(mapping, folio);
> @@ -1092,6 +1108,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
>   		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
>   			return -EPERM;
>   
> +		if (info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE) {
> +			if (oldsize)
> +				return -EPERM;
> +			if (!PAGE_ALIGNED(newsize))
> +				return -EINVAL;
> +		}
> +
>   		if (newsize != oldsize) {
>   			error = shmem_reacct_size(SHMEM_I(inode)->flags,
>   					oldsize, newsize);
> @@ -1336,6 +1359,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
>   		goto redirty;
>   	if (!total_swap_pages)
>   		goto redirty;
> +	if (info->memfile_node.flags & MEMFILE_F_UNRECLAIMABLE)
> +		goto redirty;
>   
>   	/*
>   	 * Our capabilities prevent regular writeback or sync from ever calling
> @@ -2271,6 +2296,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
>   	if (ret)
>   		return ret;
>   
> +	if (info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE)
> +		return -EPERM;
> +
>   	/* arm64 - allow memory tagging on RAM-based files */
>   	vma->vm_flags |= VM_MTE_ALLOWED;
>   
> @@ -2306,6 +2334,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>   		info->i_crtime = inode->i_mtime;
>   		INIT_LIST_HEAD(&info->shrinklist);
>   		INIT_LIST_HEAD(&info->swaplist);
> +		memfile_node_init(&info->memfile_node);
>   		simple_xattrs_init(&info->xattrs);
>   		cache_no_acl(inode);
>   		mapping_set_large_folios(inode->i_mapping);
> @@ -2477,6 +2506,8 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
>   		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
>   			return -EPERM;
>   	}
> +	if (unlikely(info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE))
> +		return -EPERM;
>   
>   	if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE))
>   		sgp = SGP_NOALLOC;
> @@ -2556,6 +2587,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   		end_index = i_size >> PAGE_SHIFT;
>   		if (index > end_index)
>   			break;
> +
> +		if (SHMEM_I(inode)->memfile_node.flags &
> +				MEMFILE_F_USER_INACCESSIBLE) {
> +			error = -EPERM;
> +			break;
> +		}
> +
>   		if (index == end_index) {
>   			nr = i_size & ~PAGE_MASK;
>   			if (nr <= offset)
> @@ -2697,6 +2735,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
>   			goto out;
>   		}
>   
> +		if ((info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE) &&
> +		    (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))) {
> +			error = -EINVAL;
> +			goto out;
> +		}
> +
>   		shmem_falloc.waitq = &shmem_falloc_waitq;
>   		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
>   		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
> @@ -3806,6 +3850,20 @@ static int shmem_error_remove_page(struct address_space *mapping,
>   	return 0;
>   }
>   
> +#ifdef CONFIG_MIGRATION
> +static int shmem_migrate_page(struct address_space *mapping,
> +			      struct page *newpage, struct page *page,
> +			      enum migrate_mode mode)
> +{
> +	struct inode *inode = mapping->host;
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +
> +	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
> +		return -EOPNOTSUPP;
> +	return migrate_page(mapping, newpage, page, mode);

Wondering how well page migrate would work for private pages
on shmem memfd based backend?

> +}
> +#endif
> +
>   const struct address_space_operations shmem_aops = {
>   	.writepage	= shmem_writepage,
>   	.dirty_folio	= noop_dirty_folio,
> @@ -3814,7 +3872,7 @@ const struct address_space_operations shmem_aops = {
>   	.write_end	= shmem_write_end,
>   #endif
>   #ifdef CONFIG_MIGRATION
> -	.migratepage	= migrate_page,
> +	.migratepage	= shmem_migrate_page,
>   #endif
>   	.error_remove_page = shmem_error_remove_page,
>   };
> @@ -3931,6 +3989,51 @@ static struct file_system_type shmem_fs_type = {
>   	.fs_flags	= FS_USERNS_MOUNT,
>   };
>   
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static struct memfile_node *shmem_lookup_memfile_node(struct file *file)
> +{
> +	struct inode *inode = file_inode(file);
> +
> +	if (!shmem_mapping(inode->i_mapping))
> +		return NULL;
> +
> +	return  &SHMEM_I(inode)->memfile_node;
> +}
> +
> +
> +static int shmem_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
> +			 int *order)
> +{
> +	struct page *page;
> +	int ret;
> +
> +	ret = shmem_getpage(file_inode(file), offset, &page, SGP_WRITE);
> +	if (ret)
> +		return ret;
> +
> +	unlock_page(page);
> +	*pfn = page_to_pfn_t(page);
> +	*order = thp_order(compound_head(page));
> +	return 0;
> +}
> +
> +static void shmem_put_pfn(pfn_t pfn)
> +{
> +	struct page *page = pfn_t_to_page(pfn);
> +
> +	if (!page)
> +		return;
> +
> +	put_page(page);
> +}
> +
> +static struct memfile_backing_store shmem_backing_store = {
> +	.lookup_memfile_node = shmem_lookup_memfile_node,
> +	.get_pfn = shmem_get_pfn,
> +	.put_pfn = shmem_put_pfn,
> +};
> +#endif /* CONFIG_MEMFILE_NOTIFIER */
> +
>   void __init shmem_init(void)
>   {
>   	int error;
> @@ -3956,6 +4059,10 @@ void __init shmem_init(void)
>   	else
>   		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
>   #endif
> +
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	memfile_register_backing_store(&shmem_backing_store);
> +#endif
>   	return;
>   
>   out1:
Chao Peng July 13, 2022, 7:44 a.m. UTC | #2
On Tue, Jul 12, 2022 at 08:02:34PM +0200, Gupta, Pankaj wrote:
> On 7/6/2022 10:20 AM, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > Implement shmem as a memfile_notifier backing store. Essentially it
> > interacts with the memfile_notifier feature flags for userspace
> > access/page migration/page reclaiming and implements the necessary
> > memfile_backing_store callbacks.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >   include/linux/shmem_fs.h |   2 +
> >   mm/shmem.c               | 109 ++++++++++++++++++++++++++++++++++++++-
> >   2 files changed, 110 insertions(+), 1 deletion(-)
...

> > +#ifdef CONFIG_MIGRATION
> > +static int shmem_migrate_page(struct address_space *mapping,
> > +			      struct page *newpage, struct page *page,
> > +			      enum migrate_mode mode)
> > +{
> > +	struct inode *inode = mapping->host;
> > +	struct shmem_inode_info *info = SHMEM_I(inode);
> > +
> > +	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
> > +		return -EOPNOTSUPP;
> > +	return migrate_page(mapping, newpage, page, mode);
> 
> Wondering how well page migrate would work for private pages
> on shmem memfd based backend?

From high level:
  - KVM unset MEMFILE_F_UNMOVABLE bit to indicate it capable of
    migrating a page.
  - Introduce new 'migrate' callback(s) to memfile_notifier_ops for KVM
    to register.
  - The callback is hooked to migrate_page() here.
  - Once page migration requested, shmem calls into the 'migrate'
    callback(s) to perform additional steps for encrypted memory (For
    TDX we will call TDH.MEM.PAGE.RELOCATE).

Chao
> 
> > +}
> > +#endif
> > +
> >   const struct address_space_operations shmem_aops = {
> >   	.writepage	= shmem_writepage,
> >   	.dirty_folio	= noop_dirty_folio,
> > @@ -3814,7 +3872,7 @@ const struct address_space_operations shmem_aops = {
> >   	.write_end	= shmem_write_end,
> >   #endif
> >   #ifdef CONFIG_MIGRATION
> > -	.migratepage	= migrate_page,
> > +	.migratepage	= shmem_migrate_page,
> >   #endif
> >   	.error_remove_page = shmem_error_remove_page,
> >   };
> > @@ -3931,6 +3989,51 @@ static struct file_system_type shmem_fs_type = {
> >   	.fs_flags	= FS_USERNS_MOUNT,
> >   };
Gupta, Pankaj July 13, 2022, 10:01 a.m. UTC | #3
>>> +#ifdef CONFIG_MIGRATION
>>> +static int shmem_migrate_page(struct address_space *mapping,
>>> +			      struct page *newpage, struct page *page,
>>> +			      enum migrate_mode mode)
>>> +{
>>> +	struct inode *inode = mapping->host;
>>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>>> +
>>> +	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
>>> +		return -EOPNOTSUPP;
>>> +	return migrate_page(mapping, newpage, page, mode);
>>
>> Wondering how well page migrate would work for private pages
>> on shmem memfd based backend?
> 
>  From high level:
>    - KVM unset MEMFILE_F_UNMOVABLE bit to indicate it capable of
>      migrating a page.
>    - Introduce new 'migrate' callback(s) to memfile_notifier_ops for KVM
>      to register.
>    - The callback is hooked to migrate_page() here.
>    - Once page migration requested, shmem calls into the 'migrate'
>      callback(s) to perform additional steps for encrypted memory (For
>      TDX we will call TDH.MEM.PAGE.RELOCATE).

Yes, that would require additional (protocol specific) handling for 
private pages. Was trying to find where "MEMFILE_F_UNMOVABLE" flag is 
set currently?

Thanks,
Pankaj
Chao Peng July 13, 2022, 11:49 p.m. UTC | #4
On Wed, Jul 13, 2022 at 12:01:13PM +0200, Gupta, Pankaj wrote:
> 
> > > > +#ifdef CONFIG_MIGRATION
> > > > +static int shmem_migrate_page(struct address_space *mapping,
> > > > +			      struct page *newpage, struct page *page,
> > > > +			      enum migrate_mode mode)
> > > > +{
> > > > +	struct inode *inode = mapping->host;
> > > > +	struct shmem_inode_info *info = SHMEM_I(inode);
> > > > +
> > > > +	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
> > > > +		return -EOPNOTSUPP;
> > > > +	return migrate_page(mapping, newpage, page, mode);
> > > 
> > > Wondering how well page migrate would work for private pages
> > > on shmem memfd based backend?
> > 
> >  From high level:
> >    - KVM unset MEMFILE_F_UNMOVABLE bit to indicate it capable of
> >      migrating a page.
> >    - Introduce new 'migrate' callback(s) to memfile_notifier_ops for KVM
> >      to register.
> >    - The callback is hooked to migrate_page() here.
> >    - Once page migration requested, shmem calls into the 'migrate'
> >      callback(s) to perform additional steps for encrypted memory (For
> >      TDX we will call TDH.MEM.PAGE.RELOCATE).
> 
> Yes, that would require additional (protocol specific) handling for private
> pages. Was trying to find where "MEMFILE_F_UNMOVABLE" flag is set currently?

It's set with memfile_register_notifier() in patch 13.

> 
> Thanks,
> Pankaj
Gupta, Pankaj July 14, 2022, 4:15 a.m. UTC | #5
>>>>> +#ifdef CONFIG_MIGRATION
>>>>> +static int shmem_migrate_page(struct address_space *mapping,
>>>>> +			      struct page *newpage, struct page *page,
>>>>> +			      enum migrate_mode mode)
>>>>> +{
>>>>> +	struct inode *inode = mapping->host;
>>>>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>>>>> +
>>>>> +	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
>>>>> +		return -EOPNOTSUPP;
>>>>> +	return migrate_page(mapping, newpage, page, mode);
>>>>
>>>> Wondering how well page migrate would work for private pages
>>>> on shmem memfd based backend?
>>>
>>>   From high level:
>>>     - KVM unset MEMFILE_F_UNMOVABLE bit to indicate it capable of
>>>       migrating a page.
>>>     - Introduce new 'migrate' callback(s) to memfile_notifier_ops for KVM
>>>       to register.
>>>     - The callback is hooked to migrate_page() here.
>>>     - Once page migration requested, shmem calls into the 'migrate'
>>>       callback(s) to perform additional steps for encrypted memory (For
>>>       TDX we will call TDH.MEM.PAGE.RELOCATE).
>>
>> Yes, that would require additional (protocol specific) handling for private
>> pages. Was trying to find where "MEMFILE_F_UNMOVABLE" flag is set currently?
> 
> It's set with memfile_register_notifier() in patch 13.

o.k.

Thanks,

Pankaj
David Hildenbrand Aug. 5, 2022, 1:26 p.m. UTC | #6
On 06.07.22 10:20, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> Implement shmem as a memfile_notifier backing store. Essentially it
> interacts with the memfile_notifier feature flags for userspace
> access/page migration/page reclaiming and implements the necessary
> memfile_backing_store callbacks.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---

[...]

> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static struct memfile_node *shmem_lookup_memfile_node(struct file *file)
> +{
> +	struct inode *inode = file_inode(file);
> +
> +	if (!shmem_mapping(inode->i_mapping))
> +		return NULL;
> +
> +	return  &SHMEM_I(inode)->memfile_node;
> +}
> +
> +
> +static int shmem_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
> +			 int *order)
> +{
> +	struct page *page;
> +	int ret;
> +
> +	ret = shmem_getpage(file_inode(file), offset, &page, SGP_WRITE);
> +	if (ret)
> +		return ret;
> +
> +	unlock_page(page);
> +	*pfn = page_to_pfn_t(page);
> +	*order = thp_order(compound_head(page));
> +	return 0;
> +}
> +
> +static void shmem_put_pfn(pfn_t pfn)
> +{
> +	struct page *page = pfn_t_to_page(pfn);
> +
> +	if (!page)
> +		return;
> +
> +	put_page(page);


Why do we export shmem_get_pfn/shmem_put_pfn and not simply

get_folio()

and let the caller deal with putting the folio? What's the reason to

a) Operate on PFNs and not folios
b) Have these get/put semantics?

> +}
> +
> +static struct memfile_backing_store shmem_backing_store = {
> +	.lookup_memfile_node = shmem_lookup_memfile_node,
> +	.get_pfn = shmem_get_pfn,
> +	.put_pfn = shmem_put_pfn,
> +};
> +#endif /* CONFIG_MEMFILE_NOTIFIER */
> +
>  void __init shmem_init(void)
>  {
>  	int error;
> @@ -3956,6 +4059,10 @@ void __init shmem_init(void)
>  	else
>  		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
>  #endif
> +
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +	memfile_register_backing_store(&shmem_backing_store);

Can we instead prove a dummy function that does nothing without
CONFIG_MEMFILE_NOTIFIER?

> +#endif
>  	return;
>  
>  out1:
Chao Peng Aug. 10, 2022, 9:25 a.m. UTC | #7
On Fri, Aug 05, 2022 at 03:26:02PM +0200, David Hildenbrand wrote:
> On 06.07.22 10:20, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > Implement shmem as a memfile_notifier backing store. Essentially it
> > interacts with the memfile_notifier feature flags for userspace
> > access/page migration/page reclaiming and implements the necessary
> > memfile_backing_store callbacks.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> 
> [...]
> 
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +static struct memfile_node *shmem_lookup_memfile_node(struct file *file)
> > +{
> > +	struct inode *inode = file_inode(file);
> > +
> > +	if (!shmem_mapping(inode->i_mapping))
> > +		return NULL;
> > +
> > +	return  &SHMEM_I(inode)->memfile_node;
> > +}
> > +
> > +
> > +static int shmem_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
> > +			 int *order)
> > +{
> > +	struct page *page;
> > +	int ret;
> > +
> > +	ret = shmem_getpage(file_inode(file), offset, &page, SGP_WRITE);
> > +	if (ret)
> > +		return ret;
> > +
> > +	unlock_page(page);
> > +	*pfn = page_to_pfn_t(page);
> > +	*order = thp_order(compound_head(page));
> > +	return 0;
> > +}
> > +
> > +static void shmem_put_pfn(pfn_t pfn)
> > +{
> > +	struct page *page = pfn_t_to_page(pfn);
> > +
> > +	if (!page)
> > +		return;
> > +
> > +	put_page(page);
> 
> 
> Why do we export shmem_get_pfn/shmem_put_pfn and not simply
> 
> get_folio()
> 
> and let the caller deal with putting the folio? What's the reason to
> 
> a) Operate on PFNs and not folios
> b) Have these get/put semantics?

We have a design assumption that somedays this can even support non-page
based backing stores. There are some discussions:
  https://lkml.org/lkml/2022/3/28/1440
I should add document for this two callbacks.

> 
> > +}
> > +
> > +static struct memfile_backing_store shmem_backing_store = {
> > +	.lookup_memfile_node = shmem_lookup_memfile_node,
> > +	.get_pfn = shmem_get_pfn,
> > +	.put_pfn = shmem_put_pfn,
> > +};
> > +#endif /* CONFIG_MEMFILE_NOTIFIER */
> > +
> >  void __init shmem_init(void)
> >  {
> >  	int error;
> > @@ -3956,6 +4059,10 @@ void __init shmem_init(void)
> >  	else
> >  		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
> >  #endif
> > +
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +	memfile_register_backing_store(&shmem_backing_store);
> 
> Can we instead prove a dummy function that does nothing without
> CONFIG_MEMFILE_NOTIFIER?

Sounds good.

Chao
> 
> > +#endif
> >  	return;
> >  
> >  out1:
> 
> 
> -- 
> Thanks,
> 
> David / dhildenb
>
diff mbox series

Patch

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a68f982f22d1..6031c0b08d26 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@ 
 #include <linux/percpu_counter.h>
 #include <linux/xattr.h>
 #include <linux/fs_parser.h>
+#include <linux/memfile_notifier.h>
 
 /* inode in-kernel data */
 
@@ -25,6 +26,7 @@  struct shmem_inode_info {
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct timespec64	i_crtime;	/* file creation time */
+	struct memfile_node	memfile_node;	/* memfile node */
 	struct inode		vfs_inode;
 };
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 6c8aef15a17d..627e315c3b4d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -905,6 +905,17 @@  static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 	return page ? page_folio(page) : NULL;
 }
 
+static void notify_invalidate(struct inode *inode, struct folio *folio,
+				   pgoff_t start, pgoff_t end)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	start = max(start, folio->index);
+	end = min(end, folio->index + folio_nr_pages(folio));
+
+	memfile_notifier_invalidate(&info->memfile_node, start, end);
+}
+
 /*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -948,6 +959,8 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			}
 			index += folio_nr_pages(folio) - 1;
 
+			notify_invalidate(inode, folio, start, end);
+
 			if (!unfalloc || !folio_test_uptodate(folio))
 				truncate_inode_folio(mapping, folio);
 			folio_unlock(folio);
@@ -1021,6 +1034,9 @@  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 					index--;
 					break;
 				}
+
+				notify_invalidate(inode, folio, start, end);
+
 				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
 						folio);
 				truncate_inode_folio(mapping, folio);
@@ -1092,6 +1108,13 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 			return -EPERM;
 
+		if (info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE) {
+			if (oldsize)
+				return -EPERM;
+			if (!PAGE_ALIGNED(newsize))
+				return -EINVAL;
+		}
+
 		if (newsize != oldsize) {
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
@@ -1336,6 +1359,8 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 	if (!total_swap_pages)
 		goto redirty;
+	if (info->memfile_node.flags & MEMFILE_F_UNRECLAIMABLE)
+		goto redirty;
 
 	/*
 	 * Our capabilities prevent regular writeback or sync from ever calling
@@ -2271,6 +2296,9 @@  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
+	if (info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE)
+		return -EPERM;
+
 	/* arm64 - allow memory tagging on RAM-based files */
 	vma->vm_flags |= VM_MTE_ALLOWED;
 
@@ -2306,6 +2334,7 @@  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		info->i_crtime = inode->i_mtime;
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
+		memfile_node_init(&info->memfile_node);
 		simple_xattrs_init(&info->xattrs);
 		cache_no_acl(inode);
 		mapping_set_large_folios(inode->i_mapping);
@@ -2477,6 +2506,8 @@  shmem_write_begin(struct file *file, struct address_space *mapping,
 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
 			return -EPERM;
 	}
+	if (unlikely(info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE))
+		return -EPERM;
 
 	if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE))
 		sgp = SGP_NOALLOC;
@@ -2556,6 +2587,13 @@  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		end_index = i_size >> PAGE_SHIFT;
 		if (index > end_index)
 			break;
+
+		if (SHMEM_I(inode)->memfile_node.flags &
+				MEMFILE_F_USER_INACCESSIBLE) {
+			error = -EPERM;
+			break;
+		}
+
 		if (index == end_index) {
 			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset)
@@ -2697,6 +2735,12 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			goto out;
 		}
 
+		if ((info->memfile_node.flags & MEMFILE_F_USER_INACCESSIBLE) &&
+		    (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))) {
+			error = -EINVAL;
+			goto out;
+		}
+
 		shmem_falloc.waitq = &shmem_falloc_waitq;
 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
@@ -3806,6 +3850,20 @@  static int shmem_error_remove_page(struct address_space *mapping,
 	return 0;
 }
 
+#ifdef CONFIG_MIGRATION
+static int shmem_migrate_page(struct address_space *mapping,
+			      struct page *newpage, struct page *page,
+			      enum migrate_mode mode)
+{
+	struct inode *inode = mapping->host;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (info->memfile_node.flags & MEMFILE_F_UNMOVABLE)
+		return -EOPNOTSUPP;
+	return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
 const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.dirty_folio	= noop_dirty_folio,
@@ -3814,7 +3872,7 @@  const struct address_space_operations shmem_aops = {
 	.write_end	= shmem_write_end,
 #endif
 #ifdef CONFIG_MIGRATION
-	.migratepage	= migrate_page,
+	.migratepage	= shmem_migrate_page,
 #endif
 	.error_remove_page = shmem_error_remove_page,
 };
@@ -3931,6 +3989,51 @@  static struct file_system_type shmem_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
+#ifdef CONFIG_MEMFILE_NOTIFIER
+static struct memfile_node *shmem_lookup_memfile_node(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	if (!shmem_mapping(inode->i_mapping))
+		return NULL;
+
+	return  &SHMEM_I(inode)->memfile_node;
+}
+
+
+static int shmem_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
+			 int *order)
+{
+	struct page *page;
+	int ret;
+
+	ret = shmem_getpage(file_inode(file), offset, &page, SGP_WRITE);
+	if (ret)
+		return ret;
+
+	unlock_page(page);
+	*pfn = page_to_pfn_t(page);
+	*order = thp_order(compound_head(page));
+	return 0;
+}
+
+static void shmem_put_pfn(pfn_t pfn)
+{
+	struct page *page = pfn_t_to_page(pfn);
+
+	if (!page)
+		return;
+
+	put_page(page);
+}
+
+static struct memfile_backing_store shmem_backing_store = {
+	.lookup_memfile_node = shmem_lookup_memfile_node,
+	.get_pfn = shmem_get_pfn,
+	.put_pfn = shmem_put_pfn,
+};
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
 void __init shmem_init(void)
 {
 	int error;
@@ -3956,6 +4059,10 @@  void __init shmem_init(void)
 	else
 		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
 #endif
+
+#ifdef CONFIG_MEMFILE_NOTIFIER
+	memfile_register_backing_store(&shmem_backing_store);
+#endif
 	return;
 
 out1: