diff mbox series

[v4,01/12] mm/shmem: Introduce F_SEAL_INACCESSIBLE

Message ID 20220118132121.31388-2-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng Jan. 18, 2022, 1:21 p.m. UTC
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
the file is inaccessible from userspace through ordinary MMU access
(e.g., read/write/mmap). However, the file content can be accessed
via a different mechanism (e.g. KVM MMU) indirectly.

It provides semantics required for KVM guest private memory support
that a file descriptor with this seal set is going to be used as the
source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV but may not be accessible from host userspace.

At this time only shmem implements this seal.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/uapi/linux/fcntl.h |  1 +
 mm/shmem.c                 | 40 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

Comments

Vlastimil Babka Feb. 7, 2022, 12:24 p.m. UTC | #1
On 1/18/22 14:21, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> the file is inaccessible from userspace through ordinary MMU access
> (e.g., read/write/mmap). However, the file content can be accessed
> via a different mechanism (e.g. KVM MMU) indirectly.
> 
> It provides semantics required for KVM guest private memory support
> that a file descriptor with this seal set is going to be used as the
> source of guest memory in confidential computing environments such
> as Intel TDX/AMD SEV but may not be accessible from host userspace.
> 
> At this time only shmem implements this seal.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/uapi/linux/fcntl.h |  1 +
>  mm/shmem.c                 | 40 ++++++++++++++++++++++++++++++++++++--
>  2 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..09ef34754dfa 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -43,6 +43,7 @@
>  #define F_SEAL_GROW	0x0004	/* prevent file from growing */
>  #define F_SEAL_WRITE	0x0008	/* prevent writes */
>  #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
> +#define F_SEAL_INACCESSIBLE	0x0020  /* prevent ordinary MMU access (e.g. read/write/mmap) to file content */
>  /* (1U << 31) is reserved for signed error codes */
>  
>  /*
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 18f93c2d68f1..72185630e7c4 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1098,6 +1098,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
>  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
>  			return -EPERM;
>  
> +		if (info->seals & F_SEAL_INACCESSIBLE) {
> +			if(i_size_read(inode))

Is this needed? The rest of the function seems to trust oldsize obtained by
plain reading inode->i_size well enough, so why be suddenly paranoid here?

> +				return -EPERM;
> +			if (newsize & ~PAGE_MASK)
> +				return -EINVAL;
> +		}
> +
>  		if (newsize != oldsize) {
>  			error = shmem_reacct_size(SHMEM_I(inode)->flags,
>  					oldsize, newsize);
> @@ -1364,6 +1371,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
>  		goto redirty;
>  	if (!total_swap_pages)
>  		goto redirty;
> +	if (info->seals & F_SEAL_INACCESSIBLE)
> +		goto redirty;
>  
>  	/*
>  	 * Our capabilities prevent regular writeback or sync from ever calling
> @@ -2262,6 +2271,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
>  	if (ret)
>  		return ret;
>  
> +	if (info->seals & F_SEAL_INACCESSIBLE)
> +		return -EPERM;
> +
>  	/* arm64 - allow memory tagging on RAM-based files */
>  	vma->vm_flags |= VM_MTE_ALLOWED;
>  
> @@ -2459,12 +2471,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
>  	pgoff_t index = pos >> PAGE_SHIFT;
>  
>  	/* i_rwsem is held by caller */
> -	if (unlikely(info->seals & (F_SEAL_GROW |
> -				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
> +	if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
> +				    F_SEAL_FUTURE_WRITE |
> +				    F_SEAL_INACCESSIBLE))) {
>  		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
>  			return -EPERM;
>  		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
>  			return -EPERM;
> +		if (info->seals & F_SEAL_INACCESSIBLE)
> +			return -EPERM;
>  	}
>  
>  	return shmem_getpage(inode, index, pagep, SGP_WRITE);
> @@ -2538,6 +2553,21 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		end_index = i_size >> PAGE_SHIFT;
>  		if (index > end_index)
>  			break;
> +
> +		/*
> +		 * inode_lock protects setting up seals as well as write to
> +		 * i_size. Setting F_SEAL_INACCESSIBLE only allowed with
> +		 * i_size == 0.
> +		 *
> +		 * Check F_SEAL_INACCESSIBLE after i_size. It effectively
> +		 * serialize read vs. setting F_SEAL_INACCESSIBLE without
> +		 * taking inode_lock in read path.
> +		 */
> +		if (SHMEM_I(inode)->seals & F_SEAL_INACCESSIBLE) {
> +			error = -EPERM;
> +			break;
> +		}
> +
>  		if (index == end_index) {
>  			nr = i_size & ~PAGE_MASK;
>  			if (nr <= offset)
> @@ -2663,6 +2693,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
>  			goto out;
>  		}
>  
> +		if ((info->seals & F_SEAL_INACCESSIBLE) &&
> +		    (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {

Could we use PAGE_ALIGNED()?

> +			error = -EINVAL;
> +			goto out;
> +		}
> +
>  		shmem_falloc.waitq = &shmem_falloc_waitq;
>  		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
>  		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
Andy Lutomirski Feb. 11, 2022, 11:33 p.m. UTC | #2
On 1/18/22 05:21, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> the file is inaccessible from userspace through ordinary MMU access
> (e.g., read/write/mmap). However, the file content can be accessed
> via a different mechanism (e.g. KVM MMU) indirectly.
> 
> It provides semantics required for KVM guest private memory support
> that a file descriptor with this seal set is going to be used as the
> source of guest memory in confidential computing environments such
> as Intel TDX/AMD SEV but may not be accessible from host userspace.
> 
> At this time only shmem implements this seal.
> 

I don't dislike this *that* much, but I do dislike this. 
F_SEAL_INACCESSIBLE essentially transmutes a memfd into a different type 
of object.  While this can apparently be done successfully and without 
races (as in this code), it's at least awkward.  I think that either 
creating a special inaccessible memfd should be a single operation that 
create the correct type of object or there should be a clear 
justification for why it's a two-step process.

(Imagine if the way to create an eventfd would be to call 
timerfd_create() and then do a special fcntl to turn it into an eventfd 
but only if it's not currently armed.  This would be weird.)
Chao Peng Feb. 17, 2022, 12:56 p.m. UTC | #3
On Mon, Feb 07, 2022 at 01:24:42PM +0100, Vlastimil Babka wrote:
> On 1/18/22 14:21, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> >  /*
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 18f93c2d68f1..72185630e7c4 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1098,6 +1098,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
> >  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
> >  			return -EPERM;
> >  
> > +		if (info->seals & F_SEAL_INACCESSIBLE) {
> > +			if(i_size_read(inode))
> 
> Is this needed? The rest of the function seems to trust oldsize obtained by
> plain reading inode->i_size well enough, so why be suddenly paranoid here?

oldsize sounds enough here, unless kirill has different mind.

> 
> > +				return -EPERM;
> > +			if (newsize & ~PAGE_MASK)
> > +				return -EINVAL;
> > +		}
> > +
> >  		if (newsize != oldsize) {
> >  			error = shmem_reacct_size(SHMEM_I(inode)->flags,
> > +		if ((info->seals & F_SEAL_INACCESSIBLE) &&
> > +		    (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
> 
> Could we use PAGE_ALIGNED()?

Yes, definitely, thanks.

Chao
> 
> > +			error = -EINVAL;
> > +			goto out;
> > +		}
> > +
> >  		shmem_falloc.waitq = &shmem_falloc_waitq;
> >  		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
> >  		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
Chao Peng Feb. 17, 2022, 1:06 p.m. UTC | #4
On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
> On 1/18/22 05:21, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> > the file is inaccessible from userspace through ordinary MMU access
> > (e.g., read/write/mmap). However, the file content can be accessed
> > via a different mechanism (e.g. KVM MMU) indirectly.
> > 
> > It provides semantics required for KVM guest private memory support
> > that a file descriptor with this seal set is going to be used as the
> > source of guest memory in confidential computing environments such
> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> > 
> > At this time only shmem implements this seal.
> > 
> 
> I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
> essentially transmutes a memfd into a different type of object.  While this
> can apparently be done successfully and without races (as in this code),
> it's at least awkward.  I think that either creating a special inaccessible
> memfd should be a single operation that create the correct type of object or
> there should be a clear justification for why it's a two-step process.

Now one justification maybe from Stever's comment to patch-00: for ARM
usage it can be used with creating a normal memfd, (partially)populate
it with initial guest memory content (e.g. firmware), and then
F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
KVM (definitely the current code needs to be changed to support that).

Thanks,
Chao
> 
> (Imagine if the way to create an eventfd would be to call timerfd_create()
> and then do a special fcntl to turn it into an eventfd but only if it's not
> currently armed.  This would be weird.)
Andy Lutomirski Feb. 17, 2022, 7:09 p.m. UTC | #5
On Thu, Feb 17, 2022, at 5:06 AM, Chao Peng wrote:
> On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
>> On 1/18/22 05:21, Chao Peng wrote:
>> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>> > 
>> > Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
>> > the file is inaccessible from userspace through ordinary MMU access
>> > (e.g., read/write/mmap). However, the file content can be accessed
>> > via a different mechanism (e.g. KVM MMU) indirectly.
>> > 
>> > It provides semantics required for KVM guest private memory support
>> > that a file descriptor with this seal set is going to be used as the
>> > source of guest memory in confidential computing environments such
>> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
>> > 
>> > At this time only shmem implements this seal.
>> > 
>> 
>> I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
>> essentially transmutes a memfd into a different type of object.  While this
>> can apparently be done successfully and without races (as in this code),
>> it's at least awkward.  I think that either creating a special inaccessible
>> memfd should be a single operation that create the correct type of object or
>> there should be a clear justification for why it's a two-step process.
>
> Now one justification maybe from Stever's comment to patch-00: for ARM
> usage it can be used with creating a normal memfd, (partially)populate
> it with initial guest memory content (e.g. firmware), and then
> F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
> KVM (definitely the current code needs to be changed to support that).

Except we don't allow F_SEAL_INACCESSIBLE on a non-empty file, right?  So this won't work.

In any case, the whole confidential VM initialization story is a bit buddy.  From the earlier emails, it sounds like ARM expects the host to fill in guest memory and measure it.  From my recollection of Intel's scheme (which may well be wrong, and I could easily be confusing it with SGX), TDX instead measures what is essentially a transcript of the series of operations that initializes the VM.  These are fundamentally not the same thing even if they accomplish the same end goal.  For TDX, we unavoidably need an operation (ioctl or similar) that initializes things according to the VM's instructions, and ARM ought to be able to use roughly the same mechanism.

Also, if we ever get fancy and teach the page allocator about memory with reduced directmap permissions, it may well be more efficient for userspace to shove data into a memfd via ioctl than it is to mmap it and write the data.
Chao Peng Feb. 23, 2022, 11:49 a.m. UTC | #6
On Thu, Feb 17, 2022 at 11:09:35AM -0800, Andy Lutomirski wrote:
> On Thu, Feb 17, 2022, at 5:06 AM, Chao Peng wrote:
> > On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
> >> On 1/18/22 05:21, Chao Peng wrote:
> >> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> >> > 
> >> > Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> >> > the file is inaccessible from userspace through ordinary MMU access
> >> > (e.g., read/write/mmap). However, the file content can be accessed
> >> > via a different mechanism (e.g. KVM MMU) indirectly.
> >> > 
> >> > It provides semantics required for KVM guest private memory support
> >> > that a file descriptor with this seal set is going to be used as the
> >> > source of guest memory in confidential computing environments such
> >> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> >> > 
> >> > At this time only shmem implements this seal.
> >> > 
> >> 
> >> I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
> >> essentially transmutes a memfd into a different type of object.  While this
> >> can apparently be done successfully and without races (as in this code),
> >> it's at least awkward.  I think that either creating a special inaccessible
> >> memfd should be a single operation that create the correct type of object or
> >> there should be a clear justification for why it's a two-step process.
> >
> > Now one justification maybe from Stever's comment to patch-00: for ARM
> > usage it can be used with creating a normal memfd, (partially)populate
> > it with initial guest memory content (e.g. firmware), and then
> > F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
> > KVM (definitely the current code needs to be changed to support that).
> 
> Except we don't allow F_SEAL_INACCESSIBLE on a non-empty file, right?  So this won't work.

Hmm, right, if we set F_SEAL_INACCESSIBLE on a non-empty file, we will 
need to make sure access to existing mmap-ed area should be prevented,
but that is hard.

> 
> In any case, the whole confidential VM initialization story is a bit buddy.  From the earlier emails, it sounds like ARM expects the host to fill in guest memory and measure it.  From my recollection of Intel's scheme (which may well be wrong, and I could easily be confusing it with SGX), TDX instead measures what is essentially a transcript of the series of operations that initializes the VM.  These are fundamentally not the same thing even if they accomplish the same end goal.  For TDX, we unavoidably need an operation (ioctl or similar) that initializes things according to the VM's instructions, and ARM ought to be able to use roughly the same mechanism.

Yes, TDX requires a ioctl. Steven may comment on the ARM part.

Chao
> 
> Also, if we ever get fancy and teach the page allocator about memory with reduced directmap permissions, it may well be more efficient for userspace to shove data into a memfd via ioctl than it is to mmap it and write the data.
Steven Price Feb. 23, 2022, 12:05 p.m. UTC | #7
On 23/02/2022 11:49, Chao Peng wrote:
> On Thu, Feb 17, 2022 at 11:09:35AM -0800, Andy Lutomirski wrote:
>> On Thu, Feb 17, 2022, at 5:06 AM, Chao Peng wrote:
>>> On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
>>>> On 1/18/22 05:21, Chao Peng wrote:
>>>>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>>>>>
>>>>> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
>>>>> the file is inaccessible from userspace through ordinary MMU access
>>>>> (e.g., read/write/mmap). However, the file content can be accessed
>>>>> via a different mechanism (e.g. KVM MMU) indirectly.
>>>>>
>>>>> It provides semantics required for KVM guest private memory support
>>>>> that a file descriptor with this seal set is going to be used as the
>>>>> source of guest memory in confidential computing environments such
>>>>> as Intel TDX/AMD SEV but may not be accessible from host userspace.
>>>>>
>>>>> At this time only shmem implements this seal.
>>>>>
>>>>
>>>> I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
>>>> essentially transmutes a memfd into a different type of object.  While this
>>>> can apparently be done successfully and without races (as in this code),
>>>> it's at least awkward.  I think that either creating a special inaccessible
>>>> memfd should be a single operation that create the correct type of object or
>>>> there should be a clear justification for why it's a two-step process.
>>>
>>> Now one justification maybe from Stever's comment to patch-00: for ARM
>>> usage it can be used with creating a normal memfd, (partially)populate
>>> it with initial guest memory content (e.g. firmware), and then
>>> F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
>>> KVM (definitely the current code needs to be changed to support that).
>>
>> Except we don't allow F_SEAL_INACCESSIBLE on a non-empty file, right?  So this won't work.
> 
> Hmm, right, if we set F_SEAL_INACCESSIBLE on a non-empty file, we will 
> need to make sure access to existing mmap-ed area should be prevented,
> but that is hard.
> 
>>
>> In any case, the whole confidential VM initialization story is a bit buddy.  From the earlier emails, it sounds like ARM expects the host to fill in guest memory and measure it.  From my recollection of Intel's scheme (which may well be wrong, and I could easily be confusing it with SGX), TDX instead measures what is essentially a transcript of the series of operations that initializes the VM.  These are fundamentally not the same thing even if they accomplish the same end goal.  For TDX, we unavoidably need an operation (ioctl or similar) that initializes things according to the VM's instructions, and ARM ought to be able to use roughly the same mechanism.
> 
> Yes, TDX requires a ioctl. Steven may comment on the ARM part.

The Arm story is evolving so I can't give a definite answer yet. Our
current prototyping works by creating the initial VM content in a
memslot as with a normal VM and then calling an ioctl which throws the
big switch and converts all the (populated) pages to be protected. At
this point the RMM performs a measurement of the data that the VM is
being populated with.

The above (in our prototype) suffers from all the expected problems with
a malicious VMM being able to trick the host kernel into accessing those
pages after they have been protected (causing a fault detected by the
hardware).

The ideal (from our perspective) approach would be to follow the same
flow but where the VMM populates a memfd rather than normal anonymous
pages. The memfd could then be sealed and the pages converted to
protected ones (with the RMM measuring them in the process).

The question becomes how is that memfd populated? It would be nice if
that could be done using normal operations on a memfd (i.e. using
mmap()) and therefore this code could be (relatively) portable. This
would mean that any pages mapped from the memfd would either need to
block the sealing or be revoked at the time of sealing.

The other approach is we could of course implement a special ioctl which
effectively does a memcpy into the (created empty and sealed) memfd and
does the necessary dance with the RMM to measure the contents. This
would match the "transcript of the series of operations" described above
- but seems much less ideal from the viewpoint of the VMM.

Steve

> Chao
>>
>> Also, if we ever get fancy and teach the page allocator about memory with reduced directmap permissions, it may well be more efficient for userspace to shove data into a memfd via ioctl than it is to mmap it and write the data.
> 
> 
>
Andy Lutomirski March 4, 2022, 7:24 p.m. UTC | #8
On 2/23/22 04:05, Steven Price wrote:
> On 23/02/2022 11:49, Chao Peng wrote:
>> On Thu, Feb 17, 2022 at 11:09:35AM -0800, Andy Lutomirski wrote:
>>> On Thu, Feb 17, 2022, at 5:06 AM, Chao Peng wrote:
>>>> On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
>>>>> On 1/18/22 05:21, Chao Peng wrote:
>>>>>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>>>>>>
>>>>>> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
>>>>>> the file is inaccessible from userspace through ordinary MMU access
>>>>>> (e.g., read/write/mmap). However, the file content can be accessed
>>>>>> via a different mechanism (e.g. KVM MMU) indirectly.
>>>>>>
>>>>>> It provides semantics required for KVM guest private memory support
>>>>>> that a file descriptor with this seal set is going to be used as the
>>>>>> source of guest memory in confidential computing environments such
>>>>>> as Intel TDX/AMD SEV but may not be accessible from host userspace.
>>>>>>
>>>>>> At this time only shmem implements this seal.
>>>>>>
>>>>>
>>>>> I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
>>>>> essentially transmutes a memfd into a different type of object.  While this
>>>>> can apparently be done successfully and without races (as in this code),
>>>>> it's at least awkward.  I think that either creating a special inaccessible
>>>>> memfd should be a single operation that create the correct type of object or
>>>>> there should be a clear justification for why it's a two-step process.
>>>>
>>>> Now one justification maybe from Stever's comment to patch-00: for ARM
>>>> usage it can be used with creating a normal memfd, (partially)populate
>>>> it with initial guest memory content (e.g. firmware), and then
>>>> F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
>>>> KVM (definitely the current code needs to be changed to support that).
>>>
>>> Except we don't allow F_SEAL_INACCESSIBLE on a non-empty file, right?  So this won't work.
>>
>> Hmm, right, if we set F_SEAL_INACCESSIBLE on a non-empty file, we will
>> need to make sure access to existing mmap-ed area should be prevented,
>> but that is hard.
>>
>>>
>>> In any case, the whole confidential VM initialization story is a bit buddy.  From the earlier emails, it sounds like ARM expects the host to fill in guest memory and measure it.  From my recollection of Intel's scheme (which may well be wrong, and I could easily be confusing it with SGX), TDX instead measures what is essentially a transcript of the series of operations that initializes the VM.  These are fundamentally not the same thing even if they accomplish the same end goal.  For TDX, we unavoidably need an operation (ioctl or similar) that initializes things according to the VM's instructions, and ARM ought to be able to use roughly the same mechanism.
>>
>> Yes, TDX requires a ioctl. Steven may comment on the ARM part.
> 
> The Arm story is evolving so I can't give a definite answer yet. Our
> current prototyping works by creating the initial VM content in a
> memslot as with a normal VM and then calling an ioctl which throws the
> big switch and converts all the (populated) pages to be protected. At
> this point the RMM performs a measurement of the data that the VM is
> being populated with.
> 
> The above (in our prototype) suffers from all the expected problems with
> a malicious VMM being able to trick the host kernel into accessing those
> pages after they have been protected (causing a fault detected by the
> hardware).
> 
> The ideal (from our perspective) approach would be to follow the same
> flow but where the VMM populates a memfd rather than normal anonymous
> pages. The memfd could then be sealed and the pages converted to
> protected ones (with the RMM measuring them in the process).
> 
> The question becomes how is that memfd populated? It would be nice if
> that could be done using normal operations on a memfd (i.e. using
> mmap()) and therefore this code could be (relatively) portable. This
> would mean that any pages mapped from the memfd would either need to
> block the sealing or be revoked at the time of sealing.
> 
> The other approach is we could of course implement a special ioctl which
> effectively does a memcpy into the (created empty and sealed) memfd and
> does the necessary dance with the RMM to measure the contents. This
> would match the "transcript of the series of operations" described above
> - but seems much less ideal from the viewpoint of the VMM.

A VMM that supports Other Vendors will need to understand this sort of 
model regardless.

I don't particularly mind the idea of having the kernel consume a normal 
memfd and spit out a new object, but I find the concept of changing the 
type of the object in place, even if it has other references, and trying 
to control all the resulting races to be somewhat alarming.

In pseudo-Rust, this is the difference between:

fn convert_to_private(in: &mut Memfd)

and

fn convert_to_private(in: Memfd) -> PrivateMemoryFd

This doesn't map particularly nicely to the kernel, though.

--Andy\
Chao Peng March 7, 2022, 1:26 p.m. UTC | #9
On Fri, Mar 04, 2022 at 11:24:30AM -0800, Andy Lutomirski wrote:
> On 2/23/22 04:05, Steven Price wrote:
> > On 23/02/2022 11:49, Chao Peng wrote:
> > > On Thu, Feb 17, 2022 at 11:09:35AM -0800, Andy Lutomirski wrote:
> > > > On Thu, Feb 17, 2022, at 5:06 AM, Chao Peng wrote:
> > > > > On Fri, Feb 11, 2022 at 03:33:35PM -0800, Andy Lutomirski wrote:
> > > > > > On 1/18/22 05:21, Chao Peng wrote:
> > > > > > > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > > > > > > 
> > > > > > > Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> > > > > > > the file is inaccessible from userspace through ordinary MMU access
> > > > > > > (e.g., read/write/mmap). However, the file content can be accessed
> > > > > > > via a different mechanism (e.g. KVM MMU) indirectly.
> > > > > > > 
> > > > > > > It provides semantics required for KVM guest private memory support
> > > > > > > that a file descriptor with this seal set is going to be used as the
> > > > > > > source of guest memory in confidential computing environments such
> > > > > > > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> > > > > > > 
> > > > > > > At this time only shmem implements this seal.
> > > > > > > 
> > > > > > 
> > > > > > I don't dislike this *that* much, but I do dislike this. F_SEAL_INACCESSIBLE
> > > > > > essentially transmutes a memfd into a different type of object.  While this
> > > > > > can apparently be done successfully and without races (as in this code),
> > > > > > it's at least awkward.  I think that either creating a special inaccessible
> > > > > > memfd should be a single operation that create the correct type of object or
> > > > > > there should be a clear justification for why it's a two-step process.
> > > > > 
> > > > > Now one justification maybe from Stever's comment to patch-00: for ARM
> > > > > usage it can be used with creating a normal memfd, (partially)populate
> > > > > it with initial guest memory content (e.g. firmware), and then
> > > > > F_SEAL_INACCESSIBLE it just before the first time lunch of the guest in
> > > > > KVM (definitely the current code needs to be changed to support that).
> > > > 
> > > > Except we don't allow F_SEAL_INACCESSIBLE on a non-empty file, right?  So this won't work.
> > > 
> > > Hmm, right, if we set F_SEAL_INACCESSIBLE on a non-empty file, we will
> > > need to make sure access to existing mmap-ed area should be prevented,
> > > but that is hard.
> > > 
> > > > 
> > > > In any case, the whole confidential VM initialization story is a bit buddy.  From the earlier emails, it sounds like ARM expects the host to fill in guest memory and measure it.  From my recollection of Intel's scheme (which may well be wrong, and I could easily be confusing it with SGX), TDX instead measures what is essentially a transcript of the series of operations that initializes the VM.  These are fundamentally not the same thing even if they accomplish the same end goal.  For TDX, we unavoidably need an operation (ioctl or similar) that initializes things according to the VM's instructions, and ARM ought to be able to use roughly the same mechanism.
> > > 
> > > Yes, TDX requires a ioctl. Steven may comment on the ARM part.
> > 
> > The Arm story is evolving so I can't give a definite answer yet. Our
> > current prototyping works by creating the initial VM content in a
> > memslot as with a normal VM and then calling an ioctl which throws the
> > big switch and converts all the (populated) pages to be protected. At
> > this point the RMM performs a measurement of the data that the VM is
> > being populated with.
> > 
> > The above (in our prototype) suffers from all the expected problems with
> > a malicious VMM being able to trick the host kernel into accessing those
> > pages after they have been protected (causing a fault detected by the
> > hardware).
> > 
> > The ideal (from our perspective) approach would be to follow the same
> > flow but where the VMM populates a memfd rather than normal anonymous
> > pages. The memfd could then be sealed and the pages converted to
> > protected ones (with the RMM measuring them in the process).
> > 
> > The question becomes how is that memfd populated? It would be nice if
> > that could be done using normal operations on a memfd (i.e. using
> > mmap()) and therefore this code could be (relatively) portable. This
> > would mean that any pages mapped from the memfd would either need to
> > block the sealing or be revoked at the time of sealing.
> > 
> > The other approach is we could of course implement a special ioctl which
> > effectively does a memcpy into the (created empty and sealed) memfd and
> > does the necessary dance with the RMM to measure the contents. This
> > would match the "transcript of the series of operations" described above
> > - but seems much less ideal from the viewpoint of the VMM.
> 
> A VMM that supports Other Vendors will need to understand this sort of model
> regardless.
> 
> I don't particularly mind the idea of having the kernel consume a normal
> memfd and spit out a new object, but I find the concept of changing the type
> of the object in place, even if it has other references, and trying to
> control all the resulting races to be somewhat alarming.
> 
> In pseudo-Rust, this is the difference between:
> 
> fn convert_to_private(in: &mut Memfd)
> 
> and
> 
> fn convert_to_private(in: Memfd) -> PrivateMemoryFd
> 
> This doesn't map particularly nicely to the kernel, though.

I understand this Rust semantics and the difficulty to handle races.
Probably we should not expose F_SEAL_INACCESSIBLE to userspace, instead
we can use a new in-kernel flag to indicate the same thing. That flag
should be set only when the memfd is created with MFD_INACCESSIBLE.

Chao
> 
> --Andy\
Paolo Bonzini March 8, 2022, 12:17 p.m. UTC | #10
On 3/7/22 14:26, Chao Peng wrote:
>> In pseudo-Rust, this is the difference between:
>>
>> fn convert_to_private(in: &mut Memfd)
>>
>> and
>>
>> fn convert_to_private(in: Memfd) -> PrivateMemoryFd
>>
>> This doesn't map particularly nicely to the kernel, though.
> I understand this Rust semantics and the difficulty to handle races.
> Probably we should not expose F_SEAL_INACCESSIBLE to userspace, instead
> we can use a new in-kernel flag to indicate the same thing. That flag
> should be set only when the memfd is created with MFD_INACCESSIBLE.

Yes, I like this.

Paolo
diff mbox series

Patch

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..09ef34754dfa 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@ 
 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
+#define F_SEAL_INACCESSIBLE	0x0020  /* prevent ordinary MMU access (e.g. read/write/mmap) to file content */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..72185630e7c4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1098,6 +1098,13 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 			return -EPERM;
 
+		if (info->seals & F_SEAL_INACCESSIBLE) {
+			if(i_size_read(inode))
+				return -EPERM;
+			if (newsize & ~PAGE_MASK)
+				return -EINVAL;
+		}
+
 		if (newsize != oldsize) {
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
@@ -1364,6 +1371,8 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 	if (!total_swap_pages)
 		goto redirty;
+	if (info->seals & F_SEAL_INACCESSIBLE)
+		goto redirty;
 
 	/*
 	 * Our capabilities prevent regular writeback or sync from ever calling
@@ -2262,6 +2271,9 @@  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
+	if (info->seals & F_SEAL_INACCESSIBLE)
+		return -EPERM;
+
 	/* arm64 - allow memory tagging on RAM-based files */
 	vma->vm_flags |= VM_MTE_ALLOWED;
 
@@ -2459,12 +2471,15 @@  shmem_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_SHIFT;
 
 	/* i_rwsem is held by caller */
-	if (unlikely(info->seals & (F_SEAL_GROW |
-				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+	if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
+				    F_SEAL_FUTURE_WRITE |
+				    F_SEAL_INACCESSIBLE))) {
 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
 			return -EPERM;
 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
 			return -EPERM;
+		if (info->seals & F_SEAL_INACCESSIBLE)
+			return -EPERM;
 	}
 
 	return shmem_getpage(inode, index, pagep, SGP_WRITE);
@@ -2538,6 +2553,21 @@  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		end_index = i_size >> PAGE_SHIFT;
 		if (index > end_index)
 			break;
+
+		/*
+		 * inode_lock protects setting up seals as well as write to
+		 * i_size. Setting F_SEAL_INACCESSIBLE only allowed with
+		 * i_size == 0.
+		 *
+		 * Check F_SEAL_INACCESSIBLE after i_size. It effectively
+		 * serialize read vs. setting F_SEAL_INACCESSIBLE without
+		 * taking inode_lock in read path.
+		 */
+		if (SHMEM_I(inode)->seals & F_SEAL_INACCESSIBLE) {
+			error = -EPERM;
+			break;
+		}
+
 		if (index == end_index) {
 			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset)
@@ -2663,6 +2693,12 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			goto out;
 		}
 
+		if ((info->seals & F_SEAL_INACCESSIBLE) &&
+		    (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
+			error = -EINVAL;
+			goto out;
+		}
+
 		shmem_falloc.waitq = &shmem_falloc_waitq;
 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;