diff mbox

mm: Add new vma flag VM_LOCAL_CPU

Message ID 0efb5547-9250-6b6c-fe8e-cf4f44aaa5eb@netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boaz Harrosh May 14, 2018, 5:28 p.m. UTC
On a call to mmap an mmap provider (like an FS) can put
this flag on vma->vm_flags.

The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
from a single-core only, and therefore invalidation (flush_tlb) of
PTE(s) need not be a wide CPU scheduling.

The motivation of this flag is the ZUFS project where we want
to optimally map user-application buffers into a user-mode-server
execute the operation and efficiently unmap.

In this project we utilize a per-core server thread so everything
is kept local. If we use the regular zap_ptes() API All CPU's
are scheduled for the unmap, though in our case we know that we
have only used a single core. The regular zap_ptes adds a very big
latency on every operation and mostly kills the concurrency of the
over all system. Because it imposes a serialization between all cores

Some preliminary measurements on a 40 core machines:

	unpatched		patched
Threads	Op/s	Lat [us]	Op/s	Lat [us]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1	185391	4.9		200799	4.6
2	197993	9.6		314321	5.9
4	310597	12.1		565574	6.6
8	546702	13.8		1113138	6.6
12	641728	17.2		1598451	6.8
18	744750	22.2		1648689	7.8
24	790805	28.3		1702285	8
36	849763	38.9		1783346	13.4
48	792000	44.6		1741873	17.4

We can clearly see that on an unpatched Kernel we do not scale
and the threads are interfering with each other. This is because
flush-tlb is scheduled on all (other) CPUs.

NOTE: This vma (VM_LOCAL_CPU) is never used during a page_fault. It is
always used in a synchronous way from a thread pinned to a single core.

Signed-off-by: Boaz Harrosh <boazh@netapp.com>
---
 arch/x86/mm/tlb.c  |  3 ++-
 fs/proc/task_mmu.c |  3 +++
 include/linux/mm.h |  3 +++
 mm/memory.c        | 13 +++++++++++--
 4 files changed, 19 insertions(+), 3 deletions(-)

Comments

Boaz Harrosh May 14, 2018, 6:26 p.m. UTC | #1
On 14/05/18 20:28, Boaz Harrosh wrote:
> 
> On a call to mmap an mmap provider (like an FS) can put
> this flag on vma->vm_flags.
> 
> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> from a single-core only, and therefore invalidation (flush_tlb) of
> PTE(s) need not be a wide CPU scheduling.
> 
> The motivation of this flag is the ZUFS project where we want
> to optimally map user-application buffers into a user-mode-server
> execute the operation and efficiently unmap.
> 

I am please pushing for this patch ahead of the push of ZUFS, because
this is the only patch we need from otherwise an STD Kernel.

We are partnering with Distro(s) to push ZUFS out-of-tree to beta clients
to try and stabilize such a big project before final submission and
an ABI / on-disk freeze.

By itself this patch has 0 risk and can not break anything.

Thanks
Boaz

> In this project we utilize a per-core server thread so everything
> is kept local. If we use the regular zap_ptes() API All CPU's
> are scheduled for the unmap, though in our case we know that we
> have only used a single core. The regular zap_ptes adds a very big
> latency on every operation and mostly kills the concurrency of the
> over all system. Because it imposes a serialization between all cores
> 
> Some preliminary measurements on a 40 core machines:
> 
> 	unpatched		patched
> Threads	Op/s	Lat [us]	Op/s	Lat [us]
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 1	185391	4.9		200799	4.6
> 2	197993	9.6		314321	5.9
> 4	310597	12.1		565574	6.6
> 8	546702	13.8		1113138	6.6
> 12	641728	17.2		1598451	6.8
> 18	744750	22.2		1648689	7.8
> 24	790805	28.3		1702285	8
> 36	849763	38.9		1783346	13.4
> 48	792000	44.6		1741873	17.4
> 
> We can clearly see that on an unpatched Kernel we do not scale
> and the threads are interfering with each other. This is because
> flush-tlb is scheduled on all (other) CPUs.
> 
> NOTE: This vma (VM_LOCAL_CPU) is never used during a page_fault. It is
> always used in a synchronous way from a thread pinned to a single core.
> 
> Signed-off-by: Boaz Harrosh <boazh@netapp.com>
> ---
>  arch/x86/mm/tlb.c  |  3 ++-
>  fs/proc/task_mmu.c |  3 +++
>  include/linux/mm.h |  3 +++
>  mm/memory.c        | 13 +++++++++++--
>  4 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index e055d1a..1d398a0 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -640,7 +640,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>  		local_irq_enable();
>  	}
>  
> -	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
> +	if (!(vmflag & VM_LOCAL_CPU) &&
> +	    cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
>  		flush_tlb_others(mm_cpumask(mm), &info);
>  
>  	put_cpu();
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index c486ad4..305d6e4 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -680,6 +680,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
>  		[ilog2(VM_PKEY_BIT2)]	= "",
>  		[ilog2(VM_PKEY_BIT3)]	= "",
>  #endif
> +#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
> +		[ilog2(VM_LOCAL_CPU)]	= "lc",
> +#endif
>  	};
>  	size_t i;
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1ac1f06..3d14107 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -226,6 +226,9 @@ extern unsigned int kobjsize(const void *objp);
>  #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
>  #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
>  #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
> +#define VM_LOCAL_CPU	BIT(37)		/* FIXME: Needs to move from here */
> +#else /* ! CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
> +#define VM_LOCAL_CPU	0		/* FIXME: Needs to move from here */
>  #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>  
>  #if defined(CONFIG_X86)
> diff --git a/mm/memory.c b/mm/memory.c
> index 01f5464..6236f5e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1788,6 +1788,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  	int retval;
>  	pte_t *pte, entry;
>  	spinlock_t *ptl;
> +	bool need_flush = false;
>  
>  	retval = -ENOMEM;
>  	pte = get_locked_pte(mm, addr, &ptl);
> @@ -1795,7 +1796,12 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  		goto out;
>  	retval = -EBUSY;
>  	if (!pte_none(*pte)) {
> -		if (mkwrite) {
> +		if ((vma->vm_flags & VM_LOCAL_CPU)) {
> +			/* VM_LOCAL_CPU is set, A single CPU is allowed to not
> +			 * go through zap_vma_ptes before changing a pte
> +			 */
> +			need_flush = true;
> +		} else if (mkwrite) {
>  			/*
>  			 * For read faults on private mappings the PFN passed
>  			 * in may not match the PFN we have mapped if the
> @@ -1807,8 +1813,9 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  				goto out_unlock;
>  			entry = *pte;
>  			goto out_mkwrite;
> -		} else
> +		} else {
>  			goto out_unlock;
> +		}
>  	}
>  
>  	/* Ok, finally just insert the thing.. */
> @@ -1824,6 +1831,8 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  	}
>  
>  	set_pte_at(mm, addr, pte, entry);
> +	if (need_flush)
> +		flush_tlb_range(vma, addr, addr + PAGE_SIZE);
>  	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
>  
>  	retval = 0;
>
Matthew Wilcox May 14, 2018, 7:15 p.m. UTC | #2
On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
> On a call to mmap an mmap provider (like an FS) can put
> this flag on vma->vm_flags.
> 
> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> from a single-core only, and therefore invalidation (flush_tlb) of
> PTE(s) need not be a wide CPU scheduling.

I still don't get this.  You're opening the kernel up to being exploited
by any application which can persuade it to set this flag on a VMA.

> NOTE: This vma (VM_LOCAL_CPU) is never used during a page_fault. It is
> always used in a synchronous way from a thread pinned to a single core.

It's not a question of how your app is going to use this flag.  It's a
question about how another app can abuse this flag (or how your app is
going to be exploited to abuse this flag) to break into the kernel.
Boaz Harrosh May 14, 2018, 7:37 p.m. UTC | #3
On 14/05/18 22:15, Matthew Wilcox wrote:
> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
>> On a call to mmap an mmap provider (like an FS) can put
>> this flag on vma->vm_flags.
>>
>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
>> from a single-core only, and therefore invalidation (flush_tlb) of
>> PTE(s) need not be a wide CPU scheduling.
> 
> I still don't get this.  You're opening the kernel up to being exploited
> by any application which can persuade it to set this flag on a VMA.
> 

No No this is not an application accessible flag this can only be set
by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).

Please see the zuf patches for usage (Again apologise for pushing before
a user)

The mmap provider has all the facilities to know that this can not be
abused, not even by a trusted Server.

>> NOTE: This vma (VM_LOCAL_CPU) is never used during a page_fault. It is
>> always used in a synchronous way from a thread pinned to a single core.
> 
> It's not a question of how your app is going to use this flag.  It's a
> question about how another app can abuse this flag (or how your app is
> going to be exploited to abuse this flag) to break into the kernel.
> 

If you look at the zuf user you will see that the faults all return
SIG_BUS. These can never fault. The server has access to this mapping
from a single thread pinned to a core.

Again it is not an app visible flag in anyway

Thanks for looking
Boaz
Andrew Morton May 14, 2018, 9:49 p.m. UTC | #4
On Mon, 14 May 2018 20:28:01 +0300 Boaz Harrosh <boazh@netapp.com> wrote:

> On a call to mmap an mmap provider (like an FS) can put
> this flag on vma->vm_flags.
> 
> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> from a single-core only, and therefore invalidation (flush_tlb) of
> PTE(s) need not be a wide CPU scheduling.
> 
> The motivation of this flag is the ZUFS project where we want
> to optimally map user-application buffers into a user-mode-server
> execute the operation and efficiently unmap.
> 
> In this project we utilize a per-core server thread so everything
> is kept local. If we use the regular zap_ptes() API All CPU's
> are scheduled for the unmap, though in our case we know that we
> have only used a single core. The regular zap_ptes adds a very big
> latency on every operation and mostly kills the concurrency of the
> over all system. Because it imposes a serialization between all cores

I'd have thought that in this situation, only the local CPU's bit is
set in the vma's mm_cpumask() and the remote invalidations are not
performed.  Is that a misunderstanding, or is all that stuff not working
correctly?
Matthew Wilcox May 15, 2018, 12:41 a.m. UTC | #5
On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
> On 14/05/18 22:15, Matthew Wilcox wrote:
> > On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
> >> On a call to mmap an mmap provider (like an FS) can put
> >> this flag on vma->vm_flags.
> >>
> >> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> >> from a single-core only, and therefore invalidation (flush_tlb) of
> >> PTE(s) need not be a wide CPU scheduling.
> > 
> > I still don't get this.  You're opening the kernel up to being exploited
> > by any application which can persuade it to set this flag on a VMA.
> > 
> 
> No No this is not an application accessible flag this can only be set
> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
> 
> Please see the zuf patches for usage (Again apologise for pushing before
> a user)
> 
> The mmap provider has all the facilities to know that this can not be
> abused, not even by a trusted Server.

I don't think page tables work the way you think they work.

+               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);

That doesn't just insert it into the local CPU's page table.  Any CPU
which directly accesses or even prefetches that address will also get
the translation into its cache.
Matthew Wilcox May 15, 2018, 12:44 a.m. UTC | #6
On Mon, May 14, 2018 at 02:49:01PM -0700, Andrew Morton wrote:
> On Mon, 14 May 2018 20:28:01 +0300 Boaz Harrosh <boazh@netapp.com> wrote:
> > In this project we utilize a per-core server thread so everything
> > is kept local. If we use the regular zap_ptes() API All CPU's
> > are scheduled for the unmap, though in our case we know that we
> > have only used a single core. The regular zap_ptes adds a very big
> > latency on every operation and mostly kills the concurrency of the
> > over all system. Because it imposes a serialization between all cores
> 
> I'd have thought that in this situation, only the local CPU's bit is
> set in the vma's mm_cpumask() and the remote invalidations are not
> performed.  Is that a misunderstanding, or is all that stuff not working
> correctly?

I think you misunderstand Boaz's architecture.  He has one thread per CPU,
so every bit will be set in the mm's (not vma's) mm_cpumask.
Christoph Hellwig May 15, 2018, 7:08 a.m. UTC | #7
On Mon, May 14, 2018 at 09:26:13PM +0300, Boaz Harrosh wrote:
> I am please pushing for this patch ahead of the push of ZUFS, because
> this is the only patch we need from otherwise an STD Kernel.
> 
> We are partnering with Distro(s) to push ZUFS out-of-tree to beta clients
> to try and stabilize such a big project before final submission and
> an ABI / on-disk freeze.
> 

Please stop this crap.  If you want any new kernel functionality send
it together with a user.  Even more so for something as questionanble
and hairy as this.

With a stance like this you disqualify yourself.
Boaz Harrosh May 15, 2018, 10:43 a.m. UTC | #8
On 15/05/18 03:41, Matthew Wilcox wrote:
> On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
>> On 14/05/18 22:15, Matthew Wilcox wrote:
>>> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
>>>> On a call to mmap an mmap provider (like an FS) can put
>>>> this flag on vma->vm_flags.
>>>>
>>>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
>>>> from a single-core only, and therefore invalidation (flush_tlb) of
>>>> PTE(s) need not be a wide CPU scheduling.
>>>
>>> I still don't get this.  You're opening the kernel up to being exploited
>>> by any application which can persuade it to set this flag on a VMA.
>>>
>>
>> No No this is not an application accessible flag this can only be set
>> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
>>
>> Please see the zuf patches for usage (Again apologise for pushing before
>> a user)
>>
>> The mmap provider has all the facilities to know that this can not be
>> abused, not even by a trusted Server.
> 
> I don't think page tables work the way you think they work.
> 
> +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
> 
> That doesn't just insert it into the local CPU's page table.  Any CPU
> which directly accesses or even prefetches that address will also get
> the translation into its cache.
> 

Yes I know, but that is exactly the point of this flag. I know that this
address is only ever accessed from a single core. Because it is an mmap (vma)
of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
only that thread any kind of access to this vma. Both the filehandle and the
mmaped pointer are kept on the thread stack and have no access from outside.

So the all point of this flag is the kernel driver telling mm that this
address is enforced to only be accessed from one core-pinned thread.

Thanks
Boaz
Boaz Harrosh May 15, 2018, 10:45 a.m. UTC | #9
On 15/05/18 10:08, Christoph Hellwig wrote:
> On Mon, May 14, 2018 at 09:26:13PM +0300, Boaz Harrosh wrote:
>> I am please pushing for this patch ahead of the push of ZUFS, because
>> this is the only patch we need from otherwise an STD Kernel.
>>
>> We are partnering with Distro(s) to push ZUFS out-of-tree to beta clients
>> to try and stabilize such a big project before final submission and
>> an ABI / on-disk freeze.
>>
> 
> Please stop this crap.  If you want any new kernel functionality send
> it together with a user.  Even more so for something as questionanble
> and hairy as this.
> 
> With a stance like this you disqualify yourself.
> 

OK thank you I see your point. I will try to push a user ASAP.

Thanks
Boaz
Matthew Wilcox May 15, 2018, 11:11 a.m. UTC | #10
On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
> On 15/05/18 03:41, Matthew Wilcox wrote:
> > On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
> >> On 14/05/18 22:15, Matthew Wilcox wrote:
> >>> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
> >>>> On a call to mmap an mmap provider (like an FS) can put
> >>>> this flag on vma->vm_flags.
> >>>>
> >>>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> >>>> from a single-core only, and therefore invalidation (flush_tlb) of
> >>>> PTE(s) need not be a wide CPU scheduling.
> >>>
> >>> I still don't get this.  You're opening the kernel up to being exploited
> >>> by any application which can persuade it to set this flag on a VMA.
> >>>
> >>
> >> No No this is not an application accessible flag this can only be set
> >> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
> >>
> >> Please see the zuf patches for usage (Again apologise for pushing before
> >> a user)
> >>
> >> The mmap provider has all the facilities to know that this can not be
> >> abused, not even by a trusted Server.
> > 
> > I don't think page tables work the way you think they work.
> > 
> > +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
> > 
> > That doesn't just insert it into the local CPU's page table.  Any CPU
> > which directly accesses or even prefetches that address will also get
> > the translation into its cache.
> 
> Yes I know, but that is exactly the point of this flag. I know that this
> address is only ever accessed from a single core. Because it is an mmap (vma)
> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
> only that thread any kind of access to this vma. Both the filehandle and the
> mmaped pointer are kept on the thread stack and have no access from outside.
> 
> So the all point of this flag is the kernel driver telling mm that this
> address is enforced to only be accessed from one core-pinned thread.

You're still thinking about this from the wrong perspective.  If you
were writing a program to attack this facility, how would you do it?
It's not exactly hard to leak one pointer's worth of information.
Boaz Harrosh May 15, 2018, 11:41 a.m. UTC | #11
On 15/05/18 14:11, Matthew Wilcox wrote:
> On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
>> On 15/05/18 03:41, Matthew Wilcox wrote:
>>> On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
>>>> On 14/05/18 22:15, Matthew Wilcox wrote:
>>>>> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
>>>>>> On a call to mmap an mmap provider (like an FS) can put
>>>>>> this flag on vma->vm_flags.
>>>>>>
>>>>>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
>>>>>> from a single-core only, and therefore invalidation (flush_tlb) of
>>>>>> PTE(s) need not be a wide CPU scheduling.
>>>>>
>>>>> I still don't get this.  You're opening the kernel up to being exploited
>>>>> by any application which can persuade it to set this flag on a VMA.
>>>>>
>>>>
>>>> No No this is not an application accessible flag this can only be set
>>>> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
>>>>
>>>> Please see the zuf patches for usage (Again apologise for pushing before
>>>> a user)
>>>>
>>>> The mmap provider has all the facilities to know that this can not be
>>>> abused, not even by a trusted Server.
>>>
>>> I don't think page tables work the way you think they work.
>>>
>>> +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
>>>
>>> That doesn't just insert it into the local CPU's page table.  Any CPU
>>> which directly accesses or even prefetches that address will also get
>>> the translation into its cache.
>>
>> Yes I know, but that is exactly the point of this flag. I know that this
>> address is only ever accessed from a single core. Because it is an mmap (vma)
>> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
>> only that thread any kind of access to this vma. Both the filehandle and the
>> mmaped pointer are kept on the thread stack and have no access from outside.
>>
>> So the all point of this flag is the kernel driver telling mm that this
>> address is enforced to only be accessed from one core-pinned thread.
> 
> You're still thinking about this from the wrong perspective.  If you
> were writing a program to attack this facility, how would you do it?
> It's not exactly hard to leak one pointer's worth of information.
> 

That would be very hard. Because that program would:
- need to be root
- need to start and pretend it is zus Server with the all mount
  thread thing, register new filesystem, grab some pmem devices.
- Mount the said filesystem on said pmem. Create core-pinned ZT threads
  for all CPUs, start accepting IO.
- And only then it can start leaking the pointer and do bad things.
  The bad things it can do to the application, not to the Kernel.
  And as a full filesystem it can do those bad things to the application
  through the front door directly not needing the mismatch tlb at all.

That said. It brings up a very important point that I wanted to talk about.
In this design the zuf(Kernel) and the zus(um Server) are part of the distribution.
I would like to have the zus module be signed by the distro's Kernel's key and
checked on loadtime. I know there is an effort by Redhat guys to try and sign all
/sbin/* servers and have Kernel check these. So this is not the first time people
have thought about that.

Thanks
Boaz
Peter Zijlstra May 15, 2018, 11:47 a.m. UTC | #12
On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
> Yes I know, but that is exactly the point of this flag. I know that this
> address is only ever accessed from a single core. Because it is an mmap (vma)
> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
> only that thread any kind of access to this vma. Both the filehandle and the
> mmaped pointer are kept on the thread stack and have no access from outside.
> 
> So the all point of this flag is the kernel driver telling mm that this
> address is enforced to only be accessed from one core-pinned thread.

What happens when the userspace part -- there is one, right, how else do
you get an mm to stick a vma in -- simply does a full address range
probe scan?

Something like this really needs a far more detailed Changelog that
explains how its to be used and how it is impossible to abuse. Esp. that
latter is _very_ important.
Boaz Harrosh May 15, 2018, 11:54 a.m. UTC | #13
On 15/05/18 03:44, Matthew Wilcox wrote:
> On Mon, May 14, 2018 at 02:49:01PM -0700, Andrew Morton wrote:
>> On Mon, 14 May 2018 20:28:01 +0300 Boaz Harrosh <boazh@netapp.com> wrote:
>>> In this project we utilize a per-core server thread so everything
>>> is kept local. If we use the regular zap_ptes() API All CPU's
>>> are scheduled for the unmap, though in our case we know that we
>>> have only used a single core. The regular zap_ptes adds a very big
>>> latency on every operation and mostly kills the concurrency of the
>>> over all system. Because it imposes a serialization between all cores
>>
>> I'd have thought that in this situation, only the local CPU's bit is
>> set in the vma's mm_cpumask() and the remote invalidations are not
>> performed.  Is that a misunderstanding, or is all that stuff not working
>> correctly?
> 
> I think you misunderstand Boaz's architecture.  He has one thread per CPU,
> so every bit will be set in the mm's (not vma's) mm_cpumask.
> 

Hi Andrew, Matthew

Yes I have been trying to investigate and trace this for days.
Please see the code below:

> #define flush_tlb_range(vma, start, end)	\
> 		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)

The mm_struct @mm below comes from here vma->vm_mm

> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index e055d1a..1d398a0 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -611,39 +611,40 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>  				unsigned long end, unsigned long vmflag)
>  {
>  	int cpu;
>  
>  	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
>  		.mm = mm,
>  	};
>  
>  	cpu = get_cpu();
>  
>  	/* This is also a barrier that synchronizes with switch_mm(). */
>  	info.new_tlb_gen = inc_mm_tlb_gen(mm);
>  
>  	/* Should we flush just the requested range? */
>  	if ((end != TLB_FLUSH_ALL) &&
>  	    !(vmflag & VM_HUGETLB) &&
>  	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
>  		info.start = start;
>  		info.end = end;
>  	} else {
>  		info.start = 0UL;
>  		info.end = TLB_FLUSH_ALL;
>  	}
>  
>  	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
>  		VM_WARN_ON(irqs_disabled());
>  		local_irq_disable();
>  		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
>  		local_irq_enable();
>  	}
>  
> -	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
> +	if (!(vmflag & VM_LOCAL_CPU) &&
> +	    cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
>  		flush_tlb_others(mm_cpumask(mm), &info);
>  

I have been tracing the mm_cpumask(vma->vm_mm) at my driver at
different points. At vma creation (file_operations->mmap()), 
and before the call to insert_pfn (which calls here).

At the beginning I was wishful thinking that the mm_cpumask(vma->vm_mm)
should have a single bit set just as the affinity of the thread on
creation of that thread. But then I saw that at %80 of the times some
other random bits are also set.

Yes Random. Always the thread affinity (single) bit was set but
then zero one or two more bits were set as well. Never seen more then
two though, which baffles me a lot.

If it was like Matthew said .i.e the cpumask of the all process
then I would expect all the bits to be set. Because I have a thread
on each core. And also I would even expect that all vma->vm_mm
or maybe mm_cpumask(vma->vm_mm) to point to the same global object.
But it was not so. it was pointing to some thread unique object but
still those phantom bits were set all over. (And I am almost sure
same vma had those bits change over time)

So I would love some mm guy to explain where are those bits collected?
But I do not think this is a Kernel bug because as Matthew showed.
that vma above can and is allowed to leak memory addresses to other
threads / cores in the same process. So it appears that the Kernel
has some correct logic behind its madness.

Which brings me to another question. How can I find from
within a thread Say at the file_operations->mmap() call that the thread
is indeed core-pinned. What mm_cpumask should I inspect?

>  	put_cpu();
>  }

Thanks
Boaz
Boaz Harrosh May 15, 2018, 12:01 p.m. UTC | #14
On 15/05/18 14:47, Peter Zijlstra wrote:
> On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
>> Yes I know, but that is exactly the point of this flag. I know that this
>> address is only ever accessed from a single core. Because it is an mmap (vma)
>> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
>> only that thread any kind of access to this vma. Both the filehandle and the
>> mmaped pointer are kept on the thread stack and have no access from outside.
>>
>> So the all point of this flag is the kernel driver telling mm that this
>> address is enforced to only be accessed from one core-pinned thread.
> 
> What happens when the userspace part -- there is one, right, how else do
> you get an mm to stick a vma in -- simply does a full address range
> probe scan?
> 
> Something like this really needs a far more detailed Changelog that
> explains how its to be used and how it is impossible to abuse. Esp. that
> latter is _very_ important.
> 

Thank you yes. I will try and capture all this thread in the commit message
and as Christoph demanded supply a user code to demonstrate usage.

Thank you for looking
Boaz
Matthew Wilcox May 15, 2018, 12:03 p.m. UTC | #15
On Tue, May 15, 2018 at 02:41:41PM +0300, Boaz Harrosh wrote:
> That would be very hard. Because that program would:
> - need to be root
> - need to start and pretend it is zus Server with the all mount
>   thread thing, register new filesystem, grab some pmem devices.
> - Mount the said filesystem on said pmem. Create core-pinned ZT threads
>   for all CPUs, start accepting IO.
> - And only then it can start leaking the pointer and do bad things.

All of these things you've done for me by writing zus Server.  All I
have to do now is compromise zus Server.

>   The bad things it can do to the application, not to the Kernel.
>   And as a full filesystem it can do those bad things to the application
>   through the front door directly not needing the mismatch tlb at all.

That's not true.  When I have a TLB entry that points to a page of kernel
ram, I can do almost anything, depending on what the kernel decides to
do with that ram next.  Maybe it's page cache again, in which case I can
affect whatever application happens to get it allocated.  Maybe it's a
kmalloc page next, in which case I can affect any part of the kernel.
Maybe it's a page table, then I can affect any process.

> That said. It brings up a very important point that I wanted to talk about.
> In this design the zuf(Kernel) and the zus(um Server) are part of the distribution.
> I would like to have the zus module be signed by the distro's Kernel's key and
> checked on loadtime. I know there is an effort by Redhat guys to try and sign all
> /sbin/* servers and have Kernel check these. So this is not the first time people
> have thought about that.

You're getting dangerously close to admitting that the entire point
of this exercise is so that you can link non-GPL NetApp code into the
kernel in clear violation of the GPL.
Mark Rutland May 15, 2018, 12:07 p.m. UTC | #16
On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
> On 15/05/18 03:41, Matthew Wilcox wrote:
> > On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
> >> On 14/05/18 22:15, Matthew Wilcox wrote:
> >>> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
> >>>> On a call to mmap an mmap provider (like an FS) can put
> >>>> this flag on vma->vm_flags.
> >>>>
> >>>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> >>>> from a single-core only, and therefore invalidation (flush_tlb) of
> >>>> PTE(s) need not be a wide CPU scheduling.
> >>>
> >>> I still don't get this.  You're opening the kernel up to being exploited
> >>> by any application which can persuade it to set this flag on a VMA.
> >>>
> >>
> >> No No this is not an application accessible flag this can only be set
> >> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
> >>
> >> Please see the zuf patches for usage (Again apologise for pushing before
> >> a user)
> >>
> >> The mmap provider has all the facilities to know that this can not be
> >> abused, not even by a trusted Server.
> > 
> > I don't think page tables work the way you think they work.
> > 
> > +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
> > 
> > That doesn't just insert it into the local CPU's page table.  Any CPU
> > which directly accesses or even prefetches that address will also get
> > the translation into its cache.
> > 
> 
> Yes I know, but that is exactly the point of this flag. I know that this
> address is only ever accessed from a single core. Because it is an mmap (vma)
> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
> only that thread any kind of access to this vma. Both the filehandle and the
> mmaped pointer are kept on the thread stack and have no access from outside.

Even if (in the specific context of your application) software on other
cores might not explicitly access this area, that does not prevent
allocations into TLBs, and TLB maintenance *cannot* be elided.

Even assuming that software *never* explicitly accesses an address which
it has not mapped is insufficient.

For example, imagine you have two threads, each pinned to a CPU, and
some local_cpu_{mmap,munmap} which uses your new flag:

	CPU0				CPU1
	x = local_cpu_mmap(...);
	do_things_with(x);
					// speculatively allocates TLB
					// entries for X.

	// only invalidates local TLBs
	local_cpu_munmap(x);

					// TLB entries for X still live
	
					y = local_cpu_mmap(...);

					// if y == x, we can hit the
					// stale TLB entry, and access
					// the wrong page
					do_things_with(y);

Consider that after we free x, the kernel could reuse the page for any
purpose (e.g. kernel page tables), so this is a major risk.

This flag simply is not safe, unless the *entire* mm is only ever
accessed from a single CPU. In that case, we don't need the flag anyway,
as the mm already has a cpumask.

Thanks,
Mark.
Peter Zijlstra May 15, 2018, 12:09 p.m. UTC | #17
On Tue, May 15, 2018 at 02:41:41PM +0300, Boaz Harrosh wrote:
> On 15/05/18 14:11, Matthew Wilcox wrote:

> > You're still thinking about this from the wrong perspective.  If you
> > were writing a program to attack this facility, how would you do it?
> > It's not exactly hard to leak one pointer's worth of information.
> > 
> 
> That would be very hard. Because that program would:
> - need to be root
> - need to start and pretend it is zus Server with the all mount
>   thread thing, register new filesystem, grab some pmem devices.
> - Mount the said filesystem on said pmem. Create core-pinned ZT threads
>   for all CPUs, start accepting IO.
> - And only then it can start leaking the pointer and do bad things.
>   The bad things it can do to the application, not to the Kernel.

No I think you can do bad things to the kernel at that point. Consider
it populating the TLBs on the 'wrong' CPU by 'inadvertenly' touching
'random' memory.

Then cause an invalidation and get the page re-used for kernel bits.

Then access that page through the 'stale' TLB entry we still have on the
'wrong' CPU and corrupt kernel data.
Boaz Harrosh May 15, 2018, 12:31 p.m. UTC | #18
On 15/05/18 15:09, Peter Zijlstra wrote:
> On Tue, May 15, 2018 at 02:41:41PM +0300, Boaz Harrosh wrote:
>> On 15/05/18 14:11, Matthew Wilcox wrote:
> 
>>> You're still thinking about this from the wrong perspective.  If you
>>> were writing a program to attack this facility, how would you do it?
>>> It's not exactly hard to leak one pointer's worth of information.
>>>
>>
>> That would be very hard. Because that program would:
>> - need to be root
>> - need to start and pretend it is zus Server with the all mount
>>   thread thing, register new filesystem, grab some pmem devices.
>> - Mount the said filesystem on said pmem. Create core-pinned ZT threads
>>   for all CPUs, start accepting IO.
>> - And only then it can start leaking the pointer and do bad things.
>>   The bad things it can do to the application, not to the Kernel.
> 
> No I think you can do bad things to the kernel at that point. Consider
> it populating the TLBs on the 'wrong' CPU by 'inadvertenly' touching
> 'random' memory.
> 
> Then cause an invalidation and get the page re-used for kernel bits.
> 
> Then access that page through the 'stale' TLB entry we still have on the
> 'wrong' CPU and corrupt kernel data.
> 

Yes a BAD filesystem Server can do bad things I agree. But a filesystem can
do very bad things in any case. through the front door, No? and we trust
it with our data. So there is some trust we already put in a filesystem i think.

I will try to look at this deeper, see if I can actually enforce this policy.
Do you have any ideas? can I force page_faults on the other cores?

Thank you for looking
Boaz
Peter Zijlstra May 15, 2018, 12:35 p.m. UTC | #19
On Tue, May 15, 2018 at 01:07:51PM +0100, Mark Rutland wrote:
> 					// speculatively allocates TLB

Ohh, right, I completely forgot about that, but that actually does
happen. We had trouble with AMD doing just that only about a year ago or
so IIRC.

CPUs are completely free to speculatively load TLB entries for pages
they never actually end up touching (typically prefetcher based), as
long as there's valid page-tables for them at the time.

So yes, you're right.
Boaz Harrosh May 15, 2018, 1:19 p.m. UTC | #20
On 15/05/18 15:07, Mark Rutland wrote:
> On Tue, May 15, 2018 at 01:43:23PM +0300, Boaz Harrosh wrote:
>> On 15/05/18 03:41, Matthew Wilcox wrote:
>>> On Mon, May 14, 2018 at 10:37:38PM +0300, Boaz Harrosh wrote:
>>>> On 14/05/18 22:15, Matthew Wilcox wrote:
>>>>> On Mon, May 14, 2018 at 08:28:01PM +0300, Boaz Harrosh wrote:
>>>>>> On a call to mmap an mmap provider (like an FS) can put
>>>>>> this flag on vma->vm_flags.
>>>>>>
>>>>>> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
>>>>>> from a single-core only, and therefore invalidation (flush_tlb) of
>>>>>> PTE(s) need not be a wide CPU scheduling.
>>>>>
>>>>> I still don't get this.  You're opening the kernel up to being exploited
>>>>> by any application which can persuade it to set this flag on a VMA.
>>>>>
>>>>
>>>> No No this is not an application accessible flag this can only be set
>>>> by the mmap implementor at ->mmap() time (Say same as VM_VM_MIXEDMAP).
>>>>
>>>> Please see the zuf patches for usage (Again apologise for pushing before
>>>> a user)
>>>>
>>>> The mmap provider has all the facilities to know that this can not be
>>>> abused, not even by a trusted Server.
>>>
>>> I don't think page tables work the way you think they work.
>>>
>>> +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
>>>
>>> That doesn't just insert it into the local CPU's page table.  Any CPU
>>> which directly accesses or even prefetches that address will also get
>>> the translation into its cache.
>>>
>>
>> Yes I know, but that is exactly the point of this flag. I know that this
>> address is only ever accessed from a single core. Because it is an mmap (vma)
>> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
>> only that thread any kind of access to this vma. Both the filehandle and the
>> mmaped pointer are kept on the thread stack and have no access from outside.
> 
> Even if (in the specific context of your application) software on other
> cores might not explicitly access this area, that does not prevent
> allocations into TLBs, and TLB maintenance *cannot* be elided.
> 
> Even assuming that software *never* explicitly accesses an address which
> it has not mapped is insufficient.
> 
> For example, imagine you have two threads, each pinned to a CPU, and
> some local_cpu_{mmap,munmap} which uses your new flag:
> 
> 	CPU0				CPU1
> 	x = local_cpu_mmap(...);
> 	do_things_with(x);
> 					// speculatively allocates TLB
> 					// entries for X.
> 
> 	// only invalidates local TLBs
> 	local_cpu_munmap(x);
> 
> 					// TLB entries for X still live
> 	
> 					y = local_cpu_mmap(...);
> 
> 					// if y == x, we can hit the

But this y == x is not possible. The x here is held throughout the
lifetime  of CPU0-pinned thread. And cannot be allocated later to
another thread.

In fact if that file holding the VMA closes we know the server
crashed and we cleanly close everything.
(Including properly zapping all maps)

> 					// stale TLB entry, and access
> 					// the wrong page
> 					do_things_with(y);
> 

So even if the CPU pre fetched that TLB no one in the system will use
this address until proper close. Where everything is properly flushed.

> Consider that after we free x, the kernel could reuse the page for any
> purpose (e.g. kernel page tables), so this is a major risk.
> 

So yes. We never free x. We hold it for the entire duration of the ZT-thread
(ZThread is that core-pinned thread per-core we are using)
And each time we map some application buffers into that vma and local_tlb
invalidate when done.

When x is de-allocated, do to a close or a crash, it is all properly zapped.

> This flag simply is not safe, unless the *entire* mm is only ever
> accessed from a single CPU. In that case, we don't need the flag anyway,
> as the mm already has a cpumask.
> 

Did you please see that other part of the thread, and my answer to Andrew?
why is the vma->vm_mm cpumask so weird. It is neither all bits set nor
a single bit set. It is very common (20% of the time) for mm_cpumask(vma->vm_mm)
to be a single bit set. Exactly in an application where I have a thread-per-core
Please look at that? (I'll ping you from that email)

> Thanks,
> Mark.
> 

Thanks
Boaz
Boaz Harrosh May 15, 2018, 1:24 p.m. UTC | #21
On 15/05/18 14:54, Boaz Harrosh wrote:
> On 15/05/18 03:44, Matthew Wilcox wrote:
>> On Mon, May 14, 2018 at 02:49:01PM -0700, Andrew Morton wrote:
>>> On Mon, 14 May 2018 20:28:01 +0300 Boaz Harrosh <boazh@netapp.com> wrote:
>>>> In this project we utilize a per-core server thread so everything
>>>> is kept local. If we use the regular zap_ptes() API All CPU's
>>>> are scheduled for the unmap, though in our case we know that we
>>>> have only used a single core. The regular zap_ptes adds a very big
>>>> latency on every operation and mostly kills the concurrency of the
>>>> over all system. Because it imposes a serialization between all cores
>>>
>>> I'd have thought that in this situation, only the local CPU's bit is
>>> set in the vma's mm_cpumask() and the remote invalidations are not
>>> performed.  Is that a misunderstanding, or is all that stuff not working
>>> correctly?
>>
>> I think you misunderstand Boaz's architecture.  He has one thread per CPU,
>> so every bit will be set in the mm's (not vma's) mm_cpumask.
>>
> 
> Hi Andrew, Matthew
> 
> Yes I have been trying to investigate and trace this for days.
> Please see the code below:
> 
>> #define flush_tlb_range(vma, start, end)	\
>> 		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
> 
> The mm_struct @mm below comes from here vma->vm_mm
> 
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index e055d1a..1d398a0 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -611,39 +611,40 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
>>  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
>>  				unsigned long end, unsigned long vmflag)
>>  {
>>  	int cpu;
>>  
>>  	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
>>  		.mm = mm,
>>  	};
>>  
>>  	cpu = get_cpu();
>>  
>>  	/* This is also a barrier that synchronizes with switch_mm(). */
>>  	info.new_tlb_gen = inc_mm_tlb_gen(mm);
>>  
>>  	/* Should we flush just the requested range? */
>>  	if ((end != TLB_FLUSH_ALL) &&
>>  	    !(vmflag & VM_HUGETLB) &&
>>  	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
>>  		info.start = start;
>>  		info.end = end;
>>  	} else {
>>  		info.start = 0UL;
>>  		info.end = TLB_FLUSH_ALL;
>>  	}
>>  
>>  	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
>>  		VM_WARN_ON(irqs_disabled());
>>  		local_irq_disable();
>>  		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
>>  		local_irq_enable();
>>  	}
>>  
>> -	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
>> +	if (!(vmflag & VM_LOCAL_CPU) &&
>> +	    cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
>>  		flush_tlb_others(mm_cpumask(mm), &info);
>>  
> 
> I have been tracing the mm_cpumask(vma->vm_mm) at my driver at
> different points. At vma creation (file_operations->mmap()), 
> and before the call to insert_pfn (which calls here).
> 
> At the beginning I was wishful thinking that the mm_cpumask(vma->vm_mm)
> should have a single bit set just as the affinity of the thread on
> creation of that thread. But then I saw that at %80 of the times some
> other random bits are also set.
> 
> Yes Random. Always the thread affinity (single) bit was set but
> then zero one or two more bits were set as well. Never seen more then
> two though, which baffles me a lot.
> 
> If it was like Matthew said .i.e the cpumask of the all process
> then I would expect all the bits to be set. Because I have a thread
> on each core. And also I would even expect that all vma->vm_mm
> or maybe mm_cpumask(vma->vm_mm) to point to the same global object.
> But it was not so. it was pointing to some thread unique object but
> still those phantom bits were set all over. (And I am almost sure
> same vma had those bits change over time)
> 
> So I would love some mm guy to explain where are those bits collected?
> But I do not think this is a Kernel bug because as Matthew showed.
> that vma above can and is allowed to leak memory addresses to other
> threads / cores in the same process. So it appears that the Kernel
> has some correct logic behind its madness.
> 
Hi Mark

So you see %20 of the times the mm_cpumask(vma->vm_mm) is a single
bit set. And a natural call to flush_tlb_range will only invalidate
the local cpu. Are you familiar with this logic?

> Which brings me to another question. How can I find from
> within a thread Say at the file_operations->mmap() call that the thread
> is indeed core-pinned. What mm_cpumask should I inspect?
> 

Mark, Peter do you know how I can check the above?

Thanks
Boaz

>>  	put_cpu();
>>  }
> 
> Thanks
> Boaz
>
Boaz Harrosh May 15, 2018, 1:29 p.m. UTC | #22
On 15/05/18 15:03, Matthew Wilcox wrote:
> On Tue, May 15, 2018 at 02:41:41PM +0300, Boaz Harrosh wrote:
>> That would be very hard. Because that program would:
>> - need to be root
>> - need to start and pretend it is zus Server with the all mount
>>   thread thing, register new filesystem, grab some pmem devices.
>> - Mount the said filesystem on said pmem. Create core-pinned ZT threads
>>   for all CPUs, start accepting IO.
>> - And only then it can start leaking the pointer and do bad things.
> 
> All of these things you've done for me by writing zus Server.  All I
> have to do now is compromise zus Server.
> 
>>   The bad things it can do to the application, not to the Kernel.
>>   And as a full filesystem it can do those bad things to the application
>>   through the front door directly not needing the mismatch tlb at all.
> 
> That's not true.  When I have a TLB entry that points to a page of kernel
> ram, I can do almost anything, depending on what the kernel decides to
> do with that ram next.  Maybe it's page cache again, in which case I can
> affect whatever application happens to get it allocated.  Maybe it's a
> kmalloc page next, in which case I can affect any part of the kernel.
> Maybe it's a page table, then I can affect any process.
> 
>> That said. It brings up a very important point that I wanted to talk about.
>> In this design the zuf(Kernel) and the zus(um Server) are part of the distribution.
>> I would like to have the zus module be signed by the distro's Kernel's key and
>> checked on loadtime. I know there is an effort by Redhat guys to try and sign all
>> /sbin/* servers and have Kernel check these. So this is not the first time people
>> have thought about that.
> 
> You're getting dangerously close to admitting that the entire point
> of this exercise is so that you can link non-GPL NetApp code into the
> kernel in clear violation of the GPL.
> 

It is not that at all. What I'm trying to do is enable a zero-copy,
synchronous, low latency, low overhead. highly parallel - a new modern
interface with application servers.

You yourself had such a project that could easily be served out-of-the-box
with zufs, of a device that wanted to sit in user-mode.

Sometimes it is very convenient and needed for Servers to sit in
user-mode. And this interface allows that. And it is not always
a licensing thing. Though yes licensing is also an issue sometimes.
It is the reality we are living in.

But please indulge me I am curious how the point of signing /sbin/
servers, made you think about GPL licensing issues?

That said, is your point that as long as user-mode servers are sloooowwww
they are OK to be supported but if they are as fast as the kernel,
(as demonstrated a zufs based FS was faster then xfs-dax on same pmem)
Then it is a GPL violation?

Thanks
Boaz
Matthew Wilcox May 15, 2018, 1:50 p.m. UTC | #23
On Tue, May 15, 2018 at 04:29:22PM +0300, Boaz Harrosh wrote:
> On 15/05/18 15:03, Matthew Wilcox wrote:
> > You're getting dangerously close to admitting that the entire point
> > of this exercise is so that you can link non-GPL NetApp code into the
> > kernel in clear violation of the GPL.
> 
> It is not that at all. What I'm trying to do is enable a zero-copy,
> synchronous, low latency, low overhead. highly parallel - a new modern
> interface with application servers.

... and fully buzzword compliant.

> You yourself had such a project that could easily be served out-of-the-box
> with zufs, of a device that wanted to sit in user-mode.

For a very different reason.  I think the source code to that project
is publically available; the problem is that it's not written in C.

> Sometimes it is very convenient and needed for Servers to sit in
> user-mode. And this interface allows that. And it is not always
> a licensing thing. Though yes licensing is also an issue sometimes.
> It is the reality we are living in.
> 
> But please indulge me I am curious how the point of signing /sbin/
> servers, made you think about GPL licensing issues?
> 
> That said, is your point that as long as user-mode servers are sloooowwww
> they are OK to be supported but if they are as fast as the kernel,
> (as demonstrated a zufs based FS was faster then xfs-dax on same pmem)
> Then it is a GPL violation?

No.  Read what Linus wrote:

   NOTE! This copyright does *not* cover user programs that use kernel
 services by normal system calls - this is merely considered normal use
 of the kernel, and does *not* fall under the heading of "derived work".

What you're doing is far beyond that exception.  You're developing in
concert a userspace and kernel component, and claiming that the GPL does
not apply to the userspace component.  I'm not a lawyer, but you're on
very thin ice.
Boaz Harrosh May 15, 2018, 2:10 p.m. UTC | #24
On 15/05/18 16:50, Matthew Wilcox wrote:
> On Tue, May 15, 2018 at 04:29:22PM +0300, Boaz Harrosh wrote:
>> On 15/05/18 15:03, Matthew Wilcox wrote:
>>> You're getting dangerously close to admitting that the entire point
>>> of this exercise is so that you can link non-GPL NetApp code into the
>>> kernel in clear violation of the GPL.
>>
>> It is not that at all. What I'm trying to do is enable a zero-copy,
>> synchronous, low latency, low overhead. highly parallel - a new modern
>> interface with application servers.
> 
> ... and fully buzzword compliant.
> 
>> You yourself had such a project that could easily be served out-of-the-box
>> with zufs, of a device that wanted to sit in user-mode.
> 
> For a very different reason.  I think the source code to that project
> is publically available; the problem is that it's not written in C.
> 

Exactly the point, sir. Many reasons to sit in user-land for example
for me it is libraries that can not be loaded into Kernel.

>> Sometimes it is very convenient and needed for Servers to sit in
>> user-mode. And this interface allows that. And it is not always
>> a licensing thing. Though yes licensing is also an issue sometimes.
>> It is the reality we are living in.
>>
>> But please indulge me I am curious how the point of signing /sbin/
>> servers, made you think about GPL licensing issues?
>>
>> That said, is your point that as long as user-mode servers are sloooowwww
>> they are OK to be supported but if they are as fast as the kernel,
>> (as demonstrated a zufs based FS was faster then xfs-dax on same pmem)
>> Then it is a GPL violation?
> 
> No.  Read what Linus wrote:
> 
>    NOTE! This copyright does *not* cover user programs that use kernel
>  services by normal system calls - this is merely considered normal use
>  of the kernel, and does *not* fall under the heading of "derived work".
> 
> What you're doing is far beyond that exception.  You're developing in
> concert a userspace and kernel component, and claiming that the GPL does
> not apply to the userspace component.  I'm not a lawyer, but you're on
> very thin ice.
> 

But I am not the first one here am I? Fuse and other interfaces already do
exactly this long before I did. Actually any Kernel Interface has some user-mode
component, specifically written for it. And again I am only legally doing exactly as
FUSE is doing only much faster, and more importantly for me highly parallel on all
cores. Because from my testing the biggest problem of FUSE for me is that it does not
scale

I'm not a lawyer either but I think I'm doing OK. Because I am doing exactly
like FUSE is doing. Only some 15 years later, with modern CPUs in mind. I do not
think I am doing anything new here, am I?

Thanks
Boaz
Peter Zijlstra May 15, 2018, 2:17 p.m. UTC | #25
On Tue, May 15, 2018 at 02:54:29PM +0300, Boaz Harrosh wrote:
> At the beginning I was wishful thinking that the mm_cpumask(vma->vm_mm)
> should have a single bit set just as the affinity of the thread on
> creation of that thread. But then I saw that at %80 of the times some
> other random bits are also set.
> 
> Yes Random. Always the thread affinity (single) bit was set but
> then zero one or two more bits were set as well. Never seen more then
> two though, which baffles me a lot.
> 
> If it was like Matthew said .i.e the cpumask of the all process
> then I would expect all the bits to be set. Because I have a thread
> on each core. And also I would even expect that all vma->vm_mm
> or maybe mm_cpumask(vma->vm_mm) to point to the same global object.
> But it was not so. it was pointing to some thread unique object but
> still those phantom bits were set all over. (And I am almost sure
> same vma had those bits change over time)
> 
> So I would love some mm guy to explain where are those bits collected?

Depends on the architecture, some architectures only ever set bits,
some, like x86, clear bits again. You want to look at switch_mm().

Basically x86 clears the bit again when we switch away from the mm and
have/will invalidate TLBs for it in doing so.

> Which brings me to another question. How can I find from
> within a thread Say at the file_operations->mmap() call that the thread
> is indeed core-pinned. What mm_cpumask should I inspect?

is_percpu_thread().
Matthew Wilcox May 15, 2018, 2:18 p.m. UTC | #26
On Tue, May 15, 2018 at 05:10:57PM +0300, Boaz Harrosh wrote:
> I'm not a lawyer either but I think I'm doing OK. Because I am doing exactly
> like FUSE is doing. Only some 15 years later, with modern CPUs in mind. I do not
> think I am doing anything new here, am I?

You should talk to a lawyer.  I'm not giving you legal advice.
I'm telling you that I think what you're doing is unethical.
Dave Hansen May 15, 2018, 2:19 p.m. UTC | #27
On 05/14/2018 10:28 AM, Boaz Harrosh wrote:
> The VM_LOCAL_CPU flag tells the Kernel that the vma will be used
> from a single-core only, and therefore invalidation (flush_tlb) of
> PTE(s) need not be a wide CPU scheduling.

This doesn't work on x86.  We load TLB entries for lots of reasons, even
if the PTE is never "used".  Is there another architecture you had in
mind that has more predictable TLB population behavior?
Boaz Harrosh May 15, 2018, 2:30 p.m. UTC | #28
On 15/05/18 17:18, Matthew Wilcox wrote:
> On Tue, May 15, 2018 at 05:10:57PM +0300, Boaz Harrosh wrote:
>> I'm not a lawyer either but I think I'm doing OK. Because I am doing exactly
>> like FUSE is doing. Only some 15 years later, with modern CPUs in mind. I do not
>> think I am doing anything new here, am I?
> 
> You should talk to a lawyer.  I'm not giving you legal advice.
> I'm telling you that I think what you're doing is unethical.
> .
> 

Not more unethical than what is already there. And I do not see how
this is unethical at all? I trust your opinion and would really want
to understand.

For example your not-in-c zero-copy Server. How is it unethical?
I have the same problem actually some important parts are not in C.

How is it unethical to want to make this run fast?

Thanks
Boaz
Boaz Harrosh May 15, 2018, 2:36 p.m. UTC | #29
On 15/05/18 17:17, Peter Zijlstra wrote:
<>
>>
>> So I would love some mm guy to explain where are those bits collected?
> 
> Depends on the architecture, some architectures only ever set bits,
> some, like x86, clear bits again. You want to look at switch_mm().
> 
> Basically x86 clears the bit again when we switch away from the mm and
> have/will invalidate TLBs for it in doing so.
> 

Ha, OK I am starting to get a picture.

>> Which brings me to another question. How can I find from
>> within a thread Say at the file_operations->mmap() call that the thread
>> is indeed core-pinned. What mm_cpumask should I inspect?
> 
> is_percpu_thread().

Right thank you a lot Peter. this helps.

Boaz
> .
>
Christoph Lameter (Ampere) May 18, 2018, 2:14 p.m. UTC | #30
On Tue, 15 May 2018, Boaz Harrosh wrote:

> > I don't think page tables work the way you think they work.
> >
> > +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
> >
> > That doesn't just insert it into the local CPU's page table.  Any CPU
> > which directly accesses or even prefetches that address will also get
> > the translation into its cache.
> >
>
> Yes I know, but that is exactly the point of this flag. I know that this
> address is only ever accessed from a single core. Because it is an mmap (vma)
> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
> only that thread any kind of access to this vma. Both the filehandle and the
> mmaped pointer are kept on the thread stack and have no access from outside.
>
> So the all point of this flag is the kernel driver telling mm that this
> address is enforced to only be accessed from one core-pinned thread.

But there are no provisions for probhiting accesses from other cores?

This means that a casual accidental write from a thread executing on
another core can lead to arbitrary memory corruption because the cache
flushing has been bypassed.
Boaz Harrosh May 22, 2018, 4:05 p.m. UTC | #31
On 18/05/18 17:14, Christopher Lameter wrote:
> On Tue, 15 May 2018, Boaz Harrosh wrote:
> 
>>> I don't think page tables work the way you think they work.
>>>
>>> +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
>>>
>>> That doesn't just insert it into the local CPU's page table.  Any CPU
>>> which directly accesses or even prefetches that address will also get
>>> the translation into its cache.
>>>
>>
>> Yes I know, but that is exactly the point of this flag. I know that this
>> address is only ever accessed from a single core. Because it is an mmap (vma)
>> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
>> only that thread any kind of access to this vma. Both the filehandle and the
>> mmaped pointer are kept on the thread stack and have no access from outside.
>>
>> So the all point of this flag is the kernel driver telling mm that this
>> address is enforced to only be accessed from one core-pinned thread.
> 
> But there are no provisions for probhiting accesses from other cores?
> 
> This means that a casual accidental write from a thread executing on
> another core can lead to arbitrary memory corruption because the cache
> flushing has been bypassed.
> 

No this is not accurate. A "casual accidental write" will not do any harm.
Only a well concerted malicious server can exploit this. A different thread
on a different core will need to hit the exact time to read from the exact
pointer at the narrow window while the IO is going on. fault-in a TLB at the
time of the valid mapping. Then later after the IO has ended and before any
of the threads where scheduled out, maliciously write. All the while the App
has freed its buffers and the buffer was used for something else.
Please bear in mind that this is only As root, in an /sbin/ executable signed
by the Kernel's key. I think that anyone who as gained such an access to the
system (i.e compiled and installed an /sbin server), Can just walk the front door.
He does not need to exploit this narrow random hole. Hell he can easily just
modprob a Kernel module.

And I do not understand. Every one is motivated in saying "no cannot be solved"
So lets start from the Beginning.

How can we implement "Private memory"?

You know how in the fork days. We have APIs for "shared memory".

I.E: All read/write memory defaults to private except special setup
     "shared memory"
This is vs Threads where all memory regions are shared.

[Q] How can we implement a "private memory" region.
.I.E All read/write memory defaults to shared except special setup
     "private memory"

Can this be done? How, please advise?

Thanks
Boaz
Dave Hansen May 22, 2018, 4:18 p.m. UTC | #32
On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
> How can we implement "Private memory"?

Per-cpu page tables would do it.
Christoph Lameter (Ampere) May 22, 2018, 4:46 p.m. UTC | #33
On Tue, 22 May 2018, Dave Hansen wrote:

> On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
> > How can we implement "Private memory"?
>
> Per-cpu page tables would do it.

We already have that for percpu subsystem. See alloc_percpu()
Peter Zijlstra May 22, 2018, 4:56 p.m. UTC | #34
On Tue, May 22, 2018 at 04:46:05PM +0000, Christopher Lameter wrote:
> On Tue, 22 May 2018, Dave Hansen wrote:
> 
> > On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
> > > How can we implement "Private memory"?
> >
> > Per-cpu page tables would do it.
> 
> We already have that for percpu subsystem. See alloc_percpu()

x86 doesn't have per-cpu page tables. And the last time I looked, percpu
also didn't, it played games with staggered ranges in the vmalloc space
and used the [FG]S segment offset to make it work.

Doing proper per-cpu pagetables on x86 is possible, but quite involved
and expensive.
Dave Hansen May 22, 2018, 5:03 p.m. UTC | #35
On 05/22/2018 09:46 AM, Christopher Lameter wrote:
> On Tue, 22 May 2018, Dave Hansen wrote:
> 
>> On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
>>> How can we implement "Private memory"?
>> Per-cpu page tables would do it.
> We already have that for percpu subsystem. See alloc_percpu()

I actually mean a set of page tables which is only ever installed on a
single CPU.  The CPU is architecturally allowed to go load any PTE in
the page tables into the TLB any time it feels like.  The only way to
keep a PTE from getting into the TLB is not ensure that a CPU never has
any access to it, and the only way to do that is to make sure that no
set of page tables it ever loads into CR3 have that PTE.

As Peter said, it's possible, but not pretty.
Christoph Lameter (Ampere) May 22, 2018, 5:35 p.m. UTC | #36
On Tue, 22 May 2018, Dave Hansen wrote:

> On 05/22/2018 09:46 AM, Christopher Lameter wrote:
> > On Tue, 22 May 2018, Dave Hansen wrote:
> >
> >> On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
> >>> How can we implement "Private memory"?
> >> Per-cpu page tables would do it.
> > We already have that for percpu subsystem. See alloc_percpu()
>
> I actually mean a set of page tables which is only ever installed on a
> single CPU.  The CPU is architecturally allowed to go load any PTE in
> the page tables into the TLB any time it feels like.  The only way to
> keep a PTE from getting into the TLB is not ensure that a CPU never has
> any access to it, and the only way to do that is to make sure that no
> set of page tables it ever loads into CR3 have that PTE.
>
> As Peter said, it's possible, but not pretty.

Well yeah its much more pretty if you use the segment register to avoid
the page table tricks on x86. Other arches may rely on page table tricks.

Regardless of that the percpu subsystem was created to provide "private"
memory for each cpu and that may be the right starting point for adding
"local" memory.
Matthew Wilcox May 22, 2018, 5:51 p.m. UTC | #37
On Tue, May 22, 2018 at 10:03:54AM -0700, Dave Hansen wrote:
> On 05/22/2018 09:46 AM, Christopher Lameter wrote:
> > On Tue, 22 May 2018, Dave Hansen wrote:
> > 
> >> On 05/22/2018 09:05 AM, Boaz Harrosh wrote:
> >>> How can we implement "Private memory"?
> >> Per-cpu page tables would do it.
> > We already have that for percpu subsystem. See alloc_percpu()
> 
> I actually mean a set of page tables which is only ever installed on a
> single CPU.  The CPU is architecturally allowed to go load any PTE in
> the page tables into the TLB any time it feels like.  The only way to
> keep a PTE from getting into the TLB is not ensure that a CPU never has
> any access to it, and the only way to do that is to make sure that no
> set of page tables it ever loads into CR3 have that PTE.
> 
> As Peter said, it's possible, but not pretty.

But CR3 is a per-CPU register.  So it'd be *possible* to allocate one
PGD per CPU (per process).  Have them be identical in all but one of
the PUD entries.  Then you've reserved 1/512 of your address space for
per-CPU pages.

Complicated, ugly, memory-consuming.  But possible.
Dave Hansen May 23, 2018, 5:30 p.m. UTC | #38
On 05/22/2018 10:51 AM, Matthew Wilcox wrote:
> But CR3 is a per-CPU register.  So it'd be *possible* to allocate one
> PGD per CPU (per process).  Have them be identical in all but one of
> the PUD entries.  Then you've reserved 1/512 of your address space for
> per-CPU pages.
> 
> Complicated, ugly, memory-consuming.  But possible.

Yep, and you'd probably want a cache of them so you don't end up having
to go rewrite half of the PGD every time you context-switch.  But, on
the plus side, the logic would be pretty similar if not identical to the
way that we manage PCIDs.  If your mm was recently active on the CPU,
you can use a PGD that's already been constructed.  If not, you're stuck
making a new one.

Andy L. was alto talking about using this kind of mechanism to simplify
the entry code.  Instead of needing per-cpu areas where we index by the
CPU number, or by using %GS, we could have per-cpu data or code that has
a fixed virtual address.

It'd be a fun project, but it might not ever pan out.
Nadav Amit May 23, 2018, 5:46 p.m. UTC | #39
Dave Hansen <dave.hansen@linux.intel.com> wrote:

> On 05/22/2018 10:51 AM, Matthew Wilcox wrote:
>> But CR3 is a per-CPU register.  So it'd be *possible* to allocate one
>> PGD per CPU (per process).  Have them be identical in all but one of
>> the PUD entries.  Then you've reserved 1/512 of your address space for
>> per-CPU pages.
>> 
>> Complicated, ugly, memory-consuming.  But possible.
> 
> Yep, and you'd probably want a cache of them so you don't end up having
> to go rewrite half of the PGD every time you context-switch.  But, on
> the plus side, the logic would be pretty similar if not identical to the
> way that we manage PCIDs.  If your mm was recently active on the CPU,
> you can use a PGD that's already been constructed.  If not, you're stuck
> making a new one.
> 
> Andy L. was alto talking about using this kind of mechanism to simplify
> the entry code.  Instead of needing per-cpu areas where we index by the
> CPU number, or by using %GS, we could have per-cpu data or code that has
> a fixed virtual address.
> 
> It'd be a fun project, but it might not ever pan out.

For the record: there are several academic studies about this subject. The
most notable one is Corey [1].

[1] https://www.usenix.org/legacy/event/osdi08/tech/full_papers/boyd-wickizer/boyd_wickizer.pdf
Mark Rutland May 23, 2018, 6:10 p.m. UTC | #40
On Tue, May 22, 2018 at 07:05:48PM +0300, Boaz Harrosh wrote:
> On 18/05/18 17:14, Christopher Lameter wrote:
> > On Tue, 15 May 2018, Boaz Harrosh wrote:
> > 
> >>> I don't think page tables work the way you think they work.
> >>>
> >>> +               err = vm_insert_pfn_prot(zt->vma, zt_addr, pfn, prot);
> >>>
> >>> That doesn't just insert it into the local CPU's page table.  Any CPU
> >>> which directly accesses or even prefetches that address will also get
> >>> the translation into its cache.
> >>>
> >>
> >> Yes I know, but that is exactly the point of this flag. I know that this
> >> address is only ever accessed from a single core. Because it is an mmap (vma)
> >> of an O_TMPFILE-exclusive file created in a core-pinned thread and I allow
> >> only that thread any kind of access to this vma. Both the filehandle and the
> >> mmaped pointer are kept on the thread stack and have no access from outside.
> >>
> >> So the all point of this flag is the kernel driver telling mm that this
> >> address is enforced to only be accessed from one core-pinned thread.
> > 
> > But there are no provisions for probhiting accesses from other cores?
> > 
> > This means that a casual accidental write from a thread executing on
> > another core can lead to arbitrary memory corruption because the cache
> > flushing has been bypassed.
> 
> No this is not accurate. A "casual accidental write" will not do any harm.
> Only a well concerted malicious server can exploit this. A different thread
> on a different core will need to hit the exact time to read from the exact
> pointer at the narrow window while the IO is going on. fault-in a TLB at the
> time of the valid mapping.

TLB entries can be allocated at any time, for any reason. Even if a
program doesn't explicitly read from the exact pointer at that time, it
doesn't guarantee that a TLB entry won't be allocated.

> Then later after the IO has ended and before any
> of the threads where scheduled out, maliciously write. 

... or, regardless of the application's wishes, the core mm code decides
it needs to swap this page out (only doing local TLB invalidation), and
later pages it back in.

Several things can happen, e.g.

* a casual write can corrupt the original page, which is now in use for
  something else.

* a CPU might re-allocate a TLB entry for that page, finding it
  conflicts with an existing entry. This is *fatal* on some
  architectures.

> All the while the App has freed its buffers and the buffer was used
> for something else.  Please bear in mind that this is only As root, in
> an /sbin/ executable signed by the Kernel's key.

That isn't enforced by the core API additions, and regardless, root does
not necessarily imply access to kernel-internal stuff (e.g. if the
lockdown stuff goes in).

Claiming that root access means we don't need to care about robustness
is not a good argument.

[...]

> So lets start from the Beginning.
> 
> How can we implement "Private memory"?

Use separate processes rather than threads. Each will have a separate
mm, so the arch can get away with local TLB invalidation.

If you wish to share portions of memory between these processes, we have
shared memory APIs to do so.

Thanks,
Mark.
diff mbox

Patch

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e055d1a..1d398a0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -640,7 +640,8 @@  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 		local_irq_enable();
 	}
 
-	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+	if (!(vmflag & VM_LOCAL_CPU) &&
+	    cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), &info);
 
 	put_cpu();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4..305d6e4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -680,6 +680,9 @@  static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT2)]	= "",
 		[ilog2(VM_PKEY_BIT3)]	= "",
 #endif
+#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
+		[ilog2(VM_LOCAL_CPU)]	= "lc",
+#endif
 	};
 	size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ac1f06..3d14107 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -226,6 +226,9 @@  extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_LOCAL_CPU	BIT(37)		/* FIXME: Needs to move from here */
+#else /* ! CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
+#define VM_LOCAL_CPU	0		/* FIXME: Needs to move from here */
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #if defined(CONFIG_X86)
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464..6236f5e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1788,6 +1788,7 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	int retval;
 	pte_t *pte, entry;
 	spinlock_t *ptl;
+	bool need_flush = false;
 
 	retval = -ENOMEM;
 	pte = get_locked_pte(mm, addr, &ptl);
@@ -1795,7 +1796,12 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte)) {
-		if (mkwrite) {
+		if ((vma->vm_flags & VM_LOCAL_CPU)) {
+			/* VM_LOCAL_CPU is set, A single CPU is allowed to not
+			 * go through zap_vma_ptes before changing a pte
+			 */
+			need_flush = true;
+		} else if (mkwrite) {
 			/*
 			 * For read faults on private mappings the PFN passed
 			 * in may not match the PFN we have mapped if the
@@ -1807,8 +1813,9 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 				goto out_unlock;
 			entry = *pte;
 			goto out_mkwrite;
-		} else
+		} else {
 			goto out_unlock;
+		}
 	}
 
 	/* Ok, finally just insert the thing.. */
@@ -1824,6 +1831,8 @@  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	}
 
 	set_pte_at(mm, addr, pte, entry);
+	if (need_flush)
+		flush_tlb_range(vma, addr, addr + PAGE_SIZE);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
 	retval = 0;