diff mbox series

[RFC,v2,4/4] mm/ptshare: Add page fault handling for page table shared regions

Message ID 9edffd2a12a049a42d9a2c216e3f999aae7e65a4.1682453344.git.khalid.aziz@oracle.com (mailing list archive)
State New
Headers show
Series Add support for sharing page tables across processes (Previously mshare) | expand

Commit Message

Khalid Aziz April 26, 2023, 4:49 p.m. UTC
Add support for creating a new set of shared page tables in a new
mm_struct upon mmap of an region that can potentially share page
tables. Add page fault handling for this now shared region. Modify
free_pgtables path to make sure page tables in the shared regions
are kept intact when a process using page table region exits and
there are other mappers for the shared region. Clean up mm_struct
holding shared page tables when the last process sharing the region
exits.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/internal.h |   2 +
 mm/memory.c   | 105 ++++++++++++++++++++++++++++++------
 mm/ptshare.c  | 143 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 232 insertions(+), 18 deletions(-)

Comments

kernel test robot April 29, 2023, 2:07 p.m. UTC | #1
Hello,

kernel test robot noticed "WARNING:bad_unlock_balance_detected" on:

commit: a2eef9e49f572b7b2dfa23fc32567e83da9573d5 ("[PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions")
url: https://github.com/intel-lab-lkp/linux/commits/Khalid-Aziz/mm-ptshare-Add-vm-flag-for-shared-PTE/20230427-005143
base: https://git.kernel.org/cgit/linux/kernel/git/arnd/asm-generic.git master
patch link: https://lore.kernel.org/all/9edffd2a12a049a42d9a2c216e3f999aae7e65a4.1682453344.git.khalid.aziz@oracle.com/
patch subject: [PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions

in testcase: kernel-selftests
version: kernel-selftests-x86_64-60acb023-1_20230329
with following parameters:

	sc_nr_hugepages: 2
	group: mm

test-description: The kernel contains a set of "self tests" under the tools/testing/selftests/ directory. These are intended to be small unit tests to exercise individual code paths in the kernel.
test-url: https://www.kernel.org/doc/Documentation/kselftest.txt


compiler: gcc-11
test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz (Kaby Lake) with 32G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Link: https://lore.kernel.org/oe-lkp/202304292108.f44daa45-oliver.sang@intel.com


[  183.878671][ T3695] WARNING: bad unlock balance detected!
[  183.884040][ T3695] 6.3.0-rc1-00015-ga2eef9e49f57 #1 Not tainted
[  183.890014][ T3695] -------------------------------------
[  183.895382][ T3695] userfaultfd/3695 is trying to release lock (&mm->mmap_lock) at:
[ 183.903000][ T3695] handle_mm_fault (mm/memory.c:5276) 
[  183.909324][ T3695] but there are no more locks to release!
[  183.914866][ T3695]
[  183.914866][ T3695] other info that might help us debug this:
[  183.922738][ T3695] no locks held by userfaultfd/3695.
[  183.927847][ T3695]
[  183.927847][ T3695] stack backtrace:
[  183.933560][ T3695] CPU: 7 PID: 3695 Comm: userfaultfd Not tainted 6.3.0-rc1-00015-ga2eef9e49f57 #1
[  183.942558][ T3695] Hardware name: Dell Inc. OptiPlex 7050/062KRH, BIOS 1.2.0 12/22/2016
[  183.950604][ T3695] Call Trace:
[  183.953730][ T3695]  <TASK>
[ 183.956510][ T3695] dump_stack_lvl (lib/dump_stack.c:108) 
[ 183.960845][ T3695] __lock_release (kernel/locking/lockdep.c:5346) 
[ 183.965354][ T3695] ? lock_downgrade (kernel/locking/lockdep.c:5321) 
[ 183.970032][ T3695] ? dump_stack_print_info (lib/dump_stack.c:70) 
[ 183.975231][ T3695] ? handle_mm_fault (mm/memory.c:5276) 
[ 183.979998][ T3695] lock_release (kernel/locking/lockdep.c:467 kernel/locking/lockdep.c:5691) 
[ 183.984248][ T3695] up_read (kernel/locking/rwsem.c:1616) 
[ 183.987978][ T3695] handle_mm_fault (mm/memory.c:5276) 
[ 183.992571][ T3695] ? lock_is_held_type (kernel/locking/lockdep.c:5410 kernel/locking/lockdep.c:5712) 
[ 183.997424][ T3695] ? __handle_mm_fault (mm/memory.c:5201) 
[ 184.002364][ T3695] ? find_vma (mm/mmap.c:1854) 
[ 184.006459][ T3695] ? can_vma_merge_before+0x330/0x330 
[ 184.012283][ T3695] do_user_addr_fault (arch/x86/mm/fault.c:1407) 
[ 184.017146][ T3695] exc_page_fault (arch/x86/include/asm/irqflags.h:26 arch/x86/include/asm/irqflags.h:67 arch/x86/include/asm/irqflags.h:127 arch/x86/mm/fault.c:1506 arch/x86/mm/fault.c:1554) 
[ 184.021496][ T3695] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570) 
[  184.026186][ T3695] RIP: 0033:0x403b2e
[ 184.029923][ T3695] Code: 89 75 e0 48 89 55 d8 48 c7 45 f8 00 00 00 00 eb 2c 48 8b 55 e8 48 8b 45 f8 48 01 d0 0f b6 10 48 8b 4d e0 48 8b 45 f8 48 01 c8 <0f> b6 00 38 c2 74 07 b8 01 00 00 00 eb 14 48 83 45 f8 01 48 8b 45
All code
========
   0:	89 75 e0             	mov    %esi,-0x20(%rbp)
   3:	48 89 55 d8          	mov    %rdx,-0x28(%rbp)
   7:	48 c7 45 f8 00 00 00 	movq   $0x0,-0x8(%rbp)
   e:	00 
   f:	eb 2c                	jmp    0x3d
  11:	48 8b 55 e8          	mov    -0x18(%rbp),%rdx
  15:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  19:	48 01 d0             	add    %rdx,%rax
  1c:	0f b6 10             	movzbl (%rax),%edx
  1f:	48 8b 4d e0          	mov    -0x20(%rbp),%rcx
  23:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  27:	48 01 c8             	add    %rcx,%rax
  2a:*	0f b6 00             	movzbl (%rax),%eax		<-- trapping instruction
  2d:	38 c2                	cmp    %al,%dl
  2f:	74 07                	je     0x38
  31:	b8 01 00 00 00       	mov    $0x1,%eax
  36:	eb 14                	jmp    0x4c
  38:	48 83 45 f8 01       	addq   $0x1,-0x8(%rbp)
  3d:	48                   	rex.W
  3e:	8b                   	.byte 0x8b
  3f:	45                   	rex.RB

Code starting with the faulting instruction
===========================================
   0:	0f b6 00             	movzbl (%rax),%eax
   3:	38 c2                	cmp    %al,%dl
   5:	74 07                	je     0xe
   7:	b8 01 00 00 00       	mov    $0x1,%eax
   c:	eb 14                	jmp    0x22
   e:	48 83 45 f8 01       	addq   $0x1,-0x8(%rbp)
  13:	48                   	rex.W
  14:	8b                   	.byte 0x8b
  15:	45                   	rex.RB
[  184.049280][ T3695] RSP: 002b:00007ffe05336170 EFLAGS: 00010206
[  184.055169][ T3695] RAX: 00007f57bde00000 RBX: 00007ffe05336340 RCX: 00007f57bde00000
[  184.062955][ T3695] RDX: 00000000000000ff RSI: 00007f57bde00000 RDI: 00007f57ce000000
[  184.070742][ T3695] RBP: 00007ffe05336170 R08: 00000000ffffffff R09: 0000000000000000
[  184.078528][ T3695] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000000000
[  184.086315][ T3695] R13: 00007ffe05336560 R14: 000000000040be00 R15: 00007f57e6668020
[  184.094107][ T3695]  </TASK>
[  184.096984][ T3695] ------------[ cut here ]------------
[  184.102287][ T3695] DEBUG_RWSEMS_WARN_ON(tmp < 0): count = 0xffffffffffffff00, magic = 0xffff888160af9d28, owner = 0x1, curr 0xffff8881b2dc8040, list empty
[ 184.116154][ T3695] WARNING: CPU: 7 PID: 3695 at kernel/locking/rwsem.c:1348 __up_read (kernel/locking/rwsem.c:1348 (discriminator 15)) 
[  184.125073][ T3695] Modules linked in: openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 intel_rapl_msr intel_rapl_common btrfs x86_pkg_temp_thermal intel_powerclamp blake2b_generic coretemp xor raid6_pq zstd_compress kvm_intel libcrc32c kvm i915 sd_mod t10_pi irqbypass crct10dif_pclmul crc64_rocksoft_generic crc32_pclmul drm_buddy crc64_rocksoft crc32c_intel intel_gtt crc64 ghash_clmulni_intel drm_display_helper sha512_ssse3 sg rapl drm_kms_helper ipmi_devintf ipmi_msghandler mei_wdt wmi_bmof intel_cstate syscopyarea ahci libahci i2c_i801 i2c_designware_platform sysfillrect mei_me intel_uncore i2c_smbus idma64 i2c_designware_core sysimgblt libata ttm mei video wmi intel_pmc_core acpi_pad binfmt_misc fuse drm ip_tables
[  184.190695][ T3695] CPU: 7 PID: 3695 Comm: userfaultfd Not tainted 6.3.0-rc1-00015-ga2eef9e49f57 #1
[  184.199697][ T3695] Hardware name: Dell Inc. OptiPlex 7050/062KRH, BIOS 1.2.0 12/22/2016
[ 184.207763][ T3695] RIP: 0010:__up_read (kernel/locking/rwsem.c:1348 (discriminator 15)) 
[ 184.212637][ T3695] Code: 3c 02 00 0f 85 61 03 00 00 53 48 8b 55 00 4d 89 e9 4d 89 f8 4c 89 f1 48 c7 c6 40 70 c9 83 48 c7 c7 a0 6e c9 83 e8 11 15 e9 ff <0f> 0b 5a e9 0e ff ff ff 48 89 44 24 38 e9 db fd ff ff be 08 00 00
All code
========
   0:	3c 02                	cmp    $0x2,%al
   2:	00 0f                	add    %cl,(%rdi)
   4:	85 61 03             	test   %esp,0x3(%rcx)
   7:	00 00                	add    %al,(%rax)
   9:	53                   	push   %rbx
   a:	48 8b 55 00          	mov    0x0(%rbp),%rdx
   e:	4d 89 e9             	mov    %r13,%r9
  11:	4d 89 f8             	mov    %r15,%r8
  14:	4c 89 f1             	mov    %r14,%rcx
  17:	48 c7 c6 40 70 c9 83 	mov    $0xffffffff83c97040,%rsi
  1e:	48 c7 c7 a0 6e c9 83 	mov    $0xffffffff83c96ea0,%rdi
  25:	e8 11 15 e9 ff       	callq  0xffffffffffe9153b
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	5a                   	pop    %rdx
  2d:	e9 0e ff ff ff       	jmpq   0xffffffffffffff40
  32:	48 89 44 24 38       	mov    %rax,0x38(%rsp)
  37:	e9 db fd ff ff       	jmpq   0xfffffffffffffe17
  3c:	be                   	.byte 0xbe
  3d:	08 00                	or     %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	5a                   	pop    %rdx
   3:	e9 0e ff ff ff       	jmpq   0xffffffffffffff16
   8:	48 89 44 24 38       	mov    %rax,0x38(%rsp)
   d:	e9 db fd ff ff       	jmpq   0xfffffffffffffded
  12:	be                   	.byte 0xbe
  13:	08 00                	or     %al,(%rax)
	...
[  184.232027][ T3695] RSP: 0000:ffffc9000b6ffd48 EFLAGS: 00010286
[  184.237919][ T3695] RAX: 0000000000000000 RBX: ffffffff83c96de0 RCX: fffff520016dff6e
[  184.245734][ T3695] RDX: 0000000000000004 RSI: 0000000000000008 RDI: ffff88878c7ac38c
[  184.253611][ T3695] RBP: ffff888160af9d28 R08: ffffffff821fca01 R09: ffffc9000b6ffb6f
[  184.261422][ T3695] R10: fffff520016dff6d R11: 0000000000000001 R12: 1ffff920016dffad
[  184.269254][ T3695] R13: ffff8881b2dc8040 R14: ffff888160af9d28 R15: 0000000000000001
[  184.277051][ T3695] FS:  00007f57e6448740(0000) GS:ffff88878c780000(0000) knlGS:0000000000000000
[  184.285796][ T3695] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  184.292208][ T3695] CR2: 00007f57bde00000 CR3: 0000000100e30004 CR4: 00000000003706e0
[  184.300016][ T3695] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  184.307809][ T3695] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  184.315616][ T3695] Call Trace:
[  184.318762][ T3695]  <TASK>
[ 184.321563][ T3695] ? up_write (kernel/locking/rwsem.c:1339) 
[ 184.325575][ T3695] ? lock_release (kernel/locking/lockdep.c:5693) 
[ 184.330105][ T3695] handle_mm_fault (mm/memory.c:5276) 
[ 184.334721][ T3695] ? lock_is_held_type (kernel/locking/lockdep.c:5410 kernel/locking/lockdep.c:5712) 
[ 184.339595][ T3695] ? __handle_mm_fault (mm/memory.c:5201) 
[ 184.344556][ T3695] ? find_vma (mm/mmap.c:1854) 
[ 184.348651][ T3695] ? can_vma_merge_before+0x330/0x330 
[ 184.354479][ T3695] do_user_addr_fault (arch/x86/mm/fault.c:1407) 
[ 184.359351][ T3695] exc_page_fault (arch/x86/include/asm/irqflags.h:26 arch/x86/include/asm/irqflags.h:67 arch/x86/include/asm/irqflags.h:127 arch/x86/mm/fault.c:1506 arch/x86/mm/fault.c:1554) 
[ 184.363707][ T3695] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570) 
[  184.368408][ T3695] RIP: 0033:0x403b2e
[ 184.372145][ T3695] Code: 89 75 e0 48 89 55 d8 48 c7 45 f8 00 00 00 00 eb 2c 48 8b 55 e8 48 8b 45 f8 48 01 d0 0f b6 10 48 8b 4d e0 48 8b 45 f8 48 01 c8 <0f> b6 00 38 c2 74 07 b8 01 00 00 00 eb 14 48 83 45 f8 01 48 8b 45
All code
========
   0:	89 75 e0             	mov    %esi,-0x20(%rbp)
   3:	48 89 55 d8          	mov    %rdx,-0x28(%rbp)
   7:	48 c7 45 f8 00 00 00 	movq   $0x0,-0x8(%rbp)
   e:	00 
   f:	eb 2c                	jmp    0x3d
  11:	48 8b 55 e8          	mov    -0x18(%rbp),%rdx
  15:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  19:	48 01 d0             	add    %rdx,%rax
  1c:	0f b6 10             	movzbl (%rax),%edx
  1f:	48 8b 4d e0          	mov    -0x20(%rbp),%rcx
  23:	48 8b 45 f8          	mov    -0x8(%rbp),%rax
  27:	48 01 c8             	add    %rcx,%rax
  2a:*	0f b6 00             	movzbl (%rax),%eax		<-- trapping instruction
  2d:	38 c2                	cmp    %al,%dl
  2f:	74 07                	je     0x38
  31:	b8 01 00 00 00       	mov    $0x1,%eax
  36:	eb 14                	jmp    0x4c
  38:	48 83 45 f8 01       	addq   $0x1,-0x8(%rbp)
  3d:	48                   	rex.W
  3e:	8b                   	.byte 0x8b
  3f:	45                   	rex.RB

Code starting with the faulting instruction
===========================================
   0:	0f b6 00             	movzbl (%rax),%eax
   3:	38 c2                	cmp    %al,%dl
   5:	74 07                	je     0xe
   7:	b8 01 00 00 00       	mov    $0x1,%eax
   c:	eb 14                	jmp    0x22
   e:	48 83 45 f8 01       	addq   $0x1,-0x8(%rbp)
  13:	48                   	rex.W
  14:	8b                   	.byte 0x8b
  15:	45                   	rex.RB
[  184.391546][ T3695] RSP: 002b:00007ffe05336170 EFLAGS: 00010206
[  184.397460][ T3695] RAX: 00007f57bde00000 RBX: 00007ffe05336340 RCX: 00007f57bde00000
[  184.405251][ T3695] RDX: 00000000000000ff RSI: 00007f57bde00000 RDI: 00007f57ce000000
[  184.413057][ T3695] RBP: 00007ffe05336170 R08: 00000000ffffffff R09: 0000000000000000
[  184.420863][ T3695] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000000000
[  184.428670][ T3695] R13: 00007ffe05336560 R14: 000000000040be00 R15: 00007f57e6668020
[  184.436482][ T3695]  </TASK>
[  184.439365][ T3695] irq event stamp: 368859
[ 184.443530][ T3695] hardirqs last enabled at (368859): finish_task_switch+0x21c/0x910 
[ 184.453841][ T3695] hardirqs last disabled at (368858): __schedule (kernel/sched/core.c:6521 (discriminator 1)) 
[ 184.462946][ T3695] softirqs last enabled at (368830): __do_softirq (arch/x86/include/asm/preempt.h:27 kernel/softirq.c:415 kernel/softirq.c:600) 
[ 184.472118][ T3695] softirqs last disabled at (368825): __irq_exit_rcu (kernel/softirq.c:445 kernel/softirq.c:650) 
[  184.481495][ T3695] ---[ end trace 0000000000000000 ]---
[  184.486819][ T3695] VM_SHARED_PT vma with NULL ptshare_data
[  184.486823][ T3695] CPU: 3 PID: 3695 Comm: userfaultfd Tainted: G        W          6.3.0-rc1-00015-ga2eef9e49f57 #1
[  184.502865][ T3695] Hardware name: Dell Inc. OptiPlex 7050/062KRH, BIOS 1.2.0 12/22/2016


To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index 3efb8738e26f..924065f721fe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1061,4 +1061,6 @@  struct ptshare_data {
 int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
 void ptshare_del_mm(struct vm_area_struct *vm);
 int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
+extern vm_fault_t find_shared_vma(struct vm_area_struct **vmap,
+				unsigned long *addrp, unsigned int flags);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..c67318ffd001 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -172,17 +172,28 @@  void mm_trace_rss_stat(struct mm_struct *mm, int member)
  * has been handled earlier when unmapping all the memory regions.
  */
 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
-			   unsigned long addr)
+			   unsigned long addr, bool shared_pte)
 {
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
+	/*
+	 * if this address range shares page tables with other processes,
+	 * do not release pte pages. Those pages will be released when
+	 * host mm that hosts these pte pages is released
+	 */
+	if (shared_pte) {
+		tlb_flush_pmd_range(tlb, addr, PAGE_SIZE);
+		tlb->freed_tables = 1;
+		return;
+	}
 	pte_free_tlb(tlb, token, addr);
 	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -194,7 +205,7 @@  static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		free_pte_range(tlb, pmd, addr);
+		free_pte_range(tlb, pmd, addr, shared_pte);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -210,13 +221,19 @@  static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd, start);
-	mm_dec_nr_pmds(tlb->mm);
+	if (shared_pte) {
+		tlb_flush_pud_range(tlb, start, PAGE_SIZE);
+		tlb->freed_tables = 1;
+	} else {
+		pmd_free_tlb(tlb, pmd, start);
+		mm_dec_nr_pmds(tlb->mm);
+	}
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,7 +245,8 @@  static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		free_pmd_range(tlb, pud, addr, next, floor, ceiling,
+				shared_pte);
 	} while (pud++, addr = next, addr != end);
 
 	start &= P4D_MASK;
@@ -250,7 +268,8 @@  static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -262,7 +281,8 @@  static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = p4d_addr_end(addr, end);
 		if (p4d_none_or_clear_bad(p4d))
 			continue;
-		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+		free_pud_range(tlb, p4d, addr, next, floor, ceiling,
+				shared_pte);
 	} while (p4d++, addr = next, addr != end);
 
 	start &= PGDIR_MASK;
@@ -284,9 +304,10 @@  static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 /*
  * This function frees user-level page tables of a process.
  */
-void free_pgd_range(struct mmu_gather *tlb,
+static void _free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
-			unsigned long floor, unsigned long ceiling)
+			unsigned long floor, unsigned long ceiling,
+			bool shared_pte)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -342,10 +363,18 @@  void free_pgd_range(struct mmu_gather *tlb,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
+		free_p4d_range(tlb, pgd, addr, next, floor, ceiling,
+				shared_pte);
 	} while (pgd++, addr = next, addr != end);
 }
 
+void free_pgd_range(struct mmu_gather *tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
+{
+	_free_pgd_range(tlb, addr, end, floor, ceiling, false);
+}
+
 void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		   struct vm_area_struct *vma, unsigned long floor,
 		   unsigned long ceiling)
@@ -375,16 +404,20 @@  void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		} else {
 			/*
 			 * Optimization: gather nearby vmas into one call down
+			 * but make sure vmas not sharing page tables do
+			 * not get combined with vmas sharing page tables
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			       && !is_vm_hugetlb_page(next)) {
+			       && !is_vm_hugetlb_page(next)
+			       && (vma_is_shared(next) == vma_is_shared(vma))) {
 				vma = next;
 				next = mas_find(&mas, ceiling - 1);
 				unlink_anon_vmas(vma);
 				unlink_file_vma(vma);
 			}
-			free_pgd_range(tlb, addr, vma->vm_end,
-				floor, next ? next->vm_start : ceiling);
+			_free_pgd_range(tlb, addr, vma->vm_end,
+				floor, next ? next->vm_start : ceiling,
+				vma_is_shared(vma));
 		}
 		vma = next;
 	} while (vma);
@@ -5181,6 +5214,8 @@  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 			   unsigned int flags, struct pt_regs *regs)
 {
 	vm_fault_t ret;
+	bool shared = false;
+	struct mm_struct *orig_mm;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -5191,6 +5226,16 @@  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (ret)
 		return ret;
 
+	orig_mm = vma->vm_mm;
+	if (unlikely(vma_is_shared(vma))) {
+		ret = find_shared_vma(&vma, &address, flags);
+		if (ret)
+			return ret;
+		if (!vma)
+			return VM_FAULT_SIGSEGV;
+		shared = true;
+	}
+
 	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
 					    flags & FAULT_FLAG_INSTRUCTION,
 					    flags & FAULT_FLAG_REMOTE))
@@ -5212,6 +5257,36 @@  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
 	lru_gen_exit_fault();
 
+	/*
+	 * Release the read lock on shared VMA's parent mm unless
+	 * __handle_mm_fault released the lock already.
+	 * __handle_mm_fault sets VM_FAULT_RETRY in return value if
+	 * it released mmap lock. If lock was released, that implies
+	 * the lock would have been released on task's original mm if
+	 * this were not a shared PTE vma. To keep lock state consistent,
+	 * make sure to release the lock on task's original mm
+	 */
+	if (shared) {
+		int release_mmlock = 1;
+
+		if (!(ret & VM_FAULT_RETRY)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		} else if ((flags & FAULT_FLAG_ALLOW_RETRY) &&
+			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		}
+
+		/*
+		 * Reset guest vma pointers that were set up in
+		 * find_shared_vma() to process this fault.
+		 */
+		vma->vm_mm = orig_mm;
+		if (release_mmlock)
+			mmap_read_unlock(orig_mm);
+	}
+
 	if (flags & FAULT_FLAG_USER) {
 		mem_cgroup_exit_user_fault();
 		/*
diff --git a/mm/ptshare.c b/mm/ptshare.c
index f6784268958c..e0991a877355 100644
--- a/mm/ptshare.c
+++ b/mm/ptshare.c
@@ -13,6 +13,136 @@ 
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+/*
+ */
+static pmd_t
+*get_pmd(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return NULL;
+
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d)) {
+		p4d = p4d_alloc(mm, pgd, addr);
+		if (!p4d)
+			return NULL;
+	}
+
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud)) {
+		pud = pud_alloc(mm, p4d, addr);
+		if (!pud)
+			return NULL;
+	}
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (!pmd)
+			return NULL;
+	}
+
+	return pmd;
+}
+
+/*
+ * Find the shared pmd entries in host mm struct and install them into
+ * guest page tables.
+ */
+static int
+ptshare_copy_pmd(struct mm_struct *host_mm, struct mm_struct *guest_mm,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	pgd_t *guest_pgd;
+	p4d_t *guest_p4d;
+	pud_t *guest_pud;
+	pmd_t *host_pmd;
+	spinlock_t *host_ptl, *guest_ptl;
+
+	guest_pgd = pgd_offset(guest_mm, addr);
+	guest_p4d = p4d_offset(guest_pgd, addr);
+	if (p4d_none(*guest_p4d)) {
+		guest_p4d = p4d_alloc(guest_mm, guest_pgd, addr);
+		if (!guest_p4d)
+			return 1;
+	}
+
+	guest_pud = pud_offset(guest_p4d, addr);
+	if (pud_none(*guest_pud)) {
+		host_pmd = get_pmd(host_mm, addr);
+		if (!host_pmd)
+			return 1;
+
+		get_page(virt_to_page(host_pmd));
+		host_ptl = pmd_lockptr(host_mm, host_pmd);
+		guest_ptl = pud_lockptr(guest_mm, guest_pud);
+		spin_lock(host_ptl);
+		spin_lock(guest_ptl);
+		pud_populate(guest_mm, guest_pud,
+			(pmd_t *)((unsigned long)host_pmd & PAGE_MASK));
+		put_page(virt_to_page(host_pmd));
+		spin_unlock(guest_ptl);
+		spin_unlock(host_ptl);
+	}
+
+	return 0;
+}
+
+/*
+ * Find the shared page tables in hosting mm struct and install those in
+ * the guest mm struct
+ */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp,
+			unsigned int flags)
+{
+	struct ptshare_data *info;
+	struct mm_struct *host_mm;
+	struct vm_area_struct *host_vma, *guest_vma = *vmap;
+	unsigned long host_addr;
+	pmd_t *guest_pmd, *host_pmd;
+
+	if ((!guest_vma->vm_file) || (!guest_vma->vm_file->f_mapping))
+		return 0;
+	info = guest_vma->vm_file->f_mapping->ptshare_data;
+	if (!info) {
+		pr_warn("VM_SHARED_PT vma with NULL ptshare_data");
+		dump_stack_print_info(KERN_WARNING);
+		return 0;
+	}
+	host_mm = info->mm;
+
+	mmap_read_lock(host_mm);
+	host_addr = *addrp - guest_vma->vm_start + host_mm->mmap_base;
+	host_pmd = get_pmd(host_mm, host_addr);
+	guest_pmd = get_pmd(guest_vma->vm_mm, *addrp);
+	if (!pmd_same(*guest_pmd, *host_pmd)) {
+		set_pmd(guest_pmd, *host_pmd);
+		mmap_read_unlock(host_mm);
+		return VM_FAULT_NOPAGE;
+	}
+
+	*addrp = host_addr;
+	host_vma = find_vma(host_mm, host_addr);
+	if (!host_vma)
+		return VM_FAULT_SIGSEGV;
+
+	/*
+	 * Point vm_mm for the faulting vma to the mm struct holding shared
+	 * page tables so the fault handling will happen in the right
+	 * shared context
+	 */
+	guest_vma->vm_mm = host_mm;
+
+	return 0;
+}
+
 /*
  * Create a new mm struct that will hold the shared PTEs. Pointer to
  * this new mm is stored in the data structure ptshare_data which also
@@ -38,6 +168,7 @@  ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
 	new_mm->task_size = len;
 	if (!new_mm->task_size)
 		new_mm->task_size--;
+	new_mm->owner = NULL;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
@@ -63,7 +194,7 @@  ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
  * insert vma into mm holding shared page tables
  */
 int
-ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+ptshare_insert_vma(struct mm_struct *host_mm, struct vm_area_struct *vma)
 {
 	struct vm_area_struct *new_vma;
 	int err = 0;
@@ -80,12 +211,18 @@  ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	 */
 	vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
 	vm_flags_set(new_vma, VM_NOHUGEPAGE);
-	new_vma->vm_mm = mm;
+	new_vma->vm_mm = host_mm;
 
-	err = insert_vm_struct(mm, new_vma);
+	err = insert_vm_struct(host_mm, new_vma);
 	if (err)
 		return -ENOMEM;
 
+	/*
+	 * Copy the PMD entries from host mm to guest so they use the
+	 * same PTEs
+	 */
+	err = ptshare_copy_pmd(host_mm, vma->vm_mm, vma, vma->vm_start);
+
 	return err;
 }