diff mbox series

[RFC,18/62] x86/mm: Implement syncing per-KeyID direct mappings

Message ID 20190508144422.13171-19-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Intel MKTME enabling | expand

Commit Message

Kirill A . Shutemov May 8, 2019, 2:43 p.m. UTC
For MKTME we use per-KeyID direct mappings. This allows kernel to have
access to encrypted memory.

sync_direct_mapping() sync per-KeyID direct mappings with a canonical
one -- KeyID-0.

The function tracks changes in the canonical mapping:
 - creating or removing chunks of the translation tree;
 - changes in mapping flags (i.e. protection bits);
 - splitting huge page mapping into a page table;
 - replacing page table with a huge page mapping;

The function need to be called on every change to the direct mapping:
hotplug, hotremove, changes in permissions bits, etc.

The function is nop until MKTME is enabled.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/include/asm/mktme.h |   6 +
 arch/x86/mm/init_64.c        |  10 +
 arch/x86/mm/mktme.c          | 441 +++++++++++++++++++++++++++++++++++
 3 files changed, 457 insertions(+)

Comments

Peter Zijlstra June 14, 2019, 9:51 a.m. UTC | #1
On Wed, May 08, 2019 at 05:43:38PM +0300, Kirill A. Shutemov wrote:
> For MKTME we use per-KeyID direct mappings. This allows kernel to have
> access to encrypted memory.
> 
> sync_direct_mapping() sync per-KeyID direct mappings with a canonical
> one -- KeyID-0.
> 
> The function tracks changes in the canonical mapping:
>  - creating or removing chunks of the translation tree;
>  - changes in mapping flags (i.e. protection bits);
>  - splitting huge page mapping into a page table;
>  - replacing page table with a huge page mapping;
> 
> The function need to be called on every change to the direct mapping:
> hotplug, hotremove, changes in permissions bits, etc.

And yet I don't see anything in pageattr.c.

Also, this seems like an expensive scheme; if you know where the changes
where, a more fine-grained update would be faster.

> The function is nop until MKTME is enabled.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  arch/x86/include/asm/mktme.h |   6 +
>  arch/x86/mm/init_64.c        |  10 +
>  arch/x86/mm/mktme.c          | 441 +++++++++++++++++++++++++++++++++++
>  3 files changed, 457 insertions(+)


> @@ -1247,6 +1254,7 @@ void mark_rodata_ro(void)
>  	unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
>  	unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
>  	unsigned long all_end;
> +	int ret;
>  
>  	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
>  	       (end - start) >> 10);
> @@ -1280,6 +1288,8 @@ void mark_rodata_ro(void)
>  	free_kernel_image_pages((void *)text_end, (void *)rodata_start);
>  	free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
>  
> +	ret = sync_direct_mapping();
> +	WARN_ON(ret);
>  	debug_checkwx();
>  }
>  

If you'd done pageattr, the above would not be needed.
Kirill A. Shutemov June 14, 2019, 10:43 p.m. UTC | #2
On Fri, Jun 14, 2019 at 11:51:32AM +0200, Peter Zijlstra wrote:
> On Wed, May 08, 2019 at 05:43:38PM +0300, Kirill A. Shutemov wrote:
> > For MKTME we use per-KeyID direct mappings. This allows kernel to have
> > access to encrypted memory.
> > 
> > sync_direct_mapping() sync per-KeyID direct mappings with a canonical
> > one -- KeyID-0.
> > 
> > The function tracks changes in the canonical mapping:
> >  - creating or removing chunks of the translation tree;
> >  - changes in mapping flags (i.e. protection bits);
> >  - splitting huge page mapping into a page table;
> >  - replacing page table with a huge page mapping;
> > 
> > The function need to be called on every change to the direct mapping:
> > hotplug, hotremove, changes in permissions bits, etc.
> 
> And yet I don't see anything in pageattr.c.

You're right. I've hooked up the sync in the wrong place.
> 
> Also, this seems like an expensive scheme; if you know where the changes
> where, a more fine-grained update would be faster.

Do we have any hot enough pageattr users that makes it crucial?

I'll look into this anyway.
Peter Zijlstra June 17, 2019, 9:27 a.m. UTC | #3
On Sat, Jun 15, 2019 at 01:43:09AM +0300, Kirill A. Shutemov wrote:
> On Fri, Jun 14, 2019 at 11:51:32AM +0200, Peter Zijlstra wrote:
> > On Wed, May 08, 2019 at 05:43:38PM +0300, Kirill A. Shutemov wrote:
> > > For MKTME we use per-KeyID direct mappings. This allows kernel to have
> > > access to encrypted memory.
> > > 
> > > sync_direct_mapping() sync per-KeyID direct mappings with a canonical
> > > one -- KeyID-0.
> > > 
> > > The function tracks changes in the canonical mapping:
> > >  - creating or removing chunks of the translation tree;
> > >  - changes in mapping flags (i.e. protection bits);
> > >  - splitting huge page mapping into a page table;
> > >  - replacing page table with a huge page mapping;
> > > 
> > > The function need to be called on every change to the direct mapping:
> > > hotplug, hotremove, changes in permissions bits, etc.
> > 
> > And yet I don't see anything in pageattr.c.
> 
> You're right. I've hooked up the sync in the wrong place.
> > 
> > Also, this seems like an expensive scheme; if you know where the changes
> > where, a more fine-grained update would be faster.
> 
> Do we have any hot enough pageattr users that makes it crucial?
> 
> I'll look into this anyway.

The graphics people would be the most agressive users of this I'd think.
They're the ones that yelled when I broke it last ;-)
Kirill A. Shutemov June 17, 2019, 2:43 p.m. UTC | #4
On Mon, Jun 17, 2019 at 11:27:55AM +0200, Peter Zijlstra wrote:
> On Sat, Jun 15, 2019 at 01:43:09AM +0300, Kirill A. Shutemov wrote:
> > On Fri, Jun 14, 2019 at 11:51:32AM +0200, Peter Zijlstra wrote:
> > > On Wed, May 08, 2019 at 05:43:38PM +0300, Kirill A. Shutemov wrote:
> > > > For MKTME we use per-KeyID direct mappings. This allows kernel to have
> > > > access to encrypted memory.
> > > > 
> > > > sync_direct_mapping() sync per-KeyID direct mappings with a canonical
> > > > one -- KeyID-0.
> > > > 
> > > > The function tracks changes in the canonical mapping:
> > > >  - creating or removing chunks of the translation tree;
> > > >  - changes in mapping flags (i.e. protection bits);
> > > >  - splitting huge page mapping into a page table;
> > > >  - replacing page table with a huge page mapping;
> > > > 
> > > > The function need to be called on every change to the direct mapping:
> > > > hotplug, hotremove, changes in permissions bits, etc.
> > > 
> > > And yet I don't see anything in pageattr.c.
> > 
> > You're right. I've hooked up the sync in the wrong place.
> > > 
> > > Also, this seems like an expensive scheme; if you know where the changes
> > > where, a more fine-grained update would be faster.
> > 
> > Do we have any hot enough pageattr users that makes it crucial?
> > 
> > I'll look into this anyway.
> 
> The graphics people would be the most agressive users of this I'd think.
> They're the ones that yelled when I broke it last ;-)

I think something like this should do (I'll fold it in after testing):

diff --git a/arch/x86/include/asm/mktme.h b/arch/x86/include/asm/mktme.h
index 6c973cb1e64c..b30386d84281 100644
--- a/arch/x86/include/asm/mktme.h
+++ b/arch/x86/include/asm/mktme.h
@@ -68,7 +68,7 @@ static inline void arch_free_page(struct page *page, int order)
 		free_encrypted_page(page, order);
 }
 
-int sync_direct_mapping(void);
+int sync_direct_mapping(unsigned long start, unsigned long end);
 
 int mktme_get_alg(int keyid);
 
@@ -86,7 +86,7 @@ static inline bool mktme_enabled(void)
 
 static inline void mktme_disable(void) {}
 
-static inline int sync_direct_mapping(void)
+static inline int sync_direct_mapping(unsigned long start, unsigned long end)
 {
 	return 0;
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f50a38d86cc4..f8123aeb24a6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ __kernel_physical_mapping_init(unsigned long paddr_start,
 		pgd_changed = true;
 	}
 
-	ret = sync_direct_mapping();
+	ret = sync_direct_mapping(vaddr_start, vaddr_end);
 	WARN_ON(ret);
 
 	if (pgd_changed)
@@ -1209,7 +1209,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 	end = (unsigned long)__va(end);
 
 	remove_pagetable(start, end, true, NULL);
-	ret = sync_direct_mapping();
+	ret = sync_direct_mapping(start, end);
 	WARN_ON(ret);
 }
 
@@ -1315,7 +1315,6 @@ void mark_rodata_ro(void)
 	unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
 	unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
 	unsigned long all_end;
-	int ret;
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -1349,8 +1348,6 @@ void mark_rodata_ro(void)
 	free_kernel_image_pages((void *)text_end, (void *)rodata_start);
 	free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
 
-	ret = sync_direct_mapping();
-	WARN_ON(ret);
 	debug_checkwx();
 }
 
diff --git a/arch/x86/mm/mktme.c b/arch/x86/mm/mktme.c
index 9d2bb534f2ba..c099e1da055b 100644
--- a/arch/x86/mm/mktme.c
+++ b/arch/x86/mm/mktme.c
@@ -76,7 +76,7 @@ static void init_page_mktme(void)
 {
 	static_branch_enable(&mktme_enabled_key);
 
-	sync_direct_mapping();
+	sync_direct_mapping(PAGE_OFFSET, PAGE_OFFSET + direct_mapping_size);
 }
 
 struct page_ext_operations page_mktme_ops = {
@@ -596,15 +596,13 @@ static int sync_direct_mapping_p4d(unsigned long keyid,
 	return ret;
 }
 
-static int sync_direct_mapping_keyid(unsigned long keyid)
+static int sync_direct_mapping_keyid(unsigned long keyid,
+		unsigned long addr, unsigned long end)
 {
 	pgd_t *src_pgd, *dst_pgd;
-	unsigned long addr, end, next;
+	unsigned long next;
 	int ret = 0;
 
-	addr = PAGE_OFFSET;
-	end = PAGE_OFFSET + direct_mapping_size;
-
 	dst_pgd = pgd_offset_k(addr + keyid * direct_mapping_size);
 	src_pgd = pgd_offset_k(addr);
 
@@ -643,7 +641,7 @@ static int sync_direct_mapping_keyid(unsigned long keyid)
  *
  * The function is nop until MKTME is enabled.
  */
-int sync_direct_mapping(void)
+int sync_direct_mapping(unsigned long start, unsigned long end)
 {
 	int i, ret = 0;
 
@@ -651,7 +649,7 @@ int sync_direct_mapping(void)
 		return 0;
 
 	for (i = 1; !ret && i <= mktme_nr_keyids; i++)
-		ret = sync_direct_mapping_keyid(i);
+		ret = sync_direct_mapping_keyid(i, start, end);
 
 	flush_tlb_all();
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6a9a77a403c9..eafbe0d8c44f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -347,6 +347,28 @@ static void cpa_flush(struct cpa_data *data, int cache)
 
 	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
+	if (mktme_enabled()) {
+		unsigned long start, end;
+
+		start = *cpa->vaddr;
+		end = *cpa->vaddr + cpa->numpages * PAGE_SIZE;
+
+		/* Sync all direct mapping for an array */
+		if (cpa->flags & CPA_ARRAY) {
+			start = PAGE_OFFSET;
+			end = PAGE_OFFSET + direct_mapping_size;
+		}
+
+		/*
+		 * Sync per-KeyID direct mappings with the canonical one
+		 * (KeyID-0).
+		 *
+		 * sync_direct_mapping() does full TLB flush.
+		 */
+		sync_direct_mapping(start, end);
+		return;
+	}
+
 	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		cpa_flush_all(cache);
 		return;
Peter Zijlstra June 17, 2019, 2:51 p.m. UTC | #5
On Mon, Jun 17, 2019 at 05:43:28PM +0300, Kirill A. Shutemov wrote:
> On Mon, Jun 17, 2019 at 11:27:55AM +0200, Peter Zijlstra wrote:

> > > > And yet I don't see anything in pageattr.c.
> > > 
> > > You're right. I've hooked up the sync in the wrong place.

> I think something like this should do (I'll fold it in after testing):

> @@ -643,7 +641,7 @@ static int sync_direct_mapping_keyid(unsigned long keyid)
>   *
>   * The function is nop until MKTME is enabled.
>   */
> -int sync_direct_mapping(void)
> +int sync_direct_mapping(unsigned long start, unsigned long end)
>  {
>  	int i, ret = 0;
>  
> @@ -651,7 +649,7 @@ int sync_direct_mapping(void)
>  		return 0;
>  
>  	for (i = 1; !ret && i <= mktme_nr_keyids; i++)
> -		ret = sync_direct_mapping_keyid(i);
> +		ret = sync_direct_mapping_keyid(i, start, end);
>  
>  	flush_tlb_all();
>  
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index 6a9a77a403c9..eafbe0d8c44f 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -347,6 +347,28 @@ static void cpa_flush(struct cpa_data *data, int cache)
>  
>  	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
>  
> +	if (mktme_enabled()) {
> +		unsigned long start, end;
> +
> +		start = *cpa->vaddr;
> +		end = *cpa->vaddr + cpa->numpages * PAGE_SIZE;
> +
> +		/* Sync all direct mapping for an array */
> +		if (cpa->flags & CPA_ARRAY) {
> +			start = PAGE_OFFSET;
> +			end = PAGE_OFFSET + direct_mapping_size;
> +		}

Understandable but sad, IIRC that's the most used interface (at least,
its the one the graphics people use).

> +
> +		/*
> +		 * Sync per-KeyID direct mappings with the canonical one
> +		 * (KeyID-0).
> +		 *
> +		 * sync_direct_mapping() does full TLB flush.
> +		 */
> +		sync_direct_mapping(start, end);
> +		return;

But it doesn't flush cache. So you can't return here.

> +	}
> +
>  	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
>  		cpa_flush_all(cache);
>  		return;
> -- 
>  Kirill A. Shutemov
Kirill A. Shutemov June 17, 2019, 3:17 p.m. UTC | #6
On Mon, Jun 17, 2019 at 04:51:58PM +0200, Peter Zijlstra wrote:
> On Mon, Jun 17, 2019 at 05:43:28PM +0300, Kirill A. Shutemov wrote:
> > On Mon, Jun 17, 2019 at 11:27:55AM +0200, Peter Zijlstra wrote:
> 
> > > > > And yet I don't see anything in pageattr.c.
> > > > 
> > > > You're right. I've hooked up the sync in the wrong place.
> 
> > I think something like this should do (I'll fold it in after testing):
> 
> > @@ -643,7 +641,7 @@ static int sync_direct_mapping_keyid(unsigned long keyid)
> >   *
> >   * The function is nop until MKTME is enabled.
> >   */
> > -int sync_direct_mapping(void)
> > +int sync_direct_mapping(unsigned long start, unsigned long end)
> >  {
> >  	int i, ret = 0;
> >  
> > @@ -651,7 +649,7 @@ int sync_direct_mapping(void)
> >  		return 0;
> >  
> >  	for (i = 1; !ret && i <= mktme_nr_keyids; i++)
> > -		ret = sync_direct_mapping_keyid(i);
> > +		ret = sync_direct_mapping_keyid(i, start, end);
> >  
> >  	flush_tlb_all();
> >  
> > diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> > index 6a9a77a403c9..eafbe0d8c44f 100644
> > --- a/arch/x86/mm/pageattr.c
> > +++ b/arch/x86/mm/pageattr.c
> > @@ -347,6 +347,28 @@ static void cpa_flush(struct cpa_data *data, int cache)
> >  
> >  	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
> >  
> > +	if (mktme_enabled()) {
> > +		unsigned long start, end;
> > +
> > +		start = *cpa->vaddr;
> > +		end = *cpa->vaddr + cpa->numpages * PAGE_SIZE;
> > +
> > +		/* Sync all direct mapping for an array */
> > +		if (cpa->flags & CPA_ARRAY) {
> > +			start = PAGE_OFFSET;
> > +			end = PAGE_OFFSET + direct_mapping_size;
> > +		}
> 
> Understandable but sad, IIRC that's the most used interface (at least,
> its the one the graphics people use).
> 
> > +
> > +		/*
> > +		 * Sync per-KeyID direct mappings with the canonical one
> > +		 * (KeyID-0).
> > +		 *
> > +		 * sync_direct_mapping() does full TLB flush.
> > +		 */
> > +		sync_direct_mapping(start, end);
> > +		return;
> 
> But it doesn't flush cache. So you can't return here.

Thanks for catching this.

	if (!cache)
		return;

should be fine.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mktme.h b/arch/x86/include/asm/mktme.h
index 454d6d7c791d..bd6707e73219 100644
--- a/arch/x86/include/asm/mktme.h
+++ b/arch/x86/include/asm/mktme.h
@@ -59,6 +59,8 @@  static inline void arch_free_page(struct page *page, int order)
 		free_encrypted_page(page, order);
 }
 
+int sync_direct_mapping(void);
+
 #else
 #define mktme_keyid_mask	((phys_addr_t)0)
 #define mktme_nr_keyids		0
@@ -73,6 +75,10 @@  static inline bool mktme_enabled(void)
 
 static inline void mktme_disable(void) {}
 
+static inline int sync_direct_mapping(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3a08d707eec8..ad4ea3703faf 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -693,6 +693,7 @@  kernel_physical_mapping_init(unsigned long paddr_start,
 {
 	bool pgd_changed = false;
 	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
+	int ret;
 
 	paddr_last = paddr_end;
 	vaddr = (unsigned long)__va(paddr_start);
@@ -726,6 +727,9 @@  kernel_physical_mapping_init(unsigned long paddr_start,
 		pgd_changed = true;
 	}
 
+	ret = sync_direct_mapping();
+	WARN_ON(ret);
+
 	if (pgd_changed)
 		sync_global_pgds(vaddr_start, vaddr_end - 1);
 
@@ -1135,10 +1139,13 @@  void __ref vmemmap_free(unsigned long start, unsigned long end,
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
+	int ret;
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
 
 	remove_pagetable(start, end, true, NULL);
+	ret = sync_direct_mapping();
+	WARN_ON(ret);
 }
 
 int __ref arch_remove_memory(int nid, u64 start, u64 size,
@@ -1247,6 +1254,7 @@  void mark_rodata_ro(void)
 	unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
 	unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
 	unsigned long all_end;
+	int ret;
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -1280,6 +1288,8 @@  void mark_rodata_ro(void)
 	free_kernel_image_pages((void *)text_end, (void *)rodata_start);
 	free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
 
+	ret = sync_direct_mapping();
+	WARN_ON(ret);
 	debug_checkwx();
 }
 
diff --git a/arch/x86/mm/mktme.c b/arch/x86/mm/mktme.c
index 9221c894e8e9..024165c9c7f3 100644
--- a/arch/x86/mm/mktme.c
+++ b/arch/x86/mm/mktme.c
@@ -1,6 +1,8 @@ 
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <asm/mktme.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 
 /* Mask to extract KeyID from physical address. */
 phys_addr_t mktme_keyid_mask;
@@ -36,6 +38,8 @@  static bool need_page_mktme(void)
 static void init_page_mktme(void)
 {
 	static_branch_enable(&mktme_enabled_key);
+
+	sync_direct_mapping();
 }
 
 struct page_ext_operations page_mktme_ops = {
@@ -96,3 +100,440 @@  void free_encrypted_page(struct page *page, int order)
 		page++;
 	}
 }
+
+static int sync_direct_mapping_pte(unsigned long keyid,
+		pmd_t *dst_pmd, pmd_t *src_pmd,
+		unsigned long addr, unsigned long end)
+{
+	pte_t *src_pte, *dst_pte;
+	pte_t *new_pte = NULL;
+	bool remove_pte;
+
+	/*
+	 * We want to unmap and free the page table if the source is empty and
+	 * the range covers whole page table.
+	 */
+	remove_pte = !src_pmd && PAGE_ALIGNED(addr) && PAGE_ALIGNED(end);
+
+	/*
+	 * PMD page got split into page table.
+	 * Clear PMD mapping. Page table will be established instead.
+	 */
+	if (pmd_large(*dst_pmd)) {
+		spin_lock(&init_mm.page_table_lock);
+		pmd_clear(dst_pmd);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	/* Allocate a new page table if needed. */
+	if (pmd_none(*dst_pmd)) {
+		new_pte = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		if (!new_pte)
+			return -ENOMEM;
+		dst_pte = new_pte + pte_index(addr + keyid * direct_mapping_size);
+	} else {
+		dst_pte = pte_offset_map(dst_pmd, addr + keyid * direct_mapping_size);
+	}
+	src_pte = src_pmd ? pte_offset_map(src_pmd, addr) : NULL;
+
+	spin_lock(&init_mm.page_table_lock);
+
+	do {
+		pteval_t val;
+
+		if (!src_pte || pte_none(*src_pte)) {
+			set_pte(dst_pte, __pte(0));
+			goto next;
+		}
+
+		if (!pte_none(*dst_pte)) {
+			/*
+			 * Sanity check: PFNs must match between source
+			 * and destination even if the rest doesn't.
+			 */
+			BUG_ON(pte_pfn(*dst_pte) != pte_pfn(*src_pte));
+		}
+
+		/* Copy entry, but set KeyID. */
+		val = pte_val(*src_pte) | keyid << mktme_keyid_shift;
+		val &= __supported_pte_mask;
+		set_pte(dst_pte, __pte(val));
+next:
+		addr += PAGE_SIZE;
+		dst_pte++;
+		if (src_pte)
+			src_pte++;
+	} while (addr != end);
+
+	if (new_pte)
+		pmd_populate_kernel(&init_mm, dst_pmd, new_pte);
+
+	if (remove_pte) {
+		__free_page(pmd_page(*dst_pmd));
+		pmd_clear(dst_pmd);
+	}
+
+	spin_unlock(&init_mm.page_table_lock);
+
+	return 0;
+}
+
+static int sync_direct_mapping_pmd(unsigned long keyid,
+		pud_t *dst_pud, pud_t *src_pud,
+		unsigned long addr, unsigned long end)
+{
+	pmd_t *src_pmd, *dst_pmd;
+	pmd_t *new_pmd = NULL;
+	bool remove_pmd = false;
+	unsigned long next;
+	int ret = 0;
+
+	/*
+	 * We want to unmap and free the page table if the source is empty and
+	 * the range covers whole page table.
+	 */
+	remove_pmd = !src_pud && IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(end, PUD_SIZE);
+
+	/*
+	 * PUD page got split into page table.
+	 * Clear PUD mapping. Page table will be established instead.
+	 */
+	if (pud_large(*dst_pud)) {
+		spin_lock(&init_mm.page_table_lock);
+		pud_clear(dst_pud);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	/* Allocate a new page table if needed. */
+	if (pud_none(*dst_pud)) {
+		new_pmd = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		if (!new_pmd)
+			return -ENOMEM;
+		dst_pmd = new_pmd + pmd_index(addr + keyid * direct_mapping_size);
+	} else {
+		dst_pmd = pmd_offset(dst_pud, addr + keyid * direct_mapping_size);
+	}
+	src_pmd = src_pud ? pmd_offset(src_pud, addr) : NULL;
+
+	do {
+		pmd_t *__src_pmd = src_pmd;
+
+		next = pmd_addr_end(addr, end);
+		if (!__src_pmd || pmd_none(*__src_pmd)) {
+			if (pmd_none(*dst_pmd))
+				goto next;
+			if (pmd_large(*dst_pmd)) {
+				spin_lock(&init_mm.page_table_lock);
+				set_pmd(dst_pmd, __pmd(0));
+				spin_unlock(&init_mm.page_table_lock);
+				goto next;
+			}
+			__src_pmd = NULL;
+		}
+
+		if (__src_pmd && pmd_large(*__src_pmd)) {
+			pmdval_t val;
+
+			if (pmd_large(*dst_pmd)) {
+				/*
+				 * Sanity check: PFNs must match between source
+				 * and destination even if the rest doesn't.
+				 */
+				BUG_ON(pmd_pfn(*dst_pmd) != pmd_pfn(*__src_pmd));
+			} else if (!pmd_none(*dst_pmd)) {
+				/*
+				 * Page table is replaced with a PMD page.
+				 * Free and unmap the page table.
+				 */
+				__free_page(pmd_page(*dst_pmd));
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(dst_pmd);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+
+			/* Copy entry, but set KeyID. */
+			val = pmd_val(*__src_pmd) | keyid << mktme_keyid_shift;
+			val &= __supported_pte_mask;
+			spin_lock(&init_mm.page_table_lock);
+			set_pmd(dst_pmd, __pmd(val));
+			spin_unlock(&init_mm.page_table_lock);
+			goto next;
+		}
+
+		ret = sync_direct_mapping_pte(keyid, dst_pmd, __src_pmd,
+				addr, next);
+next:
+		addr = next;
+		dst_pmd++;
+		if (src_pmd)
+			src_pmd++;
+	} while (addr != end && !ret);
+
+	if (new_pmd) {
+		spin_lock(&init_mm.page_table_lock);
+		pud_populate(&init_mm, dst_pud, new_pmd);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	if (remove_pmd) {
+		spin_lock(&init_mm.page_table_lock);
+		__free_page(pud_page(*dst_pud));
+		pud_clear(dst_pud);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	return ret;
+}
+
+static int sync_direct_mapping_pud(unsigned long keyid,
+		p4d_t *dst_p4d, p4d_t *src_p4d,
+		unsigned long addr, unsigned long end)
+{
+	pud_t *src_pud, *dst_pud;
+	pud_t *new_pud = NULL;
+	bool remove_pud = false;
+	unsigned long next;
+	int ret = 0;
+
+	/*
+	 * We want to unmap and free the page table if the source is empty and
+	 * the range covers whole page table.
+	 */
+	remove_pud = !src_p4d && IS_ALIGNED(addr, P4D_SIZE) && IS_ALIGNED(end, P4D_SIZE);
+
+	/*
+	 * P4D page got split into page table.
+	 * Clear P4D mapping. Page table will be established instead.
+	 */
+	if (p4d_large(*dst_p4d)) {
+		spin_lock(&init_mm.page_table_lock);
+		p4d_clear(dst_p4d);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	/* Allocate a new page table if needed. */
+	if (p4d_none(*dst_p4d)) {
+		new_pud = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		if (!new_pud)
+			return -ENOMEM;
+		dst_pud = new_pud + pud_index(addr + keyid * direct_mapping_size);
+	} else {
+		dst_pud = pud_offset(dst_p4d, addr + keyid * direct_mapping_size);
+	}
+	src_pud = src_p4d ? pud_offset(src_p4d, addr) : NULL;
+
+	do {
+		pud_t *__src_pud = src_pud;
+
+		next = pud_addr_end(addr, end);
+		if (!__src_pud || pud_none(*__src_pud)) {
+			if (pud_none(*dst_pud))
+				goto next;
+			if (pud_large(*dst_pud)) {
+				spin_lock(&init_mm.page_table_lock);
+				set_pud(dst_pud, __pud(0));
+				spin_unlock(&init_mm.page_table_lock);
+				goto next;
+			}
+			__src_pud = NULL;
+		}
+
+		if (__src_pud && pud_large(*__src_pud)) {
+			pudval_t val;
+
+			if (pud_large(*dst_pud)) {
+				/*
+				 * Sanity check: PFNs must match between source
+				 * and destination even if the rest doesn't.
+				 */
+				BUG_ON(pud_pfn(*dst_pud) != pud_pfn(*__src_pud));
+			} else if (!pud_none(*dst_pud)) {
+				/*
+				 * Page table is replaced with a pud page.
+				 * Free and unmap the page table.
+				 */
+				__free_page(pud_page(*dst_pud));
+				spin_lock(&init_mm.page_table_lock);
+				pud_clear(dst_pud);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+
+			/* Copy entry, but set KeyID. */
+			val = pud_val(*__src_pud) | keyid << mktme_keyid_shift;
+			val &= __supported_pte_mask;
+			spin_lock(&init_mm.page_table_lock);
+			set_pud(dst_pud, __pud(val));
+			spin_unlock(&init_mm.page_table_lock);
+			goto next;
+		}
+
+		ret = sync_direct_mapping_pmd(keyid, dst_pud, __src_pud,
+				addr, next);
+next:
+		addr = next;
+		dst_pud++;
+		if (src_pud)
+			src_pud++;
+	} while (addr != end && !ret);
+
+	if (new_pud) {
+		spin_lock(&init_mm.page_table_lock);
+		p4d_populate(&init_mm, dst_p4d, new_pud);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	if (remove_pud) {
+		spin_lock(&init_mm.page_table_lock);
+		__free_page(p4d_page(*dst_p4d));
+		p4d_clear(dst_p4d);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	return ret;
+}
+
+static int sync_direct_mapping_p4d(unsigned long keyid,
+		pgd_t *dst_pgd, pgd_t *src_pgd,
+		unsigned long addr, unsigned long end)
+{
+	p4d_t *src_p4d, *dst_p4d;
+	p4d_t *new_p4d_1 = NULL, *new_p4d_2 = NULL;
+	bool remove_p4d = false;
+	unsigned long next;
+	int ret = 0;
+
+	/*
+	 * We want to unmap and free the page table if the source is empty and
+	 * the range covers whole page table.
+	 */
+	remove_p4d = !src_pgd && IS_ALIGNED(addr, PGDIR_SIZE) && IS_ALIGNED(end, PGDIR_SIZE);
+
+	/* Allocate a new page table if needed. */
+	if (pgd_none(*dst_pgd)) {
+		new_p4d_1 = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		if (!new_p4d_1)
+			return -ENOMEM;
+		dst_p4d = new_p4d_1 + p4d_index(addr + keyid * direct_mapping_size);
+	} else {
+		dst_p4d = p4d_offset(dst_pgd, addr + keyid * direct_mapping_size);
+	}
+	src_p4d = src_pgd ? p4d_offset(src_pgd, addr) : NULL;
+
+	do {
+		p4d_t *__src_p4d = src_p4d;
+
+		next = p4d_addr_end(addr, end);
+		if (!__src_p4d || p4d_none(*__src_p4d)) {
+			if (p4d_none(*dst_p4d))
+				goto next;
+			__src_p4d = NULL;
+		}
+
+		ret = sync_direct_mapping_pud(keyid, dst_p4d, __src_p4d,
+				addr, next);
+next:
+		addr = next;
+		dst_p4d++;
+
+		/*
+		 * Direct mappings are 1TiB-aligned. With 5-level paging it
+		 * means that on PGD level there can be misalignment between
+		 * source and distiantion.
+		 *
+		 * Allocate the new page table if dst_p4d crosses page table
+		 * boundary.
+		 */
+		if (!((unsigned long)dst_p4d & ~PAGE_MASK) && addr != end) {
+			if (pgd_none(dst_pgd[1])) {
+				new_p4d_2 = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+				if (!new_p4d_2)
+					ret = -ENOMEM;
+				dst_p4d = new_p4d_2;
+			} else {
+				dst_p4d = p4d_offset(dst_pgd + 1, 0);
+			}
+		}
+		if (src_p4d)
+			src_p4d++;
+	} while (addr != end && !ret);
+
+	if (new_p4d_1 || new_p4d_2) {
+		spin_lock(&init_mm.page_table_lock);
+		if (new_p4d_1)
+			pgd_populate(&init_mm, dst_pgd, new_p4d_1);
+		if (new_p4d_2)
+			pgd_populate(&init_mm, dst_pgd + 1, new_p4d_2);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	if (remove_p4d) {
+		spin_lock(&init_mm.page_table_lock);
+		__free_page(pgd_page(*dst_pgd));
+		pgd_clear(dst_pgd);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+
+	return ret;
+}
+
+static int sync_direct_mapping_keyid(unsigned long keyid)
+{
+	pgd_t *src_pgd, *dst_pgd;
+	unsigned long addr, end, next;
+	int ret = 0;
+
+	addr = PAGE_OFFSET;
+	end = PAGE_OFFSET + direct_mapping_size;
+
+	dst_pgd = pgd_offset_k(addr + keyid * direct_mapping_size);
+	src_pgd = pgd_offset_k(addr);
+
+	do {
+		pgd_t *__src_pgd = src_pgd;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(*__src_pgd)) {
+			if (pgd_none(*dst_pgd))
+				continue;
+			__src_pgd = NULL;
+		}
+
+		ret = sync_direct_mapping_p4d(keyid, dst_pgd, __src_pgd,
+				addr, next);
+	} while (dst_pgd++, src_pgd++, addr = next, addr != end && !ret);
+
+	return ret;
+}
+
+/*
+ * For MKTME we maintain per-KeyID direct mappings. This allows kernel to have
+ * access to encrypted memory.
+ *
+ * sync_direct_mapping() sync per-KeyID direct mappings with a canonical
+ * one -- KeyID-0.
+ *
+ * The function tracks changes in the canonical mapping:
+ *  - creating or removing chunks of the translation tree;
+ *  - changes in mapping flags (i.e. protection bits);
+ *  - splitting huge page mapping into a page table;
+ *  - replacing page table with a huge page mapping;
+ *
+ * The function need to be called on every change to the direct mapping:
+ * hotplug, hotremove, changes in permissions bits, etc.
+ *
+ * The function is nop until MKTME is enabled.
+ */
+int sync_direct_mapping(void)
+{
+	int i, ret = 0;
+
+	if (!mktme_enabled())
+		return 0;
+
+	for (i = 1; !ret && i <= mktme_nr_keyids; i++)
+		ret = sync_direct_mapping_keyid(i);
+
+	flush_tlb_all();
+
+	return ret;
+}