diff mbox

[PATCHv4,18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

Message ID 20170307130009.GA2154@node (mailing list archive)
State New, archived
Headers show

Commit Message

Kirill A. Shutemov March 7, 2017, 1 p.m. UTC
On Mon, Mar 06, 2017 at 03:48:24PM -0500, Boris Ostrovsky wrote:
> 
> > +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
> > +		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
> > +		bool last, unsigned long limit)
> > +{
> > +	int i, nr, flush = 0;
> > +
> > +	nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
> > +	for (i = 0; i < nr; i++) {
> > +		pud_t *pud;
> > +
> > +		if (p4d_none(p4d[i]))
> > +			continue;
> > +
> > +		pud = pud_offset(&p4d[i], 0);
> > +		if (PTRS_PER_PUD > 1)
> > +			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
> > +		xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
> > +	}
> > +	return flush;
> > +}
> 
> ..
> 
> > +		p4d = p4d_offset(&pgd[i], 0);
> > +		if (PTRS_PER_P4D > 1)
> > +			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
> > +		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
> 
> 
> We are losing flush status at all levels so we need something like
> 
> flush |= xen_XXX_walk(...)

+ Xiong.

Thanks for noticing this. The fixup is below.

Please test, I don't have a setup for this.

> 
> 
> 
> >  	}
> >  
> > -out:
> >  	/* Do the top level last, so that the callbacks can use it as
> >  	   a cue to do final things like tlb flushes. */
> >  	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
> > @@ -1150,57 +1161,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
> >  	xen_free_ro_pages(pa, PAGE_SIZE);
> >  }
> >  
> > +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pte_t *pte_tbl;
> > +	int i;
> > +
> > +	if (pmd_large(*pmd)) {
> > +		pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, PMD_SIZE);
> > +		return;
> > +	}
> > +
> > +	pte_tbl = pte_offset_kernel(pmd, 0);
> > +	for (i = 0; i < PTRS_PER_PTE; i++) {
> > +		if (pte_none(pte_tbl[i]))
> > +			continue;
> > +		pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
> > +		xen_free_ro_pages(pa, PAGE_SIZE);
> > +	}
> > +	set_pmd(pmd, __pmd(0));
> > +	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pmd_t *pmd_tbl;
> > +	int i;
> > +
> > +	if (pud_large(*pud)) {
> > +		pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, PUD_SIZE);
> > +		return;
> > +	}
> > +
> > +	pmd_tbl = pmd_offset(pud, 0);
> > +	for (i = 0; i < PTRS_PER_PMD; i++) {
> > +		if (pmd_none(pmd_tbl[i]))
> > +			continue;
> > +		xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
> > +	}
> > +	set_pud(pud, __pud(0));
> > +	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
> > +{
> > +	unsigned long pa;
> > +	pud_t *pud_tbl;
> > +	int i;
> > +
> > +	if (p4d_large(*p4d)) {
> > +		pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
> > +		xen_free_ro_pages(pa, P4D_SIZE);
> > +		return;
> > +	}
> > +
> > +	pud_tbl = pud_offset(p4d, 0);
> > +	for (i = 0; i < PTRS_PER_PUD; i++) {
> > +		if (pud_none(pud_tbl[i]))
> > +			continue;
> > +		xen_cleanmfnmap_pud(pud_tbl + i, unpin);
> > +	}
> > +	set_p4d(p4d, __p4d(0));
> > +	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
> > +}
> > +
> >  /*
> >   * Since it is well isolated we can (and since it is perhaps large we should)
> >   * also free the page tables mapping the initial P->M table.
> >   */
> >  static void __init xen_cleanmfnmap(unsigned long vaddr)
> >  {
> > -	unsigned long va = vaddr & PMD_MASK;
> > -	unsigned long pa;
> > -	pgd_t *pgd = pgd_offset_k(va);
> > -	pud_t *pud_page = pud_offset(pgd, 0);
> > -	pud_t *pud;
> > -	pmd_t *pmd;
> > -	pte_t *pte;
> > +	pgd_t *pgd;
> > +	p4d_t *p4d;
> >  	unsigned int i;
> >  	bool unpin;
> >  
> >  	unpin = (vaddr == 2 * PGDIR_SIZE);
> > -	set_pgd(pgd, __pgd(0));
> > -	do {
> > -		pud = pud_page + pud_index(va);
> > -		if (pud_none(*pud)) {
> > -			va += PUD_SIZE;
> > -		} else if (pud_large(*pud)) {
> > -			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > -			xen_free_ro_pages(pa, PUD_SIZE);
> > -			va += PUD_SIZE;
> > -		} else {
> > -			pmd = pmd_offset(pud, va);
> > -			if (pmd_large(*pmd)) {
> > -				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > -				xen_free_ro_pages(pa, PMD_SIZE);
> > -			} else if (!pmd_none(*pmd)) {
> > -				pte = pte_offset_kernel(pmd, va);
> > -				set_pmd(pmd, __pmd(0));
> > -				for (i = 0; i < PTRS_PER_PTE; ++i) {
> > -					if (pte_none(pte[i]))
> > -						break;
> > -					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
> > -					xen_free_ro_pages(pa, PAGE_SIZE);
> > -				}
> > -				xen_cleanmfnmap_free_pgtbl(pte, unpin);
> > -			}
> > -			va += PMD_SIZE;
> > -			if (pmd_index(va))
> > -				continue;
> > -			set_pud(pud, __pud(0));
> > -			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
> > -		}
> > -
> > -	} while (pud_index(va) || pmd_index(va));
> > -	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
> > +	vaddr &= PMD_MASK;
> > +	pgd = pgd_offset_k(vaddr);
> > +	p4d = p4d_offset(pgd, 0);
> > +	for (i = 0; i < PTRS_PER_P4D; i++) {
> > +		if (p4d_none(p4d[i]))
> > +			continue;
> > +		xen_cleanmfnmap_p4d(p4d + i, unpin);
> > +	}
> 
> Don't we need to pass vaddr down to all routines so that they select
> appropriate tables? You seem to always be choosing the first one.

IIUC, we clear whole page table subtree covered by one pgd entry.
So, no, there's no need to pass vaddr down. Just pointer to page table
entry is enough.

But I know virtually nothing about Xen. Please re-check my reasoning.

I would also appreciate help with getting x86 Xen code work with 5-level
paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.

Fixup:

Comments

Boris Ostrovsky March 7, 2017, 6:18 p.m. UTC | #1
>> Don't we need to pass vaddr down to all routines so that they select
>> appropriate tables? You seem to always be choosing the first one.
> IIUC, we clear whole page table subtree covered by one pgd entry.
> So, no, there's no need to pass vaddr down. Just pointer to page table
> entry is enough.
>
> But I know virtually nothing about Xen. Please re-check my reasoning.

Yes, we effectively remove the whole page table for vaddr so I guess
it's OK.

>
> I would also appreciate help with getting x86 Xen code work with 5-level
> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.

Hmmm... that's a problem since this requires changes in the hypervisor
and even if/when these changes are made older version of hypervisor
still will not be able to run those guests.

This affects only PV guests and there is a series under review that
provides clean code separation with CONFIG_XEN_PV but because, for
example, dom0 (Xen control domain) is PV this will significantly limit
availability of dom0-capable kernels (because I assume distros will want
to have CONFIG_X86_5LEVEL).


>
> Fixup:

Yes, that works. (But then it worked even without this change because
problems caused by missing the flush would be intermittent. And a joy to
debug).

-boris
Andrew Cooper March 7, 2017, 6:26 p.m. UTC | #2
On 07/03/17 18:18, Boris Ostrovsky wrote:
>>> Don't we need to pass vaddr down to all routines so that they select
>>> appropriate tables? You seem to always be choosing the first one.
>> IIUC, we clear whole page table subtree covered by one pgd entry.
>> So, no, there's no need to pass vaddr down. Just pointer to page table
>> entry is enough.
>>
>> But I know virtually nothing about Xen. Please re-check my reasoning.
> Yes, we effectively remove the whole page table for vaddr so I guess
> it's OK.
>
>> I would also appreciate help with getting x86 Xen code work with 5-level
>> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.
> Hmmm... that's a problem since this requires changes in the hypervisor
> and even if/when these changes are made older version of hypervisor
> still will not be able to run those guests.
>
> This affects only PV guests and there is a series under review that
> provides clean code separation with CONFIG_XEN_PV but because, for
> example, dom0 (Xen control domain) is PV this will significantly limit
> availability of dom0-capable kernels (because I assume distros will want
> to have CONFIG_X86_5LEVEL).

Wasn't the plan to be able to automatically detect 4 vs 5 level support,
and cope either way, so distros didn't have to ship two different builds
of Linux?

If so, all we need to do git things to compile sensibly, and have the PV
entry code in Linux configure the rest of the kernel appropriately.

(If not, please ignore me.)

~Andrew
Boris Ostrovsky March 7, 2017, 6:45 p.m. UTC | #3
On 03/07/2017 01:26 PM, Andrew Cooper wrote:
> On 07/03/17 18:18, Boris Ostrovsky wrote:
>>>> Don't we need to pass vaddr down to all routines so that they select
>>>> appropriate tables? You seem to always be choosing the first one.
>>> IIUC, we clear whole page table subtree covered by one pgd entry.
>>> So, no, there's no need to pass vaddr down. Just pointer to page table
>>> entry is enough.
>>>
>>> But I know virtually nothing about Xen. Please re-check my reasoning.
>> Yes, we effectively remove the whole page table for vaddr so I guess
>> it's OK.
>>
>>> I would also appreciate help with getting x86 Xen code work with 5-level
>>> paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.
>> Hmmm... that's a problem since this requires changes in the hypervisor
>> and even if/when these changes are made older version of hypervisor
>> still will not be able to run those guests.
>>
>> This affects only PV guests and there is a series under review that
>> provides clean code separation with CONFIG_XEN_PV but because, for
>> example, dom0 (Xen control domain) is PV this will significantly limit
>> availability of dom0-capable kernels (because I assume distros will want
>> to have CONFIG_X86_5LEVEL).
> Wasn't the plan to be able to automatically detect 4 vs 5 level support,
> and cope either way, so distros didn't have to ship two different builds
> of Linux?
>
> If so, all we need to do git things to compile sensibly, and have the PV
> entry code in Linux configure the rest of the kernel appropriately.

I am not aware of any plans but this would obviously be the preferred route.

-boris
diff mbox

Patch

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a4079cfab007..d66b7e79781a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -629,7 +629,8 @@  static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
 		pmd = pmd_offset(&pud[i], 0);
 		if (PTRS_PER_PMD > 1)
 			flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-		xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
+		flush |= xen_pmd_walk(mm, pmd, func,
+				last && i == nr - 1, limit);
 	}
 	return flush;
 }
@@ -650,7 +651,8 @@  static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
 		pud = pud_offset(&p4d[i], 0);
 		if (PTRS_PER_PUD > 1)
 			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-		xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
+		flush |= xen_pud_walk(mm, pud, func,
+				last && i == nr - 1, limit);
 	}
 	return flush;
 }
@@ -706,7 +708,7 @@  static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 		p4d = p4d_offset(&pgd[i], 0);
 		if (PTRS_PER_P4D > 1)
 			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
-		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
+		flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
 	}
 
 	/* Do the top level last, so that the callbacks can use it as