diff mbox series

x86/vmap: handle superpages in vmap_to_mfn()

Message ID 34de4c4326673c60d3e2cbd3bbcbcca481906524.1606755042.git.hongyxia@amazon.com (mailing list archive)
State Superseded
Headers show
Series x86/vmap: handle superpages in vmap_to_mfn() | expand

Commit Message

Hongyan Xia Nov. 30, 2020, 4:50 p.m. UTC
From: Hongyan Xia <hongyxia@amazon.com>

There is simply no guarantee that vmap won't return superpages to the
caller. It can happen if the list of MFNs are contiguous, or we simply
have a large granularity. Although rare, if such things do happen, we
will simply hit BUG_ON() and crash. Properly handle such cases in a new
implementation.

Note that vmap is now too large to be a macro, so implement it as a
normal function and move the declaration to mm.h (page.h cannot handle
mfn_t).

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
---
 xen/arch/x86/domain_page.c |  2 +-
 xen/arch/x86/mm.c          | 43 ++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/mm.h   |  2 ++
 xen/include/asm-x86/page.h |  2 --
 4 files changed, 46 insertions(+), 3 deletions(-)

Comments

Jan Beulich Dec. 2, 2020, 10:04 a.m. UTC | #1
On 30.11.2020 17:50, Hongyan Xia wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
> 
> There is simply no guarantee that vmap won't return superpages to the
> caller. It can happen if the list of MFNs are contiguous, or we simply
> have a large granularity. Although rare, if such things do happen, we
> will simply hit BUG_ON() and crash. Properly handle such cases in a new
> implementation.
> 
> Note that vmap is now too large to be a macro, so implement it as a
> normal function and move the declaration to mm.h (page.h cannot handle
> mfn_t).

I'm not happy about this movement, and it's also not really clear to
me what problem page.h would have in principle. Yes, it can't
include xen/mm.h, but I think it is long overdue that we disentangle
this at least some. Let me see what I can do as a prereq for this
change, but see also below.

> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -5194,6 +5194,49 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
>          }                                          \
>      } while ( false )
>  
> +mfn_t vmap_to_mfn(const void *v)
> +{
> +    bool locking = system_state > SYS_STATE_boot;
> +    unsigned int l2_offset = l2_table_offset((unsigned long)v);
> +    unsigned int l1_offset = l1_table_offset((unsigned long)v);
> +    l3_pgentry_t *pl3e = virt_to_xen_l3e((unsigned long)v);
> +    l2_pgentry_t *pl2e;
> +    l1_pgentry_t *pl1e;

Can't all three of them be pointer-to-const?

> +    struct page_info *l3page;
> +    mfn_t ret;
> +
> +    ASSERT(pl3e);

    if ( !pl3e )
    {
        ASSERT_UNREACHABLE();
        return INVALID_MFN;
    }

as per the bottom of ./CODING_STYLE? (Similarly further down
then.)

> +    l3page = virt_to_page(pl3e);
> +    L3T_LOCK(l3page);
> +
> +    ASSERT(l3e_get_flags(*pl3e) & _PAGE_PRESENT);
> +    if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
> +    {
> +        ret = mfn_add(l3e_get_mfn(*pl3e),
> +                      (l2_offset << PAGETABLE_ORDER) + l1_offset);
> +        L3T_UNLOCK(l3page);
> +        return ret;

To keep the locked region as narrow as possible

        mfn = l3e_get_mfn(*pl3e);
        L3T_UNLOCK(l3page);
        return mfn_add(mfn, (l2_offset << PAGETABLE_ORDER) + l1_offset);

? However, in particular because of the recurring unlocks on
the exit paths I wonder whether it wouldn't be better to
restructure the whole function such that there'll be one unlock
and one return. Otoh I'm afraid what I'm asking for here is
going to yield a measurable set of goto-s ...

> +    }
> +
> +    pl2e = map_l2t_from_l3e(*pl3e) + l2_offset;
> +    ASSERT(l2e_get_flags(*pl2e) & _PAGE_PRESENT);
> +    if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
> +    {
> +        ret = mfn_add(l2e_get_mfn(*pl2e), l1_offset);
> +        L3T_UNLOCK(l3page);
> +        return ret;
> +    }
> +
> +    pl1e = map_l1t_from_l2e(*pl2e) + l1_offset;
> +    UNMAP_DOMAIN_PAGE(pl2e);
> +    ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
> +    ret = l1e_get_mfn(*pl1e);
> +    L3T_UNLOCK(l3page);
> +    UNMAP_DOMAIN_PAGE(pl1e);
> +
> +    return ret;
> +}

Now for the name of the function: The only aspect tying it
somewhat to vmap() is that it assumes (asserts) it'll find a
valid mapping. I think it wants renaming, and vmap_to_mfn()
then would become a #define of it (perhaps even retaining
its property of getting unsigned long passed in), at which
point it also doesn't need moving out of page.h. As to the
actual name, xen_map_to_mfn() to somewhat match up with
map_pages_to_xen()?

Jan
Hongyan Xia Dec. 2, 2020, 12:17 p.m. UTC | #2
On Wed, 2020-12-02 at 11:04 +0100, Jan Beulich wrote:
> On 30.11.2020 17:50, Hongyan Xia wrote:
> > From: Hongyan Xia <hongyxia@amazon.com>
> > 
> > There is simply no guarantee that vmap won't return superpages to
> > the
> > caller. It can happen if the list of MFNs are contiguous, or we
> > simply
> > have a large granularity. Although rare, if such things do happen,
> > we
> > will simply hit BUG_ON() and crash. Properly handle such cases in a
> > new
> > implementation.
> > 
> > Note that vmap is now too large to be a macro, so implement it as a
> > normal function and move the declaration to mm.h (page.h cannot
> > handle
> > mfn_t).
> 
> I'm not happy about this movement, and it's also not really clear to
> me what problem page.h would have in principle. Yes, it can't
> include xen/mm.h, but I think it is long overdue that we disentangle
> this at least some. Let me see what I can do as a prereq for this
> change, but see also below.
> 
> > --- a/xen/arch/x86/mm.c
> > +++ b/xen/arch/x86/mm.c
> > @@ -5194,6 +5194,49 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long
> > v)
> >          }                                          \
> >      } while ( false )
> >  
> > +mfn_t vmap_to_mfn(const void *v)
> > +{
> > +    bool locking = system_state > SYS_STATE_boot;
> > +    unsigned int l2_offset = l2_table_offset((unsigned long)v);
> > +    unsigned int l1_offset = l1_table_offset((unsigned long)v);
> > +    l3_pgentry_t *pl3e = virt_to_xen_l3e((unsigned long)v);
> > +    l2_pgentry_t *pl2e;
> > +    l1_pgentry_t *pl1e;
> 
> Can't all three of them be pointer-to-const?

They can (and yes they should).

> > +    struct page_info *l3page;
> > +    mfn_t ret;
> > +
> > +    ASSERT(pl3e);
> 
>     if ( !pl3e )
>     {
>         ASSERT_UNREACHABLE();
>         return INVALID_MFN;
>     }
> 
> as per the bottom of ./CODING_STYLE? (Similarly further down
> then.)

Will revise.

> > +    l3page = virt_to_page(pl3e);
> > +    L3T_LOCK(l3page);
> > +
> > +    ASSERT(l3e_get_flags(*pl3e) & _PAGE_PRESENT);
> > +    if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
> > +    {
> > +        ret = mfn_add(l3e_get_mfn(*pl3e),
> > +                      (l2_offset << PAGETABLE_ORDER) + l1_offset);
> > +        L3T_UNLOCK(l3page);
> > +        return ret;
> 
> To keep the locked region as narrow as possible
> 
>         mfn = l3e_get_mfn(*pl3e);
>         L3T_UNLOCK(l3page);
>         return mfn_add(mfn, (l2_offset << PAGETABLE_ORDER) +
> l1_offset);
> 
> ? However, in particular because of the recurring unlocks on
> the exit paths I wonder whether it wouldn't be better to
> restructure the whole function such that there'll be one unlock
> and one return. Otoh I'm afraid what I'm asking for here is
> going to yield a measurable set of goto-s ...

I can do that.

But what about the lock narrowing? Will be slightly more tricky when
there is goto. Naturally:

    ret = full return mfn;
    goto out;

out:
    UNLOCK();
    return ret;

but with narrowing, my first reaction is:

    ret = high bits of mfn;
    l2_offset = 0;
    l1_offset = 0;
    goto out;

out:
    UNLOCK();
    return mfn + l2_offset << TABLE_ORDER + l1_offset;

To be honest, I doubt it is really worth it and I prefer the first one.

> 
> > +    }
> > +
> > +    pl2e = map_l2t_from_l3e(*pl3e) + l2_offset;
> > +    ASSERT(l2e_get_flags(*pl2e) & _PAGE_PRESENT);
> > +    if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
> > +    {
> > +        ret = mfn_add(l2e_get_mfn(*pl2e), l1_offset);
> > +        L3T_UNLOCK(l3page);
> > +        return ret;
> > +    }
> > +
> > +    pl1e = map_l1t_from_l2e(*pl2e) + l1_offset;
> > +    UNMAP_DOMAIN_PAGE(pl2e);
> > +    ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
> > +    ret = l1e_get_mfn(*pl1e);
> > +    L3T_UNLOCK(l3page);
> > +    UNMAP_DOMAIN_PAGE(pl1e);
> > +
> > +    return ret;
> > +}
> 
> Now for the name of the function: The only aspect tying it
> somewhat to vmap() is that it assumes (asserts) it'll find a
> valid mapping. I think it wants renaming, and vmap_to_mfn()
> then would become a #define of it (perhaps even retaining
> its property of getting unsigned long passed in), at which
> point it also doesn't need moving out of page.h. As to the
> actual name, xen_map_to_mfn() to somewhat match up with
> map_pages_to_xen()?

I actually really like this idea. I will come up with something in the
next rev. But if we want to make it generic, shouldn't we not ASSERT on
pl*e and the PRESENT flag and just return INVALID_MFN? Then this
function works on both mapped address (assumption of vmap_to_mfn()) and
other use cases.

Hongyan
Jan Beulich Dec. 2, 2020, 1:05 p.m. UTC | #3
On 02.12.2020 13:17, Hongyan Xia wrote:
> On Wed, 2020-12-02 at 11:04 +0100, Jan Beulich wrote:
>> On 30.11.2020 17:50, Hongyan Xia wrote:
>>> +    l3page = virt_to_page(pl3e);
>>> +    L3T_LOCK(l3page);
>>> +
>>> +    ASSERT(l3e_get_flags(*pl3e) & _PAGE_PRESENT);
>>> +    if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
>>> +    {
>>> +        ret = mfn_add(l3e_get_mfn(*pl3e),
>>> +                      (l2_offset << PAGETABLE_ORDER) + l1_offset);
>>> +        L3T_UNLOCK(l3page);
>>> +        return ret;
>>
>> To keep the locked region as narrow as possible
>>
>>         mfn = l3e_get_mfn(*pl3e);
>>         L3T_UNLOCK(l3page);
>>         return mfn_add(mfn, (l2_offset << PAGETABLE_ORDER) +
>> l1_offset);
>>
>> ? However, in particular because of the recurring unlocks on
>> the exit paths I wonder whether it wouldn't be better to
>> restructure the whole function such that there'll be one unlock
>> and one return. Otoh I'm afraid what I'm asking for here is
>> going to yield a measurable set of goto-s ...
> 
> I can do that.
> 
> But what about the lock narrowing? Will be slightly more tricky when
> there is goto. Naturally:
> 
>     ret = full return mfn;
>     goto out;
> 
> out:
>     UNLOCK();
>     return ret;
> 
> but with narrowing, my first reaction is:
> 
>     ret = high bits of mfn;
>     l2_offset = 0;
>     l1_offset = 0;
>     goto out;
> 
> out:
>     UNLOCK();
>     return mfn + l2_offset << TABLE_ORDER + l1_offset;
> 
> To be honest, I doubt it is really worth it and I prefer the first one.

That's why I said "However ..." - I did realize both won't fit
together very well.

>>> +    }
>>> +
>>> +    pl2e = map_l2t_from_l3e(*pl3e) + l2_offset;
>>> +    ASSERT(l2e_get_flags(*pl2e) & _PAGE_PRESENT);
>>> +    if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
>>> +    {
>>> +        ret = mfn_add(l2e_get_mfn(*pl2e), l1_offset);
>>> +        L3T_UNLOCK(l3page);
>>> +        return ret;
>>> +    }
>>> +
>>> +    pl1e = map_l1t_from_l2e(*pl2e) + l1_offset;
>>> +    UNMAP_DOMAIN_PAGE(pl2e);
>>> +    ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
>>> +    ret = l1e_get_mfn(*pl1e);
>>> +    L3T_UNLOCK(l3page);
>>> +    UNMAP_DOMAIN_PAGE(pl1e);
>>> +
>>> +    return ret;
>>> +}
>>
>> Now for the name of the function: The only aspect tying it
>> somewhat to vmap() is that it assumes (asserts) it'll find a
>> valid mapping. I think it wants renaming, and vmap_to_mfn()
>> then would become a #define of it (perhaps even retaining
>> its property of getting unsigned long passed in), at which
>> point it also doesn't need moving out of page.h. As to the
>> actual name, xen_map_to_mfn() to somewhat match up with
>> map_pages_to_xen()?
> 
> I actually really like this idea. I will come up with something in the
> next rev. But if we want to make it generic, shouldn't we not ASSERT on
> pl*e and the PRESENT flag and just return INVALID_MFN? Then this
> function works on both mapped address (assumption of vmap_to_mfn()) and
> other use cases.

Depends - we can still document that this function is going to
require a valid mapping. I did consider the generalization, too,
but this to a certain degree also collides with virt_to_xen_l3e()
allocating an L3 table, which isn't what we would want for a
fully generic lookup function. IOW - I'm undecided and will take
it from wherever you move it (albeit with no promise to not ask
for further adjustment).

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index eac5e3304fb8..4ba75d397a17 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -338,7 +338,7 @@  mfn_t domain_page_map_to_mfn(const void *ptr)
         return _mfn(virt_to_mfn(ptr));
 
     if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
-        return vmap_to_mfn(va);
+        return vmap_to_mfn(ptr);
 
     ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 5a50339284c7..c22385e90d8a 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5194,6 +5194,49 @@  l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
         }                                          \
     } while ( false )
 
+mfn_t vmap_to_mfn(const void *v)
+{
+    bool locking = system_state > SYS_STATE_boot;
+    unsigned int l2_offset = l2_table_offset((unsigned long)v);
+    unsigned int l1_offset = l1_table_offset((unsigned long)v);
+    l3_pgentry_t *pl3e = virt_to_xen_l3e((unsigned long)v);
+    l2_pgentry_t *pl2e;
+    l1_pgentry_t *pl1e;
+    struct page_info *l3page;
+    mfn_t ret;
+
+    ASSERT(pl3e);
+    l3page = virt_to_page(pl3e);
+    L3T_LOCK(l3page);
+
+    ASSERT(l3e_get_flags(*pl3e) & _PAGE_PRESENT);
+    if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
+    {
+        ret = mfn_add(l3e_get_mfn(*pl3e),
+                      (l2_offset << PAGETABLE_ORDER) + l1_offset);
+        L3T_UNLOCK(l3page);
+        return ret;
+    }
+
+    pl2e = map_l2t_from_l3e(*pl3e) + l2_offset;
+    ASSERT(l2e_get_flags(*pl2e) & _PAGE_PRESENT);
+    if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
+    {
+        ret = mfn_add(l2e_get_mfn(*pl2e), l1_offset);
+        L3T_UNLOCK(l3page);
+        return ret;
+    }
+
+    pl1e = map_l1t_from_l2e(*pl2e) + l1_offset;
+    UNMAP_DOMAIN_PAGE(pl2e);
+    ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
+    ret = l1e_get_mfn(*pl1e);
+    L3T_UNLOCK(l3page);
+    UNMAP_DOMAIN_PAGE(pl1e);
+
+    return ret;
+}
+
 int map_pages_to_xen(
     unsigned long virt,
     mfn_t mfn,
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index deeba75a1cbb..6354d165f48b 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -578,6 +578,8 @@  mfn_t alloc_xen_pagetable_new(void);
 void free_xen_pagetable_new(mfn_t mfn);
 
 l1_pgentry_t *virt_to_xen_l1e(unsigned long v);
+mfn_t vmap_to_mfn(const void *v);
+#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
 
 int __sync_local_execstate(void);
 
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 7a771baf7cb3..b2bcc95fd2de 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -291,8 +291,6 @@  void copy_page_sse2(void *, const void *);
 #define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
 #define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
 #define paddr_to_pdx(pa)    pfn_to_pdx(paddr_to_pfn(pa))
-#define vmap_to_mfn(va)     l1e_get_mfn(*virt_to_xen_l1e((unsigned long)(va)))
-#define vmap_to_page(va)    mfn_to_page(vmap_to_mfn(va))
 
 #endif /* !defined(__ASSEMBLY__) */