diff mbox

[5/6,v2] kvm: powerpc: booke: Add linux pte lookup like booke3s

Message ID 6A3DF150A5B70D4F9B66A25E3F7C888D070FAD49@039-SN2MPN1-012.039d.mgd.msft.net (mailing list archive)
State New, archived
Headers show

Commit Message

Bharat Bhushan Aug. 5, 2013, 2:27 p.m. UTC
> -----Original Message-----

> From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> Sent: Saturday, August 03, 2013 9:54 AM

> To: Bhushan Bharat-R65777

> Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > One of the problem I saw was that if I put this code in

> > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other

> > friend function (on which this code depends) are defined in pgtable.h.

> > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it

> > defines pte_present() and friends functions.

> >

> > Ok I move wove this in asm/pgtable*.h, initially I fought with myself

> > to take this code in pgtable* but finally end up doing here (got

> > biased by book3s :)).

> 

> Is there a reason why these routines can not be completely generic in pgtable.h

> ?


How about the generic function:

Comments

Scott Wood Aug. 5, 2013, 7:19 p.m. UTC | #1
On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:
> 
> > -----Original Message-----
> > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]
> > Sent: Saturday, August 03, 2013 9:54 AM
> > To: Bhushan Bharat-R65777
> > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;
> > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org
> > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like
> > booke3s
> > 
> > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:
> > > One of the problem I saw was that if I put this code in
> > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other
> > > friend function (on which this code depends) are defined in pgtable.h.
> > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it
> > > defines pte_present() and friends functions.
> > >
> > > Ok I move wove this in asm/pgtable*.h, initially I fought with myself
> > > to take this code in pgtable* but finally end up doing here (got
> > > biased by book3s :)).
> > 
> > Is there a reason why these routines can not be completely generic in pgtable.h
> > ?
> 
> How about the generic function:
> 
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> index d257d98..21daf28 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct *mm,
>         return old;
>  }
> 
> +static inline unsigned long pte_read(pte_t *p)
> +{
> +#ifdef PTE_ATOMIC_UPDATES
> +       pte_t pte;
> +       pte_t tmp;
> +       __asm__ __volatile__ (
> +       "1:     ldarx   %0,0,%3\n"
> +       "       andi.   %1,%0,%4\n"
> +       "       bne-    1b\n"
> +       "       ori     %1,%0,%4\n"
> +       "       stdcx.  %1,0,%3\n"
> +       "       bne-    1b"
> +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)
> +       : "r" (p), "i" (_PAGE_BUSY)
> +       : "cc");
> +
> +       return pte;
> +#else  
> +       return pte_val(*p);
> +#endif
> +#endif
> +}
>  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
>                                               unsigned long addr, pte_t *ptep)

Please leave a blank line between functions.

>  {
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 690c8c2..dad712c 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -254,6 +254,45 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
>  }
>  #endif /* !CONFIG_HUGETLB_PAGE */
> 
> +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> +                                    int writing, unsigned long *pte_sizep)

The name implies that it just reads the PTE.  Setting accessed/dirty
shouldn't be an undocumented side-effect.  Why can't the caller do that
(or a different function that the caller calls afterward if desired)?  

Though even then you have the undocumented side effect of locking the
PTE on certain targets.

> +{
> +       pte_t *ptep;
> +       pte_t pte;
> +       unsigned long ps = *pte_sizep;
> +       unsigned int shift;
> +
> +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> +       if (!ptep)
> +               return __pte(0);
> +       if (shift)
> +               *pte_sizep = 1ul << shift;
> +       else
> +               *pte_sizep = PAGE_SIZE;
> +
> +       if (ps > *pte_sizep)
> +               return __pte(0);
> +
> +       if (!pte_present(*ptep))
> +               return __pte(0);
> +
> +#ifdef CONFIG_PPC64
> +       /* Lock PTE (set _PAGE_BUSY) and read */
> +       pte = pte_read(ptep);
> +#else
> +       pte = pte_val(*ptep);
> +#endif

What about 32-bit platforms that need atomic PTEs?

-Scott



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bharat Bhushan Aug. 6, 2013, 1:12 a.m. UTC | #2
> -----Original Message-----

> From: Wood Scott-B07421

> Sent: Tuesday, August 06, 2013 12:49 AM

> To: Bhushan Bharat-R65777

> Cc: Benjamin Herrenschmidt; Wood Scott-B07421; agraf@suse.de; kvm-

> ppc@vger.kernel.org; kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:

> >

> > > -----Original Message-----

> > > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> > > Sent: Saturday, August 03, 2013 9:54 AM

> > > To: Bhushan Bharat-R65777

> > > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> > > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte

> > > lookup like booke3s

> > >

> > > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > > > One of the problem I saw was that if I put this code in

> > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other

> > > > friend function (on which this code depends) are defined in pgtable.h.

> > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h

> > > > before it defines pte_present() and friends functions.

> > > >

> > > > Ok I move wove this in asm/pgtable*.h, initially I fought with

> > > > myself to take this code in pgtable* but finally end up doing here

> > > > (got biased by book3s :)).

> > >

> > > Is there a reason why these routines can not be completely generic

> > > in pgtable.h ?

> >

> > How about the generic function:

> >

> > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h

> > b/arch/powerpc/include/asm/pgtable-ppc64.h

> > index d257d98..21daf28 100644

> > --- a/arch/powerpc/include/asm/pgtable-ppc64.h

> > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h

> > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct

> *mm,

> >         return old;

> >  }

> >

> > +static inline unsigned long pte_read(pte_t *p) { #ifdef

> > +PTE_ATOMIC_UPDATES

> > +       pte_t pte;

> > +       pte_t tmp;

> > +       __asm__ __volatile__ (

> > +       "1:     ldarx   %0,0,%3\n"

> > +       "       andi.   %1,%0,%4\n"

> > +       "       bne-    1b\n"

> > +       "       ori     %1,%0,%4\n"

> > +       "       stdcx.  %1,0,%3\n"

> > +       "       bne-    1b"

> > +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)

> > +       : "r" (p), "i" (_PAGE_BUSY)

> > +       : "cc");

> > +

> > +       return pte;

> > +#else

> > +       return pte_val(*p);

> > +#endif

> > +#endif

> > +}

> >  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,

> >                                               unsigned long addr,

> > pte_t *ptep)

> 

> Please leave a blank line between functions.

> 

> >  {

> > diff --git a/arch/powerpc/include/asm/pgtable.h

> > b/arch/powerpc/include/asm/pgtable.h

> > index 690c8c2..dad712c 100644

> > --- a/arch/powerpc/include/asm/pgtable.h

> > +++ b/arch/powerpc/include/asm/pgtable.h

> > @@ -254,6 +254,45 @@ static inline pte_t

> > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,  }  #endif

> > /* !CONFIG_HUGETLB_PAGE */

> >

> > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

> > +                                    int writing, unsigned long

> > +*pte_sizep)

> 

> The name implies that it just reads the PTE.  Setting accessed/dirty shouldn't

> be an undocumented side-effect.


Ok, will rename and document.

> Why can't the caller do that (or a different

> function that the caller calls afterward if desired)?


The current implementation in book3s is;
 1) find a pte/hugepte
 2) return null if pte not present
 3) take _PAGE_BUSY lock
 4) set accessed/dirty
 5) clear _PAGE_BUSY.

What I tried was 
1) find a pte/hugepte
2) return null if pte not present
3) return pte (not take lock by not setting _PAGE_BUSY)

4) then user calls  __ptep_set_access_flags() to atomic update the dirty/accessed flags in pte.

- but the benchmark results were not good
- Also can there be race as we do not take lock in step 3 and update in step 4 ?
  
> 

> Though even then you have the undocumented side effect of locking the PTE on

> certain targets.

> 

> > +{

> > +       pte_t *ptep;

> > +       pte_t pte;

> > +       unsigned long ps = *pte_sizep;

> > +       unsigned int shift;

> > +

> > +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

> > +       if (!ptep)

> > +               return __pte(0);

> > +       if (shift)

> > +               *pte_sizep = 1ul << shift;

> > +       else

> > +               *pte_sizep = PAGE_SIZE;

> > +

> > +       if (ps > *pte_sizep)

> > +               return __pte(0);

> > +

> > +       if (!pte_present(*ptep))

> > +               return __pte(0);

> > +

> > +#ifdef CONFIG_PPC64

> > +       /* Lock PTE (set _PAGE_BUSY) and read */

> > +       pte = pte_read(ptep);

> > +#else

> > +       pte = pte_val(*ptep);

> > +#endif

> 

> What about 32-bit platforms that need atomic PTEs?


I called __ptep_set_access_flags() for both 32/64bit (for 64bit I was not calling pte_read()), which handles atomic updates. Somehow the benchmark result were not good, will try again.

Thanks
-Bharat
> 

> -Scott

>
Bharat Bhushan Aug. 6, 2013, 7:02 a.m. UTC | #3
> -----Original Message-----

> From: Bhushan Bharat-R65777

> Sent: Tuesday, August 06, 2013 6:42 AM

> To: Wood Scott-B07421

> Cc: Benjamin Herrenschmidt; agraf@suse.de; kvm-ppc@vger.kernel.org;

> kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> Subject: RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like

> booke3s

> 

> 

> 

> > -----Original Message-----

> > From: Wood Scott-B07421

> > Sent: Tuesday, August 06, 2013 12:49 AM

> > To: Bhushan Bharat-R65777

> > Cc: Benjamin Herrenschmidt; Wood Scott-B07421; agraf@suse.de; kvm-

> > ppc@vger.kernel.org; kvm@vger.kernel.org;

> > linuxppc-dev@lists.ozlabs.org

> > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup

> > like booke3s

> >

> > On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:

> > >

> > > > -----Original Message-----

> > > > From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]

> > > > Sent: Saturday, August 03, 2013 9:54 AM

> > > > To: Bhushan Bharat-R65777

> > > > Cc: Wood Scott-B07421; agraf@suse.de; kvm-ppc@vger.kernel.org;

> > > > kvm@vger.kernel.org; linuxppc-dev@lists.ozlabs.org

> > > > Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte

> > > > lookup like booke3s

> > > >

> > > > On Sat, 2013-08-03 at 02:58 +0000, Bhushan Bharat-R65777 wrote:

> > > > > One of the problem I saw was that if I put this code in

> > > > > asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and

> > > > > other friend function (on which this code depends) are defined in

> pgtable.h.

> > > > > And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h

> > > > > before it defines pte_present() and friends functions.

> > > > >

> > > > > Ok I move wove this in asm/pgtable*.h, initially I fought with

> > > > > myself to take this code in pgtable* but finally end up doing

> > > > > here (got biased by book3s :)).

> > > >

> > > > Is there a reason why these routines can not be completely generic

> > > > in pgtable.h ?

> > >

> > > How about the generic function:

> > >

> > > diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h

> > > b/arch/powerpc/include/asm/pgtable-ppc64.h

> > > index d257d98..21daf28 100644

> > > --- a/arch/powerpc/include/asm/pgtable-ppc64.h

> > > +++ b/arch/powerpc/include/asm/pgtable-ppc64.h

> > > @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct

> > > mm_struct

> > *mm,

> > >         return old;

> > >  }

> > >

> > > +static inline unsigned long pte_read(pte_t *p) { #ifdef

> > > +PTE_ATOMIC_UPDATES

> > > +       pte_t pte;

> > > +       pte_t tmp;

> > > +       __asm__ __volatile__ (

> > > +       "1:     ldarx   %0,0,%3\n"

> > > +       "       andi.   %1,%0,%4\n"

> > > +       "       bne-    1b\n"

> > > +       "       ori     %1,%0,%4\n"

> > > +       "       stdcx.  %1,0,%3\n"

> > > +       "       bne-    1b"

> > > +       : "=&r" (pte), "=&r" (tmp), "=m" (*p)

> > > +       : "r" (p), "i" (_PAGE_BUSY)

> > > +       : "cc");

> > > +

> > > +       return pte;

> > > +#else

> > > +       return pte_val(*p);

> > > +#endif

> > > +#endif

> > > +}

> > >  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,

> > >                                               unsigned long addr,

> > > pte_t *ptep)

> >

> > Please leave a blank line between functions.

> >

> > >  {

> > > diff --git a/arch/powerpc/include/asm/pgtable.h

> > > b/arch/powerpc/include/asm/pgtable.h

> > > index 690c8c2..dad712c 100644

> > > --- a/arch/powerpc/include/asm/pgtable.h

> > > +++ b/arch/powerpc/include/asm/pgtable.h

> > > @@ -254,6 +254,45 @@ static inline pte_t

> > > *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,  }

> > > #endif

> > > /* !CONFIG_HUGETLB_PAGE */

> > >

> > > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,

> > > +                                    int writing, unsigned long

> > > +*pte_sizep)

> >

> > The name implies that it just reads the PTE.  Setting accessed/dirty

> > shouldn't be an undocumented side-effect.

> 

> Ok, will rename and document.

> 

> > Why can't the caller do that (or a different function that the caller

> > calls afterward if desired)?

> 

> The current implementation in book3s is;

>  1) find a pte/hugepte

>  2) return null if pte not present

>  3) take _PAGE_BUSY lock

>  4) set accessed/dirty

>  5) clear _PAGE_BUSY.

> 

> What I tried was

> 1) find a pte/hugepte

> 2) return null if pte not present

> 3) return pte (not take lock by not setting _PAGE_BUSY)

> 

> 4) then user calls  __ptep_set_access_flags() to atomic update the

> dirty/accessed flags in pte.

> 

> - but the benchmark results were not good

> - Also can there be race as we do not take lock in step 3 and update in step 4 ?

> 

> >

> > Though even then you have the undocumented side effect of locking the

> > PTE on certain targets.

> >

> > > +{

> > > +       pte_t *ptep;

> > > +       pte_t pte;

> > > +       unsigned long ps = *pte_sizep;

> > > +       unsigned int shift;

> > > +

> > > +       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);

> > > +       if (!ptep)

> > > +               return __pte(0);

> > > +       if (shift)

> > > +               *pte_sizep = 1ul << shift;

> > > +       else

> > > +               *pte_sizep = PAGE_SIZE;

> > > +

> > > +       if (ps > *pte_sizep)

> > > +               return __pte(0);

> > > +

> > > +       if (!pte_present(*ptep))

> > > +               return __pte(0);

> > > +

> > > +#ifdef CONFIG_PPC64

> > > +       /* Lock PTE (set _PAGE_BUSY) and read */

> > > +       pte = pte_read(ptep);

> > > +#else

> > > +       pte = pte_val(*ptep);

> > > +#endif

> >

> > What about 32-bit platforms that need atomic PTEs?

> 

> I called __ptep_set_access_flags() for both 32/64bit (for 64bit I was not

> calling pte_read()), which handles atomic updates. Somehow the benchmark result

> were not good, will try again.


Hi Pauls,

I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.

I am not sure which of the below two should be ok, please help

--------------------------- (1) This --------------------------
/*
 * Lock and read a linux PTE.  If it's present and writable, atomically
 * set dirty and referenced bits and return the PTE, otherwise return 0.
 */
static inline atomic_read_update_pte_flags(pte_t *ptep, int writing)
{
        pte_t pte;
#ifdef CONFIG_PPC64
        /* Lock PTE (set _PAGE_BUSY) and read
         * XXX: NOTE: Some Architectures (book3e) does not have pte Locking,
         * so not generic in that sense for all architecture
         */
        pte = pte_read(ptep);
#else
        /* XXX: do not see any read locking on 32 bit architecture */
        pte = pte_val(*ptep);
#endif
        if (pte_present(pte)) {
                pte = pte_mkyoung(pte);
                if (writing && pte_write(pte))
                        pte = pte_mkdirty(pte);
        }

        *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */

        return pte;
}


--------------------- (2) OR THIS ----------------------------------

/*
 * Read a linux PTE.  If it's present and writable, atomically
 * set dirty and referenced bits and return the PTE, otherwise return 0.
 */
static inline atomic_read_update_pte_flags(pte_t *ptep, int writing)
{
        pte_t pte;
        pte = pte_val(*ptep);
        if (pte_present(pte)) {
                pte = pte_mkyoung(pte);
                if (writing && pte_write(pte))
                        pte = pte_mkdirty(pte);
        }

        __ptep_set_access_flags(ptep, pte);

        return pte_val(*ptep);
}

Thanks
-Bharat

> 

> Thanks

> -Bharat

> >

> > -Scott

> >
Bharat Bhushan Aug. 6, 2013, 2:46 p.m. UTC | #4
DQoNCj4gLS0tLS1PcmlnaW5hbCBNZXNzYWdlLS0tLS0NCj4gRnJvbTogV29vZCBTY290dC1CMDc0
MjENCj4gU2VudDogVHVlc2RheSwgQXVndXN0IDA2LCAyMDEzIDEyOjQ5IEFNDQo+IFRvOiBCaHVz
aGFuIEJoYXJhdC1SNjU3NzcNCj4gQ2M6IEJlbmphbWluIEhlcnJlbnNjaG1pZHQ7IFdvb2QgU2Nv
dHQtQjA3NDIxOyBhZ3JhZkBzdXNlLmRlOyBrdm0tDQo+IHBwY0B2Z2VyLmtlcm5lbC5vcmc7IGt2
bUB2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4cHBjLWRldkBsaXN0cy5vemxhYnMub3JnDQo+IFN1Ympl
Y3Q6IFJlOiBbUEFUQ0ggNS82IHYyXSBrdm06IHBvd2VycGM6IGJvb2tlOiBBZGQgbGludXggcHRl
IGxvb2t1cCBsaWtlDQo+IGJvb2tlM3MNCj4gDQo+IE9uIE1vbiwgMjAxMy0wOC0wNSBhdCAwOToy
NyAtMDUwMCwgQmh1c2hhbiBCaGFyYXQtUjY1Nzc3IHdyb3RlOg0KPiA+DQo+ID4gPiAtLS0tLU9y
aWdpbmFsIE1lc3NhZ2UtLS0tLQ0KPiA+ID4gRnJvbTogQmVuamFtaW4gSGVycmVuc2NobWlkdCBb
bWFpbHRvOmJlbmhAa2VybmVsLmNyYXNoaW5nLm9yZ10NCj4gPiA+IFNlbnQ6IFNhdHVyZGF5LCBB
dWd1c3QgMDMsIDIwMTMgOTo1NCBBTQ0KPiA+ID4gVG86IEJodXNoYW4gQmhhcmF0LVI2NTc3Nw0K
PiA+ID4gQ2M6IFdvb2QgU2NvdHQtQjA3NDIxOyBhZ3JhZkBzdXNlLmRlOyBrdm0tcHBjQHZnZXIu
a2VybmVsLm9yZzsNCj4gPiA+IGt2bUB2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4cHBjLWRldkBsaXN0
cy5vemxhYnMub3JnDQo+ID4gPiBTdWJqZWN0OiBSZTogW1BBVENIIDUvNiB2Ml0ga3ZtOiBwb3dl
cnBjOiBib29rZTogQWRkIGxpbnV4IHB0ZQ0KPiA+ID4gbG9va3VwIGxpa2UgYm9va2Uzcw0KPiA+
ID4NCj4gPiA+IE9uIFNhdCwgMjAxMy0wOC0wMyBhdCAwMjo1OCArMDAwMCwgQmh1c2hhbiBCaGFy
YXQtUjY1Nzc3IHdyb3RlOg0KPiA+ID4gPiBPbmUgb2YgdGhlIHByb2JsZW0gSSBzYXcgd2FzIHRo
YXQgaWYgSSBwdXQgdGhpcyBjb2RlIGluDQo+ID4gPiA+IGFzbS9wZ3RhYmxlLTMyLmggYW5kIGFz
bS9wZ3RhYmxlLTY0LmggdGhlbiBwdGVfcGVyc2VudCgpIGFuZCBvdGhlcg0KPiA+ID4gPiBmcmll
bmQgZnVuY3Rpb24gKG9uIHdoaWNoIHRoaXMgY29kZSBkZXBlbmRzKSBhcmUgZGVmaW5lZCBpbiBw
Z3RhYmxlLmguDQo+ID4gPiA+IEFuZCBwZ3RhYmxlLmggaW5jbHVkZXMgYXNtL3BndGFibGUtMzIu
aCBhbmQgYXNtL3BndGFibGUtNjQuaA0KPiA+ID4gPiBiZWZvcmUgaXQgZGVmaW5lcyBwdGVfcHJl
c2VudCgpIGFuZCBmcmllbmRzIGZ1bmN0aW9ucy4NCj4gPiA+ID4NCj4gPiA+ID4gT2sgSSBtb3Zl
IHdvdmUgdGhpcyBpbiBhc20vcGd0YWJsZSouaCwgaW5pdGlhbGx5IEkgZm91Z2h0IHdpdGgNCj4g
PiA+ID4gbXlzZWxmIHRvIHRha2UgdGhpcyBjb2RlIGluIHBndGFibGUqIGJ1dCBmaW5hbGx5IGVu
ZCB1cCBkb2luZyBoZXJlDQo+ID4gPiA+IChnb3QgYmlhc2VkIGJ5IGJvb2szcyA6KSkuDQo+ID4g
Pg0KPiA+ID4gSXMgdGhlcmUgYSByZWFzb24gd2h5IHRoZXNlIHJvdXRpbmVzIGNhbiBub3QgYmUg
Y29tcGxldGVseSBnZW5lcmljDQo+ID4gPiBpbiBwZ3RhYmxlLmggPw0KPiA+DQo+ID4gSG93IGFi
b3V0IHRoZSBnZW5lcmljIGZ1bmN0aW9uOg0KPiA+DQo+ID4gZGlmZiAtLWdpdCBhL2FyY2gvcG93
ZXJwYy9pbmNsdWRlL2FzbS9wZ3RhYmxlLXBwYzY0LmgNCj4gPiBiL2FyY2gvcG93ZXJwYy9pbmNs
dWRlL2FzbS9wZ3RhYmxlLXBwYzY0LmgNCj4gPiBpbmRleCBkMjU3ZDk4Li4yMWRhZjI4IDEwMDY0
NA0KPiA+IC0tLSBhL2FyY2gvcG93ZXJwYy9pbmNsdWRlL2FzbS9wZ3RhYmxlLXBwYzY0LmgNCj4g
PiArKysgYi9hcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vcGd0YWJsZS1wcGM2NC5oDQo+ID4gQEAg
LTIyMSw2ICsyMjEsMjcgQEAgc3RhdGljIGlubGluZSB1bnNpZ25lZCBsb25nIHB0ZV91cGRhdGUo
c3RydWN0IG1tX3N0cnVjdA0KPiAqbW0sDQo+ID4gICAgICAgICByZXR1cm4gb2xkOw0KPiA+ICB9
DQo+ID4NCj4gPiArc3RhdGljIGlubGluZSB1bnNpZ25lZCBsb25nIHB0ZV9yZWFkKHB0ZV90ICpw
KSB7ICNpZmRlZg0KPiA+ICtQVEVfQVRPTUlDX1VQREFURVMNCj4gPiArICAgICAgIHB0ZV90IHB0
ZTsNCj4gPiArICAgICAgIHB0ZV90IHRtcDsNCj4gPiArICAgICAgIF9fYXNtX18gX192b2xhdGls
ZV9fICgNCj4gPiArICAgICAgICIxOiAgICAgbGRhcnggICAlMCwwLCUzXG4iDQo+ID4gKyAgICAg
ICAiICAgICAgIGFuZGkuICAgJTEsJTAsJTRcbiINCj4gPiArICAgICAgICIgICAgICAgYm5lLSAg
ICAxYlxuIg0KPiA+ICsgICAgICAgIiAgICAgICBvcmkgICAgICUxLCUwLCU0XG4iDQo+ID4gKyAg
ICAgICAiICAgICAgIHN0ZGN4LiAgJTEsMCwlM1xuIg0KPiA+ICsgICAgICAgIiAgICAgICBibmUt
ICAgIDFiIg0KPiA+ICsgICAgICAgOiAiPSZyIiAocHRlKSwgIj0mciIgKHRtcCksICI9bSIgKCpw
KQ0KPiA+ICsgICAgICAgOiAiciIgKHApLCAiaSIgKF9QQUdFX0JVU1kpDQo+ID4gKyAgICAgICA6
ICJjYyIpOw0KPiA+ICsNCj4gPiArICAgICAgIHJldHVybiBwdGU7DQo+ID4gKyNlbHNlDQo+ID4g
KyAgICAgICByZXR1cm4gcHRlX3ZhbCgqcCk7DQo+ID4gKyNlbmRpZg0KPiA+ICsjZW5kaWYNCj4g
PiArfQ0KPiA+ICBzdGF0aWMgaW5saW5lIGludCBfX3B0ZXBfdGVzdF9hbmRfY2xlYXJfeW91bmco
c3RydWN0IG1tX3N0cnVjdCAqbW0sDQo+ID4gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg
ICAgICAgICAgICAgICAgIHVuc2lnbmVkIGxvbmcgYWRkciwNCj4gPiBwdGVfdCAqcHRlcCkNCj4g
DQo+IFBsZWFzZSBsZWF2ZSBhIGJsYW5rIGxpbmUgYmV0d2VlbiBmdW5jdGlvbnMuDQo+IA0KPiA+
ICB7DQo+ID4gZGlmZiAtLWdpdCBhL2FyY2gvcG93ZXJwYy9pbmNsdWRlL2FzbS9wZ3RhYmxlLmgN
Cj4gPiBiL2FyY2gvcG93ZXJwYy9pbmNsdWRlL2FzbS9wZ3RhYmxlLmgNCj4gPiBpbmRleCA2OTBj
OGMyLi5kYWQ3MTJjIDEwMDY0NA0KPiA+IC0tLSBhL2FyY2gvcG93ZXJwYy9pbmNsdWRlL2FzbS9w
Z3RhYmxlLmgNCj4gPiArKysgYi9hcmNoL3Bvd2VycGMvaW5jbHVkZS9hc20vcGd0YWJsZS5oDQo+
ID4gQEAgLTI1NCw2ICsyNTQsNDUgQEAgc3RhdGljIGlubGluZSBwdGVfdA0KPiA+ICpmaW5kX2xp
bnV4X3B0ZV9vcl9odWdlcHRlKHBnZF90ICpwZ2RpciwgdW5zaWduZWQgbG9uZyBlYSwgIH0gICNl
bmRpZg0KPiA+IC8qICFDT05GSUdfSFVHRVRMQl9QQUdFICovDQo+ID4NCj4gPiArc3RhdGljIGlu
bGluZSBwdGVfdCBsb29rdXBfbGludXhfcHRlKHBnZF90ICpwZ2RpciwgdW5zaWduZWQgbG9uZyBo
dmEsDQo+ID4gKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGludCB3cml0aW5n
LCB1bnNpZ25lZCBsb25nDQo+ID4gKypwdGVfc2l6ZXApDQo+IA0KPiBUaGUgbmFtZSBpbXBsaWVz
IHRoYXQgaXQganVzdCByZWFkcyB0aGUgUFRFLiAgU2V0dGluZyBhY2Nlc3NlZC9kaXJ0eSBzaG91
bGRuJ3QNCj4gYmUgYW4gdW5kb2N1bWVudGVkIHNpZGUtZWZmZWN0LiAgV2h5IGNhbid0IHRoZSBj
YWxsZXIgZG8gdGhhdCAob3IgYSBkaWZmZXJlbnQNCj4gZnVuY3Rpb24gdGhhdCB0aGUgY2FsbGVy
IGNhbGxzIGFmdGVyd2FyZCBpZiBkZXNpcmVkKT8NCg0KU2NvdHQsIEkgc2VudCB0aGUgbmV4dCB2
ZXJzaW9uIG9mIHBhdGNoIGJhc2VkIG9uIGFib3ZlIGlkZWEuIE5vdyBJIHRoaW5rIHdlIGRvIG5v
dCBuZWVkIHRvIHVwZGF0ZSB0aGUgcHRlIGZsYWdzIG9uIGJvb2tlIA0KU28gd2UgZG8gbm90IG5l
ZWQgdG8gc29sdmUgdGhlIGt2bXBwY19yZWFkX3VwZGF0ZV9saW51eF9wdGUoKSBzdHVmZiBvZiBi
b29rM3MuDQoNCi1CaGFyYXQNCg0KPiANCj4gVGhvdWdoIGV2ZW4gdGhlbiB5b3UgaGF2ZSB0aGUg
dW5kb2N1bWVudGVkIHNpZGUgZWZmZWN0IG9mIGxvY2tpbmcgdGhlIFBURSBvbg0KPiBjZXJ0YWlu
IHRhcmdldHMuDQo+IA0KPiA+ICt7DQo+ID4gKyAgICAgICBwdGVfdCAqcHRlcDsNCj4gPiArICAg
ICAgIHB0ZV90IHB0ZTsNCj4gPiArICAgICAgIHVuc2lnbmVkIGxvbmcgcHMgPSAqcHRlX3NpemVw
Ow0KPiA+ICsgICAgICAgdW5zaWduZWQgaW50IHNoaWZ0Ow0KPiA+ICsNCj4gPiArICAgICAgIHB0
ZXAgPSBmaW5kX2xpbnV4X3B0ZV9vcl9odWdlcHRlKHBnZGlyLCBodmEsICZzaGlmdCk7DQo+ID4g
KyAgICAgICBpZiAoIXB0ZXApDQo+ID4gKyAgICAgICAgICAgICAgIHJldHVybiBfX3B0ZSgwKTsN
Cj4gPiArICAgICAgIGlmIChzaGlmdCkNCj4gPiArICAgICAgICAgICAgICAgKnB0ZV9zaXplcCA9
IDF1bCA8PCBzaGlmdDsNCj4gPiArICAgICAgIGVsc2UNCj4gPiArICAgICAgICAgICAgICAgKnB0
ZV9zaXplcCA9IFBBR0VfU0laRTsNCj4gPiArDQo+ID4gKyAgICAgICBpZiAocHMgPiAqcHRlX3Np
emVwKQ0KPiA+ICsgICAgICAgICAgICAgICByZXR1cm4gX19wdGUoMCk7DQo+ID4gKw0KPiA+ICsg
ICAgICAgaWYgKCFwdGVfcHJlc2VudCgqcHRlcCkpDQo+ID4gKyAgICAgICAgICAgICAgIHJldHVy
biBfX3B0ZSgwKTsNCj4gPiArDQo+ID4gKyNpZmRlZiBDT05GSUdfUFBDNjQNCj4gPiArICAgICAg
IC8qIExvY2sgUFRFIChzZXQgX1BBR0VfQlVTWSkgYW5kIHJlYWQgKi8NCj4gPiArICAgICAgIHB0
ZSA9IHB0ZV9yZWFkKHB0ZXApOw0KPiA+ICsjZWxzZQ0KPiA+ICsgICAgICAgcHRlID0gcHRlX3Zh
bCgqcHRlcCk7DQo+ID4gKyNlbmRpZg0KPiANCj4gV2hhdCBhYm91dCAzMi1iaXQgcGxhdGZvcm1z
IHRoYXQgbmVlZCBhdG9taWMgUFRFcz8NCj4gDQo+IC1TY290dA0KPiANCg0K

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 7, 2013, 12:24 a.m. UTC | #5
On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> 
> I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> 
> I am not sure which of the below two should be ok, please help

Given that the BookE code uses gfn_to_pfn_memslot() to get the host
pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
the guest write to, I don't think you need to set the dirty and/or
accessed bits in the Linux PTE explicitly.  If you care about the
WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
PTE, but you really should be using mmu_notifier_retry() to guard
against concurrent changes to the Linux PTE.  See the HV KVM code or
patch 21 of my recent series to see how it's used.  You probably
should be calling kvm_set_pfn_accessed() as well.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Wood Aug. 7, 2013, 1:11 a.m. UTC | #6
On Wed, 2013-08-07 at 10:24 +1000, Paul Mackerras wrote:
> On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> > 
> > I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> > 
> > I am not sure which of the below two should be ok, please help
> 
> Given that the BookE code uses gfn_to_pfn_memslot() to get the host
> pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
> the guest write to, I don't think you need to set the dirty and/or
> accessed bits in the Linux PTE explicitly.  If you care about the
> WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
> PTE, but you really should be using mmu_notifier_retry() to guard
> against concurrent changes to the Linux PTE.  See the HV KVM code or
> patch 21 of my recent series to see how it's used. 

Hmm... we only get a callback on invalidate_range_start(), not
invalidate_range_end() (and even if we did get a callback for the
latter, it'd probably be racy).  So we may have a problem here
regardless of getting WIMG from the PTE, unless it's guaranteed that
hva_to_pfn() will fail after invalidate_range_start().

>  You probably should be calling kvm_set_pfn_accessed() as well.

Yeah...  I think it'll only affect the quality of page-out decisions (as
opposed to corruption and such), but still it should be fixed.

-Scott



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 7, 2013, 1:47 a.m. UTC | #7
On Tue, Aug 06, 2013 at 08:11:34PM -0500, Scott Wood wrote:
> On Wed, 2013-08-07 at 10:24 +1000, Paul Mackerras wrote:
> > On Tue, Aug 06, 2013 at 07:02:48AM +0000, Bhushan Bharat-R65777 wrote:
> > > 
> > > I am trying to me the Linux pte search and update generic so that this can be used for powerpc as well.
> > > 
> > > I am not sure which of the below two should be ok, please help
> > 
> > Given that the BookE code uses gfn_to_pfn_memslot() to get the host
> > pfn, and then kvm_set_pfn_dirty(pfn) on pages that you're going to let
> > the guest write to, I don't think you need to set the dirty and/or
> > accessed bits in the Linux PTE explicitly.  If you care about the
> > WIMGE bits you can do find_linux_pte_or_hugepte() and just look at the
> > PTE, but you really should be using mmu_notifier_retry() to guard
> > against concurrent changes to the Linux PTE.  See the HV KVM code or
> > patch 21 of my recent series to see how it's used. 
> 
> Hmm... we only get a callback on invalidate_range_start(), not
> invalidate_range_end() (and even if we did get a callback for the
> latter, it'd probably be racy).  So we may have a problem here
> regardless of getting WIMG from the PTE, unless it's guaranteed that
> hva_to_pfn() will fail after invalidate_range_start().

No, it's not guaranteed.  You have to use mmu_notifier_retry().  It
tells you if either (a) some sort of invalidation has happened since
you snapshotted kvm->mmu_notifier_seq, or (b) an
invalidate_range_start...end sequence is currently in progress.  In
either case you should discard any PTE or pfn information you
collected and retry.

> >  You probably should be calling kvm_set_pfn_accessed() as well.
> 
> Yeah...  I think it'll only affect the quality of page-out decisions (as
> opposed to corruption and such), but still it should be fixed.

Right.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index d257d98..21daf28 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -221,6 +221,27 @@  static inline unsigned long pte_update(struct mm_struct *mm,
        return old;
 }

+static inline unsigned long pte_read(pte_t *p)
+{
+#ifdef PTE_ATOMIC_UPDATES
+       pte_t pte;
+       pte_t tmp;
+       __asm__ __volatile__ (
+       "1:     ldarx   %0,0,%3\n"
+       "       andi.   %1,%0,%4\n"
+       "       bne-    1b\n"
+       "       ori     %1,%0,%4\n"
+       "       stdcx.  %1,0,%3\n"
+       "       bne-    1b"
+       : "=&r" (pte), "=&r" (tmp), "=m" (*p)
+       : "r" (p), "i" (_PAGE_BUSY)
+       : "cc");
+
+       return pte;
+#else  
+       return pte_val(*p);
+#endif
+#endif
+}
 static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
                                              unsigned long addr, pte_t *ptep)
 {
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 690c8c2..dad712c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -254,6 +254,45 @@  static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */

+static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
+                                    int writing, unsigned long *pte_sizep)
+{
+       pte_t *ptep;
+       pte_t pte;
+       unsigned long ps = *pte_sizep;
+       unsigned int shift;
+
+       ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+       if (!ptep)
+               return __pte(0);
+       if (shift)
+               *pte_sizep = 1ul << shift;
+       else
+               *pte_sizep = PAGE_SIZE;
+
+       if (ps > *pte_sizep)
+               return __pte(0);
+
+       if (!pte_present(*ptep))
+               return __pte(0);
+
+#ifdef CONFIG_PPC64
+       /* Lock PTE (set _PAGE_BUSY) and read */
+       pte = pte_read(ptep);
+#else
+       pte = pte_val(*ptep);
+#endif
+       if (pte_present(pte)) {
+               pte = pte_mkyoung(pte);
+               if (writing && pte_write(pte))
+                       pte = pte_mkdirty(pte);
+       }
+
+       *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */
+
+       return pte;
+}
+
 #endif /* __ASSEMBLY__ */

 #endif /* __KERNEL__ */