diff mbox

[V4,4/6] x86/hvm: pkeys, add pkeys support for guest_walk_tables

Message ID 1450682504-32286-5-git-send-email-huaitong.han@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Huaitong Han Dec. 21, 2015, 7:21 a.m. UTC
Protection keys define a new 4-bit protection key field(PKEY) in bits 62:59 of
leaf entries of the page tables.

PKRU register defines 32 bits, there are 16 domains and 2 attribute bits per
domain in pkru, for each i (0 ? i ? 15), PKRU[2i] is the access-disable bit for
protection key i (ADi); PKRU[2i+1] is the write-disable bit for protection key
i (WDi). PKEY is index to a defined domain.

A fault is considered as a PKU violation if all of the following conditions are
ture:
1.CR4_PKE=1.
2.EFER_LMA=1.
3.Page is present with no reserved bit violations.
4.The access is not an instruction fetch.
5.The access is to a user page.
6.PKRU.AD=1
    or The access is a data write and PKRU.WD=1
        and either CR0.WP=1 or it is a user access.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
---
 xen/arch/x86/mm/guest_walk.c      | 65 +++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/hap/guest_walk.c  |  3 ++
 xen/include/asm-x86/guest_pt.h    |  7 +++++
 xen/include/asm-x86/hvm/hvm.h     |  2 ++
 xen/include/asm-x86/page.h        |  5 +++
 xen/include/asm-x86/processor.h   | 39 +++++++++++++++++++++++
 xen/include/asm-x86/x86_64/page.h | 12 ++++++++
 7 files changed, 133 insertions(+)

Comments

Jan Beulich Dec. 21, 2015, 3:32 p.m. UTC | #1
>>> On 21.12.15 at 08:21, <huaitong.han@intel.com> wrote:
> --- a/xen/arch/x86/mm/guest_walk.c
> +++ b/xen/arch/x86/mm/guest_walk.c
> @@ -90,6 +90,55 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
>      return 0;
>  }
>  
> +#if GUEST_PAGING_LEVELS >= CONFIG_PAGING_LEVELS

GUEST_PAGING_LEVELS >= 4 (just like further down)

> +bool_t leaf_pte_pkeys_check(struct vcpu *vcpu,
> +        uint32_t pfec, uint32_t pte_pkey)
> +{
> +    unsigned int pkru = 0;
> +    bool_t pkru_ad, pkru_wd;
> +

Stray blank line.

> +    bool_t pf = !!(pfec & PFEC_page_present);
> +    bool_t uf = !!(pfec & PFEC_user_mode);
> +    bool_t wf = !!(pfec & PFEC_write_access);
> +    bool_t ff = !!(pfec & PFEC_insn_fetch);
> +    bool_t rsvdf = !!(pfec & PFEC_reserved_bit);
> +
> +    /* When page is present,  PFEC_prot_key is always checked */
> +    if ( !pf || is_pv_vcpu(vcpu) )
> +        return 0;

I think for a function called "check" together with how its callers use
it the return value meaning is inverted here. Also the comment seems
inverted wrt the actual check (and is missing a full stop). And doesn't
key 0 have static meaning, in which case you could bail early (and
namely avoid the expensive RDPKRU further down)?

> +    /*
> +     * PKU:  additional mechanism by which the paging controls
> +     * access to user-mode addresses based on the value in the
> +     * PKRU register. A fault is considered as a PKU violation if all
> +     * of the following conditions are ture:
> +     * 1.CR4_PKE=1.
> +     * 2.EFER_LMA=1.
> +     * 3.page is present with no reserved bit violations.
> +     * 4.the access is not an instruction fetch.
> +     * 5.the access is to a user page.
> +     * 6.PKRU.AD=1
> +     *       or The access is a data write and PKRU.WD=1
> +     *            and either CR0.WP=1 or it is a user access.
> +     */
> +    if ( !hvm_pku_enabled(vcpu) ||
> +            !hvm_long_mode_enabled(vcpu) || rsvdf || ff )

Where's the "user page" check? Also - indentation.

> +        return 0;
> +
> +    pkru = read_pkru();
> +    if ( unlikely(pkru) )
> +    {
> +        pkru_ad = read_pkru_ad(pkru, pte_pkey);
> +        pkru_wd = read_pkru_wd(pkru, pte_pkey);
> +        /* Condition 6 */
> +        if ( pkru_ad || (pkru_wd && wf && (hvm_wp_enabled(vcpu) || uf)))

Ah, uf is being checked here. But according to the comment it could
(and should, again to avoid the RDPKRU) move up.

> @@ -270,6 +324,12 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
>  
>      pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); 
>  
> +#if GUEST_PAGING_LEVELS >= 4
> +    pkey = guest_l2e_get_pkey(gw->l2e);
> +    if ( pse2M && leaf_pte_pkeys_check(v, pfec, pkey) )
> +        rc |= _PAGE_PKEY_BITS;
> +#endif

I think the #ifdef isn't really needed here, if you moved the one
around leaf_pte_pkeys_check() into that function, and if you
perhaps also dropped the "pkey" local variable.

Jan
Huaitong Han Dec. 22, 2015, 8:12 a.m. UTC | #2
On Mon, 2015-12-21 at 08:32 -0700, Jan Beulich wrote:
> > > > On 21.12.15 at 08:21, <huaitong.han@intel.com> wrote:
> > --- a/xen/arch/x86/mm/guest_walk.c
> > +++ b/xen/arch/x86/mm/guest_walk.c
> > @@ -90,6 +90,55 @@ static uint32_t set_ad_bits(void *guest_p, void
> > *walk_p, int set_dirty)
> >      return 0;
> >  }
> >  
> > +#if GUEST_PAGING_LEVELS >= CONFIG_PAGING_LEVELS
> 
> GUEST_PAGING_LEVELS >= 4 (just like further down)
The code is modified according Andrew's comments:
"
  This is a latent linking bug for the future 
  when 5 levels comes along.

  It will probably be best to use the same trick
  as gw_page_flags to compile it once but use it
  multiple times.
"

> 
> > +bool_t leaf_pte_pkeys_check(struct vcpu *vcpu,
> > +        uint32_t pfec, uint32_t pte_pkey)
> > +{
> > +    unsigned int pkru = 0;
> > +    bool_t pkru_ad, pkru_wd;
> > +
> 
> Stray blank line.
> 
> > +    bool_t pf = !!(pfec & PFEC_page_present);
> > +    bool_t uf = !!(pfec & PFEC_user_mode);
> > +    bool_t wf = !!(pfec & PFEC_write_access);
> > +    bool_t ff = !!(pfec & PFEC_insn_fetch);
> > +    bool_t rsvdf = !!(pfec & PFEC_reserved_bit);
> > +
> > +    /* When page is present,  PFEC_prot_key is always checked */
> > +    if ( !pf || is_pv_vcpu(vcpu) )
> > +        return 0;
> 
> I think for a function called "check" together with how its callers
> use
> it the return value meaning is inverted here. 
I will change the function name to "pkey_page_fault".

> Also the comment seems
> inverted wrt the actual check (and is missing a full stop). And
> doesn't
> key 0 have static meaning, in which case you could bail early (and
> namely avoid the expensive RDPKRU further down)?
Key 0 have no static meaning, the default key maybe different in
different OS.
> 
> > +    /*
> > +     * PKU:  additional mechanism by which the paging controls
> > +     * access to user-mode addresses based on the value in the
> > +     * PKRU register. A fault is considered as a PKU violation if
> > all
> > +     * of the following conditions are ture:
> > +     * 1.CR4_PKE=1.
> > +     * 2.EFER_LMA=1.
> > +     * 3.page is present with no reserved bit violations.
> > +     * 4.the access is not an instruction fetch.
> > +     * 5.the access is to a user page.
> > +     * 6.PKRU.AD=1
> > +     *       or The access is a data write and PKRU.WD=1
> > +     *            and either CR0.WP=1 or it is a user access.
> > +     */
> > +    if ( !hvm_pku_enabled(vcpu) ||
> > +            !hvm_long_mode_enabled(vcpu) || rsvdf || ff )
> 
> Where's the "user page" check? Also - indentation.
it should be:
    if ( !hvm_pku_enabled(vcpu) || !hvm_long_mode_enabled(vcpu) ||
            rsvdf || ff || !(pte_flags & _PAGE_USER) )
> 
> > +        return 0;
> > +
> > +    pkru = read_pkru();
> > +    if ( unlikely(pkru) )
> > +    {
> > +        pkru_ad = read_pkru_ad(pkru, pte_pkey);
> > +        pkru_wd = read_pkru_wd(pkru, pte_pkey);
> > +        /* Condition 6 */
> > +        if ( pkru_ad || (pkru_wd && wf && (hvm_wp_enabled(vcpu) ||
> > uf)))
> 
> Ah, uf is being checked here. But according to the comment it could
> (and should, again to avoid the RDPKRU) move up.
> 
> > @@ -270,6 +324,12 @@ guest_walk_tables(struct vcpu *v, struct
> > p2m_domain *p2m,
> >  
> >      pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); 
> >  
> > +#if GUEST_PAGING_LEVELS >= 4
> > +    pkey = guest_l2e_get_pkey(gw->l2e);
> > +    if ( pse2M && leaf_pte_pkeys_check(v, pfec, pkey) )
> > +        rc |= _PAGE_PKEY_BITS;
> > +#endif
> 
> I think the #ifdef isn't really needed here, if you moved the one
> around leaf_pte_pkeys_check() into that function, and if you
> perhaps also dropped the "pkey" local variable.
guest_l2e_get_pkey has different macro depend on GUEST_PAGING_LEVELS
too, and I think it's better to keep it.
> 
> Jan
>
Jan Beulich Dec. 22, 2015, 8:43 a.m. UTC | #3
>>> On 22.12.15 at 09:12, <huaitong.han@intel.com> wrote:
> On Mon, 2015-12-21 at 08:32 -0700, Jan Beulich wrote:
>> > > > On 21.12.15 at 08:21, <huaitong.han@intel.com> wrote:
>> > --- a/xen/arch/x86/mm/guest_walk.c
>> > +++ b/xen/arch/x86/mm/guest_walk.c
>> > @@ -90,6 +90,55 @@ static uint32_t set_ad_bits(void *guest_p, void
>> > *walk_p, int set_dirty)
>> >      return 0;
>> >  }
>> >  
>> > +#if GUEST_PAGING_LEVELS >= CONFIG_PAGING_LEVELS
>> 
>> GUEST_PAGING_LEVELS >= 4 (just like further down)
> The code is modified according Andrew's comments:
> "
>   This is a latent linking bug for the future 
>   when 5 levels comes along.
> 
>   It will probably be best to use the same trick
>   as gw_page_flags to compile it once but use it
>   multiple times.
> "

Okay, understood. But then you should follow _that_ model (using
== instead of >=) instead of inventing a 3rd variant. Similar things
should be done in similar ways so that they can be easily recognized
being similar.

Otoh the function isn't being called from code other than
GUEST_PAGING_LEVELS == 4, so at present it could be static,
which would take care of the latent linking issue Andrew referred to.

>> > +    bool_t pf = !!(pfec & PFEC_page_present);
>> > +    bool_t uf = !!(pfec & PFEC_user_mode);
>> > +    bool_t wf = !!(pfec & PFEC_write_access);
>> > +    bool_t ff = !!(pfec & PFEC_insn_fetch);
>> > +    bool_t rsvdf = !!(pfec & PFEC_reserved_bit);
>> > +
>> > +    /* When page is present,  PFEC_prot_key is always checked */
>> > +    if ( !pf || is_pv_vcpu(vcpu) )
>> > +        return 0;
>> 
>> I think for a function called "check" together with how its callers
>> use
>> it the return value meaning is inverted here. 
> I will change the function name to "pkey_page_fault".

Perhaps just pkey_fault() then, since it's clear there's no other fault
possible here besides a page fault?

>> Also the comment seems
>> inverted wrt the actual check (and is missing a full stop). And
>> doesn't
>> key 0 have static meaning, in which case you could bail early (and
>> namely avoid the expensive RDPKRU further down)?
> Key 0 have no static meaning, the default key maybe different in
> different OS.

Okay - I thought I had seen something like this mentioned in
another sub-thread.

>> > @@ -270,6 +324,12 @@ guest_walk_tables(struct vcpu *v, struct
>> > p2m_domain *p2m,
>> >  
>> >      pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); 
>> >  
>> > +#if GUEST_PAGING_LEVELS >= 4
>> > +    pkey = guest_l2e_get_pkey(gw->l2e);
>> > +    if ( pse2M && leaf_pte_pkeys_check(v, pfec, pkey) )
>> > +        rc |= _PAGE_PKEY_BITS;
>> > +#endif
>> 
>> I think the #ifdef isn't really needed here, if you moved the one
>> around leaf_pte_pkeys_check() into that function, and if you
>> perhaps also dropped the "pkey" local variable.
> guest_l2e_get_pkey has different macro depend on GUEST_PAGING_LEVELS
> too, and I think it's better to keep it.

I didn't suggest to drop guest_l2e_get_pkey(). I'd like to see the
#ifdef-s go away if at all possible, to help code readability.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index 18d1acf..f65ba27 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -90,6 +90,55 @@  static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
     return 0;
 }
 
+#if GUEST_PAGING_LEVELS >= CONFIG_PAGING_LEVELS
+bool_t leaf_pte_pkeys_check(struct vcpu *vcpu,
+        uint32_t pfec, uint32_t pte_pkey)
+{
+    unsigned int pkru = 0;
+    bool_t pkru_ad, pkru_wd;
+
+    bool_t pf = !!(pfec & PFEC_page_present);
+    bool_t uf = !!(pfec & PFEC_user_mode);
+    bool_t wf = !!(pfec & PFEC_write_access);
+    bool_t ff = !!(pfec & PFEC_insn_fetch);
+    bool_t rsvdf = !!(pfec & PFEC_reserved_bit);
+
+    /* When page is present,  PFEC_prot_key is always checked */
+    if ( !pf || is_pv_vcpu(vcpu) )
+        return 0;
+
+    /*
+     * PKU:  additional mechanism by which the paging controls
+     * access to user-mode addresses based on the value in the
+     * PKRU register. A fault is considered as a PKU violation if all
+     * of the following conditions are ture:
+     * 1.CR4_PKE=1.
+     * 2.EFER_LMA=1.
+     * 3.page is present with no reserved bit violations.
+     * 4.the access is not an instruction fetch.
+     * 5.the access is to a user page.
+     * 6.PKRU.AD=1
+     *       or The access is a data write and PKRU.WD=1
+     *            and either CR0.WP=1 or it is a user access.
+     */
+    if ( !hvm_pku_enabled(vcpu) ||
+            !hvm_long_mode_enabled(vcpu) || rsvdf || ff )
+        return 0;
+
+    pkru = read_pkru();
+    if ( unlikely(pkru) )
+    {
+        pkru_ad = read_pkru_ad(pkru, pte_pkey);
+        pkru_wd = read_pkru_wd(pkru, pte_pkey);
+        /* Condition 6 */
+        if ( pkru_ad || (pkru_wd && wf && (hvm_wp_enabled(vcpu) || uf)))
+            return 1;
+    }
+
+    return 0;
+}
+#endif
+
 /* Walk the guest pagetables, after the manner of a hardware walker. */
 /* Because the walk is essentially random, it can cause a deadlock 
  * warning in the p2m locking code. Highly unlikely this is an actual
@@ -106,6 +155,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
+    unsigned int pkey;
 #endif
     uint32_t gflags, mflags, iflags, rc = 0;
     bool_t smep = 0, smap = 0;
@@ -190,6 +240,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
         goto out;
     /* Get the l3e and check its flags*/
     gw->l3e = l3p[guest_l3_table_offset(va)];
+    pkey = guest_l3e_get_pkey(gw->l3e);
     gflags = guest_l3e_get_flags(gw->l3e) ^ iflags;
     if ( !(gflags & _PAGE_PRESENT) ) {
         rc |= _PAGE_PRESENT;
@@ -199,6 +250,9 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     
     pse1G = (gflags & _PAGE_PSE) && guest_supports_1G_superpages(v); 
 
+    if ( pse1G && leaf_pte_pkeys_check(v, pfec, pkey) )
+        rc |= _PAGE_PKEY_BITS;
+
     if ( pse1G )
     {
         /* Generate a fake l1 table entry so callers don't all 
@@ -270,6 +324,12 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 
     pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); 
 
+#if GUEST_PAGING_LEVELS >= 4
+    pkey = guest_l2e_get_pkey(gw->l2e);
+    if ( pse2M && leaf_pte_pkeys_check(v, pfec, pkey) )
+        rc |= _PAGE_PKEY_BITS;
+#endif
+
     if ( pse2M )
     {
         /* Special case: this guest VA is in a PSE superpage, so there's
@@ -330,6 +390,11 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
             goto out;
         }
         rc |= ((gflags & mflags) ^ mflags);
+#if GUEST_PAGING_LEVELS >= 4
+        pkey = guest_l1e_get_pkey(gw->l1e);
+        if ( leaf_pte_pkeys_check(v, pfec, pkey) )
+            rc |= _PAGE_PKEY_BITS;
+#endif
     }
 
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index 11c1b35..49d0328 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -130,6 +130,9 @@  unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
     if ( missing & _PAGE_INVALID_BITS ) 
         pfec[0] |= PFEC_reserved_bit;
 
+    if ( missing & _PAGE_PKEY_BITS )
+        pfec[0] |= PFEC_prot_key;
+
     if ( missing & _PAGE_PAGED )
         pfec[0] = PFEC_page_paged;
 
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 3447973..d02f296 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -154,6 +154,13 @@  static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
 { return l4e_get_flags(gl4e); }
 #endif
 
+static inline u32 guest_l1e_get_pkey(guest_l1e_t gl1e)
+{ return l1e_get_pkey(gl1e); }
+static inline u32 guest_l2e_get_pkey(guest_l2e_t gl2e)
+{ return l2e_get_pkey(gl2e); }
+static inline u32 guest_l3e_get_pkey(guest_l3e_t gl3e)
+{ return l3e_get_pkey(gl3e); }
+
 static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
 { return l1e_from_pfn(gfn_x(gfn), flags); }
 static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index f80e143..79b3421 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -275,6 +275,8 @@  int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode);
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMAP))
 #define hvm_nx_enabled(v) \
     (!!((v)->arch.hvm_vcpu.guest_efer & EFER_NX))
+#define hvm_pku_enabled(v) \
+    (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE))
 
 /* Can we use superpages in the HAP p2m table? */
 #define hap_has_1gb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_1GB))
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index a095a93..d69d91d 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -93,6 +93,11 @@ 
 #define l3e_get_flags(x)           (get_pte_flags((x).l3))
 #define l4e_get_flags(x)           (get_pte_flags((x).l4))
 
+/* Get pte pkeys (unsigned int). */
+#define l1e_get_pkey(x)           (get_pte_pkey((x).l1))
+#define l2e_get_pkey(x)           (get_pte_pkey((x).l2))
+#define l3e_get_pkey(x)           (get_pte_pkey((x).l3))
+
 /* Construct an empty pte. */
 #define l1e_empty()                ((l1_pgentry_t) { 0 })
 #define l2e_empty()                ((l2_pgentry_t) { 0 })
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 3f8411f..3f251aa 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -373,6 +373,45 @@  static always_inline void clear_in_cr4 (unsigned long mask)
     write_cr4(read_cr4() & ~mask);
 }
 
+static inline unsigned int read_pkru(void)
+{
+    unsigned int pkru;
+
+    /*
+     * _PAGE_PKEY_BITS have a conflict with _PAGE_GNTTAB used by PV guests,
+     * so that X86_CR4_PKE is disable on hypervisor, RDPKRU instruction can
+     * be used with temporarily setting CR4.PKE.
+     */
+    set_in_cr4(X86_CR4_PKE);
+    asm volatile (".byte 0x0f,0x01,0xee"
+        : "=a" (pkru) : "c" (0) : "dx");
+    clear_in_cr4(X86_CR4_PKE);
+
+    return pkru;
+}
+
+/* Macros for PKRU domain */
+#define PKRU_READ  (0)
+#define PKRU_WRITE (1)
+#define PKRU_ATTRS (2)
+
+/*
+ * PKRU defines 32 bits, there are 16 domains and 2 attribute bits per
+ * domain in pkru, pkeys is index to a defined domain, so the value of
+ * pte_pkeys * PKRU_ATTRS + R/W is offset of a defined domain attribute.
+ */
+static inline bool_t read_pkru_ad(unsigned int pkru, unsigned int pkey)
+{
+    ASSERT(pkey < 16);
+    return (pkru >> (pkey * PKRU_ATTRS + PKRU_READ)) & 1;
+}
+
+static inline bool_t read_pkru_wd(unsigned int pkru, unsigned int pkey)
+{
+    ASSERT(pkey < 16);
+    return (pkru >> (pkey * PKRU_ATTRS + PKRU_WRITE)) & 1;
+}
+
 /*
  *      NSC/Cyrix CPU configuration register indexes
  */
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 19ab4d0..86abb94 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -134,6 +134,18 @@  typedef l4_pgentry_t root_pgentry_t;
 #define get_pte_flags(x) (((int)((x) >> 40) & ~0xFFF) | ((int)(x) & 0xFFF))
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
 
+/*
+ * Protection keys define a new 4-bit protection key field
+ * (PKEY) in bits 62:59 of leaf entries of the page tables.
+ * This corresponds to bit 22:19 of a 24-bit flags.
+ *
+ * Notice: Bit 22 is used by _PAGE_GNTTAB which is visible to PV guests,
+ * so Protection keys must be disabled on PV guests.
+ */
+#define _PAGE_PKEY_BITS  (0x780000)	 /* Protection Keys, 22:19 */
+
+#define get_pte_pkey(x) (MASK_EXTR(get_pte_flags(x), _PAGE_PKEY_BITS))
+
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
 #define _PAGE_NX_BIT (1U<<23)