diff mbox series

[v7,6/8] KVM: MMU: Add support for PKS emulation

Message ID 20220424101557.134102-7-lei4.wang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: PKS Virtualization support | expand

Commit Message

Lei Wang April 24, 2022, 10:15 a.m. UTC
From: Chenyi Qiang <chenyi.qiang@intel.com>

Up until now, pkr_mask had 0 bits for supervisor pages (the U/S bit in
page tables replaces the PFEC.RSVD in page fault error code).
For PKS support, fill in the bits using the same algorithm used for user
mode pages, but with CR4.PKE replaced by CR4.PKS. Because of this
change, CR4.PKS must also be included in the MMU role.

Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
Co-developed-by: Lei Wang <lei4.wang@intel.com>
Signed-off-by: Lei Wang <lei4.wang@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  10 +--
 arch/x86/kvm/mmu.h              |   3 +-
 arch/x86/kvm/mmu/mmu.c          | 109 +++++++++++++++++++++-----------
 3 files changed, 80 insertions(+), 42 deletions(-)

Comments

Sean Christopherson May 24, 2022, 11:28 p.m. UTC | #1
On Sun, Apr 24, 2022, Lei Wang wrote:
> @@ -454,10 +455,11 @@ struct kvm_mmu {
>  	u8 permissions[16];
>  
>  	/*
> -	* The pkru_mask indicates if protection key checks are needed.  It
> -	* consists of 16 domains indexed by page fault error code bits [4:1],
> -	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
> -	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
> +	* The pkr_mask indicates if protection key checks are needed.
> +	* It consists of 16 domains indexed by page fault error code
> +	* bits[4:1] with PFEC.RSVD replaced by ACC_USER_MASK from the
> +	* page tables. Each domain has 2 bits which are ANDed with AD
> +	* and WD from PKRU/PKRS.

Same comments, align and wrap closer to 80 please.

>  	*/
>  	u32 pkr_mask;
>  
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index cea03053a153..6963c641e6ce 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -45,7 +45,8 @@
>  #define PT32E_ROOT_LEVEL 3
>  
>  #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
> -			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
> +			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
> +			       X86_CR4_PKS)
>  
>  #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
>  #define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d3276986102..a6cbc22d3312 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -209,6 +209,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
>  BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
>  BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
>  BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
> +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pks, X86_CR4_PKS);
>  BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
>  BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
>  
> @@ -231,6 +232,7 @@ BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
>  BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
>  BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
>  BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
> +BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pks);
>  BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
>  
>  static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
> @@ -4608,37 +4610,58 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
>  }
>  
>  /*

...

> + * Protection Key Rights (PKR) is an additional mechanism by which data accesses
> + * with 4-level or 5-level paging (EFER.LMA=1) may be disabled based on the
> + * Protection Key Rights Userspace (PRKU) or Protection Key Rights Supervisor
> + * (PKRS) registers.  The Protection Key (PK) used for an access is a 4-bit
> + * value specified in bits 62:59 of the leaf PTE used to translate the address.
> + *
> + * PKRU and PKRS are 32-bit registers, with 16 2-bit entries consisting of an
> + * access-disable (AD) and write-disable (WD) bit.  The PK from the leaf PTE is
> + * used to index the approriate PKR (see below), e.g. PK=1 would consume bits

s/approriate/appropriate

> + * 3:2 (bit 3 == write-disable, bit 2 == access-disable).
> + *
> + * The PK register (PKRU vs. PKRS) indexed by the PK depends on the type of
> + * _address_ (not access type!).  For a user-mode address, PKRU is used; for a
> + * supervisor-mode address, PKRS is used.  An address is supervisor-mode if the
> + * U/S flag (bit 2) is 0 in at least one of the paging-structure entries, i.e.
> + * an address is user-mode if the U/S flag is 1 in _all_ entries.  Again, this
> + * is the address type, not the the access type, e.g. a supervisor-mode _access_

Double "the the" can be a single "the".

> + * will consume PKRU if the _address_ is a user-mode address.
> + *
> + * As alluded to above, PKR checks are only performed for data accesses; code
> + * fetches are not subject to PKR checks.  Terminal page faults (!PRESENT or
> + * PFEC.RSVD=1) are also not subject to PKR checks.
> + *
> + * PKR write-disable checks for superivsor-mode _accesses_ are performed if and
> + * only if CR0.WP=1 (though access-disable checks still apply).
> + *
> + * In summary, PKR checks are based on (a) EFER.LMA, (b) CR4.PKE or CR4.PKS,
> + * (c) CR0.WP, (d) the PK in the leaf PTE, (e) two bits from the corresponding
> + * PKR{S,U} entry, (f) the access type (derived from the other PFEC bits), and
> + * (g) the address type (retrieved from the paging-structure entries).
> + *
> + * To avoid conditional branches in permission_fault(), the PKR bitmask caches
> + * the above inputs, except for (e) the PKR{S,U} entry.  The FETCH, USER, and
> + * WRITE bits of the PFEC and the effective value of the paging-structures' U/S
> + * bit (slotted into the PFEC.RSVD position, bit 3) are used to index into the
> + * PKR bitmask (similar to the 4-bit Protection Key itself).  The two bits of
> + * the PKR bitmask "entry" are then extracted and ANDed with the two bits of
> + * the PKR{S,U} register corresponding to the address type and protection key.
> + *
> + * E.g. for all values where PFEC.FETCH=1, the corresponding pkr_bitmask bits
> + * will be 00b, thus masking away the AD and WD bits from the PKR{S,U} register
> + * to suppress PKR checks on code fetches.
> + */
>  static void update_pkr_bitmask(struct kvm_mmu *mmu)
>  {
>  	unsigned bit;
>  	bool wp;
> -

Please keep this newline, i.e. after the declaration of the cr4 booleans.  That
helps isolate the clearing of mmu->pkr_mask, which makes the functional affect of
the earlier return more obvious.

Ah, and use reverse fir tree for the variable declarations, i.e.

	bool cr4_pke = is_cr4_pke(mmu);
	bool cr4_pks = is_cr4_pks(mmu);
	unsigned bit;
	bool wp;

	mmu->pkr_mask = 0;

	if (!cr4_pke && !cr4_pks)
		return;

> +	bool cr4_pke = is_cr4_pke(mmu);
> +	bool cr4_pks = is_cr4_pks(mmu);
>  	mmu->pkr_mask = 0;
>  
> -	if (!is_cr4_pke(mmu))
> +	if (!cr4_pke && !cr4_pks)
>  		return;
>  
>  	wp = is_cr0_wp(mmu);
  

  ...

> @@ -6482,14 +6509,22 @@ u32 kvm_mmu_pkr_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  		     unsigned pte_access, unsigned pte_pkey, unsigned int pfec)
>  {
>  	u32 pkr_bits, offset;
> +	u32 pkr;
>  
>  	/*
> -	* PKRU defines 32 bits, there are 16 domains and 2
> -	* attribute bits per domain in pkru.  pte_pkey is the
> -	* index of the protection domain, so pte_pkey * 2 is
> -	* is the index of the first bit for the domain.
> +	* PKRU and PKRS both define 32 bits. There are 16 domains
> +	* and 2 attribute bits per domain in them. pte_key is the
> +	* index of the protection domain, so pte_pkey * 2 is the
> +	* index of the first bit for the domain. The use of PKRU
> +	* versus PKRS is selected by the address type, as determined
> +	* by the U/S bit in the paging-structure entries.


Align and wrap closer to 80 please.

>  	*/
> -	pkr_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
> +	if (pte_access & PT_USER_MASK)
> +		pkr = is_cr4_pke(mmu) ? vcpu->arch.pkru : 0;
> +	else
> +		pkr = is_cr4_pks(mmu) ? kvm_read_pkrs(vcpu) : 0;
> +
> +	pkr_bits = (pkr >> pte_pkey * 2) & 3;
>  
>  	/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
>  	offset = (pfec & ~1) + ((pte_access & PT_USER_MASK)
> -- 
> 2.25.1
>
Lei Wang May 27, 2022, 9:40 a.m. UTC | #2
On 5/25/2022 7:28 AM, Sean Christopherson wrote:
> On Sun, Apr 24, 2022, Lei Wang wrote:
>> @@ -454,10 +455,11 @@ struct kvm_mmu {
>>   	u8 permissions[16];
>>   
>>   	/*
>> -	* The pkru_mask indicates if protection key checks are needed.  It
>> -	* consists of 16 domains indexed by page fault error code bits [4:1],
>> -	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
>> -	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
>> +	* The pkr_mask indicates if protection key checks are needed.
>> +	* It consists of 16 domains indexed by page fault error code
>> +	* bits[4:1] with PFEC.RSVD replaced by ACC_USER_MASK from the
>> +	* page tables. Each domain has 2 bits which are ANDed with AD
>> +	* and WD from PKRU/PKRS.
> Same comments, align and wrap closer to 80 please.
Will do it.
>>   	*/
>>   	u32 pkr_mask;
>>   
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index cea03053a153..6963c641e6ce 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -45,7 +45,8 @@
>>   #define PT32E_ROOT_LEVEL 3
>>   
>>   #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
>> -			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
>> +			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
>> +			       X86_CR4_PKS)
>>   
>>   #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
>>   #define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d3276986102..a6cbc22d3312 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -209,6 +209,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
>>   BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
>>   BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
>>   BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
>> +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pks, X86_CR4_PKS);
>>   BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
>>   BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
>>   
>> @@ -231,6 +232,7 @@ BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
>>   BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
>>   BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
>>   BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
>> +BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pks);
>>   BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
>>   
>>   static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
>> @@ -4608,37 +4610,58 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
>>   }
>>   
>>   /*
> ...
>
>> + * Protection Key Rights (PKR) is an additional mechanism by which data accesses
>> + * with 4-level or 5-level paging (EFER.LMA=1) may be disabled based on the
>> + * Protection Key Rights Userspace (PRKU) or Protection Key Rights Supervisor
>> + * (PKRS) registers.  The Protection Key (PK) used for an access is a 4-bit
>> + * value specified in bits 62:59 of the leaf PTE used to translate the address.
>> + *
>> + * PKRU and PKRS are 32-bit registers, with 16 2-bit entries consisting of an
>> + * access-disable (AD) and write-disable (WD) bit.  The PK from the leaf PTE is
>> + * used to index the approriate PKR (see below), e.g. PK=1 would consume bits
> s/approriate/appropriate
Will correct it.
>> + * 3:2 (bit 3 == write-disable, bit 2 == access-disable).
>> + *
>> + * The PK register (PKRU vs. PKRS) indexed by the PK depends on the type of
>> + * _address_ (not access type!).  For a user-mode address, PKRU is used; for a
>> + * supervisor-mode address, PKRS is used.  An address is supervisor-mode if the
>> + * U/S flag (bit 2) is 0 in at least one of the paging-structure entries, i.e.
>> + * an address is user-mode if the U/S flag is 1 in _all_ entries.  Again, this
>> + * is the address type, not the the access type, e.g. a supervisor-mode _access_
> Double "the the" can be a single "the".
Will remove it.
>> + * will consume PKRU if the _address_ is a user-mode address.
>> + *
>> + * As alluded to above, PKR checks are only performed for data accesses; code
>> + * fetches are not subject to PKR checks.  Terminal page faults (!PRESENT or
>> + * PFEC.RSVD=1) are also not subject to PKR checks.
>> + *
>> + * PKR write-disable checks for superivsor-mode _accesses_ are performed if and
>> + * only if CR0.WP=1 (though access-disable checks still apply).
>> + *
>> + * In summary, PKR checks are based on (a) EFER.LMA, (b) CR4.PKE or CR4.PKS,
>> + * (c) CR0.WP, (d) the PK in the leaf PTE, (e) two bits from the corresponding
>> + * PKR{S,U} entry, (f) the access type (derived from the other PFEC bits), and
>> + * (g) the address type (retrieved from the paging-structure entries).
>> + *
>> + * To avoid conditional branches in permission_fault(), the PKR bitmask caches
>> + * the above inputs, except for (e) the PKR{S,U} entry.  The FETCH, USER, and
>> + * WRITE bits of the PFEC and the effective value of the paging-structures' U/S
>> + * bit (slotted into the PFEC.RSVD position, bit 3) are used to index into the
>> + * PKR bitmask (similar to the 4-bit Protection Key itself).  The two bits of
>> + * the PKR bitmask "entry" are then extracted and ANDed with the two bits of
>> + * the PKR{S,U} register corresponding to the address type and protection key.
>> + *
>> + * E.g. for all values where PFEC.FETCH=1, the corresponding pkr_bitmask bits
>> + * will be 00b, thus masking away the AD and WD bits from the PKR{S,U} register
>> + * to suppress PKR checks on code fetches.
>> + */
>>   static void update_pkr_bitmask(struct kvm_mmu *mmu)
>>   {
>>   	unsigned bit;
>>   	bool wp;
>> -
> Please keep this newline, i.e. after the declaration of the cr4 booleans.  That
> helps isolate the clearing of mmu->pkr_mask, which makes the functional affect of
> the earlier return more obvious.
>
> Ah, and use reverse fir tree for the variable declarations, i.e.
>
> 	bool cr4_pke = is_cr4_pke(mmu);
> 	bool cr4_pks = is_cr4_pks(mmu);
> 	unsigned bit;
> 	bool wp;
>
> 	mmu->pkr_mask = 0;
>
> 	if (!cr4_pke && !cr4_pks)
> 		return;

Very nice of you, will use reverse fir tree for the declaration.

>> +	bool cr4_pke = is_cr4_pke(mmu);
>> +	bool cr4_pks = is_cr4_pks(mmu);
>>   	mmu->pkr_mask = 0;
>>   
>> -	if (!is_cr4_pke(mmu))
>> +	if (!cr4_pke && !cr4_pks)
>>   		return;
>>   
>>   	wp = is_cr0_wp(mmu);
>    
>
>    ...
>
>> @@ -6482,14 +6509,22 @@ u32 kvm_mmu_pkr_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>>   		     unsigned pte_access, unsigned pte_pkey, unsigned int pfec)
>>   {
>>   	u32 pkr_bits, offset;
>> +	u32 pkr;
>>   
>>   	/*
>> -	* PKRU defines 32 bits, there are 16 domains and 2
>> -	* attribute bits per domain in pkru.  pte_pkey is the
>> -	* index of the protection domain, so pte_pkey * 2 is
>> -	* is the index of the first bit for the domain.
>> +	* PKRU and PKRS both define 32 bits. There are 16 domains
>> +	* and 2 attribute bits per domain in them. pte_key is the
>> +	* index of the protection domain, so pte_pkey * 2 is the
>> +	* index of the first bit for the domain. The use of PKRU
>> +	* versus PKRS is selected by the address type, as determined
>> +	* by the U/S bit in the paging-structure entries.
>
> Align and wrap closer to 80 please.
Will do it.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1014d6a2b069..a245d9817f72 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -375,6 +375,7 @@  union kvm_mmu_extended_role {
 		unsigned int cr4_smap:1;
 		unsigned int cr4_smep:1;
 		unsigned int cr4_la57:1;
+		unsigned int cr4_pks:1;
 		unsigned int efer_lma:1;
 	};
 };
@@ -454,10 +455,11 @@  struct kvm_mmu {
 	u8 permissions[16];
 
 	/*
-	* The pkru_mask indicates if protection key checks are needed.  It
-	* consists of 16 domains indexed by page fault error code bits [4:1],
-	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
-	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+	* The pkr_mask indicates if protection key checks are needed.
+	* It consists of 16 domains indexed by page fault error code
+	* bits[4:1] with PFEC.RSVD replaced by ACC_USER_MASK from the
+	* page tables. Each domain has 2 bits which are ANDed with AD
+	* and WD from PKRU/PKRS.
 	*/
 	u32 pkr_mask;
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index cea03053a153..6963c641e6ce 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -45,7 +45,8 @@ 
 #define PT32E_ROOT_LEVEL 3
 
 #define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
-			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+			       X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
+			       X86_CR4_PKS)
 
 #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
 #define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d3276986102..a6cbc22d3312 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -209,6 +209,7 @@  BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pks, X86_CR4_PKS);
 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
 
@@ -231,6 +232,7 @@  BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pks);
 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
 
 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
@@ -4608,37 +4610,58 @@  static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
 }
 
 /*
-* PKU is an additional mechanism by which the paging controls access to
-* user-mode addresses based on the value in the PKRU register.  Protection
-* key violations are reported through a bit in the page fault error code.
-* Unlike other bits of the error code, the PK bit is not known at the
-* call site of e.g. gva_to_gpa; it must be computed directly in
-* permission_fault based on two bits of PKRU, on some machine state (CR4,
-* CR0, EFER, CPL), and on other bits of the error code and the page tables.
-*
-* In particular the following conditions come from the error code, the
-* page tables and the machine state:
-* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
-* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
-* - PK is always zero if U=0 in the page tables
-* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
-*
-* The PKRU bitmask caches the result of these four conditions.  The error
-* code (minus the P bit) and the page table's U bit form an index into the
-* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
-* with the two bits of the PKRU register corresponding to the protection key.
-* For the first three conditions above the bits will be 00, thus masking
-* away both AD and WD.  For all reads or if the last condition holds, WD
-* only will be masked away.
-*/
+ * Protection Key Rights (PKR) is an additional mechanism by which data accesses
+ * with 4-level or 5-level paging (EFER.LMA=1) may be disabled based on the
+ * Protection Key Rights Userspace (PRKU) or Protection Key Rights Supervisor
+ * (PKRS) registers.  The Protection Key (PK) used for an access is a 4-bit
+ * value specified in bits 62:59 of the leaf PTE used to translate the address.
+ *
+ * PKRU and PKRS are 32-bit registers, with 16 2-bit entries consisting of an
+ * access-disable (AD) and write-disable (WD) bit.  The PK from the leaf PTE is
+ * used to index the approriate PKR (see below), e.g. PK=1 would consume bits
+ * 3:2 (bit 3 == write-disable, bit 2 == access-disable).
+ *
+ * The PK register (PKRU vs. PKRS) indexed by the PK depends on the type of
+ * _address_ (not access type!).  For a user-mode address, PKRU is used; for a
+ * supervisor-mode address, PKRS is used.  An address is supervisor-mode if the
+ * U/S flag (bit 2) is 0 in at least one of the paging-structure entries, i.e.
+ * an address is user-mode if the U/S flag is 1 in _all_ entries.  Again, this
+ * is the address type, not the the access type, e.g. a supervisor-mode _access_
+ * will consume PKRU if the _address_ is a user-mode address.
+ *
+ * As alluded to above, PKR checks are only performed for data accesses; code
+ * fetches are not subject to PKR checks.  Terminal page faults (!PRESENT or
+ * PFEC.RSVD=1) are also not subject to PKR checks.
+ *
+ * PKR write-disable checks for superivsor-mode _accesses_ are performed if and
+ * only if CR0.WP=1 (though access-disable checks still apply).
+ *
+ * In summary, PKR checks are based on (a) EFER.LMA, (b) CR4.PKE or CR4.PKS,
+ * (c) CR0.WP, (d) the PK in the leaf PTE, (e) two bits from the corresponding
+ * PKR{S,U} entry, (f) the access type (derived from the other PFEC bits), and
+ * (g) the address type (retrieved from the paging-structure entries).
+ *
+ * To avoid conditional branches in permission_fault(), the PKR bitmask caches
+ * the above inputs, except for (e) the PKR{S,U} entry.  The FETCH, USER, and
+ * WRITE bits of the PFEC and the effective value of the paging-structures' U/S
+ * bit (slotted into the PFEC.RSVD position, bit 3) are used to index into the
+ * PKR bitmask (similar to the 4-bit Protection Key itself).  The two bits of
+ * the PKR bitmask "entry" are then extracted and ANDed with the two bits of
+ * the PKR{S,U} register corresponding to the address type and protection key.
+ *
+ * E.g. for all values where PFEC.FETCH=1, the corresponding pkr_bitmask bits
+ * will be 00b, thus masking away the AD and WD bits from the PKR{S,U} register
+ * to suppress PKR checks on code fetches.
+ */
 static void update_pkr_bitmask(struct kvm_mmu *mmu)
 {
 	unsigned bit;
 	bool wp;
-
+	bool cr4_pke = is_cr4_pke(mmu);
+	bool cr4_pks = is_cr4_pks(mmu);
 	mmu->pkr_mask = 0;
 
-	if (!is_cr4_pke(mmu))
+	if (!cr4_pke && !cr4_pks)
 		return;
 
 	wp = is_cr0_wp(mmu);
@@ -4656,19 +4679,22 @@  static void update_pkr_bitmask(struct kvm_mmu *mmu)
 		pte_user = pfec & PFERR_RSVD_MASK;
 
 		/*
-		 * Only need to check the access which is not an
-		 * instruction fetch and is to a user page.
+		 * need to check the access which is not an
+		 * instruction fetch and
+		 * - if cr4_pke 1-setting when accessing a user page.
+		 * - if cr4_pks 1-setting when accessing a supervisor page.
 		 */
-		check_pkey = (!ff && pte_user);
+		check_pkey = !ff && (pte_user ? cr4_pke : cr4_pks);
+
 		/*
-		 * write access is controlled by PKRU if it is a
-		 * user access or CR0.WP = 1.
+		 * write access is controlled by PKRU/PKRS if
+		 * it is a user access or CR0.WP = 1.
 		 */
 		check_write = check_pkey && wf && (uf || wp);
 
-		/* PKRU.AD stops both read and write access. */
+		/* PKRU/PKRS.AD stops both read and write access. */
 		pkey_bits = !!check_pkey;
-		/* PKRU.WD stops write access. */
+		/* PKRU/PKRS.WD stops write access. */
 		pkey_bits |= (!!check_write) << 1;
 
 		mmu->pkr_mask |= (pkey_bits & 3) << pfec;
@@ -4719,6 +4745,7 @@  static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
 		/* PKEY and LA57 are active iff long mode is active. */
 		ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
 		ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+		ext.cr4_pks = ____is_efer_lma(regs) && ____is_cr4_pks(regs);
 		ext.efer_lma = ____is_efer_lma(regs);
 	}
 
@@ -6482,14 +6509,22 @@  u32 kvm_mmu_pkr_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 		     unsigned pte_access, unsigned pte_pkey, unsigned int pfec)
 {
 	u32 pkr_bits, offset;
+	u32 pkr;
 
 	/*
-	* PKRU defines 32 bits, there are 16 domains and 2
-	* attribute bits per domain in pkru.  pte_pkey is the
-	* index of the protection domain, so pte_pkey * 2 is
-	* is the index of the first bit for the domain.
+	* PKRU and PKRS both define 32 bits. There are 16 domains
+	* and 2 attribute bits per domain in them. pte_key is the
+	* index of the protection domain, so pte_pkey * 2 is the
+	* index of the first bit for the domain. The use of PKRU
+	* versus PKRS is selected by the address type, as determined
+	* by the U/S bit in the paging-structure entries.
 	*/
-	pkr_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+	if (pte_access & PT_USER_MASK)
+		pkr = is_cr4_pke(mmu) ? vcpu->arch.pkru : 0;
+	else
+		pkr = is_cr4_pks(mmu) ? kvm_read_pkrs(vcpu) : 0;
+
+	pkr_bits = (pkr >> pte_pkey * 2) & 3;
 
 	/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
 	offset = (pfec & ~1) + ((pte_access & PT_USER_MASK)