diff mbox series

[V7,12/18] x86/pks: Add PKS fault callbacks

Message ID 20210804043231.2655537-13-ira.weiny@intel.com (mailing list archive)
State New, archived
Headers show
Series PKS/PMEM: Add Stray Write Protection | expand

Commit Message

Ira Weiny Aug. 4, 2021, 4:32 a.m. UTC
From: Rick Edgecombe <rick.p.edgecombe@intel.com>

Some PKS keys will want special handling on accesses that violate their
permissions. One of these is PMEM which will want to have a mode that
logs the access violation, disables protection, and continues rather
than oops the machine.

Since PKS faults do not provide the actual key that faulted, this
information needs to be recovered by walking the page tables and
extracting it from the leaf entry.

This infrastructure could be used to implement abandoned pkeys, but adds
support in a separate call such that abandoned pkeys are handled more
quickly by skipping the page table walk.

In pkeys.c, define a new api for setting callbacks for individual pkeys.

Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

---
Changes for V7:
	New patch
---
 Documentation/core-api/protection-keys.rst | 27 +++++++++++-
 arch/x86/include/asm/pks.h                 |  7 +++
 arch/x86/mm/fault.c                        | 51 ++++++++++++++++++++++
 arch/x86/mm/pkeys.c                        | 13 ++++++
 include/linux/pkeys.h                      |  2 +
 5 files changed, 99 insertions(+), 1 deletion(-)

Comments

Edgecombe, Rick P Aug. 11, 2021, 9:18 p.m. UTC | #1
On Tue, 2021-08-03 at 21:32 -0700, ira.weiny@intel.com wrote:
> +static const pks_key_callback
> pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = { 0 };
> +
> +bool handle_pks_key_callback(unsigned long address, bool write, u16
> key)
> +{
> +       if (key > PKS_KEY_NR_CONSUMERS)
> +               return false;
Good idea, should be >= though?

> +
> +       if (pks_key_callbacks[key])
> +               return pks_key_callbacks[key](address, write);
> +
> +       return false;
> +}
> +

Otherwise, I've rebased on this series and didn't hit any problems.
Thanks.
Ira Weiny Aug. 17, 2021, 3:21 a.m. UTC | #2
On Wed, Aug 11, 2021 at 02:18:26PM -0700, Edgecombe, Rick P wrote:
> On Tue, 2021-08-03 at 21:32 -0700, ira.weiny@intel.com wrote:
> > +static const pks_key_callback
> > pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = { 0 };
> > +
> > +bool handle_pks_key_callback(unsigned long address, bool write, u16
> > key)
> > +{
> > +       if (key > PKS_KEY_NR_CONSUMERS)
> > +               return false;
> Good idea, should be >= though?

Yep.  Fixed thanks.

> 
> > +
> > +       if (pks_key_callbacks[key])
> > +               return pks_key_callbacks[key](address, write);
> > +
> > +       return false;
> > +}
> > +
> 
> Otherwise, I've rebased on this series and didn't hit any problems.
> Thanks.

Awesome!  I still want Dave and Dan to weigh in prior to me respining with the
changes so far.

Thanks,
Ira
diff mbox series

Patch

diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index 8cf7eaaed3e5..bbf81b12e67d 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -113,7 +113,8 @@  Kernel API for PKS support
 
 Similar to user space pkeys, supervisor pkeys allow additional protections to
 be defined for a supervisor mappings.  Unlike user space pkeys, violations of
-these protections result in a a kernel oops.
+these protections result in a a kernel oops unless a PKS fault handler is
+provided which handles the fault.
 
 Supervisor Memory Protection Keys (PKS) is a feature which is found on Intel's
 Sapphire Rapids (and later) "Scalable Processor" Server CPUs.  It will also be
@@ -145,6 +146,30 @@  Disabled.
         consumer_defaults[PKS_KEY_MY_FEATURE]  = PKR_DISABLE_WRITE;
         ...
 
+
+Users may also provide a fault handler which can handle a fault differently
+than an oops.  Continuing our example from above if 'MY_FEATURE' wanted to
+define a handler they can do so by adding the coresponding entry to the
+pks_key_callbacks array.
+
+::
+
+        #ifdef CONFIG_MY_FEATURE
+        bool my_feature_pks_fault_callback(unsigned long address, bool write)
+        {
+                if (my_feature_fault_is_ok)
+                        return true;
+                return false;
+        }
+        #endif
+
+        static const pks_key_callback pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = {
+                [PKS_KEY_DEFAULT]            = NULL,
+        #ifdef CONFIG_MY_FEATURE
+                [PKS_KEY_PGMAP_PROTECTION]   = my_feature_pks_fault_callback,
+        #endif
+        };
+
 The following interface is used to manipulate the 'protection domain' defined
 by a pkey within the kernel.  Setting a pkey value in a supervisor PTE adds
 this additional protection to the page.
diff --git a/arch/x86/include/asm/pks.h b/arch/x86/include/asm/pks.h
index e28413cc410d..3de5089d379d 100644
--- a/arch/x86/include/asm/pks.h
+++ b/arch/x86/include/asm/pks.h
@@ -23,6 +23,7 @@  static inline struct extended_pt_regs *extended_pt_regs(struct pt_regs *regs)
 
 void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code);
 int handle_abandoned_pks_value(struct pt_regs *regs);
+bool handle_pks_key_callback(unsigned long address, bool write, u16 key);
 
 #else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
 
@@ -36,6 +37,12 @@  static inline int handle_abandoned_pks_value(struct pt_regs *regs)
 {
 	return 0;
 }
+static inline bool handle_pks_key_fault(struct pt_regs *regs,
+					unsigned long hw_error_code,
+					unsigned long address)
+{
+	return false;
+}
 
 #endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3780ed0f9597..7a8c807006c7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1134,6 +1134,54 @@  bool fault_in_kernel_space(unsigned long address)
 	return address >= TASK_SIZE_MAX;
 }
 
+#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
+bool handle_pks_key_fault(struct pt_regs *regs, unsigned long hw_error_code,
+			  unsigned long address)
+{
+	bool write = (hw_error_code & X86_PF_WRITE);
+	pgd_t pgd;
+	p4d_t p4d;
+	pud_t pud;
+	pmd_t pmd;
+	pte_t pte;
+
+	pgd = READ_ONCE(*(init_mm.pgd + pgd_index(address)));
+	if (!pgd_present(pgd))
+		return false;
+
+	p4d = READ_ONCE(*p4d_offset(&pgd, address));
+	if (!p4d_present(p4d))
+		return false;
+
+	if (p4d_large(p4d))
+		return handle_pks_key_callback(address, write,
+					       pte_flags_pkey(p4d_val(p4d)));
+
+	pud = READ_ONCE(*pud_offset(&p4d, address));
+	if (!pud_present(pud))
+		return false;
+
+	if (pud_large(pud))
+		return handle_pks_key_callback(address, write,
+					      pte_flags_pkey(pud_val(pud)));
+
+	pmd = READ_ONCE(*pmd_offset(&pud, address));
+	if (!pmd_present(pmd))
+		return false;
+
+	if (pmd_large(pmd))
+		return handle_pks_key_callback(address, write,
+					      pte_flags_pkey(pmd_val(pmd)));
+
+	pte = READ_ONCE(*pte_offset_kernel(&pmd, address));
+	if (!pte_present(pte))
+		return false;
+
+	return handle_pks_key_callback(address, write,
+				      pte_flags_pkey(pte_val(pte)));
+}
+#endif
+
 /*
  * Called for all faults where 'address' is part of the kernel address
  * space.  Might get called for faults that originate from *code* that
@@ -1164,6 +1212,9 @@  do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 
 		if (handle_abandoned_pks_value(regs))
 			return;
+
+		if (handle_pks_key_fault(regs, hw_error_code, address))
+			return;
 	}
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index c7358662ec07..f0166725a128 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -241,6 +241,19 @@  int handle_abandoned_pks_value(struct pt_regs *regs)
 	return (ept_regs->thread_pkrs != old);
 }
 
+static const pks_key_callback pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = { 0 };
+
+bool handle_pks_key_callback(unsigned long address, bool write, u16 key)
+{
+	if (key > PKS_KEY_NR_CONSUMERS)
+		return false;
+
+	if (pks_key_callbacks[key])
+		return pks_key_callbacks[key](address, write);
+
+	return false;
+}
+
 /*
  * write_pkrs() optimizes MSR writes by maintaining a per cpu cache which can
  * be checked quickly.
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 4d22ccd971fc..549fa01d7da3 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -62,6 +62,8 @@  void pks_mk_readonly(int pkey);
 void pks_mk_readwrite(int pkey);
 void pks_abandon_protections(int pkey);
 
+typedef bool (*pks_key_callback)(unsigned long address, bool write);
+
 #else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
 
 static inline void pkrs_save_irq(struct pt_regs *regs) { }