@@ -113,7 +113,8 @@ Kernel API for PKS support
Similar to user space pkeys, supervisor pkeys allow additional protections to
be defined for a supervisor mappings. Unlike user space pkeys, violations of
-these protections result in a a kernel oops.
+these protections result in a a kernel oops unless a PKS fault handler is
+provided which handles the fault.
Supervisor Memory Protection Keys (PKS) is a feature which is found on Intel's
Sapphire Rapids (and later) "Scalable Processor" Server CPUs. It will also be
@@ -145,6 +146,30 @@ Disabled.
consumer_defaults[PKS_KEY_MY_FEATURE] = PKR_DISABLE_WRITE;
...
+
+Users may also provide a fault handler which can handle a fault differently
+than an oops. Continuing our example from above if 'MY_FEATURE' wanted to
+define a handler they can do so by adding the coresponding entry to the
+pks_key_callbacks array.
+
+::
+
+ #ifdef CONFIG_MY_FEATURE
+ bool my_feature_pks_fault_callback(unsigned long address, bool write)
+ {
+ if (my_feature_fault_is_ok)
+ return true;
+ return false;
+ }
+ #endif
+
+ static const pks_key_callback pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = {
+ [PKS_KEY_DEFAULT] = NULL,
+ #ifdef CONFIG_MY_FEATURE
+ [PKS_KEY_PGMAP_PROTECTION] = my_feature_pks_fault_callback,
+ #endif
+ };
+
The following interface is used to manipulate the 'protection domain' defined
by a pkey within the kernel. Setting a pkey value in a supervisor PTE adds
this additional protection to the page.
@@ -23,6 +23,7 @@ static inline struct extended_pt_regs *extended_pt_regs(struct pt_regs *regs)
void show_extended_regs_oops(struct pt_regs *regs, unsigned long error_code);
int handle_abandoned_pks_value(struct pt_regs *regs);
+bool handle_pks_key_callback(unsigned long address, bool write, u16 key);
#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
@@ -36,6 +37,12 @@ static inline int handle_abandoned_pks_value(struct pt_regs *regs)
{
return 0;
}
+static inline bool handle_pks_key_fault(struct pt_regs *regs,
+ unsigned long hw_error_code,
+ unsigned long address)
+{
+ return false;
+}
#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
@@ -1134,6 +1134,54 @@ bool fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}
+#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
+bool handle_pks_key_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
+{
+ bool write = (hw_error_code & X86_PF_WRITE);
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+ pte_t pte;
+
+ pgd = READ_ONCE(*(init_mm.pgd + pgd_index(address)));
+ if (!pgd_present(pgd))
+ return false;
+
+ p4d = READ_ONCE(*p4d_offset(&pgd, address));
+ if (!p4d_present(p4d))
+ return false;
+
+ if (p4d_large(p4d))
+ return handle_pks_key_callback(address, write,
+ pte_flags_pkey(p4d_val(p4d)));
+
+ pud = READ_ONCE(*pud_offset(&p4d, address));
+ if (!pud_present(pud))
+ return false;
+
+ if (pud_large(pud))
+ return handle_pks_key_callback(address, write,
+ pte_flags_pkey(pud_val(pud)));
+
+ pmd = READ_ONCE(*pmd_offset(&pud, address));
+ if (!pmd_present(pmd))
+ return false;
+
+ if (pmd_large(pmd))
+ return handle_pks_key_callback(address, write,
+ pte_flags_pkey(pmd_val(pmd)));
+
+ pte = READ_ONCE(*pte_offset_kernel(&pmd, address));
+ if (!pte_present(pte))
+ return false;
+
+ return handle_pks_key_callback(address, write,
+ pte_flags_pkey(pte_val(pte)));
+}
+#endif
+
/*
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
@@ -1164,6 +1212,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
if (handle_abandoned_pks_value(regs))
return;
+
+ if (handle_pks_key_fault(regs, hw_error_code, address))
+ return;
}
#ifdef CONFIG_X86_32
@@ -241,6 +241,19 @@ int handle_abandoned_pks_value(struct pt_regs *regs)
return (ept_regs->thread_pkrs != old);
}
+static const pks_key_callback pks_key_callbacks[PKS_KEY_NR_CONSUMERS] = { 0 };
+
+bool handle_pks_key_callback(unsigned long address, bool write, u16 key)
+{
+ if (key > PKS_KEY_NR_CONSUMERS)
+ return false;
+
+ if (pks_key_callbacks[key])
+ return pks_key_callbacks[key](address, write);
+
+ return false;
+}
+
/*
* write_pkrs() optimizes MSR writes by maintaining a per cpu cache which can
* be checked quickly.
@@ -62,6 +62,8 @@ void pks_mk_readonly(int pkey);
void pks_mk_readwrite(int pkey);
void pks_abandon_protections(int pkey);
+typedef bool (*pks_key_callback)(unsigned long address, bool write);
+
#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
static inline void pkrs_save_irq(struct pt_regs *regs) { }