@@ -3756,6 +3756,25 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
return 0;
}
+/*
+ * Draft function to flush data in
+ * block multi-queues to perform isolation
+ * of specified cpu from managed interrupts.
+ */
+void blk_mq_flush_on_cpu(int cpu)
+{
+ // TODO: Thoroughly test this code with high test coverage.
+ /* Calling:
+ * - blk_mq_hctx_notify_offline()
+ * - blk_mq_hctx_notify_dead()
+ * - bio_cpu_dead()
+ */
+ cpuhp_invoke_callback(cpu, CPUHP_AP_BLK_MQ_ONLINE, false, NULL, NULL);
+ cpuhp_invoke_callback(cpu, CPUHP_BLK_MQ_DEAD, false, NULL, NULL);
+ cpuhp_invoke_callback(cpu, CPUHP_BIO_DEAD, false, NULL, NULL);
+ blk_softirq_cpu_dead(cpu);
+}
+
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_STACKING))
@@ -929,6 +929,8 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
unsigned int blk_mq_rq_cpu(struct request *rq);
+void blk_mq_flush_on_cpu(int cpu);
+
bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
@@ -203,4 +203,8 @@ static inline bool cpu_mitigations_auto_nosmt(void)
}
#endif
+int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp);
+
#endif /* _LINUX_CPU_H_ */
@@ -619,6 +619,8 @@ extern int irq_affinity_online_cpu(unsigned int cpu);
# define irq_affinity_online_cpu NULL
#endif
+void managed_irq_adjust_activity(struct cpumask *enable_mask);
+
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
void __irq_move_irq(struct irq_data *data);
static inline void irq_move_irq(struct irq_data *data)
@@ -95,6 +95,7 @@ static struct list_head remote_children;
#define HOUSEKEEPING_FLAGS (BIT(HK_TYPE_TIMER) | BIT(HK_TYPE_RCU) |\
BIT(HK_TYPE_SCHED) | BIT(HK_TYPE_MISC) |\
BIT(HK_TYPE_DOMAIN) | BIT(HK_TYPE_WQ) |\
+ BIT(HK_TYPE_MANAGED_IRQ) |\
BIT(HK_TYPE_KTHREAD))
/*
@@ -171,7 +171,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
*
* Return: %0 on success or a negative errno code
*/
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
struct hlist_node **lastp)
{
@@ -279,3 +279,102 @@ int irq_affinity_online_cpu(unsigned int cpu)
return 0;
}
+
+/*
+ * managed_irq_isolate() - Deactivate managed interrupts if necessary
+ */
+// derived from migrate_one_irq(), irq_needs_fixup(), irq_fixup_move_pending().
+// Finally, this function can be considered for
+// merging back with migrate_one_irq().
+static int managed_irq_isolate(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
+ const struct cpumask *a;
+ bool maskchip;
+ int err;
+
+ /*
+ * Deactivate if:
+ * - Interrupt is managed
+ * - Interrupt is not per cpu
+ * - Interrupt is started
+ * - Effective affinity mask includes isolated CPUs
+ */
+ if (!irqd_affinity_is_managed(d) || irqd_is_per_cpu(d) || !irqd_is_started(d)
+ || cpumask_subset(irq_data_get_effective_affinity_mask(d),
+ housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
+ return 0;
+ // TBD: it is required?
+ /*
+ * Complete an eventually pending irq move cleanup. If this
+ * interrupt was moved in hard irq context, then the vectors need
+ * to be cleaned up. It can't wait until this interrupt actually
+ * happens and this CPU was involved.
+ */
+ irq_force_complete_move(desc);
+
+ if (irqd_is_setaffinity_pending(d)) {
+ irqd_clr_move_pending(d);
+ if (cpumask_intersects(desc->pending_mask,
+ housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
+ a = irq_desc_get_pending_mask(desc);
+ } else {
+ a = irq_data_get_affinity_mask(d);
+ }
+
+ maskchip = chip->irq_mask && !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
+ if (maskchip)
+ chip->irq_mask(d);
+
+ if (!cpumask_intersects(a, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) {
+ /*
+ * Shut managed interrupt down and leave the affinity untouched.
+ * The effective affinity is reset to the first online CPU.
+ */
+ irqd_set_managed_shutdown(d);
+ irq_shutdown_and_deactivate(desc);
+ return 0;
+ }
+
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the isolated
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, a, false);
+ if (err)
+ pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+ d->irq, err);
+
+ if (maskchip)
+ chip->irq_unmask(d);
+
+ return err;
+}
+
+/** managed_irq_adjust_activity() - Deactivate of restore managed interrupts
+ * according to change of housekeeping cpumask.
+ *
+ * @enable_mask: CPUs for which interrupts should be restored
+ */
+void managed_irq_adjust_activity(struct cpumask *enable_mask)
+{
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ unsigned int cpu;
+
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for_each_cpu(cpu, enable_mask)
+ irq_restore_affinity_of_irq(desc, cpu);
+ managed_irq_isolate(desc);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
@@ -8,6 +8,8 @@
*
*/
+#include <linux/blk-mq.h>
+
#undef pr_fmt
#define pr_fmt(fmt) "%s:%d: %s " fmt, __FILE__, __LINE__, __func__
@@ -152,10 +154,16 @@ static int housekeeping_update(enum hk_type type, const struct cpumask *update)
if (!static_branch_unlikely(&housekeeping_overridden))
static_key_enable_cpuslocked(&housekeeping_overridden.key);
- /* Add here code to update dependent subsystems with
- * changes of the housekeeping masks.
- */
+ switch (type) {
+ case HK_TYPE_MANAGED_IRQ:
+ int cpu;
+ for_each_cpu(cpu, &masks->disable)
+ blk_mq_flush_on_cpu(cpu);
+ managed_irq_adjust_activity(&masks->enable);
+ break;
+ default:
+ }
kfree(masks);
return 0;
Interrupts disturb real-time tasks on affined cpus. To ensure CPU isolation for real-time tasks, interrupt handling must be adjusted accordingly. Non-managed interrupts can be configured from userspace, while managed interrupts require adjustments in kernelspace. Adjust status of managed interrupts according change of housekeeping CPUs to support dynamic CPU isolation. Signed-off-by: Costa Shulyupin <costa.shul@redhat.com> --- The following code is a proof of concept to validate and review the correctness of the approach to solving the problem. C++ comments denote temporary comments. v3: - rename `int managed_irq_affinity_adjust()` to `void managed_irq_adjust_activity()` - Addresses Thomas Gleixner's comments: - add locking to managed_irq_adjust_activity() - add blk_mq_flush_on_cpu() to flush queues associated with isolated interrupts. v2: - refactor irq_affinity_adjust(): - add more comments - add managed_irq_isolate() derived from migrate_one_irq(), irq_needs_fixup() and irq_fixup_move_pending() - use irq_set_affinity() instead of irq_set_affinity_locked - Addressed Gleixner's comments: - use `struct cpumask *` instead of `cpumask_var_t` in function signature - remove locking in irq_affinity_adjust() v1: - https://lore.kernel.org/lkml/20240516190437.3545310-5-costa.shul@redhat.com/ --- block/blk-mq.c | 19 ++++++++ include/linux/blk-mq.h | 2 + include/linux/cpu.h | 4 ++ include/linux/irq.h | 2 + kernel/cgroup/cpuset.c | 1 + kernel/cpu.c | 2 +- kernel/irq/cpuhotplug.c | 99 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/isolation.c | 14 ++++-- 8 files changed, 139 insertions(+), 4 deletions(-)