From b199b868736b2c4eb03fac25b436dd229b75ed7e Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 8 May 2023 18:06:33 -0700 Subject: [PATCH] qos: Only wake idle CPUs which are affected by a request change The pm_qos idle wake-up mechanism currently wakes up *all* idle CPUs when there's a pm_qos request change, instead of just the CPUs which are affected by the change. This is horribly suboptimal and increases power consumption by needlessly waking idled CPUs. Additionally, pm_qos may kick CPUs which aren't even idle, since wake_up_all_idle_cpus() only checks if a CPU is running the idle task, which says nothing about whether or not the CPU is really in an idle state. Optimize the pm_qos wake-ups by only sending IPIs to CPUs that are idle, and by using arch_send_wakeup_ipi_mask() instead of wake_up_if_idle() which is used under the hood in wake_up_all_idle_cpus(). Using IPI_WAKEUP instead of IPI_RESCHEDULE, which is what wake_up_if_idle() uses behind the scenes, has the benefit of doing zero work upon receipt of the IPI; IPI_WAKEUP is designed purely for sending an IPI without a payload. Determining which CPUs are idle is done efficiently with an atomic bitmask instead of using the wake_up_if_idle() API, which checks the CPU's runqueue in an RCU read-side critical section and under a spin lock. Not very efficient in comparison to a simple, atomic bitwise operation. A cpumask isn't needed for this because NR_CPUS is guaranteed to fit within a word. CPUs are marked as idle as soon as IRQs are disabled in the idle loop, since any IPI sent after that point will cause the CPU's idle attempt to immediately exit (like when executing the wfi instruction). CPUs are marked as not-idle as soon as they wake up in order to avoid sending redundant IPIs to CPUs that are already awake. Change-Id: I04c9e2bd9317357e16d8184a104fe603d0d2dab2 Signed-off-by: Sultan Alsawaf Signed-off-by: Richard Raya --- kernel/power/qos.c | 6 ++++-- kernel/sched/idle.c | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index ec25fb682f1d..eed4e7957dcc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -318,7 +318,8 @@ static inline int pm_qos_set_value_for_cpus(struct pm_qos_request *new_req, for_each_cpu(cpu, to_cpumask(&affected_cpus)) { if (c->target_per_cpu[cpu] != req->node.prio) { c->target_per_cpu[cpu] = req->node.prio; - *cpus |= BIT(cpu); + if (cpus) + *cpus |= BIT(cpu); } } @@ -329,7 +330,8 @@ static inline int pm_qos_set_value_for_cpus(struct pm_qos_request *new_req, for_each_cpu(cpu, to_cpumask(&new_req_cpus)) { if (c->target_per_cpu[cpu] != PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE) { c->target_per_cpu[cpu] = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE; - *cpus |= BIT(cpu); + if (cpus) + *cpus |= BIT(cpu); } } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8682e90df0fd..44d040db4e0a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -252,6 +252,8 @@ static void do_idle(void) tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); + } else { + cpuidle_set_idle_cpu(cpu); } arch_cpu_idle_enter(); @@ -269,6 +271,7 @@ static void do_idle(void) } else { cpuidle_idle_call(); } + cpuidle_clear_idle_cpu(cpu); arch_cpu_idle_exit(); }