memlat: Read perf counters in parallel and reduce system jitter

Sending synchronous IPIs to other CPUs involves spinning with preemption disabled in order to wait for each IPI to finish. Keeping preemption off for long periods of time like this is bad for system jitter, not to mention the perf event IPIs are sent and flushed one at a time for each event for each CPU rather than all at once for all the CPUs. Since the way perf events are currently read is quite naive, rewrite it to make it exploit parallelism and go much faster. IPIs for reading each perf event are now sent to all CPUs asynchronously so that each CPU can work on reading the events in parallel, and the dispatching CPU now sleeps rather than spins when waiting for the IPIs to finish. Before the dispatching CPU starts waiting though, it works on reading events for itself and then reading events which can be read from any CPU in order to derive further parallelism, and then waits for the IPIs to finish afterwards if they haven't already. Furthermore, there's now only one IPI sent to read all of a CPU's events rather than an IPI sent for reading each event, which significantly speeds up the event reads and reduces the number of IPIs sent. This also checks for active SCM calls on a per-CPU basis rather than a global basis so that unrelated CPUs don't get their counter reads skipped and so that some CPUs can still receive fresh counter readings. Overall, this makes the memlat driver much faster and more efficient, and eliminates significant system jitter previously caused by IPI abuse. Change-Id: I238c4e57f672a0337e2377c8fd38d0f6a1dbc2d0 Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> Signed-off-by: Richard Raya <rdxzv.dev@gmail.com>
2025-02-20 11:45:48 +08:00 · 2022-06-01 23:21:08 -07:00 · 2022-06-01 23:21:08 -07:00 · 2d26472b39
commit 2d26472b39
parent aa6fa8433d
3 changed files with 143 additions and 28 deletions
--- a/drivers/devfreq/arm-memlat-mon.c
+++ b/drivers/devfreq/arm-memlat-mon.c
@ -58,6 +58,7 @@ struct cpu_pmu_stats {

 struct cpu_grp_info {
 	cpumask_t cpus;
+	unsigned long any_cpu_ev_mask;
 	unsigned int event_ids[NUM_EVENTS];
 	struct cpu_pmu_stats *cpustats;
 	struct memlat_hwmon hw;
@ -67,6 +68,13 @@ struct memlat_mon_spec {
 	bool is_compute;
 };

+struct ipi_data {
+	unsigned long cnts[NR_CPUS][NUM_EVENTS];
+	struct task_struct *waiter_task;
+	struct cpu_grp_info *cpu_grp;
+	atomic_t cpus_left;
+};
+
 #define to_cpustats(cpu_grp, cpu) \
 	(&cpu_grp->cpustats[cpu - cpumask_first(&cpu_grp->cpus)])
 #define to_devstats(cpu_grp, cpu) \
@ -93,32 +101,84 @@ static unsigned long compute_freq(struct cpu_pmu_stats *cpustats,
 }

 #define MAX_COUNT_LIM 0xFFFFFFFFFFFFFFFF
-static inline unsigned long read_event(struct event_data *event)
+static unsigned long read_event(struct cpu_pmu_stats *cpustats, int event_id)
 {
+	struct event_data *event = &cpustats->events[event_id];
 	unsigned long ev_count;
-	u64 total, enabled, running;
+	u64 total;

-	if (!event->pevent)
+	if (!event->pevent || perf_event_read_local(event->pevent, &total))
 		return 0;

-	total = perf_event_read_value(event->pevent, &enabled, &running);
 	ev_count = total - event->prev_count;
 	event->prev_count = total;
 	return ev_count;
 }

-static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
+static void read_perf_counters(struct ipi_data *ipd, int cpu)
 {
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+	int ev;
+
+	for (ev = 0; ev < NUM_EVENTS; ev++) {
+		if (!(cpu_grp->any_cpu_ev_mask & BIT(ev)))
+			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+	}
+}
+
+static void read_evs_ipi(void *info)
+{
+	int cpu = raw_smp_processor_id();
+	struct ipi_data *ipd = info;
+	struct task_struct *waiter;
+
+	read_perf_counters(ipd, cpu);
+
+	/*
+	 * Wake up the waiter task if we're the final CPU. The ipi_data pointer
+	 * isn't safe to dereference once cpus_left reaches zero, so the waiter
+	 * task_struct pointer must be cached before that. Also defend against
+	 * the extremely unlikely possibility that the waiter task will have
+	 * exited by the time wake_up_process() is reached.
+	 */
+	waiter = ipd->waiter_task;
+	get_task_struct(waiter);
+	if (atomic_fetch_andnot(BIT(cpu), &ipd->cpus_left) == BIT(cpu) &&
+	    waiter->state != TASK_RUNNING)
+		wake_up_process(waiter);
+	put_task_struct(waiter);
+}
+
+static void read_any_cpu_events(struct ipi_data *ipd, unsigned long cpus)
+{
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+	int cpu, ev;
+
+	if (!cpu_grp->any_cpu_ev_mask)
+		return;
+
+	for_each_cpu(cpu, to_cpumask(&cpus)) {
+		struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+
+		for_each_set_bit(ev, &cpu_grp->any_cpu_ev_mask, NUM_EVENTS)
+			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+	}
+}
+
+static void compute_perf_counters(struct ipi_data *ipd, int cpu)
+{
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
 	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
 	struct dev_stats *devstats = to_devstats(cpu_grp, cpu);
 	unsigned long cyc_cnt, stall_cnt;

-	devstats->inst_count = read_event(&cpustats->events[INST_IDX]);
-	devstats->mem_count = read_event(&cpustats->events[CM_IDX]);
-	cyc_cnt = read_event(&cpustats->events[CYC_IDX]);
+	devstats->inst_count = ipd->cnts[cpu][INST_IDX];
+	devstats->mem_count = ipd->cnts[cpu][CM_IDX];
+	cyc_cnt = ipd->cnts[cpu][CYC_IDX];
 	devstats->freq = compute_freq(cpustats, cyc_cnt);
 	if (cpustats->events[STALL_CYC_IDX].pevent) {
-		stall_cnt = read_event(&cpustats->events[STALL_CYC_IDX]);
+		stall_cnt = ipd->cnts[cpu][STALL_CYC_IDX];
 		stall_cnt = min(stall_cnt, cyc_cnt);
 		devstats->stall_pct = mult_frac(100, stall_cnt, cyc_cnt);
 	} else {
@ -128,19 +188,69 @@ static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)

 static unsigned long get_cnt(struct memlat_hwmon *hw)
 {
-	int cpu;
 	struct cpu_grp_info *cpu_grp = to_cpu_grp(hw);
+	unsigned long cpus_read_mask, tmp_mask;
+	call_single_data_t csd[NR_CPUS];
+	struct ipi_data ipd;
+	int cpu, this_cpu;
+
+	ipd.waiter_task = current;
+	ipd.cpu_grp = cpu_grp;
+
+	/* Dispatch asynchronous IPIs to each CPU to read the perf events */
+	cpus_read_lock();
+	migrate_disable();
+	this_cpu = raw_smp_processor_id();
+	cpus_read_mask = *cpumask_bits(&cpu_grp->cpus);
+	tmp_mask = cpus_read_mask & ~BIT(this_cpu);
+	ipd.cpus_left = (atomic_t)ATOMIC_INIT(tmp_mask);
+	for_each_cpu(cpu, to_cpumask(&tmp_mask)) {
+		/*
+		 * Some SCM calls take very long (20+ ms), so the IPI could lag
+		 * on the CPU running the SCM call. Skip offline CPUs too.
+		 */
+		csd[cpu].flags = 0;
+		if (under_scm_call(cpu) ||
+		    generic_exec_single(cpu, &csd[cpu], read_evs_ipi, &ipd))
+			cpus_read_mask &= ~BIT(cpu);
+	}
+	cpus_read_unlock();
+	/* Read this CPU's events while the IPIs run */
+	if (cpus_read_mask & BIT(this_cpu))
+		read_perf_counters(&ipd, this_cpu);
+	migrate_enable();
+
+	/* Bail out if there weren't any CPUs available */
+	if (!cpus_read_mask)
+		return 0;
+
+	/* Read any any-CPU events while the IPIs run */
+	read_any_cpu_events(&ipd, cpus_read_mask);
+
+	/* Clear out CPUs which were skipped */
+	atomic_andnot(cpus_read_mask ^ tmp_mask, &ipd.cpus_left);

 	/*
-	 * Some of SCM call is very heavy(+20ms) so perf IPI could
-	 * be stuck on the CPU which contributes long latency.
+	 * Wait until all the IPIs are done reading their events, and compute
+	 * each finished CPU's results while waiting since some CPUs may finish
+	 * reading their events faster than others.
 	 */
-	if (under_scm_call()) {
-		return 0;
-	}
+	for (tmp_mask = cpus_read_mask;;) {
+		unsigned long cpus_done, cpus_left;

-	for_each_cpu(cpu, &cpu_grp->cpus)
-		read_perf_counters(cpu, cpu_grp);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		cpus_left = (unsigned int)atomic_read(&ipd.cpus_left);
+		if ((cpus_done = cpus_left ^ tmp_mask)) {
+			for_each_cpu(cpu, to_cpumask(&cpus_done))
+				compute_perf_counters(&ipd, cpu);
+			if (!cpus_left)
+				break;
+			tmp_mask = cpus_left;
+		} else {
+			schedule();
+		}
+	}
+	__set_current_state(TASK_RUNNING);

 	return 0;
 }
@ -217,6 +327,8 @@ static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
 			goto err_out;
 		cpustats->events[i].pevent = pevent;
 		perf_event_enable(pevent);
+		if (cpumask_equal(&pevent->readable_on_cpus, &CPU_MASK_ALL))
+			cpu_grp->any_cpu_ev_mask |= BIT(i);
 	}

 	kfree(attr);
--- a/drivers/soc/qcom/scm.c
+++ b/drivers/soc/qcom/scm.c
@ -36,7 +36,7 @@
 #define SCM_EBUSY		-55
 #define SCM_V2_EBUSY		-12

-static atomic_t scm_call_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(atomic_t, scm_call_count);
 static DEFINE_MUTEX(scm_lock);

 /*
@ -433,11 +433,12 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;

-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);

 	return ret;
 }
@ -495,11 +496,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;

-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);

 	return ret;
 }
@ -557,11 +559,12 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;

-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);

 	return ret;
 }
@ -1352,7 +1355,7 @@ inline int scm_enable_mem_protection(void)
 #endif
 EXPORT_SYMBOL(scm_enable_mem_protection);

-bool under_scm_call(void)
+bool under_scm_call(int cpu)
 {
-	return atomic_read(&scm_call_count);
+	return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
 }
--- a/include/soc/qcom/scm.h
+++ b/include/soc/qcom/scm.h
@ -124,7 +124,7 @@ struct scm_hdcp_req {
 };

 extern struct mutex scm_lmh_lock;
-extern bool under_scm_call(void);
+extern bool under_scm_call(int cpu);
 #else

 static inline int scm_call2(u32 cmd_id, struct scm_desc *desc)
@ -186,7 +186,7 @@ static inline int scm_enable_mem_protection(void)
 {
 	return 0;
 }
-extern bool under_scm_call(void)
+extern bool under_scm_call(int cpu)
 {
 	return false;
 }