From 4b1ae9fdf5d09172f1ea9eb4069aa46bb40d65af Mon Sep 17 00:00:00 2001
From: Richard Raya <rdxzv.dev@gmail.com>
Date: Tue, 8 Oct 2024 22:34:01 -0300
Subject: [PATCH] msm-4.14: Revert some Sultan changes

This reverts commits:
- 2d26472b398e2850755e0f447f11762948876758 [memlat: Read perf counters in parallel and reduce system jitter]
- 91164894044c9682faf3da668c06cd588975dc15 [soc: qcom: watchdog_v2: Optimize IPI pings to reduce system jitter]
- f060136685304ae86aad3da35b1a098642d4e8de [smp: Use migrate disable/enable in smp_call_function_single_async()]
- d1155d942a10a5fe166941db835aad60f8e7603d [smp: Make generic_exec_single() public]

Change-Id: I8723ded1aaa7241ace7c56613a7abf6c46b7808d
Signed-off-by: Richard Raya <rdxzv.dev@gmail.com>
---
 drivers/devfreq/arm-memlat-mon.c | 146 ++++---------------------------
 drivers/soc/qcom/scm.c           |  21 ++---
 drivers/soc/qcom/watchdog_v2.c   |  73 ++++++----------
 include/linux/smp.h              |   2 -
 include/soc/qcom/scm.h           |   4 +-
 kernel/smp.c                     |   9 +-
 6 files changed, 58 insertions(+), 197 deletions(-)

diff --git a/drivers/devfreq/arm-memlat-mon.c b/drivers/devfreq/arm-memlat-mon.c
index d845f953fb06..b82d54d47fd4 100644
--- a/drivers/devfreq/arm-memlat-mon.c
+++ b/drivers/devfreq/arm-memlat-mon.c
@@ -58,7 +58,6 @@ struct cpu_pmu_stats {
 
 struct cpu_grp_info {
 	cpumask_t cpus;
-	unsigned long any_cpu_ev_mask;
 	unsigned int event_ids[NUM_EVENTS];
 	struct cpu_pmu_stats *cpustats;
 	struct memlat_hwmon hw;
@@ -68,13 +67,6 @@ struct memlat_mon_spec {
 	bool is_compute;
 };
 
-struct ipi_data {
-	unsigned long cnts[NR_CPUS][NUM_EVENTS];
-	struct task_struct *waiter_task;
-	struct cpu_grp_info *cpu_grp;
-	atomic_t cpus_left;
-};
-
 #define to_cpustats(cpu_grp, cpu) \
 	(&cpu_grp->cpustats[cpu - cpumask_first(&cpu_grp->cpus)])
 #define to_devstats(cpu_grp, cpu) \
@@ -101,84 +93,32 @@ static unsigned long compute_freq(struct cpu_pmu_stats *cpustats,
 }
 
 #define MAX_COUNT_LIM 0xFFFFFFFFFFFFFFFF
-static unsigned long read_event(struct cpu_pmu_stats *cpustats, int event_id)
+static inline unsigned long read_event(struct event_data *event)
 {
-	struct event_data *event = &cpustats->events[event_id];
 	unsigned long ev_count;
-	u64 total;
+	u64 total, enabled, running;
 
-	if (!event->pevent || perf_event_read_local(event->pevent, &total))
+	if (!event->pevent)
 		return 0;
 
+	total = perf_event_read_value(event->pevent, &enabled, &running);
 	ev_count = total - event->prev_count;
 	event->prev_count = total;
 	return ev_count;
 }
 
-static void read_perf_counters(struct ipi_data *ipd, int cpu)
+static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
 {
-	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
-	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
-	int ev;
-
-	for (ev = 0; ev < NUM_EVENTS; ev++) {
-		if (!(cpu_grp->any_cpu_ev_mask & BIT(ev)))
-			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
-	}
-}
-
-static void read_evs_ipi(void *info)
-{
-	int cpu = raw_smp_processor_id();
-	struct ipi_data *ipd = info;
-	struct task_struct *waiter;
-
-	read_perf_counters(ipd, cpu);
-
-	/*
-	 * Wake up the waiter task if we're the final CPU. The ipi_data pointer
-	 * isn't safe to dereference once cpus_left reaches zero, so the waiter
-	 * task_struct pointer must be cached before that. Also defend against
-	 * the extremely unlikely possibility that the waiter task will have
-	 * exited by the time wake_up_process() is reached.
-	 */
-	waiter = ipd->waiter_task;
-	get_task_struct(waiter);
-	if (atomic_fetch_andnot(BIT(cpu), &ipd->cpus_left) == BIT(cpu) &&
-	    waiter->state != TASK_RUNNING)
-		wake_up_process(waiter);
-	put_task_struct(waiter);
-}
-
-static void read_any_cpu_events(struct ipi_data *ipd, unsigned long cpus)
-{
-	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
-	int cpu, ev;
-
-	if (!cpu_grp->any_cpu_ev_mask)
-		return;
-
-	for_each_cpu(cpu, to_cpumask(&cpus)) {
-		struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
-
-		for_each_set_bit(ev, &cpu_grp->any_cpu_ev_mask, NUM_EVENTS)
-			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
-	}
-}
-
-static void compute_perf_counters(struct ipi_data *ipd, int cpu)
-{
-	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
 	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
 	struct dev_stats *devstats = to_devstats(cpu_grp, cpu);
 	unsigned long cyc_cnt, stall_cnt;
 
-	devstats->inst_count = ipd->cnts[cpu][INST_IDX];
-	devstats->mem_count = ipd->cnts[cpu][CM_IDX];
-	cyc_cnt = ipd->cnts[cpu][CYC_IDX];
+	devstats->inst_count = read_event(&cpustats->events[INST_IDX]);
+	devstats->mem_count = read_event(&cpustats->events[CM_IDX]);
+	cyc_cnt = read_event(&cpustats->events[CYC_IDX]);
 	devstats->freq = compute_freq(cpustats, cyc_cnt);
 	if (cpustats->events[STALL_CYC_IDX].pevent) {
-		stall_cnt = ipd->cnts[cpu][STALL_CYC_IDX];
+		stall_cnt = read_event(&cpustats->events[STALL_CYC_IDX]);
 		stall_cnt = min(stall_cnt, cyc_cnt);
 		devstats->stall_pct = mult_frac(100, stall_cnt, cyc_cnt);
 	} else {
@@ -188,69 +128,19 @@ static void compute_perf_counters(struct ipi_data *ipd, int cpu)
 
 static unsigned long get_cnt(struct memlat_hwmon *hw)
 {
+	int cpu;
 	struct cpu_grp_info *cpu_grp = to_cpu_grp(hw);
-	unsigned long cpus_read_mask, tmp_mask;
-	call_single_data_t csd[NR_CPUS];
-	struct ipi_data ipd;
-	int cpu, this_cpu;
-
-	ipd.waiter_task = current;
-	ipd.cpu_grp = cpu_grp;
-
-	/* Dispatch asynchronous IPIs to each CPU to read the perf events */
-	cpus_read_lock();
-	migrate_disable();
-	this_cpu = raw_smp_processor_id();
-	cpus_read_mask = *cpumask_bits(&cpu_grp->cpus);
-	tmp_mask = cpus_read_mask & ~BIT(this_cpu);
-	ipd.cpus_left = (atomic_t)ATOMIC_INIT(tmp_mask);
-	for_each_cpu(cpu, to_cpumask(&tmp_mask)) {
-		/*
-		 * Some SCM calls take very long (20+ ms), so the IPI could lag
-		 * on the CPU running the SCM call. Skip offline CPUs too.
-		 */
-		csd[cpu].flags = 0;
-		if (under_scm_call(cpu) ||
-		    generic_exec_single(cpu, &csd[cpu], read_evs_ipi, &ipd))
-			cpus_read_mask &= ~BIT(cpu);
-	}
-	cpus_read_unlock();
-	/* Read this CPU's events while the IPIs run */
-	if (cpus_read_mask & BIT(this_cpu))
-		read_perf_counters(&ipd, this_cpu);
-	migrate_enable();
-
-	/* Bail out if there weren't any CPUs available */
-	if (!cpus_read_mask)
-		return 0;
-
-	/* Read any any-CPU events while the IPIs run */
-	read_any_cpu_events(&ipd, cpus_read_mask);
-
-	/* Clear out CPUs which were skipped */
-	atomic_andnot(cpus_read_mask ^ tmp_mask, &ipd.cpus_left);
 
 	/*
-	 * Wait until all the IPIs are done reading their events, and compute
-	 * each finished CPU's results while waiting since some CPUs may finish
-	 * reading their events faster than others.
+	 * Some of SCM call is very heavy(+20ms) so perf IPI could
+	 * be stuck on the CPU which contributes long latency.
 	 */
-	for (tmp_mask = cpus_read_mask;;) {
-		unsigned long cpus_done, cpus_left;
-
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		cpus_left = (unsigned int)atomic_read(&ipd.cpus_left);
-		if ((cpus_done = cpus_left ^ tmp_mask)) {
-			for_each_cpu(cpu, to_cpumask(&cpus_done))
-				compute_perf_counters(&ipd, cpu);
-			if (!cpus_left)
-				break;
-			tmp_mask = cpus_left;
-		} else {
-			schedule();
-		}
+	if (under_scm_call()) {
+		return 0;
 	}
-	__set_current_state(TASK_RUNNING);
+
+	for_each_cpu(cpu, &cpu_grp->cpus)
+		read_perf_counters(cpu, cpu_grp);
 
 	return 0;
 }
@@ -327,8 +217,6 @@ static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
 			goto err_out;
 		cpustats->events[i].pevent = pevent;
 		perf_event_enable(pevent);
-		if (cpumask_equal(&pevent->readable_on_cpus, &CPU_MASK_ALL))
-			cpu_grp->any_cpu_ev_mask |= BIT(i);
 	}
 
 	kfree(attr);
diff --git a/drivers/soc/qcom/scm.c b/drivers/soc/qcom/scm.c
index fc36e8db1ab5..b067c9baf4e2 100644
--- a/drivers/soc/qcom/scm.c
+++ b/drivers/soc/qcom/scm.c
@@ -36,7 +36,7 @@
 #define SCM_EBUSY		-55
 #define SCM_V2_EBUSY		-12
 
-static DEFINE_PER_CPU(atomic_t, scm_call_count);
+static atomic_t scm_call_count = ATOMIC_INIT(0);
 static DEFINE_MUTEX(scm_lock);
 
 /*
@@ -433,12 +433,11 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
-	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(cnt);
+	atomic_inc(&scm_call_count);
 	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
-	atomic_dec(cnt);
+	atomic_dec(&scm_call_count);
 
 	return ret;
 }
@@ -496,12 +495,11 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
-	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(cnt);
+	atomic_inc(&scm_call_count);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(cnt);
+	atomic_dec(&scm_call_count);
 
 	return ret;
 }
@@ -559,12 +557,11 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
-	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(cnt);
+	atomic_inc(&scm_call_count);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(cnt);
+	atomic_dec(&scm_call_count);
 
 	return ret;
 }
@@ -1355,7 +1352,7 @@ inline int scm_enable_mem_protection(void)
 #endif
 EXPORT_SYMBOL(scm_enable_mem_protection);
 
-bool under_scm_call(int cpu)
+bool under_scm_call(void)
 {
-	return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
+	return atomic_read(&scm_call_count);
 }
diff --git a/drivers/soc/qcom/watchdog_v2.c b/drivers/soc/qcom/watchdog_v2.c
index 331bf6ff6077..29955d82e92e 100644
--- a/drivers/soc/qcom/watchdog_v2.c
+++ b/drivers/soc/qcom/watchdog_v2.c
@@ -80,8 +80,7 @@ struct msm_watchdog_data {
 	unsigned int min_slack_ticks;
 	unsigned long long min_slack_ns;
 	void *scm_regsave;
-	atomic_t alive_mask;
-	atomic_t pinged_mask;
+	cpumask_t alive_mask;
 	struct mutex disable_lock;
 	bool irq_ppi;
 	struct msm_watchdog_data __percpu **wdog_cpu_dd;
@@ -98,6 +97,8 @@ struct msm_watchdog_data {
 	bool user_pet_complete;
 	unsigned long long timer_fired;
 	unsigned long long thread_start;
+	unsigned long long ping_start[NR_CPUS];
+	unsigned long long ping_end[NR_CPUS];
 	unsigned int cpu_scandump_sizes[NR_CPUS];
 
 	/* When single buffer is used to collect Scandump */
@@ -142,8 +143,8 @@ static void dump_cpu_alive_mask(struct msm_watchdog_data *wdog_dd)
 {
 	static char alive_mask_buf[MASK_SIZE];
 
-	scnprintf(alive_mask_buf, MASK_SIZE, "%x",
-		  atomic_read(&wdog_dd->alive_mask));
+	scnprintf(alive_mask_buf, MASK_SIZE, "%*pb1", cpumask_pr_args(
+				&wdog_dd->alive_mask));
 	dev_info(wdog_dd->dev, "cpu alive mask from last pet %s\n",
 				alive_mask_buf);
 }
@@ -381,59 +382,33 @@ static void pet_watchdog(struct msm_watchdog_data *wdog_dd)
 
 static void keep_alive_response(void *info)
 {
-	struct msm_watchdog_data *wdog_dd = wdog_data;
-	unsigned int this_cpu_bit = (unsigned long)info >> 32;
-	unsigned int final_alive_mask = (unsigned int)(long)info;
-	unsigned int old;
+	int cpu = smp_processor_id();
+	struct msm_watchdog_data *wdog_dd = (struct msm_watchdog_data *)info;
 
-	/* Wake up the watchdog task if we're the final pinged CPU */
-	old = atomic_fetch_or_relaxed(this_cpu_bit, &wdog_data->alive_mask);
-	if (old == (final_alive_mask & ~this_cpu_bit))
-		wake_up_process(wdog_dd->watchdog_task);
+	cpumask_set_cpu(cpu, &wdog_dd->alive_mask);
+	wdog_dd->ping_end[cpu] = sched_clock();
+	/* Make sure alive mask is cleared and set in order */
+	smp_mb();
 }
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
-
 /*
  * If this function does not return, it implies one of the
  * other cpu's is not responsive.
  */
 static void ping_other_cpus(struct msm_watchdog_data *wdog_dd)
 {
-	unsigned long online_mask, ping_mask = 0;
-	unsigned int final_alive_mask;
-	int cpu, this_cpu;
+	int cpu;
 
-	/*
-	 * Ping all CPUs other than the current one asynchronously so that we
-	 * don't spend a lot of time spinning on the current CPU with IRQs
-	 * disabled (which is what smp_call_function_single() does in
-	 * synchronous mode).
-	 */
-	preempt_disable();
-	this_cpu = raw_smp_processor_id();
-	atomic_set(&wdog_dd->alive_mask, BIT(this_cpu));
-	online_mask = *cpumask_bits(cpu_online_mask) & ~BIT(this_cpu);
-	for_each_cpu(cpu, to_cpumask(&online_mask)) {
-		if (!cpu_idle_pc_state[cpu] && !cpu_isolated(cpu))
-			ping_mask |= BIT(cpu);
+	cpumask_clear(&wdog_dd->alive_mask);
+	/* Make sure alive mask is cleared and set in order */
+	smp_mb();
+	for_each_cpu(cpu, cpu_online_mask) {
+		if (!cpu_idle_pc_state[cpu] && !cpu_isolated(cpu)) {
+			wdog_dd->ping_start[cpu] = sched_clock();
+			smp_call_function_single(cpu, keep_alive_response,
+						 wdog_dd, 1);
+		}
 	}
-	final_alive_mask = ping_mask | BIT(this_cpu);
-	for_each_cpu(cpu, to_cpumask(&ping_mask)) {
-		generic_exec_single(cpu, per_cpu_ptr(&csd_data, cpu),
-				    keep_alive_response,
-				    (void *)(BIT(cpu + 32) | final_alive_mask));
-	}
-	preempt_enable();
-
-	atomic_set(&wdog_dd->pinged_mask, final_alive_mask);
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&wdog_dd->alive_mask) == final_alive_mask)
-			break;
-		schedule();
-	}
-	__set_current_state(TASK_RUNNING);
 }
 
 static void pet_task_wakeup(unsigned long data)
@@ -451,7 +426,7 @@ static __ref int watchdog_kthread(void *arg)
 		(struct msm_watchdog_data *)arg;
 	unsigned long delay_time = 0;
 	struct sched_param param = {.sched_priority = MAX_RT_PRIO-1};
-	int ret;
+	int ret, cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	while (!kthread_should_stop()) {
@@ -461,6 +436,9 @@ static __ref int watchdog_kthread(void *arg)
 		} while (ret != 0);
 
 		wdog_dd->thread_start = sched_clock();
+		for_each_cpu(cpu, cpu_present_mask)
+			wdog_dd->ping_start[cpu] = wdog_dd->ping_end[cpu] = 0;
+
 		if (wdog_dd->do_ipi_ping)
 			ping_other_cpus(wdog_dd);
 
@@ -928,6 +906,7 @@ static int msm_watchdog_probe(struct platform_device *pdev)
 	wdog_data = wdog_dd;
 	wdog_dd->dev = &pdev->dev;
 	platform_set_drvdata(pdev, wdog_dd);
+	cpumask_clear(&wdog_dd->alive_mask);
 	wdog_dd->watchdog_task = kthread_create(watchdog_kthread, wdog_dd,
 			"msm_watchdog");
 	if (IS_ERR(wdog_dd->watchdog_task)) {
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a06b8da0822a..039da089482c 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -97,8 +97,6 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int generic_exec_single(int cpu, struct __call_single_data *csd,
-			       smp_call_func_t func, void *info);
 int smp_call_function(smp_call_func_t func, void *info, int wait);
 void smp_call_function_many(const struct cpumask *mask,
 			    smp_call_func_t func, void *info, bool wait);
diff --git a/include/soc/qcom/scm.h b/include/soc/qcom/scm.h
index 5e80139a9d84..e8d47986972a 100644
--- a/include/soc/qcom/scm.h
+++ b/include/soc/qcom/scm.h
@@ -124,7 +124,7 @@ struct scm_hdcp_req {
 };
 
 extern struct mutex scm_lmh_lock;
-extern bool under_scm_call(int cpu);
+extern bool under_scm_call(void);
 #else
 
 static inline int scm_call2(u32 cmd_id, struct scm_desc *desc)
@@ -186,7 +186,7 @@ static inline int scm_enable_mem_protection(void)
 {
 	return 0;
 }
-extern bool under_scm_call(int cpu)
+extern bool under_scm_call(void)
 {
 	return false;
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index d836ec14b339..477434536a73 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -130,8 +130,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd)
 
 static __always_inline void csd_unlock(struct __call_single_data *csd)
 {
-	if (!(csd->flags & CSD_FLAG_LOCK))
-		return;
+	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
 	/*
 	 * ensure we're all done before releasing data:
@@ -148,7 +147,7 @@ extern void send_call_function_single_ipi(int cpu);
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-int generic_exec_single(int cpu, struct __call_single_data *csd,
+static int generic_exec_single(int cpu, struct __call_single_data *csd,
 			       smp_call_func_t func, void *info)
 {
 	if (cpu == smp_processor_id()) {
@@ -371,7 +370,7 @@ int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 {
 	int err = 0;
 
-	migrate_disable();
+	preempt_disable();
 
 	/* We could deadlock if we have to wait here with interrupts disabled! */
 	if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
@@ -381,7 +380,7 @@ int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 	smp_wmb();
 
 	err = generic_exec_single(cpu, csd, csd->func, csd->info);
-	migrate_enable();
+	preempt_enable();
 
 	return err;
 }