Revert "sched: Remove sched_ktime_clock()"

This reverts 'commit 24c18127e9ba ("sched: Remove sched_ktime_clock()")' WALT accounting uses ktime_get() as time source to keep windows in align with the tick. ktime_get() API should not be called while the timekeeping subsystem is suspended during the system suspend. The code before the reverted patch has a wrapper around ktime_get() to avoid calling ktime_get() when timekeeping subsystem is suspended. The reverted patch removed this wrapper with the assumption that there will not be any scheduler activity while timekeeping subsystem is suspended. The timekeeping subsystem is resumed very early even before non-boot CPUs are brought online. However it is possible that tasks can wake up from the idle notifiers which gets called before timekeeping subsystem is resumed. When this happens, the time read from ktime_get() will not be consistent. We see a jump from the values that would be returned later when timekeeping subsystem is resumed. The rq->window_start update happens with incorrect time. This rq->window_start becomes inconsistent with the rest of the CPUs's rq->window_start and wallclock time after timekeeping subsystem is resumed. This results in WALT accounting bugs. Change-Id: I9c3b2fb9ffbf1103d1bd78778882450560dac09f Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org> [clingutla@codeaurora.org: Resolved trivial merge conflicts.] Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
2025-02-20 11:45:48 +08:00 · 2018-06-25 16:13:39 +05:30 · 2018-06-25 16:13:39 +05:30 · c7ae8d74de
commit c7ae8d74de
parent ceac4d9e49
6 changed files with 65 additions and 22 deletions
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -785,7 +785,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	p->sched_class->dequeue_task(rq, p, flags);
 #ifdef CONFIG_SCHED_WALT
 	if (p == rq->ed_task)
-		early_detection_notify(rq, ktime_get_ns());
+		early_detection_notify(rq, sched_ktime_clock());
 #endif
 	trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]);
 }
@ -2041,7 +2041,7 @@ static inline void walt_try_to_wake_up(struct task_struct *p)

 	rq_lock_irqsave(rq, &rf);
 	old_load = task_load(p);
-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 	rq_unlock_irqrestore(rq, &rf);
@ -2175,7 +2175,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 		set_task_cpu(p, cpu);
 	}

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	note_task_waking(p, wallclock);
 #else /* CONFIG_SMP */

@ -2240,7 +2240,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 	trace_sched_waking(p);

 	if (!task_on_rq_queued(p)) {
-		u64 wallclock = ktime_get_ns();
+		u64 wallclock = sched_ktime_clock();

 		update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 		update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
@ -3163,7 +3163,7 @@ void scheduler_tick(void)

 	old_load = task_load(curr);
 	set_window_start(rq);
-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
@ -3547,7 +3547,7 @@ static void __sched notrace __schedule(bool preempt)
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	if (likely(prev != next)) {
 		if (!prev->on_rq)
 			prev->last_sleep_ts = wallclock;
@ -7426,7 +7426,7 @@ void sched_exit(struct task_struct *p)
 	rq = task_rq_lock(p, &rf);

 	/* rq->curr == p */
-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	dequeue_task(rq, p, 0);
 	/*
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@ -540,7 +540,7 @@ static void sugov_work(struct kthread_work *work)
 	mutex_lock(&sg_policy->work_lock);
 	raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 	sugov_track_cycles(sg_policy, sg_policy->policy->cur,
-			   ktime_get_ns());
+			   sched_ktime_clock());
 	raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
 				CPUFREQ_RELATION_L);
@ -993,7 +993,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 		mutex_lock(&sg_policy->work_lock);
 		raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
 		sugov_track_cycles(sg_policy, sg_policy->policy->cur,
-				   ktime_get_ns());
+				   sched_ktime_clock());
 		raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 		cpufreq_policy_apply_limits(policy);
 		mutex_unlock(&sg_policy->work_lock);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -12750,7 +12750,7 @@ static void walt_check_for_rotation(struct rq *src_rq)
 	if (!is_min_capacity_cpu(src_cpu))
 		return;

-	wc = ktime_get_ns();
+	wc = sched_ktime_clock();
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -1841,6 +1841,15 @@ static inline int hrtick_enabled(struct rq *rq)

 #endif /* CONFIG_SCHED_HRTICK */

+#ifdef CONFIG_SCHED_WALT
+u64 sched_ktime_clock(void);
+#else
+static inline u64 sched_ktime_clock(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 extern void sched_avg_update(struct rq *rq);
 extern unsigned long sched_get_rt_rq_util(int cpu);
@ -2479,7 +2488,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
 					cpu_of(rq)));
 	if (data)
-		data->func(data, ktime_get_ns(), flags);
+		data->func(data, sched_ktime_clock(), flags);
 }
 #else
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@ -19,6 +19,7 @@
 *             and Todd Kjos
 */

+#include <linux/syscore_ops.h>
 #include <linux/cpufreq.h>
 #include <linux/list_sort.h>
 #include <linux/jiffies.h>
@ -42,6 +43,8 @@ const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",

 #define EARLY_DETECTION_DURATION 9500000

+static ktime_t ktime_last;
+static bool sched_ktime_suspended;
 static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
 static bool use_cycle_counter;
 DEFINE_MUTEX(cluster_lock);
@ -51,6 +54,37 @@ u64 walt_load_reported_window;
 static struct irq_work walt_cpufreq_irq_work;
 static struct irq_work walt_migration_irq_work;

+u64 sched_ktime_clock(void)
+{
+	if (unlikely(sched_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void sched_resume(void)
+{
+	sched_ktime_suspended = false;
+}
+
+static int sched_suspend(void)
+{
+	ktime_last = ktime_get();
+	sched_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops sched_syscore_ops = {
+	.resume = sched_resume,
+	.suspend = sched_suspend
+};
+
+static int __init sched_init_ops(void)
+{
+	register_syscore_ops(&sched_syscore_ops);
+	return 0;
+}
+late_initcall(sched_init_ops);
+
 static void acquire_rq_locks_irqsave(const cpumask_t *cpus,
 				     unsigned long *flags)
 {
@ -361,7 +395,7 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
 	if (is_idle_task(curr)) {
 		/* We're here without rq->lock held, IRQ disabled */
 		raw_spin_lock(&rq->lock);
-		update_task_cpu_cycles(curr, cpu, ktime_get_ns());
+		update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
 		raw_spin_unlock(&rq->lock);
 	}
 }
@ -416,7 +450,7 @@ void sched_account_irqtime(int cpu, struct task_struct *curr,
 	cur_jiffies_ts = get_jiffies_64();

 	if (is_idle_task(curr))
-		update_task_ravg(curr, rq, IRQ_UPDATE, ktime_get_ns(),
+		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
 				 delta);

 	nr_windows = cur_jiffies_ts - rq->irqload_ts;
@ -756,7 +790,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	if (sched_disable_window_stats)
 		goto done;

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();

 	update_task_ravg(task_rq(p)->curr, task_rq(p),
 			 TASK_UPDATE,
@ -2055,7 +2089,7 @@ void mark_task_starting(struct task_struct *p)
 		return;
 	}

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();
 	p->ravg.mark_start = p->last_wake_ts = wallclock;
 	p->last_enqueued_ts = wallclock;
 	update_task_cpu_cycles(p, cpu_of(rq), wallclock);
@ -2401,7 +2435,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,

 			raw_spin_lock_irqsave(&rq->lock, flags);
 			update_task_ravg(rq->curr, rq, TASK_UPDATE,
-					 ktime_get_ns(), 0);
+					 sched_ktime_clock(), 0);
 			raw_spin_unlock_irqrestore(&rq->lock, flags);
 		}

@ -2561,7 +2595,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
 	if (list_empty(&grp->tasks))
 		return;

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();

 	/*
 	 * wakeup of two or more related tasks could race with each other and
@ -2587,7 +2621,7 @@ static void _set_preferred_cluster(struct related_thread_group *grp)

 	grp->preferred_cluster = best_cluster(grp,
 			combined_demand, group_boost);
-	grp->last_update = ktime_get_ns();
+	grp->last_update = sched_ktime_clock();
 	trace_sched_set_preferred_cluster(grp, combined_demand);
 }

@ -2611,7 +2645,7 @@ int update_preferred_cluster(struct related_thread_group *grp,
 	 * has passed since we last updated preference
 	 */
 	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
-		ktime_get_ns() - grp->last_update > sched_ravg_window)
+		sched_ktime_clock() - grp->last_update > sched_ravg_window)
 		return 1;

 	return 0;
@ -2994,7 +3028,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 	bool new_task;
 	int i;

-	wallclock = ktime_get_ns();
+	wallclock = sched_ktime_clock();

 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
@ -3166,7 +3200,7 @@ void walt_irq_work(struct irq_work *irq_work)
 	for_each_cpu(cpu, cpu_possible_mask)
 		raw_spin_lock(&cpu_rq(cpu)->lock);

-	wc = ktime_get_ns();
+	wc = sched_ktime_clock();
 	walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);
 	for_each_sched_cluster(cluster) {
 		u64 aggr_grp_load = 0;
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@ -299,7 +299,7 @@ void walt_sched_init_rq(struct rq *rq);

 static inline void walt_update_last_enqueue(struct task_struct *p)
 {
-	p->last_enqueued_ts = ktime_get_ns();
+	p->last_enqueued_ts = sched_ktime_clock();
 }
 extern void walt_rotate_work_init(void);
 extern void walt_rotation_checkpoint(int nr_big);