Blame - kernel/sched/hmp.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 9bc1bacd52e5fb2a11631bc1736d985a6dee8e34 [file] [log] [blame]

Syed Rameez Mustafa	dddcab7	2016-09-07 16:18:27 -0700	[diff] [blame]	1	/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
				2	*
				3	* This program is free software; you can redistribute it and/or modify
				4	* it under the terms of the GNU General Public License version 2 and
				5	* only version 2 as published by the Free Software Foundation.
				6	*
				7	* This program is distributed in the hope that it will be useful,
				8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				10	* GNU General Public License for more details.
				11	*
				12	* Implementation credits: Srivatsa Vaddagiri, Steve Muckle
				13	* Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
				14	* and Vikram Mulukutla
				15	*/
				16
				17	#include <linux/cpufreq.h>
				18	#include <linux/list_sort.h>
				19	#include <linux/syscore_ops.h>
				20
				21	#include "sched.h"
				22
				23	#include <trace/events/sched.h>
				24
				25	const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
				26	"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"};
				27
				28	const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
				29	"RQ_TO_RQ", "GROUP_TO_GROUP"};
				30
				31
				32	static ktime_t ktime_last;
				33	static bool sched_ktime_suspended;
				34
				35	static bool use_cycle_counter;
				36	static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
				37
				38	u64 sched_ktime_clock(void)
				39	{
				40	if (unlikely(sched_ktime_suspended))
				41	return ktime_to_ns(ktime_last);
				42	return ktime_get_ns();
				43	}
				44
				45	static void sched_resume(void)
				46	{
				47	sched_ktime_suspended = false;
				48	}
				49
				50	static int sched_suspend(void)
				51	{
				52	ktime_last = ktime_get();
				53	sched_ktime_suspended = true;
				54	return 0;
				55	}
				56
				57	static struct syscore_ops sched_syscore_ops = {
				58	.resume = sched_resume,
				59	.suspend = sched_suspend
				60	};
				61
				62	static int __init sched_init_ops(void)
				63	{
				64	register_syscore_ops(&sched_syscore_ops);
				65	return 0;
				66	}
				67	late_initcall(sched_init_ops);
				68
				69	inline void clear_ed_task(struct task_struct p, struct rq rq)
				70	{
				71	if (p == rq->ed_task)
				72	rq->ed_task = NULL;
				73	}
				74
				75	inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
				76	{
				77	p->last_wake_ts = wallclock;
				78	}
				79
				80	inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
				81	{
				82	p->last_switch_out_ts = wallclock;
				83	}
				84
				85	/*
				86	* Note C-state for (idle) cpus.
				87	*
				88	* @cstate = cstate index, 0 -> active state
				89	* @wakeup_energy = energy spent in waking up cpu
				90	* @wakeup_latency = latency to wakeup from cstate
				91	*
				92	*/
				93	void
				94	sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
				95	{
				96	struct rq *rq = cpu_rq(cpu);
				97
				98	rq->cstate = cstate; /* C1, C2 etc */
				99	rq->wakeup_energy = wakeup_energy;
				100	rq->wakeup_latency = wakeup_latency;
				101	}
				102
				103	/*
				104	* Note D-state for (idle) cluster.
				105	*
				106	* @dstate = dstate index, 0 -> active state
				107	* @wakeup_energy = energy spent in waking up cluster
				108	* @wakeup_latency = latency to wakeup from cluster
				109	*
				110	*/
				111	void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
				112	int wakeup_energy, int wakeup_latency)
				113	{
				114	struct sched_cluster *cluster =
				115	cpu_rq(cpumask_first(cluster_cpus))->cluster;
				116	cluster->dstate = dstate;
				117	cluster->dstate_wakeup_energy = wakeup_energy;
				118	cluster->dstate_wakeup_latency = wakeup_latency;
				119	}
				120
				121	u32 __weak get_freq_max_load(int cpu, u32 freq)
				122	{
				123	/* 100% by default */
				124	return 100;
				125	}
				126
				127	struct freq_max_load_entry {
				128	/* The maximum load which has accounted governor's headroom. */
				129	u64 hdemand;
				130	};
				131
				132	struct freq_max_load {
				133	struct rcu_head rcu;
				134	int length;
				135	struct freq_max_load_entry freqs[0];
				136	};
				137
				138	static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
				139	static DEFINE_SPINLOCK(freq_max_load_lock);
				140
				141	struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
				142	{
				143	return NULL;
				144	}
				145
				146	int sched_update_freq_max_load(const cpumask_t *cpumask)
				147	{
				148	int i, cpu, ret;
				149	unsigned int freq;
				150	struct cpu_pstate_pwr *costs;
				151	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
				152	struct freq_max_load max_load, old_max_load;
				153	struct freq_max_load_entry *entry;
				154	u64 max_demand_capacity, max_demand;
				155	unsigned long flags;
				156	u32 hfreq;
				157	int hpct;
				158
				159	if (!per_cpu_info)
				160	return 0;
				161
				162	spin_lock_irqsave(&freq_max_load_lock, flags);
				163	max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
				164	for_each_cpu(cpu, cpumask) {
				165	if (!per_cpu_info[cpu].ptable) {
				166	ret = -EINVAL;
				167	goto fail;
				168	}
				169
				170	old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
				171
				172	/*
				173	* allocate len + 1 and leave the last power cost as 0 for
				174	* power_cost() can stop iterating index when
				175	* per_cpu_info[cpu].len > len of max_load due to race between
				176	* cpu power stats update and get_cpu_pwr_stats().
				177	*/
				178	max_load = kzalloc(sizeof(struct freq_max_load) +
				179	sizeof(struct freq_max_load_entry) *
				180	(per_cpu_info[cpu].len + 1), GFP_ATOMIC);
				181	if (unlikely(!max_load)) {
				182	ret = -ENOMEM;
				183	goto fail;
				184	}
				185
				186	max_load->length = per_cpu_info[cpu].len;
				187
				188	max_demand = max_demand_capacity *
				189	cpu_max_possible_capacity(cpu);
				190
				191	i = 0;
				192	costs = per_cpu_info[cpu].ptable;
				193	while (costs[i].freq) {
				194	entry = &max_load->freqs[i];
				195	freq = costs[i].freq;
				196	hpct = get_freq_max_load(cpu, freq);
				197	if (hpct <= 0 && hpct > 100)
				198	hpct = 100;
				199	hfreq = div64_u64((u64)freq * hpct, 100);
				200	entry->hdemand =
				201	div64_u64(max_demand * hfreq,
				202	cpu_max_possible_freq(cpu));
				203	i++;
				204	}
				205
				206	rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
				207	if (old_max_load)
				208	kfree_rcu(old_max_load, rcu);
				209	}
				210
				211	spin_unlock_irqrestore(&freq_max_load_lock, flags);
				212	return 0;
				213
				214	fail:
				215	for_each_cpu(cpu, cpumask) {
				216	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
				217	if (max_load) {
				218	rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
				219	kfree_rcu(max_load, rcu);
				220	}
				221	}
				222
				223	spin_unlock_irqrestore(&freq_max_load_lock, flags);
				224	return ret;
				225	}
				226
				227	unsigned int max_possible_efficiency = 1;
				228	unsigned int min_possible_efficiency = UINT_MAX;
				229
				230	unsigned long __weak arch_get_cpu_efficiency(int cpu)
				231	{
				232	return SCHED_CAPACITY_SCALE;
				233	}
				234
				235	/* Keep track of max/min capacity possible across CPUs "currently" */
				236	static void __update_min_max_capacity(void)
				237	{
				238	int i;
				239	int max_cap = 0, min_cap = INT_MAX;
				240
				241	for_each_online_cpu(i) {
				242	max_cap = max(max_cap, cpu_capacity(i));
				243	min_cap = min(min_cap, cpu_capacity(i));
				244	}
				245
				246	max_capacity = max_cap;
				247	min_capacity = min_cap;
				248	}
				249
				250	static void update_min_max_capacity(void)
				251	{
				252	unsigned long flags;
				253	int i;
				254
				255	local_irq_save(flags);
				256	for_each_possible_cpu(i)
				257	raw_spin_lock(&cpu_rq(i)->lock);
				258
				259	__update_min_max_capacity();
				260
				261	for_each_possible_cpu(i)
				262	raw_spin_unlock(&cpu_rq(i)->lock);
				263	local_irq_restore(flags);
				264	}
				265
				266	/*
				267	* Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
				268	* least efficient cpu gets capacity of 1024
				269	*/
				270	static unsigned long
				271	capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
				272	{
				273	return (1024 * cluster->efficiency) / min_possible_efficiency;
				274	}
				275
				276	/*
				277	* Return 'capacity' of a cpu in reference to cpu with lowest max_freq
				278	* (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
				279	*/
				280	static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
				281	{
				282	return (1024 * cluster_max_freq(cluster)) / min_max_freq;
				283	}
				284
				285	/*
				286	* Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
				287	* that "most" efficient cpu gets a load_scale_factor of 1
				288	*/
				289	static inline unsigned long
				290	load_scale_cpu_efficiency(struct sched_cluster *cluster)
				291	{
				292	return DIV_ROUND_UP(1024 * max_possible_efficiency,
				293	cluster->efficiency);
				294	}
				295
				296	/*
				297	* Return load_scale_factor of a cpu in reference to cpu with best max_freq
				298	* (max_possible_freq), so that one with best max_freq gets a load_scale_factor
				299	* of 1.
				300	*/
				301	static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
				302	{
				303	return DIV_ROUND_UP(1024 * max_possible_freq,
				304	cluster_max_freq(cluster));
				305	}
				306
				307	static int compute_capacity(struct sched_cluster *cluster)
				308	{
				309	int capacity = 1024;
				310
				311	capacity *= capacity_scale_cpu_efficiency(cluster);
				312	capacity >>= 10;
				313
				314	capacity *= capacity_scale_cpu_freq(cluster);
				315	capacity >>= 10;
				316
				317	return capacity;
				318	}
				319
				320	static int compute_max_possible_capacity(struct sched_cluster *cluster)
				321	{
				322	int capacity = 1024;
				323
				324	capacity *= capacity_scale_cpu_efficiency(cluster);
				325	capacity >>= 10;
				326
				327	capacity = (1024 cluster->max_possible_freq) / min_max_freq;
				328	capacity >>= 10;
				329
				330	return capacity;
				331	}
				332
				333	static int compute_load_scale_factor(struct sched_cluster *cluster)
				334	{
				335	int load_scale = 1024;
				336
				337	/*
				338	* load_scale_factor accounts for the fact that task load
				339	* is in reference to "best" performing cpu. Task's load will need to be
				340	* scaled (up) by a factor to determine suitability to be placed on a
				341	* (little) cpu.
				342	*/
				343	load_scale *= load_scale_cpu_efficiency(cluster);
				344	load_scale >>= 10;
				345
				346	load_scale *= load_scale_cpu_freq(cluster);
				347	load_scale >>= 10;
				348
				349	return load_scale;
				350	}
				351
				352	struct list_head cluster_head;
				353	static DEFINE_MUTEX(cluster_lock);
				354	static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
				355	DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
				356	struct sched_cluster *sched_cluster[NR_CPUS];
				357	int num_clusters;
				358
				359	struct sched_cluster init_cluster = {
				360	.list = LIST_HEAD_INIT(init_cluster.list),
				361	.id = 0,
				362	.max_power_cost = 1,
				363	.min_power_cost = 1,
				364	.capacity = 1024,
				365	.max_possible_capacity = 1024,
				366	.efficiency = 1,
				367	.load_scale_factor = 1024,
				368	.cur_freq = 1,
				369	.max_freq = 1,
				370	.max_mitigated_freq = UINT_MAX,
				371	.min_freq = 1,
				372	.max_possible_freq = 1,
				373	.dstate = 0,
				374	.dstate_wakeup_energy = 0,
				375	.dstate_wakeup_latency = 0,
				376	.exec_scale_factor = 1024,
				377	.notifier_sent = 0,
				378	};
				379
				380	static void update_all_clusters_stats(void)
				381	{
				382	struct sched_cluster *cluster;
				383	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
				384
				385	pre_big_task_count_change(cpu_possible_mask);
				386
				387	for_each_sched_cluster(cluster) {
				388	u64 mpc;
				389
				390	cluster->capacity = compute_capacity(cluster);
				391	mpc = cluster->max_possible_capacity =
				392	compute_max_possible_capacity(cluster);
				393	cluster->load_scale_factor = compute_load_scale_factor(cluster);
				394
				395	cluster->exec_scale_factor =
				396	DIV_ROUND_UP(cluster->efficiency * 1024,
				397	max_possible_efficiency);
				398
				399	if (mpc > highest_mpc)
				400	highest_mpc = mpc;
				401
				402	if (mpc < lowest_mpc)
				403	lowest_mpc = mpc;
				404	}
				405
				406	max_possible_capacity = highest_mpc;
				407	min_max_possible_capacity = lowest_mpc;
				408
				409	__update_min_max_capacity();
				410	sched_update_freq_max_load(cpu_possible_mask);
				411	post_big_task_count_change(cpu_possible_mask);
				412	}
				413
				414	static void assign_cluster_ids(struct list_head *head)
				415	{
				416	struct sched_cluster *cluster;
				417	int pos = 0;
				418
				419	list_for_each_entry(cluster, head, list) {
				420	cluster->id = pos;
				421	sched_cluster[pos++] = cluster;
				422	}
				423	}
				424
				425	static void
				426	move_list(struct list_head dst, struct list_head src, bool sync_rcu)
				427	{
				428	struct list_head first, last;
				429
				430	first = src->next;
				431	last = src->prev;
				432
				433	if (sync_rcu) {
				434	INIT_LIST_HEAD_RCU(src);
				435	synchronize_rcu();
				436	}
				437
				438	first->prev = dst;
				439	dst->prev = last;
				440	last->next = dst;
				441
				442	/* Ensure list sanity before making the head visible to all CPUs. */
				443	smp_mb();
				444	dst->next = first;
				445	}
				446
				447	static int
				448	compare_clusters(void priv, struct list_head a, struct list_head *b)
				449	{
				450	struct sched_cluster cluster1, cluster2;
				451	int ret;
				452
				453	cluster1 = container_of(a, struct sched_cluster, list);
				454	cluster2 = container_of(b, struct sched_cluster, list);
				455
				456	ret = cluster1->max_power_cost > cluster2->max_power_cost \|\|
				457	(cluster1->max_power_cost == cluster2->max_power_cost &&
				458	cluster1->max_possible_capacity <
				459	cluster2->max_possible_capacity);
				460
				461	return ret;
				462	}
				463
				464	static void sort_clusters(void)
				465	{
				466	struct sched_cluster *cluster;
				467	struct list_head new_head;
				468
				469	INIT_LIST_HEAD(&new_head);
				470
				471	for_each_sched_cluster(cluster) {
				472	cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
				473	max_task_load());
				474	cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
				475	0);
				476	}
				477
				478	move_list(&new_head, &cluster_head, true);
				479
				480	list_sort(NULL, &new_head, compare_clusters);
				481	assign_cluster_ids(&new_head);
				482
				483	/*
				484	* Ensure cluster ids are visible to all CPUs before making
				485	* cluster_head visible.
				486	*/
				487	move_list(&cluster_head, &new_head, false);
				488	}
				489
				490	static void
				491	insert_cluster(struct sched_cluster cluster, struct list_head head)
				492	{
				493	struct sched_cluster *tmp;
				494	struct list_head *iter = head;
				495
				496	list_for_each_entry(tmp, head, list) {
				497	if (cluster->max_power_cost < tmp->max_power_cost)
				498	break;
				499	iter = &tmp->list;
				500	}
				501
				502	list_add(&cluster->list, iter);
				503	}
				504
				505	static struct sched_cluster alloc_new_cluster(const struct cpumask cpus)
				506	{
				507	struct sched_cluster *cluster = NULL;
				508
				509	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
				510	if (!cluster) {
				511	__WARN_printf("Cluster allocation failed. \
				512	Possible bad scheduling\n");
				513	return NULL;
				514	}
				515
				516	INIT_LIST_HEAD(&cluster->list);
				517	cluster->max_power_cost = 1;
				518	cluster->min_power_cost = 1;
				519	cluster->capacity = 1024;
				520	cluster->max_possible_capacity = 1024;
				521	cluster->efficiency = 1;
				522	cluster->load_scale_factor = 1024;
				523	cluster->cur_freq = 1;
				524	cluster->max_freq = 1;
				525	cluster->max_mitigated_freq = UINT_MAX;
				526	cluster->min_freq = 1;
				527	cluster->max_possible_freq = 1;
				528	cluster->dstate = 0;
				529	cluster->dstate_wakeup_energy = 0;
				530	cluster->dstate_wakeup_latency = 0;
				531	cluster->freq_init_done = false;
				532
				533	cluster->cpus = *cpus;
				534	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
				535
				536	if (cluster->efficiency > max_possible_efficiency)
				537	max_possible_efficiency = cluster->efficiency;
				538	if (cluster->efficiency < min_possible_efficiency)
				539	min_possible_efficiency = cluster->efficiency;
				540
				541	cluster->notifier_sent = 0;
				542	return cluster;
				543	}
				544
				545	static void add_cluster(const struct cpumask cpus, struct list_head head)
				546	{
				547	struct sched_cluster *cluster = alloc_new_cluster(cpus);
				548	int i;
				549
				550	if (!cluster)
				551	return;
				552
				553	for_each_cpu(i, cpus)
				554	cpu_rq(i)->cluster = cluster;
				555
				556	insert_cluster(cluster, head);
				557	set_bit(num_clusters, all_cluster_ids);
				558	num_clusters++;
				559	}
				560
				561	void update_cluster_topology(void)
				562	{
				563	struct cpumask cpus = *cpu_possible_mask;
				564	const struct cpumask *cluster_cpus;
				565	struct list_head new_head;
				566	int i;
				567
				568	INIT_LIST_HEAD(&new_head);
				569
				570	for_each_cpu(i, &cpus) {
				571	cluster_cpus = cpu_coregroup_mask(i);
				572	cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
				573	cpumask_andnot(&cpus, &cpus, cluster_cpus);
				574	add_cluster(cluster_cpus, &new_head);
				575	}
				576
				577	assign_cluster_ids(&new_head);
				578
				579	/*
				580	* Ensure cluster ids are visible to all CPUs before making
				581	* cluster_head visible.
				582	*/
				583	move_list(&cluster_head, &new_head, false);
				584	}
				585
				586	void init_clusters(void)
				587	{
				588	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
				589	init_cluster.cpus = *cpu_possible_mask;
				590	INIT_LIST_HEAD(&cluster_head);
				591	}
				592
				593	int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
				594	{
				595	mutex_lock(&cluster_lock);
				596	if (!cb->get_cpu_cycle_counter) {
				597	mutex_unlock(&cluster_lock);
				598	return -EINVAL;
				599	}
				600
				601	cpu_cycle_counter_cb = *cb;
				602	use_cycle_counter = true;
				603	mutex_unlock(&cluster_lock);
				604
				605	return 0;
				606	}
				607
				608	int got_boost_kick(void)
				609	{
				610	int cpu = smp_processor_id();
				611	struct rq *rq = cpu_rq(cpu);
				612
				613	return test_bit(BOOST_KICK, &rq->hmp_flags);
				614	}
				615
				616	inline void clear_boost_kick(int cpu)
				617	{
				618	struct rq *rq = cpu_rq(cpu);
				619
				620	clear_bit(BOOST_KICK, &rq->hmp_flags);
				621	}
				622
				623	inline void boost_kick(int cpu)
				624	{
				625	struct rq *rq = cpu_rq(cpu);
				626
				627	if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
				628	smp_send_reschedule(cpu);
				629	}
				630
				631	/* Clear any HMP scheduler related requests pending from or on cpu */
				632	void clear_hmp_request(int cpu)
				633	{
				634	struct rq *rq = cpu_rq(cpu);
				635	unsigned long flags;
				636
				637	clear_boost_kick(cpu);
				638	clear_reserved(cpu);
				639	if (rq->push_task) {
				640	raw_spin_lock_irqsave(&rq->lock, flags);
				641	if (rq->push_task) {
				642	clear_reserved(rq->push_cpu);
				643	put_task_struct(rq->push_task);
				644	rq->push_task = NULL;
				645	}
				646	rq->active_balance = 0;
				647	raw_spin_unlock_irqrestore(&rq->lock, flags);
				648	}
				649	}
				650
				651	int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
				652	{
				653	struct rq *rq = cpu_rq(cpu);
				654
				655	rq->static_cpu_pwr_cost = cost;
				656	return 0;
				657	}
				658
				659	unsigned int sched_get_static_cpu_pwr_cost(int cpu)
				660	{
				661	return cpu_rq(cpu)->static_cpu_pwr_cost;
				662	}
				663
				664	int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
				665	{
				666	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
				667
				668	cluster->static_cluster_pwr_cost = cost;
				669	return 0;
				670	}
				671
				672	unsigned int sched_get_static_cluster_pwr_cost(int cpu)
				673	{
				674	return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
				675	}
				676
				677	/*
				678	* sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
				679	* associated with them. This is required for atomic update of those variables
				680	* when being modifed via sysctl interface.
				681	*
				682	* IMPORTANT: Initialize both copies to same value!!
				683	*/
				684
				685	/*
				686	* Tasks that are runnable continuously for a period greather than
				687	* EARLY_DETECTION_DURATION can be flagged early as potential
				688	* high load tasks.
				689	*/
				690	#define EARLY_DETECTION_DURATION 9500000
				691
				692	static __read_mostly unsigned int sched_ravg_hist_size = 5;
				693	__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
				694
				695	static __read_mostly unsigned int sched_window_stats_policy =
				696	WINDOW_STATS_MAX_RECENT_AVG;
				697	__read_mostly unsigned int sysctl_sched_window_stats_policy =
				698	WINDOW_STATS_MAX_RECENT_AVG;
				699
				700	#define SCHED_ACCOUNT_WAIT_TIME 1
				701
				702	__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
				703
				704	unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
				705
				706	/*
				707	* Enable colocation and frequency aggregation for all threads in a process.
				708	* The children inherits the group id from the parent.
				709	*/
				710	unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
				711
				712
				713	__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
				714
				715	#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
				716
				717	/*
				718	* For increase, send notification if
				719	* freq_required - cur_freq > sysctl_sched_freq_inc_notify
				720	*/
				721	__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
				722
				723	/*
				724	* For decrease, send notification if
				725	* cur_freq - freq_required > sysctl_sched_freq_dec_notify
				726	*/
				727	__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
				728
				729	static __read_mostly unsigned int sched_io_is_busy;
				730
				731	__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
				732
				733	/*
				734	* Maximum possible frequency across all cpus. Task demand and cpu
				735	* capacity (cpu_power) metrics are scaled in reference to it.
				736	*/
				737	unsigned int max_possible_freq = 1;
				738
				739	/*
				740	* Minimum possible max_freq across all cpus. This will be same as
				741	* max_possible_freq on homogeneous systems and could be different from
				742	* max_possible_freq on heterogenous systems. min_max_freq is used to derive
				743	* capacity (cpu_power) of cpus.
				744	*/
				745	unsigned int min_max_freq = 1;
				746
				747	unsigned int max_capacity = 1024; /* max(rq->capacity) */
				748	unsigned int min_capacity = 1024; /* min(rq->capacity) */
				749	unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
				750	unsigned int
				751	min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
				752
				753	/* Window size (in ns) */
				754	__read_mostly unsigned int sched_ravg_window = 10000000;
				755
				756	/* Min window size (in ns) = 10ms */
				757	#define MIN_SCHED_RAVG_WINDOW 10000000
				758
				759	/* Max window size (in ns) = 1s */
				760	#define MAX_SCHED_RAVG_WINDOW 1000000000
				761
				762	/* Temporarily disable window-stats activity on all cpus */
				763	unsigned int __read_mostly sched_disable_window_stats;
				764
				765	/*
				766	* Major task runtime. If a task runs for more than sched_major_task_runtime
				767	* in a window, it's considered to be generating majority of workload
				768	* for this window. Prediction could be adjusted for such tasks.
				769	*/
				770	__read_mostly unsigned int sched_major_task_runtime = 10000000;
				771
				772	static unsigned int sync_cpu;
				773
				774	static LIST_HEAD(related_thread_groups);
				775	static DEFINE_RWLOCK(related_thread_group_lock);
				776
				777	#define for_each_related_thread_group(grp) \
				778	list_for_each_entry(grp, &related_thread_groups, list)
				779
				780	/*
				781	* Demand aggregation for frequency purpose:
				782	*
				783	* 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
				784	* for frequency determination purpose. This aggregation is done per-cluster.
				785	*
				786	* CPU demand of tasks from various related groups is aggregated per-cluster and
				787	* added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
				788	* by just rq->prev_runnable_sum.
				789	*
				790	* Some examples follow, which assume:
				791	* Cluster0 = CPU0-3, Cluster1 = CPU4-7
				792	* One related thread group A that has tasks A0, A1, A2
				793	*
				794	* A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
				795	* tasks belonging to group A are accumulated when they run on cpu X.
				796	*
				797	* CX->curr/prev_sum = counters in which cpu execution stats of all tasks
				798	* not belonging to group A are accumulated when they run on cpu X
				799	*
				800	* Lets say the stats for window M was as below:
				801	*
				802	* C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
				803	* Task A0 ran 5ms on CPU0
				804	* Task B0 ran 1ms on CPU0
				805	*
				806	* C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
				807	* Task A1 ran 4ms on CPU1
				808	* Task A2 ran 2ms on CPU1
				809	* Task B1 ran 5ms on CPU1
				810	*
				811	* C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
				812	* CPU2 idle
				813	*
				814	* C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
				815	* CPU3 idle
				816	*
				817	* In this case, CPU1 was most busy going by just its prev_sum counter. Demand
				818	* from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
				819	* time reported to governor will be:
				820	*
				821	*
				822	* C0 busy time = 1ms
				823	* C1 busy time = 5 + 5 + 6 = 16ms
				824	*
				825	*/
				826	static __read_mostly unsigned int sched_freq_aggregate;
				827	__read_mostly unsigned int sysctl_sched_freq_aggregate;
				828
				829	unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
				830	static unsigned int __read_mostly sched_freq_aggregate_threshold;
				831
				832	/* Initial task load. Newly created tasks are assigned this load. */
				833	unsigned int __read_mostly sched_init_task_load_windows;
				834	unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
				835
				836	unsigned int max_task_load(void)
				837	{
				838	return sched_ravg_window;
				839	}
				840
				841	/*
				842	* Scheduler boost is a mechanism to temporarily place tasks on CPUs
				843	* with higher capacity than those where a task would have normally
				844	* ended up with their load characteristics. Any entity enabling
				845	* boost is responsible for disabling it as well.
				846	*/
				847	unsigned int sysctl_sched_boost;
				848
				849	/* A cpu can no longer accommodate more tasks if:
				850	*
				851	* rq->nr_running > sysctl_sched_spill_nr_run \|\|
				852	* rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
				853	*/
				854	unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
				855
				856	/*
				857	* Place sync wakee tasks those have less than configured demand to the waker's
				858	* cluster.
				859	*/
				860	unsigned int __read_mostly sched_small_wakee_task_load;
				861	unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
				862
				863	unsigned int __read_mostly sched_big_waker_task_load;
				864	unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
				865
				866	/*
				867	* CPUs with load greater than the sched_spill_load_threshold are not
				868	* eligible for task placement. When all CPUs in a cluster achieve a
				869	* load higher than this level, tasks becomes eligible for inter
				870	* cluster migration.
				871	*/
				872	unsigned int __read_mostly sched_spill_load;
				873	unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
				874
				875	/*
				876	* Tasks whose bandwidth consumption on a cpu is more than
				877	* sched_upmigrate are considered "big" tasks. Big tasks will be
				878	* considered for "up" migration, i.e migrating to a cpu with better
				879	* capacity.
				880	*/
				881	unsigned int __read_mostly sched_upmigrate;
				882	unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
				883
				884	/*
				885	* Big tasks, once migrated, will need to drop their bandwidth
				886	* consumption to less than sched_downmigrate before they are "down"
				887	* migrated.
				888	*/
				889	unsigned int __read_mostly sched_downmigrate;
				890	unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
				891
				892	/*
				893	* The load scale factor of a CPU gets boosted when its max frequency
				894	* is restricted due to which the tasks are migrating to higher capacity
				895	* CPUs early. The sched_upmigrate threshold is auto-upgraded by
				896	* rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
				897	*/
				898	unsigned int up_down_migrate_scale_factor = 1024;
				899
				900	/*
				901	* Scheduler selects and places task to its previous CPU if sleep time is
				902	* less than sysctl_sched_select_prev_cpu_us.
				903	*/
				904	unsigned int __read_mostly
				905	sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
				906
				907	unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
				908
				909	unsigned int __read_mostly
				910	sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
				911
				912	unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
				913
				914	void update_up_down_migrate(void)
				915	{
				916	unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
				917	unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
				918	unsigned int delta;
				919
				920	if (up_down_migrate_scale_factor == 1024)
				921	goto done;
				922
				923	delta = up_migrate - down_migrate;
				924
				925	up_migrate /= NSEC_PER_USEC;
				926	up_migrate *= up_down_migrate_scale_factor;
				927	up_migrate >>= 10;
				928	up_migrate *= NSEC_PER_USEC;
				929
				930	up_migrate = min(up_migrate, sched_ravg_window);
				931
				932	down_migrate /= NSEC_PER_USEC;
				933	down_migrate *= up_down_migrate_scale_factor;
				934	down_migrate >>= 10;
				935	down_migrate *= NSEC_PER_USEC;
				936
				937	down_migrate = min(down_migrate, up_migrate - delta);
				938	done:
				939	sched_upmigrate = up_migrate;
				940	sched_downmigrate = down_migrate;
				941	}
				942
				943	void set_hmp_defaults(void)
				944	{
				945	sched_spill_load =
				946	pct_to_real(sysctl_sched_spill_load_pct);
				947
				948	update_up_down_migrate();
				949
				950	sched_major_task_runtime =
				951	mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
				952
				953	sched_init_task_load_windows =
				954	div64_u64((u64)sysctl_sched_init_task_load_pct *
				955	(u64)sched_ravg_window, 100);
				956
				957	sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
				958	NSEC_PER_USEC;
				959
				960	sched_small_wakee_task_load =
				961	div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
				962	(u64)sched_ravg_window, 100);
				963
				964	sched_big_waker_task_load =
				965	div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
				966	(u64)sched_ravg_window, 100);
				967
				968	sched_freq_aggregate_threshold =
				969	pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
				970	}
				971
				972	u32 sched_get_init_task_load(struct task_struct *p)
				973	{
				974	return p->init_load_pct;
				975	}
				976
				977	int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
				978	{
				979	if (init_load_pct < 0 \|\| init_load_pct > 100)
				980	return -EINVAL;
				981
				982	p->init_load_pct = init_load_pct;
				983
				984	return 0;
				985	}
				986
				987	#ifdef CONFIG_CGROUP_SCHED
				988
				989	int upmigrate_discouraged(struct task_struct *p)
				990	{
				991	return task_group(p)->upmigrate_discouraged;
				992	}
				993
				994	#else
				995
				996	static inline int upmigrate_discouraged(struct task_struct *p)
				997	{
				998	return 0;
				999	}
				1000
				1001	#endif
				1002
				1003	/* Is a task "big" on its current cpu */
				1004	static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
				1005	{
				1006	int nice = task_nice(p);
				1007
				1008	if (nice > SCHED_UPMIGRATE_MIN_NICE \|\| upmigrate_discouraged(p))
				1009	return 0;
				1010
				1011	return scaled_load > sched_upmigrate;
				1012	}
				1013
				1014	int is_big_task(struct task_struct *p)
				1015	{
				1016	return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
				1017	}
				1018
				1019	u64 cpu_load(int cpu)
				1020	{
				1021	struct rq *rq = cpu_rq(cpu);
				1022
				1023	return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
				1024	}
				1025
				1026	u64 cpu_load_sync(int cpu, int sync)
				1027	{
				1028	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
				1029	}
				1030
				1031	static int boost_refcount;
				1032	static DEFINE_SPINLOCK(boost_lock);
				1033	static DEFINE_MUTEX(boost_mutex);
				1034
				1035	static void boost_kick_cpus(void)
				1036	{
				1037	int i;
				1038
				1039	for_each_online_cpu(i) {
				1040	if (cpu_capacity(i) != max_capacity)
				1041	boost_kick(i);
				1042	}
				1043	}
				1044
				1045	int sched_boost(void)
				1046	{
				1047	return boost_refcount > 0;
				1048	}
				1049
				1050	int sched_set_boost(int enable)
				1051	{
				1052	unsigned long flags;
				1053	int ret = 0;
				1054	int old_refcount;
				1055
				1056	spin_lock_irqsave(&boost_lock, flags);
				1057
				1058	old_refcount = boost_refcount;
				1059
				1060	if (enable == 1) {
				1061	boost_refcount++;
				1062	} else if (!enable) {
				1063	if (boost_refcount >= 1)
				1064	boost_refcount--;
				1065	else
				1066	ret = -EINVAL;
				1067	} else {
				1068	ret = -EINVAL;
				1069	}
				1070
				1071	if (!old_refcount && boost_refcount)
				1072	boost_kick_cpus();
				1073
				1074	trace_sched_set_boost(boost_refcount);
				1075	spin_unlock_irqrestore(&boost_lock, flags);
				1076
				1077	return ret;
				1078	}
				1079
				1080	int sched_boost_handler(struct ctl_table *table, int write,
				1081	void __user buffer, size_t lenp,
				1082	loff_t *ppos)
				1083	{
				1084	int ret;
				1085
				1086	mutex_lock(&boost_mutex);
				1087	if (!write)
				1088	sysctl_sched_boost = sched_boost();
				1089
				1090	ret = proc_dointvec(table, write, buffer, lenp, ppos);
				1091	if (ret \|\| !write)
				1092	goto done;
				1093
				1094	ret = (sysctl_sched_boost <= 1) ?
				1095	sched_set_boost(sysctl_sched_boost) : -EINVAL;
				1096
				1097	done:
				1098	mutex_unlock(&boost_mutex);
				1099	return ret;
				1100	}
				1101
				1102	/*
				1103	* Task will fit on a cpu if it's bandwidth consumption on that cpu
				1104	* will be less than sched_upmigrate. A big task that was previously
				1105	* "up" migrated will be considered fitting on "little" cpu if its
				1106	* bandwidth consumption on "little" cpu will be less than
				1107	* sched_downmigrate. This will help avoid frequenty migrations for
				1108	* tasks with load close to the upmigrate threshold
				1109	*/
				1110	int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
				1111	enum sched_boost_type boost_type)
				1112	{
				1113	int upmigrate;
				1114
				1115	if (cpu_capacity(cpu) == max_capacity)
				1116	return 1;
				1117
				1118	if (boost_type != SCHED_BOOST_ON_BIG) {
				1119	if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE \|\|
				1120	upmigrate_discouraged(p))
				1121	return 1;
				1122
				1123	upmigrate = sched_upmigrate;
				1124	if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
				1125	upmigrate = sched_downmigrate;
				1126
				1127	if (task_load < upmigrate)
				1128	return 1;
				1129	}
				1130
				1131	return 0;
				1132	}
				1133
				1134	enum sched_boost_type sched_boost_type(void)
				1135	{
				1136	if (sched_boost()) {
				1137	if (min_possible_efficiency != max_possible_efficiency)
				1138	return SCHED_BOOST_ON_BIG;
				1139	else
				1140	return SCHED_BOOST_ON_ALL;
				1141	}
				1142	return SCHED_BOOST_NONE;
				1143	}
				1144
				1145	int task_will_fit(struct task_struct *p, int cpu)
				1146	{
				1147	u64 tload = scale_load_to_cpu(task_load(p), cpu);
				1148
				1149	return task_load_will_fit(p, tload, cpu, sched_boost_type());
				1150	}
				1151
				1152	int group_will_fit(struct sched_cluster *cluster,
				1153	struct related_thread_group *grp, u64 demand)
				1154	{
				1155	int cpu = cluster_first_cpu(cluster);
				1156	int prev_capacity = 0;
				1157	unsigned int threshold = sched_upmigrate;
				1158	u64 load;
				1159
				1160	if (cluster->capacity == max_capacity)
				1161	return 1;
				1162
				1163	if (grp->preferred_cluster)
				1164	prev_capacity = grp->preferred_cluster->capacity;
				1165
				1166	if (cluster->capacity < prev_capacity)
				1167	threshold = sched_downmigrate;
				1168
				1169	load = scale_load_to_cpu(demand, cpu);
				1170	if (load < threshold)
				1171	return 1;
				1172
				1173	return 0;
				1174	}
				1175
				1176	/*
				1177	* Return the cost of running task p on CPU cpu. This function
				1178	* currently assumes that task p is the only task which will run on
				1179	* the CPU.
				1180	*/
				1181	unsigned int power_cost(int cpu, u64 demand)
				1182	{
				1183	int first, mid, last;
				1184	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
				1185	struct cpu_pstate_pwr *costs;
				1186	struct freq_max_load *max_load;
				1187	int total_static_pwr_cost = 0;
				1188	struct rq *rq = cpu_rq(cpu);
				1189	unsigned int pc;
				1190
				1191	if (!per_cpu_info \|\| !per_cpu_info[cpu].ptable)
				1192	/*
				1193	* When power aware scheduling is not in use, or CPU
				1194	* power data is not available, just use the CPU
				1195	* capacity as a rough stand-in for real CPU power
				1196	* numbers, assuming bigger CPUs are more power
				1197	* hungry.
				1198	*/
				1199	return cpu_max_possible_capacity(cpu);
				1200
				1201	rcu_read_lock();
				1202	max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
				1203	if (!max_load) {
				1204	pc = cpu_max_possible_capacity(cpu);
				1205	goto unlock;
				1206	}
				1207
				1208	costs = per_cpu_info[cpu].ptable;
				1209
				1210	if (demand <= max_load->freqs[0].hdemand) {
				1211	pc = costs[0].power;
				1212	goto unlock;
				1213	} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
				1214	pc = costs[max_load->length - 1].power;
				1215	goto unlock;
				1216	}
				1217
				1218	first = 0;
				1219	last = max_load->length - 1;
				1220	mid = (last - first) >> 1;
				1221	while (1) {
				1222	if (demand <= max_load->freqs[mid].hdemand)
				1223	last = mid;
				1224	else
				1225	first = mid;
				1226
				1227	if (last - first == 1)
				1228	break;
				1229	mid = first + ((last - first) >> 1);
				1230	}
				1231
				1232	pc = costs[last].power;
				1233
				1234	unlock:
				1235	rcu_read_unlock();
				1236
				1237	if (idle_cpu(cpu) && rq->cstate) {
				1238	total_static_pwr_cost += rq->static_cpu_pwr_cost;
				1239	if (rq->cluster->dstate)
				1240	total_static_pwr_cost +=
				1241	rq->cluster->static_cluster_pwr_cost;
				1242	}
				1243
				1244	return pc + total_static_pwr_cost;
				1245
				1246	}
				1247
				1248	void inc_nr_big_task(struct hmp_sched_stats stats, struct task_struct p)
				1249	{
				1250	if (sched_disable_window_stats)
				1251	return;
				1252
				1253	if (is_big_task(p))
				1254	stats->nr_big_tasks++;
				1255	}
				1256
				1257	void dec_nr_big_task(struct hmp_sched_stats stats, struct task_struct p)
				1258	{
				1259	if (sched_disable_window_stats)
				1260	return;
				1261
				1262	if (is_big_task(p))
				1263	stats->nr_big_tasks--;
				1264
				1265	BUG_ON(stats->nr_big_tasks < 0);
				1266	}
				1267
				1268	void inc_rq_hmp_stats(struct rq rq, struct task_struct p, int change_cra)
				1269	{
				1270	inc_nr_big_task(&rq->hmp_stats, p);
				1271	if (change_cra)
				1272	inc_cumulative_runnable_avg(&rq->hmp_stats, p);
				1273	}
				1274
				1275	void dec_rq_hmp_stats(struct rq rq, struct task_struct p, int change_cra)
				1276	{
				1277	dec_nr_big_task(&rq->hmp_stats, p);
				1278	if (change_cra)
				1279	dec_cumulative_runnable_avg(&rq->hmp_stats, p);
				1280	}
				1281
				1282	static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
				1283	{
				1284	stats->nr_big_tasks = 0;
				1285	if (reset_cra) {
				1286	stats->cumulative_runnable_avg = 0;
				1287	stats->pred_demands_sum = 0;
				1288	}
				1289	}
				1290
				1291	/*
				1292	* Invoked from three places:
				1293	* 1) try_to_wake_up() -> ... -> select_best_cpu()
				1294	* 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
				1295	* 3) can_migrate_task()
				1296	*
				1297	* Its safe to de-reference p->grp in first case (since p->pi_lock is held)
				1298	* but not in other cases. p->grp is hence freed after a RCU grace period and
				1299	* accessed under rcu_read_lock()
				1300	*/
				1301	int preferred_cluster(struct sched_cluster cluster, struct task_struct p)
				1302	{
				1303	struct related_thread_group *grp;
				1304	int rc = 0;
				1305
				1306	rcu_read_lock();
				1307
				1308	grp = task_related_thread_group(p);
				1309	if (!grp \|\| !sysctl_sched_enable_colocation)
				1310	rc = 1;
				1311	else
				1312	rc = (grp->preferred_cluster == cluster);
				1313
				1314	rcu_read_unlock();
				1315	return rc;
				1316	}
				1317
				1318	struct sched_cluster rq_cluster(struct rq rq)
				1319	{
				1320	return rq->cluster;
				1321	}
				1322
				1323	/*
				1324	* reset_cpu_hmp_stats - reset HMP stats for a cpu
				1325	* nr_big_tasks
				1326	* cumulative_runnable_avg (iff reset_cra is true)
				1327	*/
				1328	void reset_cpu_hmp_stats(int cpu, int reset_cra)
				1329	{
				1330	reset_cfs_rq_hmp_stats(cpu, reset_cra);
				1331	reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
				1332	}
				1333
				1334	void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
				1335	struct task_struct *p, s64 delta)
				1336	{
				1337	u64 new_task_load;
				1338	u64 old_task_load;
				1339
				1340	if (sched_disable_window_stats)
				1341	return;
				1342
				1343	old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
				1344	new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
				1345
				1346	if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
				1347	stats->nr_big_tasks--;
				1348	else if (!__is_big_task(p, old_task_load) &&
				1349	__is_big_task(p, new_task_load))
				1350	stats->nr_big_tasks++;
				1351
				1352	BUG_ON(stats->nr_big_tasks < 0);
				1353	}
				1354
				1355	/*
				1356	* Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
				1357	*/
				1358	static void update_nr_big_tasks(int cpu)
				1359	{
				1360	struct rq *rq = cpu_rq(cpu);
				1361	struct task_struct *p;
				1362
				1363	/* Do not reset cumulative_runnable_avg */
				1364	reset_cpu_hmp_stats(cpu, 0);
				1365
				1366	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
				1367	inc_hmp_sched_stats_fair(rq, p, 0);
				1368	}
				1369
				1370	/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
				1371	void pre_big_task_count_change(const struct cpumask *cpus)
				1372	{
				1373	int i;
				1374
				1375	local_irq_disable();
				1376
				1377	for_each_cpu(i, cpus)
				1378	raw_spin_lock(&cpu_rq(i)->lock);
				1379	}
				1380
				1381	/*
				1382	* Reinitialize 'nr_big_tasks' counters on all affected cpus
				1383	*/
				1384	void post_big_task_count_change(const struct cpumask *cpus)
				1385	{
				1386	int i;
				1387
				1388	/* Assumes local_irq_disable() keeps online cpumap stable */
				1389	for_each_cpu(i, cpus)
				1390	update_nr_big_tasks(i);
				1391
				1392	for_each_cpu(i, cpus)
				1393	raw_spin_unlock(&cpu_rq(i)->lock);
				1394
				1395	local_irq_enable();
				1396	}
				1397
				1398	DEFINE_MUTEX(policy_mutex);
				1399
				1400	static inline int invalid_value_freq_input(unsigned int *data)
				1401	{
				1402	if (data == &sysctl_sched_freq_aggregate)
				1403	return !(data == 0 \|\| data == 1);
				1404
				1405	return 0;
				1406	}
				1407
				1408	static inline int invalid_value(unsigned int *data)
				1409	{
				1410	unsigned int val = *data;
				1411
				1412	if (data == &sysctl_sched_ravg_hist_size)
				1413	return (val < 2 \|\| val > RAVG_HIST_SIZE_MAX);
				1414
				1415	if (data == &sysctl_sched_window_stats_policy)
				1416	return val >= WINDOW_STATS_INVALID_POLICY;
				1417
				1418	return invalid_value_freq_input(data);
				1419	}
				1420
				1421	/*
				1422	* Handle "atomic" update of sysctl_sched_window_stats_policy,
				1423	* sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
				1424	*/
				1425	int sched_window_update_handler(struct ctl_table *table, int write,
				1426	void __user buffer, size_t lenp,
				1427	loff_t *ppos)
				1428	{
				1429	int ret;
				1430	unsigned int data = (unsigned int )table->data;
				1431	unsigned int old_val;
				1432
				1433	mutex_lock(&policy_mutex);
				1434
				1435	old_val = *data;
				1436
				1437	ret = proc_dointvec(table, write, buffer, lenp, ppos);
				1438	if (ret \|\| !write \|\| (write && (old_val == *data)))
				1439	goto done;
				1440
				1441	if (invalid_value(data)) {
				1442	*data = old_val;
				1443	ret = -EINVAL;
				1444	goto done;
				1445	}
				1446
				1447	reset_all_window_stats(0, 0);
				1448
				1449	done:
				1450	mutex_unlock(&policy_mutex);
				1451
				1452	return ret;
				1453	}
				1454
				1455	/*
				1456	* Convert percentage value into absolute form. This will avoid div() operation
				1457	* in fast path, to convert task load in percentage scale.
				1458	*/
				1459	int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
				1460	void __user buffer, size_t lenp,
				1461	loff_t *ppos)
				1462	{
				1463	int ret;
				1464	unsigned int old_val;
				1465	unsigned int data = (unsigned int )table->data;
				1466	int update_min_nice = 0;
				1467
				1468	mutex_lock(&policy_mutex);
				1469
				1470	old_val = *data;
				1471
				1472	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				1473
				1474	if (ret \|\| !write)
				1475	goto done;
				1476
				1477	if (write && (old_val == *data))
				1478	goto done;
				1479
				1480	if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
				1481	*data = old_val;
				1482	ret = -EINVAL;
				1483	goto done;
				1484	}
				1485
				1486	/*
				1487	* Big task tunable change will need to re-classify tasks on
				1488	* runqueue as big and set their counters appropriately.
				1489	* sysctl interface affects secondary variables (*_pct), which is then
				1490	* "atomically" carried over to the primary variables. Atomic change
				1491	* includes taking runqueue lock of all online cpus and re-initiatizing
				1492	* their big counter values based on changed criteria.
				1493	*/
				1494	if ((data == &sysctl_sched_upmigrate_pct \|\| update_min_nice)) {
				1495	get_online_cpus();
				1496	pre_big_task_count_change(cpu_online_mask);
				1497	}
				1498
				1499	set_hmp_defaults();
				1500
				1501	if ((data == &sysctl_sched_upmigrate_pct \|\| update_min_nice)) {
				1502	post_big_task_count_change(cpu_online_mask);
				1503	put_online_cpus();
				1504	}
				1505
				1506	done:
				1507	mutex_unlock(&policy_mutex);
				1508	return ret;
				1509	}
				1510
				1511	inline int nr_big_tasks(struct rq *rq)
				1512	{
				1513	return rq->hmp_stats.nr_big_tasks;
				1514	}
				1515
				1516	unsigned int cpu_temp(int cpu)
				1517	{
				1518	struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
				1519
				1520	if (per_cpu_info)
				1521	return per_cpu_info[cpu].temp;
				1522	else
				1523	return 0;
				1524	}
				1525
				1526	void init_new_task_load(struct task_struct *p)
				1527	{
				1528	int i;
				1529	u32 init_load_windows = sched_init_task_load_windows;
				1530	u32 init_load_pct = current->init_load_pct;
				1531
				1532	p->init_load_pct = 0;
				1533	rcu_assign_pointer(p->grp, NULL);
				1534	INIT_LIST_HEAD(&p->grp_list);
				1535	memset(&p->ravg, 0, sizeof(struct ravg));
				1536	p->cpu_cycles = 0;
				1537
				1538	if (init_load_pct)
				1539	init_load_windows = div64_u64((u64)init_load_pct *
				1540	(u64)sched_ravg_window, 100);
				1541
				1542	p->ravg.demand = init_load_windows;
				1543	p->ravg.pred_demand = 0;
				1544	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
				1545	p->ravg.sum_history[i] = init_load_windows;
				1546	}
				1547
				1548	/* Return task demand in percentage scale */
				1549	unsigned int pct_task_load(struct task_struct *p)
				1550	{
				1551	unsigned int load;
				1552
				1553	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
				1554
				1555	return load;
				1556	}
				1557
				1558	/*
				1559	* Return total number of tasks "eligible" to run on highest capacity cpu
				1560	*
				1561	* This is simply nr_big_tasks for cpus which are not of max_capacity and
				1562	* nr_running for cpus of max_capacity
				1563	*/
				1564	unsigned int nr_eligible_big_tasks(int cpu)
				1565	{
				1566	struct rq *rq = cpu_rq(cpu);
				1567	int nr_big = rq->hmp_stats.nr_big_tasks;
				1568	int nr = rq->nr_running;
				1569
				1570	if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
				1571	return nr_big;
				1572
				1573	return nr;
				1574	}
				1575
				1576	static inline int exiting_task(struct task_struct *p)
				1577	{
				1578	return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
				1579	}
				1580
				1581	static int __init set_sched_ravg_window(char *str)
				1582	{
				1583	unsigned int window_size;
				1584
				1585	get_option(&str, &window_size);
				1586
				1587	if (window_size < MIN_SCHED_RAVG_WINDOW \|\|
				1588	window_size > MAX_SCHED_RAVG_WINDOW) {
				1589	WARN_ON(1);
				1590	return -EINVAL;
				1591	}
				1592
				1593	sched_ravg_window = window_size;
				1594	return 0;
				1595	}
				1596
				1597	early_param("sched_ravg_window", set_sched_ravg_window);
				1598
				1599	static inline void
				1600	update_window_start(struct rq *rq, u64 wallclock)
				1601	{
				1602	s64 delta;
				1603	int nr_windows;
				1604
				1605	delta = wallclock - rq->window_start;
				1606	BUG_ON(delta < 0);
				1607	if (delta < sched_ravg_window)
				1608	return;
				1609
				1610	nr_windows = div64_u64(delta, sched_ravg_window);
				1611	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
				1612	}
				1613
				1614	#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
				1615
				1616	static inline u64 scale_exec_time(u64 delta, struct rq *rq)
				1617	{
				1618	u32 freq;
				1619
				1620	freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
				1621	delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
				1622	delta *= rq->cluster->exec_scale_factor;
				1623	delta >>= 10;
				1624
				1625	return delta;
				1626	}
				1627
				1628	static inline int cpu_is_waiting_on_io(struct rq *rq)
				1629	{
				1630	if (!sched_io_is_busy)
				1631	return 0;
				1632
				1633	return atomic_read(&rq->nr_iowait);
				1634	}
				1635
				1636	/* Does freq_required sufficiently exceed or fall behind cur_freq? */
				1637	static inline int
				1638	nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
				1639	{
				1640	int delta = freq_required - cur_freq;
				1641
				1642	if (freq_required > cur_freq)
				1643	return delta < sysctl_sched_freq_inc_notify;
				1644
				1645	delta = -delta;
				1646
				1647	return delta < sysctl_sched_freq_dec_notify;
				1648	}
				1649
				1650	/* Convert busy time to frequency equivalent */
				1651	static inline unsigned int load_to_freq(struct rq *rq, u64 load)
				1652	{
				1653	unsigned int freq;
				1654
				1655	load = scale_load_to_cpu(load, cpu_of(rq));
				1656	load *= 128;
				1657	load = div64_u64(load, max_task_load());
				1658
				1659	freq = load * cpu_max_possible_freq(cpu_of(rq));
				1660	freq /= 128;
				1661
				1662	return freq;
				1663	}
				1664
				1665	static inline struct group_cpu_time *
				1666	_group_cpu_time(struct related_thread_group *grp, int cpu);
				1667
				1668	/*
				1669	* Return load from all related group in given cpu.
				1670	* Caller must ensure that related_thread_group_lock is held.
				1671	*/
				1672	static void _group_load_in_cpu(int cpu, u64 grp_load, u64 new_grp_load)
				1673	{
				1674	struct related_thread_group *grp;
				1675
				1676	for_each_related_thread_group(grp) {
				1677	struct group_cpu_time *cpu_time;
				1678
				1679	cpu_time = _group_cpu_time(grp, cpu);
				1680	*grp_load += cpu_time->prev_runnable_sum;
				1681	if (new_grp_load)
				1682	*new_grp_load += cpu_time->nt_prev_runnable_sum;
				1683	}
				1684	}
				1685
				1686	/*
				1687	* Return load from all related groups in given frequency domain.
				1688	* Caller must ensure that related_thread_group_lock is held.
				1689	*/
				1690	static void group_load_in_freq_domain(struct cpumask *cpus,
				1691	u64 grp_load, u64 new_grp_load)
				1692	{
				1693	struct related_thread_group *grp;
				1694	int j;
				1695
				1696	for_each_related_thread_group(grp) {
				1697	for_each_cpu(j, cpus) {
				1698	struct group_cpu_time *cpu_time;
				1699
				1700	cpu_time = _group_cpu_time(grp, j);
				1701	*grp_load += cpu_time->prev_runnable_sum;
				1702	*new_grp_load += cpu_time->nt_prev_runnable_sum;
				1703	}
				1704	}
				1705	}
				1706
				1707	/*
				1708	* Should scheduler alert governor for changing frequency?
				1709	*
				1710	* @check_pred - evaluate frequency based on the predictive demand
				1711	* @check_groups - add load from all related groups on given cpu
				1712	*
				1713	* check_groups is set to 1 if a "related" task movement/wakeup is triggering
				1714	* the notification check. To avoid "re-aggregation" of demand in such cases,
				1715	* we check whether the migrated/woken tasks demand (along with demand from
				1716	* existing tasks on the cpu) can be met on target cpu
				1717	*
				1718	*/
				1719
				1720	static int send_notification(struct rq *rq, int check_pred, int check_groups)
				1721	{
				1722	unsigned int cur_freq, freq_required;
				1723	unsigned long flags;
				1724	int rc = 0;
				1725	u64 group_load = 0, new_load = 0;
				1726
				1727	if (check_pred) {
				1728	u64 prev = rq->old_busy_time;
				1729	u64 predicted = rq->hmp_stats.pred_demands_sum;
				1730
				1731	if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
				1732	return 0;
				1733
				1734	prev = max(prev, rq->old_estimated_time);
				1735	if (prev > predicted)
				1736	return 0;
				1737
				1738	cur_freq = load_to_freq(rq, prev);
				1739	freq_required = load_to_freq(rq, predicted);
				1740
				1741	if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
				1742	return 0;
				1743	} else {
				1744	read_lock(&related_thread_group_lock);
				1745	/*
				1746	* Protect from concurrent update of rq->prev_runnable_sum and
				1747	* group cpu load
				1748	*/
				1749	raw_spin_lock_irqsave(&rq->lock, flags);
				1750	if (check_groups)
				1751	_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
				1752
				1753	new_load = rq->prev_runnable_sum + group_load;
				1754
				1755	raw_spin_unlock_irqrestore(&rq->lock, flags);
				1756	read_unlock(&related_thread_group_lock);
				1757
				1758	cur_freq = load_to_freq(rq, rq->old_busy_time);
				1759	freq_required = load_to_freq(rq, new_load);
				1760
				1761	if (nearly_same_freq(cur_freq, freq_required))
				1762	return 0;
				1763	}
				1764
				1765	raw_spin_lock_irqsave(&rq->lock, flags);
				1766	if (!rq->cluster->notifier_sent) {
				1767	rq->cluster->notifier_sent = 1;
				1768	rc = 1;
				1769	trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
				1770	new_load);
				1771	}
				1772	raw_spin_unlock_irqrestore(&rq->lock, flags);
				1773
				1774	return rc;
				1775	}
				1776
				1777	/* Alert governor if there is a need to change frequency */
				1778	void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
				1779	{
				1780	int cpu = cpu_of(rq);
				1781
				1782	if (!send_notification(rq, check_pred, check_groups))
				1783	return;
				1784
				1785	atomic_notifier_call_chain(
				1786	&load_alert_notifier_head, 0,
				1787	(void *)(long)cpu);
				1788	}
				1789
				1790	void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
				1791	struct task_struct *p)
				1792	{
				1793	bool check_groups;
				1794
				1795	rcu_read_lock();
				1796	check_groups = task_in_related_thread_group(p);
				1797	rcu_read_unlock();
				1798
				1799	if (!same_freq_domain(src_cpu, dest_cpu)) {
				1800	if (!src_cpu_dead)
				1801	check_for_freq_change(cpu_rq(src_cpu), false,
				1802	check_groups);
				1803	check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
				1804	} else {
				1805	check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
				1806	}
				1807	}
				1808
				1809	static int account_busy_for_cpu_time(struct rq rq, struct task_struct p,
				1810	u64 irqtime, int event)
				1811	{
				1812	if (is_idle_task(p)) {
				1813	/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
				1814	if (event == PICK_NEXT_TASK)
				1815	return 0;
				1816
				1817	/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
				1818	return irqtime \|\| cpu_is_waiting_on_io(rq);
				1819	}
				1820
				1821	if (event == TASK_WAKE)
				1822	return 0;
				1823
				1824	if (event == PUT_PREV_TASK \|\| event == IRQ_UPDATE)
				1825	return 1;
				1826
				1827	/*
				1828	* TASK_UPDATE can be called on sleeping task, when its moved between
				1829	* related groups
				1830	*/
				1831	if (event == TASK_UPDATE) {
				1832	if (rq->curr == p)
				1833	return 1;
				1834
				1835	return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
				1836	}
				1837
				1838	/* TASK_MIGRATE, PICK_NEXT_TASK left */
				1839	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
				1840	}
				1841
				1842	static inline bool is_new_task(struct task_struct *p)
				1843	{
				1844	return p->ravg.active_windows < sysctl_sched_new_task_windows;
				1845	}
				1846
				1847	#define INC_STEP 8
				1848	#define DEC_STEP 2
				1849	#define CONSISTENT_THRES 16
				1850	#define INC_STEP_BIG 16
				1851	/*
				1852	* bucket_increase - update the count of all buckets
				1853	*
				1854	* @buckets: array of buckets tracking busy time of a task
				1855	* @idx: the index of bucket to be incremented
				1856	*
				1857	* Each time a complete window finishes, count of bucket that runtime
				1858	* falls in (@idx) is incremented. Counts of all other buckets are
				1859	* decayed. The rate of increase and decay could be different based
				1860	* on current count in the bucket.
				1861	*/
				1862	static inline void bucket_increase(u8 *buckets, int idx)
				1863	{
				1864	int i, step;
				1865
				1866	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
				1867	if (idx != i) {
				1868	if (buckets[i] > DEC_STEP)
				1869	buckets[i] -= DEC_STEP;
				1870	else
				1871	buckets[i] = 0;
				1872	} else {
				1873	step = buckets[i] >= CONSISTENT_THRES ?
				1874	INC_STEP_BIG : INC_STEP;
				1875	if (buckets[i] > U8_MAX - step)
				1876	buckets[i] = U8_MAX;
				1877	else
				1878	buckets[i] += step;
				1879	}
				1880	}
				1881	}
				1882
				1883	static inline int busy_to_bucket(u32 normalized_rt)
				1884	{
				1885	int bidx;
				1886
				1887	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
				1888	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
				1889
				1890	/*
				1891	* Combine lowest two buckets. The lowest frequency falls into
				1892	* 2nd bucket and thus keep predicting lowest bucket is not
				1893	* useful.
				1894	*/
				1895	if (!bidx)
				1896	bidx++;
				1897
				1898	return bidx;
				1899	}
				1900
				1901	static inline u64
				1902	scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
				1903	{
				1904	return div64_u64(load * (u64)src_freq, (u64)dst_freq);
				1905	}
				1906
				1907	#define HEAVY_TASK_SKIP 2
				1908	#define HEAVY_TASK_SKIP_LIMIT 4
				1909	/*
				1910	* get_pred_busy - calculate predicted demand for a task on runqueue
				1911	*
				1912	* @rq: runqueue of task p
				1913	* @p: task whose prediction is being updated
				1914	* @start: starting bucket. returned prediction should not be lower than
				1915	* this bucket.
				1916	* @runtime: runtime of the task. returned prediction should not be lower
				1917	* than this runtime.
				1918	* Note: @start can be derived from @runtime. It's passed in only to
				1919	* avoid duplicated calculation in some cases.
				1920	*
				1921	* A new predicted busy time is returned for task @p based on @runtime
				1922	* passed in. The function searches through buckets that represent busy
				1923	* time equal to or bigger than @runtime and attempts to find the bucket to
				1924	* to use for prediction. Once found, it searches through historical busy
				1925	* time and returns the latest that falls into the bucket. If no such busy
				1926	* time exists, it returns the medium of that bucket.
				1927	*/
				1928	static u32 get_pred_busy(struct rq rq, struct task_struct p,
				1929	int start, u32 runtime)
				1930	{
				1931	int i;
				1932	u8 *buckets = p->ravg.busy_buckets;
				1933	u32 *hist = p->ravg.sum_history;
				1934	u32 dmin, dmax;
				1935	u64 cur_freq_runtime = 0;
				1936	int first = NUM_BUSY_BUCKETS, final, skip_to;
				1937	u32 ret = runtime;
				1938
				1939	/* skip prediction for new tasks due to lack of history */
				1940	if (unlikely(is_new_task(p)))
				1941	goto out;
				1942
				1943	/* find minimal bucket index to pick */
				1944	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
				1945	if (buckets[i]) {
				1946	first = i;
				1947	break;
				1948	}
				1949	}
				1950	/* if no higher buckets are filled, predict runtime */
				1951	if (first >= NUM_BUSY_BUCKETS)
				1952	goto out;
				1953
				1954	/* compute the bucket for prediction */
				1955	final = first;
				1956	if (first < HEAVY_TASK_SKIP_LIMIT) {
				1957	/* compute runtime at current CPU frequency */
				1958	cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
				1959	rq->cluster->efficiency);
				1960	cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
				1961	max_possible_freq, rq->cluster->cur_freq);
				1962	/*
				1963	* if the task runs for majority of the window, try to
				1964	* pick higher buckets.
				1965	*/
				1966	if (cur_freq_runtime >= sched_major_task_runtime) {
				1967	int next = NUM_BUSY_BUCKETS;
				1968	/*
				1969	* if there is a higher bucket that's consistently
				1970	* hit, don't jump beyond that.
				1971	*/
				1972	for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
				1973	i < NUM_BUSY_BUCKETS; i++) {
				1974	if (buckets[i] > CONSISTENT_THRES) {
				1975	next = i;
				1976	break;
				1977	}
				1978	}
				1979	skip_to = min(next, start + HEAVY_TASK_SKIP);
				1980	/* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
				1981	skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
				1982	/* don't go below first non-empty bucket, if any */
				1983	final = max(first, skip_to);
				1984	}
				1985	}
				1986
				1987	/* determine demand range for the predicted bucket */
				1988	if (final < 2) {
				1989	/* lowest two buckets are combined */
				1990	dmin = 0;
				1991	final = 1;
				1992	} else {
				1993	dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
				1994	}
				1995	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
				1996
				1997	/*
				1998	* search through runtime history and return first runtime that falls
				1999	* into the range of predicted bucket.
				2000	*/
				2001	for (i = 0; i < sched_ravg_hist_size; i++) {
				2002	if (hist[i] >= dmin && hist[i] < dmax) {
				2003	ret = hist[i];
				2004	break;
				2005	}
				2006	}
				2007	/* no historical runtime within bucket found, use average of the bin */
				2008	if (ret < dmin)
				2009	ret = (dmin + dmax) / 2;
				2010	/*
				2011	* when updating in middle of a window, runtime could be higher
				2012	* than all recorded history. Always predict at least runtime.
				2013	*/
				2014	ret = max(runtime, ret);
				2015	out:
				2016	trace_sched_update_pred_demand(rq, p, runtime,
				2017	mult_frac((unsigned int)cur_freq_runtime, 100,
				2018	sched_ravg_window), ret);
				2019	return ret;
				2020	}
				2021
				2022	static inline u32 calc_pred_demand(struct rq rq, struct task_struct p)
				2023	{
				2024	if (p->ravg.pred_demand >= p->ravg.curr_window)
				2025	return p->ravg.pred_demand;
				2026
				2027	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
				2028	p->ravg.curr_window);
				2029	}
				2030
				2031	/*
				2032	* predictive demand of a task is calculated at the window roll-over.
				2033	* if the task current window busy time exceeds the predicted
				2034	* demand, update it here to reflect the task needs.
				2035	*/
				2036	void update_task_pred_demand(struct rq rq, struct task_struct p, int event)
				2037	{
				2038	u32 new, old;
				2039
				2040	if (is_idle_task(p) \|\| exiting_task(p))
				2041	return;
				2042
				2043	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
				2044	(!SCHED_FREQ_ACCOUNT_WAIT_TIME \|\|
				2045	(event != TASK_MIGRATE &&
				2046	event != PICK_NEXT_TASK)))
				2047	return;
				2048
				2049	/*
				2050	* TASK_UPDATE can be called on sleeping task, when its moved between
				2051	* related groups
				2052	*/
				2053	if (event == TASK_UPDATE) {
				2054	if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
				2055	return;
				2056	}
				2057
				2058	new = calc_pred_demand(rq, p);
				2059	old = p->ravg.pred_demand;
				2060
				2061	if (old >= new)
				2062	return;
				2063
				2064	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) \|\|
				2065	!p->dl.dl_throttled))
				2066	p->sched_class->fixup_hmp_sched_stats(rq, p,
				2067	p->ravg.demand,
				2068	new);
				2069
				2070	p->ravg.pred_demand = new;
				2071	}
				2072
				2073	/*
				2074	* Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
				2075	*/
				2076	static void update_cpu_busy_time(struct task_struct p, struct rq rq,
				2077	int event, u64 wallclock, u64 irqtime)
				2078	{
				2079	int new_window, full_window = 0;
				2080	int p_is_curr_task = (p == rq->curr);
				2081	u64 mark_start = p->ravg.mark_start;
				2082	u64 window_start = rq->window_start;
				2083	u32 window_size = sched_ravg_window;
				2084	u64 delta;
				2085	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
				2086	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
				2087	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
				2088	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
				2089	int flip_counters = 0;
				2090	int prev_sum_reset = 0;
				2091	bool new_task;
				2092	struct related_thread_group *grp;
				2093
				2094	new_window = mark_start < window_start;
				2095	if (new_window) {
				2096	full_window = (window_start - mark_start) >= window_size;
				2097	if (p->ravg.active_windows < USHRT_MAX)
				2098	p->ravg.active_windows++;
				2099	}
				2100
				2101	new_task = is_new_task(p);
				2102
				2103	grp = p->grp;
				2104	if (grp && sched_freq_aggregate) {
				2105	/* cpu_time protected by rq_lock */
				2106	struct group_cpu_time *cpu_time =
				2107	_group_cpu_time(grp, cpu_of(rq));
				2108
				2109	curr_runnable_sum = &cpu_time->curr_runnable_sum;
				2110	prev_runnable_sum = &cpu_time->prev_runnable_sum;
				2111
				2112	nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
				2113	nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
				2114
				2115	if (cpu_time->window_start != rq->window_start) {
				2116	int nr_windows;
				2117
				2118	delta = rq->window_start - cpu_time->window_start;
				2119	nr_windows = div64_u64(delta, window_size);
				2120	if (nr_windows > 1)
				2121	prev_sum_reset = 1;
				2122
				2123	cpu_time->window_start = rq->window_start;
				2124	flip_counters = 1;
				2125	}
				2126
				2127	if (p_is_curr_task && new_window) {
				2128	u64 curr_sum = rq->curr_runnable_sum;
				2129	u64 nt_curr_sum = rq->nt_curr_runnable_sum;
				2130
				2131	if (full_window)
				2132	curr_sum = nt_curr_sum = 0;
				2133
				2134	rq->prev_runnable_sum = curr_sum;
				2135	rq->nt_prev_runnable_sum = nt_curr_sum;
				2136
				2137	rq->curr_runnable_sum = 0;
				2138	rq->nt_curr_runnable_sum = 0;
				2139	}
				2140	} else {
				2141	if (p_is_curr_task && new_window) {
				2142	flip_counters = 1;
				2143	if (full_window)
				2144	prev_sum_reset = 1;
				2145	}
				2146	}
				2147
				2148	/*
				2149	* Handle per-task window rollover. We don't care about the idle
				2150	* task or exiting tasks.
				2151	*/
				2152	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
				2153	u32 curr_window = 0;
				2154
				2155	if (!full_window)
				2156	curr_window = p->ravg.curr_window;
				2157
				2158	p->ravg.prev_window = curr_window;
				2159	p->ravg.curr_window = 0;
				2160	}
				2161
				2162	if (flip_counters) {
				2163	u64 curr_sum = *curr_runnable_sum;
				2164	u64 nt_curr_sum = *nt_curr_runnable_sum;
				2165
				2166	if (prev_sum_reset)
				2167	curr_sum = nt_curr_sum = 0;
				2168
				2169	*prev_runnable_sum = curr_sum;
				2170	*nt_prev_runnable_sum = nt_curr_sum;
				2171
				2172	*curr_runnable_sum = 0;
				2173	*nt_curr_runnable_sum = 0;
				2174	}
				2175
				2176	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
				2177	/*
				2178	* account_busy_for_cpu_time() = 0, so no update to the
				2179	* task's current window needs to be made. This could be
				2180	* for example
				2181	*
				2182	* - a wakeup event on a task within the current
				2183	* window (!new_window below, no action required),
				2184	* - switching to a new task from idle (PICK_NEXT_TASK)
				2185	* in a new window where irqtime is 0 and we aren't
				2186	* waiting on IO
				2187	*/
				2188
				2189	if (!new_window)
				2190	return;
				2191
				2192	/*
				2193	* A new window has started. The RQ demand must be rolled
				2194	* over if p is the current task.
				2195	*/
				2196	if (p_is_curr_task) {
				2197	/* p is idle task */
				2198	BUG_ON(p != rq->idle);
				2199	}
				2200
				2201	return;
				2202	}
				2203
				2204	if (!new_window) {
				2205	/*
				2206	* account_busy_for_cpu_time() = 1 so busy time needs
				2207	* to be accounted to the current window. No rollover
				2208	* since we didn't start a new window. An example of this is
				2209	* when a task starts execution and then sleeps within the
				2210	* same window.
				2211	*/
				2212
				2213	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq))
				2214	delta = wallclock - mark_start;
				2215	else
				2216	delta = irqtime;
				2217	delta = scale_exec_time(delta, rq);
				2218	*curr_runnable_sum += delta;
				2219	if (new_task)
				2220	*nt_curr_runnable_sum += delta;
				2221
				2222	if (!is_idle_task(p) && !exiting_task(p))
				2223	p->ravg.curr_window += delta;
				2224
				2225	return;
				2226	}
				2227
				2228	if (!p_is_curr_task) {
				2229	/*
				2230	* account_busy_for_cpu_time() = 1 so busy time needs
				2231	* to be accounted to the current window. A new window
				2232	* has also started, but p is not the current task, so the
				2233	* window is not rolled over - just split up and account
				2234	* as necessary into curr and prev. The window is only
				2235	* rolled over when a new window is processed for the current
				2236	* task.
				2237	*
				2238	* Irqtime can't be accounted by a task that isn't the
				2239	* currently running task.
				2240	*/
				2241
				2242	if (!full_window) {
				2243	/*
				2244	* A full window hasn't elapsed, account partial
				2245	* contribution to previous completed window.
				2246	*/
				2247	delta = scale_exec_time(window_start - mark_start, rq);
				2248	if (!exiting_task(p))
				2249	p->ravg.prev_window += delta;
				2250	} else {
				2251	/*
				2252	* Since at least one full window has elapsed,
				2253	* the contribution to the previous window is the
				2254	* full window (window_size).
				2255	*/
				2256	delta = scale_exec_time(window_size, rq);
				2257	if (!exiting_task(p))
				2258	p->ravg.prev_window = delta;
				2259	}
				2260
				2261	*prev_runnable_sum += delta;
				2262	if (new_task)
				2263	*nt_prev_runnable_sum += delta;
				2264
				2265	/* Account piece of busy time in the current window. */
				2266	delta = scale_exec_time(wallclock - window_start, rq);
				2267	*curr_runnable_sum += delta;
				2268	if (new_task)
				2269	*nt_curr_runnable_sum += delta;
				2270
				2271	if (!exiting_task(p))
				2272	p->ravg.curr_window = delta;
				2273
				2274	return;
				2275	}
				2276
				2277	if (!irqtime \|\| !is_idle_task(p) \|\| cpu_is_waiting_on_io(rq)) {
				2278	/*
				2279	* account_busy_for_cpu_time() = 1 so busy time needs
				2280	* to be accounted to the current window. A new window
				2281	* has started and p is the current task so rollover is
				2282	* needed. If any of these three above conditions are true
				2283	* then this busy time can't be accounted as irqtime.
				2284	*
				2285	* Busy time for the idle task or exiting tasks need not
				2286	* be accounted.
				2287	*
				2288	* An example of this would be a task that starts execution
				2289	* and then sleeps once a new window has begun.
				2290	*/
				2291
				2292	if (!full_window) {
				2293	/*
				2294	* A full window hasn't elapsed, account partial
				2295	* contribution to previous completed window.
				2296	*/
				2297	delta = scale_exec_time(window_start - mark_start, rq);
				2298	if (!is_idle_task(p) && !exiting_task(p))
				2299	p->ravg.prev_window += delta;
				2300	} else {
				2301	/*
				2302	* Since at least one full window has elapsed,
				2303	* the contribution to the previous window is the
				2304	* full window (window_size).
				2305	*/
				2306	delta = scale_exec_time(window_size, rq);
				2307	if (!is_idle_task(p) && !exiting_task(p))
				2308	p->ravg.prev_window = delta;
				2309	}
				2310
				2311	/*
				2312	* Rollover is done here by overwriting the values in
				2313	* prev_runnable_sum and curr_runnable_sum.
				2314	*/
				2315	*prev_runnable_sum += delta;
				2316	if (new_task)
				2317	*nt_prev_runnable_sum += delta;
				2318
				2319	/* Account piece of busy time in the current window. */
				2320	delta = scale_exec_time(wallclock - window_start, rq);
				2321	*curr_runnable_sum += delta;
				2322	if (new_task)
				2323	*nt_curr_runnable_sum += delta;
				2324
				2325	if (!is_idle_task(p) && !exiting_task(p))
				2326	p->ravg.curr_window = delta;
				2327
				2328	return;
				2329	}
				2330
				2331	if (irqtime) {
				2332	/*
				2333	* account_busy_for_cpu_time() = 1 so busy time needs
				2334	* to be accounted to the current window. A new window
				2335	* has started and p is the current task so rollover is
				2336	* needed. The current task must be the idle task because
				2337	* irqtime is not accounted for any other task.
				2338	*
				2339	* Irqtime will be accounted each time we process IRQ activity
				2340	* after a period of idleness, so we know the IRQ busy time
				2341	* started at wallclock - irqtime.
				2342	*/
				2343
				2344	BUG_ON(!is_idle_task(p));
				2345	mark_start = wallclock - irqtime;
				2346
				2347	/*
				2348	* Roll window over. If IRQ busy time was just in the current
				2349	* window then that is all that need be accounted.
				2350	*/
				2351	if (mark_start > window_start) {
				2352	*curr_runnable_sum = scale_exec_time(irqtime, rq);
				2353	return;
				2354	}
				2355
				2356	/*
				2357	* The IRQ busy time spanned multiple windows. Process the
				2358	* busy time preceding the current window start first.
				2359	*/
				2360	delta = window_start - mark_start;
				2361	if (delta > window_size)
				2362	delta = window_size;
				2363	delta = scale_exec_time(delta, rq);
				2364	*prev_runnable_sum += delta;
				2365
				2366	/* Process the remaining IRQ busy time in the current window. */
				2367	delta = wallclock - window_start;
				2368	rq->curr_runnable_sum = scale_exec_time(delta, rq);
				2369
				2370	return;
				2371	}
				2372
				2373	BUG();
				2374	}
				2375
				2376	static inline u32 predict_and_update_buckets(struct rq *rq,
				2377	struct task_struct *p, u32 runtime) {
				2378
				2379	int bidx;
				2380	u32 pred_demand;
				2381
				2382	bidx = busy_to_bucket(runtime);
				2383	pred_demand = get_pred_busy(rq, p, bidx, runtime);
				2384	bucket_increase(p->ravg.busy_buckets, bidx);
				2385
				2386	return pred_demand;
				2387	}
				2388
				2389	static void update_task_cpu_cycles(struct task_struct *p, int cpu)
				2390	{
				2391	if (use_cycle_counter)
				2392	p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
				2393	}
				2394
				2395	static void
				2396	update_task_rq_cpu_cycles(struct task_struct p, struct rq rq, int event,
				2397	u64 wallclock, u64 irqtime)
				2398	{
				2399	u64 cur_cycles;
				2400	int cpu = cpu_of(rq);
				2401
				2402	lockdep_assert_held(&rq->lock);
				2403
				2404	if (!use_cycle_counter) {
				2405	rq->cc.cycles = cpu_cur_freq(cpu);
				2406	rq->cc.time = 1;
				2407	return;
				2408	}
				2409
				2410	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
				2411
				2412	/*
				2413	* If current task is idle task and irqtime == 0 CPU was
				2414	* indeed idle and probably its cycle counter was not
				2415	* increasing. We still need estimatied CPU frequency
				2416	* for IO wait time accounting. Use the previously
				2417	* calculated frequency in such a case.
				2418	*/
				2419	if (!is_idle_task(rq->curr) \|\| irqtime) {
				2420	if (unlikely(cur_cycles < p->cpu_cycles))
				2421	rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
				2422	else
				2423	rq->cc.cycles = cur_cycles - p->cpu_cycles;
				2424	rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
				2425
				2426	if (event == IRQ_UPDATE && is_idle_task(p))
				2427	/*
				2428	* Time between mark_start of idle task and IRQ handler
				2429	* entry time is CPU cycle counter stall period.
				2430	* Upon IRQ handler entry sched_account_irqstart()
				2431	* replenishes idle task's cpu cycle counter so
				2432	* rq->cc.cycles now represents increased cycles during
				2433	* IRQ handler rather than time between idle entry and
				2434	* IRQ exit. Thus use irqtime as time delta.
				2435	*/
				2436	rq->cc.time = irqtime;
				2437	else
				2438	rq->cc.time = wallclock - p->ravg.mark_start;
				2439	BUG_ON((s64)rq->cc.time < 0);
				2440	}
				2441
				2442	p->cpu_cycles = cur_cycles;
				2443
				2444	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
				2445	}
				2446
				2447	static int account_busy_for_task_demand(struct task_struct *p, int event)
				2448	{
				2449	/*
				2450	* No need to bother updating task demand for exiting tasks
				2451	* or the idle task.
				2452	*/
				2453	if (exiting_task(p) \|\| is_idle_task(p))
				2454	return 0;
				2455
				2456	/*
				2457	* When a task is waking up it is completing a segment of non-busy
				2458	* time. Likewise, if wait time is not treated as busy time, then
				2459	* when a task begins to run or is migrated, it is not running and
				2460	* is completing a segment of non-busy time.
				2461	*/
				2462	if (event == TASK_WAKE \|\| (!SCHED_ACCOUNT_WAIT_TIME &&
				2463	(event == PICK_NEXT_TASK \|\| event == TASK_MIGRATE)))
				2464	return 0;
				2465
				2466	return 1;
				2467	}
				2468
				2469	/*
				2470	* Called when new window is starting for a task, to record cpu usage over
				2471	* recently concluded window(s). Normally 'samples' should be 1. It can be > 1
				2472	* when, say, a real-time task runs without preemption for several windows at a
				2473	* stretch.
				2474	*/
				2475	static void update_history(struct rq rq, struct task_struct p,
				2476	u32 runtime, int samples, int event)
				2477	{
				2478	u32 *hist = &p->ravg.sum_history[0];
				2479	int ridx, widx;
				2480	u32 max = 0, avg, demand, pred_demand;
				2481	u64 sum = 0;
				2482
				2483	/* Ignore windows where task had no activity */
				2484	if (!runtime \|\| is_idle_task(p) \|\| exiting_task(p) \|\| !samples)
				2485	goto done;
				2486
				2487	/* Push new 'runtime' value onto stack */
				2488	widx = sched_ravg_hist_size - 1;
				2489	ridx = widx - samples;
				2490	for (; ridx >= 0; --widx, --ridx) {
				2491	hist[widx] = hist[ridx];
				2492	sum += hist[widx];
				2493	if (hist[widx] > max)
				2494	max = hist[widx];
				2495	}
				2496
				2497	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
				2498	hist[widx] = runtime;
				2499	sum += hist[widx];
				2500	if (hist[widx] > max)
				2501	max = hist[widx];
				2502	}
				2503
				2504	p->ravg.sum = 0;
				2505
				2506	if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
				2507	demand = runtime;
				2508	} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
				2509	demand = max;
				2510	} else {
				2511	avg = div64_u64(sum, sched_ravg_hist_size);
				2512	if (sched_window_stats_policy == WINDOW_STATS_AVG)
				2513	demand = avg;
				2514	else
				2515	demand = max(avg, runtime);
				2516	}
				2517	pred_demand = predict_and_update_buckets(rq, p, runtime);
				2518
				2519	/*
				2520	* A throttled deadline sched class task gets dequeued without
				2521	* changing p->on_rq. Since the dequeue decrements hmp stats
				2522	* avoid decrementing it here again.
				2523	*/
				2524	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) \|\|
				2525	!p->dl.dl_throttled))
				2526	p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
				2527	pred_demand);
				2528
				2529	p->ravg.demand = demand;
				2530	p->ravg.pred_demand = pred_demand;
				2531
				2532	done:
				2533	trace_sched_update_history(rq, p, runtime, samples, event);
				2534	}
				2535
				2536	static void add_to_task_demand(struct rq rq, struct task_struct p, u64 delta)
				2537	{
				2538	delta = scale_exec_time(delta, rq);
				2539	p->ravg.sum += delta;
				2540	if (unlikely(p->ravg.sum > sched_ravg_window))
				2541	p->ravg.sum = sched_ravg_window;
				2542	}
				2543
				2544	/*
				2545	* Account cpu demand of task and/or update task's cpu demand history
				2546	*
				2547	* ms = p->ravg.mark_start;
				2548	* wc = wallclock
				2549	* ws = rq->window_start
				2550	*
				2551	* Three possibilities:
				2552	*
				2553	* a) Task event is contained within one window.
				2554	* window_start < mark_start < wallclock
				2555	*
				2556	* ws ms wc
				2557	* \| \| \|
				2558	* V V V
				2559	* \|---------------\|
				2560	*
				2561	* In this case, p->ravg.sum is updated iff event is appropriate
				2562	* (ex: event == PUT_PREV_TASK)
				2563	*
				2564	* b) Task event spans two windows.
				2565	* mark_start < window_start < wallclock
				2566	*
				2567	* ms ws wc
				2568	* \| \| \|
				2569	* V V V
				2570	* -----\|-------------------
				2571	*
				2572	* In this case, p->ravg.sum is updated with (ws - ms) iff event
				2573	* is appropriate, then a new window sample is recorded followed
				2574	* by p->ravg.sum being set to (wc - ws) iff event is appropriate.
				2575	*
				2576	* c) Task event spans more than two windows.
				2577	*
				2578	* ms ws_tmp ws wc
				2579	* \| \| \| \|
				2580	* V V V V
				2581	* ---\|-------\|-------\|-------\|-------\|------
				2582	* \| \|
				2583	* \|<------ nr_full_windows ------>\|
				2584	*
				2585	* In this case, p->ravg.sum is updated with (ws_tmp - ms) first iff
				2586	* event is appropriate, window sample of p->ravg.sum is recorded,
				2587	* 'nr_full_window' samples of window_size is also recorded iff
				2588	* event is appropriate and finally p->ravg.sum is set to (wc - ws)
				2589	* iff event is appropriate.
				2590	*
				2591	* IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
				2592	* depends on it!
				2593	*/
				2594	static void update_task_demand(struct task_struct p, struct rq rq,
				2595	int event, u64 wallclock)
				2596	{
				2597	u64 mark_start = p->ravg.mark_start;
				2598	u64 delta, window_start = rq->window_start;
				2599	int new_window, nr_full_windows;
				2600	u32 window_size = sched_ravg_window;
				2601
				2602	new_window = mark_start < window_start;
				2603	if (!account_busy_for_task_demand(p, event)) {
				2604	if (new_window)
				2605	/*
				2606	* If the time accounted isn't being accounted as
				2607	* busy time, and a new window started, only the
				2608	* previous window need be closed out with the
				2609	* pre-existing demand. Multiple windows may have
				2610	* elapsed, but since empty windows are dropped,
				2611	* it is not necessary to account those.
				2612	*/
				2613	update_history(rq, p, p->ravg.sum, 1, event);
				2614	return;
				2615	}
				2616
				2617	if (!new_window) {
				2618	/*
				2619	* The simple case - busy time contained within the existing
				2620	* window.
				2621	*/
				2622	add_to_task_demand(rq, p, wallclock - mark_start);
				2623	return;
				2624	}
				2625
				2626	/*
				2627	* Busy time spans at least two windows. Temporarily rewind
				2628	* window_start to first window boundary after mark_start.
				2629	*/
				2630	delta = window_start - mark_start;
				2631	nr_full_windows = div64_u64(delta, window_size);
				2632	window_start -= (u64)nr_full_windows * (u64)window_size;
				2633
				2634	/* Process (window_start - mark_start) first */
				2635	add_to_task_demand(rq, p, window_start - mark_start);
				2636
				2637	/* Push new sample(s) into task's demand history */
				2638	update_history(rq, p, p->ravg.sum, 1, event);
				2639	if (nr_full_windows)
				2640	update_history(rq, p, scale_exec_time(window_size, rq),
				2641	nr_full_windows, event);
				2642
				2643	/*
				2644	* Roll window_start back to current to process any remainder
				2645	* in current window.
				2646	*/
				2647	window_start += (u64)nr_full_windows * (u64)window_size;
				2648
				2649	/* Process (wallclock - window_start) next */
				2650	mark_start = window_start;
				2651	add_to_task_demand(rq, p, wallclock - mark_start);
				2652	}
				2653
				2654	/* Reflect task activity on its demand and cpu's busy time statistics */
				2655	void update_task_ravg(struct task_struct p, struct rq rq, int event,
				2656	u64 wallclock, u64 irqtime)
				2657	{
				2658	if (!rq->window_start \|\| sched_disable_window_stats)
				2659	return;
				2660
				2661	lockdep_assert_held(&rq->lock);
				2662
				2663	update_window_start(rq, wallclock);
				2664
				2665	if (!p->ravg.mark_start) {
				2666	update_task_cpu_cycles(p, cpu_of(rq));
				2667	goto done;
				2668	}
				2669
				2670	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
				2671	update_task_demand(p, rq, event, wallclock);
				2672	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
				2673	update_task_pred_demand(rq, p, event);
				2674	done:
				2675	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
				2676	rq->cc.cycles, rq->cc.time,
				2677	_group_cpu_time(p->grp, cpu_of(rq)));
				2678
				2679	p->ravg.mark_start = wallclock;
				2680	}
				2681
				2682	void sched_account_irqtime(int cpu, struct task_struct *curr,
				2683	u64 delta, u64 wallclock)
				2684	{
				2685	struct rq *rq = cpu_rq(cpu);
				2686	unsigned long flags, nr_windows;
				2687	u64 cur_jiffies_ts;
				2688
				2689	raw_spin_lock_irqsave(&rq->lock, flags);
				2690
				2691	/*
				2692	* cputime (wallclock) uses sched_clock so use the same here for
				2693	* consistency.
				2694	*/
				2695	delta += sched_clock() - wallclock;
				2696	cur_jiffies_ts = get_jiffies_64();
				2697
				2698	if (is_idle_task(curr))
				2699	update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
				2700	delta);
				2701
				2702	nr_windows = cur_jiffies_ts - rq->irqload_ts;
				2703
				2704	if (nr_windows) {
				2705	if (nr_windows < 10) {
				2706	/* Decay CPU's irqload by 3/4 for each window. */
				2707	rq->avg_irqload = (3 nr_windows);
				2708	rq->avg_irqload = div64_u64(rq->avg_irqload,
				2709	4 * nr_windows);
				2710	} else {
				2711	rq->avg_irqload = 0;
				2712	}
				2713	rq->avg_irqload += rq->cur_irqload;
				2714	rq->cur_irqload = 0;
				2715	}
				2716
				2717	rq->cur_irqload += delta;
				2718	rq->irqload_ts = cur_jiffies_ts;
				2719	raw_spin_unlock_irqrestore(&rq->lock, flags);
				2720	}
				2721
				2722	void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
				2723	{
				2724	struct rq *rq = cpu_rq(cpu);
				2725
				2726	if (!rq->window_start \|\| sched_disable_window_stats)
				2727	return;
				2728
				2729	if (is_idle_task(curr)) {
				2730	/* We're here without rq->lock held, IRQ disabled */
				2731	raw_spin_lock(&rq->lock);
				2732	update_task_cpu_cycles(curr, cpu);
				2733	raw_spin_unlock(&rq->lock);
				2734	}
				2735	}
				2736
				2737	void reset_task_stats(struct task_struct *p)
				2738	{
				2739	u32 sum = 0;
				2740
				2741	if (exiting_task(p))
				2742	sum = EXITING_TASK_MARKER;
				2743
				2744	memset(&p->ravg, 0, sizeof(struct ravg));
				2745	/* Retain EXITING_TASK marker */
				2746	p->ravg.sum_history[0] = sum;
				2747	}
				2748
				2749	void mark_task_starting(struct task_struct *p)
				2750	{
				2751	u64 wallclock;
				2752	struct rq *rq = task_rq(p);
				2753
				2754	if (!rq->window_start \|\| sched_disable_window_stats) {
				2755	reset_task_stats(p);
				2756	return;
				2757	}
				2758
				2759	wallclock = sched_ktime_clock();
				2760	p->ravg.mark_start = p->last_wake_ts = wallclock;
				2761	p->last_cpu_selected_ts = wallclock;
				2762	p->last_switch_out_ts = 0;
				2763	update_task_cpu_cycles(p, cpu_of(rq));
				2764	}
				2765
				2766	void set_window_start(struct rq *rq)
				2767	{
				2768	int cpu = cpu_of(rq);
				2769	struct rq *sync_rq = cpu_rq(sync_cpu);
				2770
				2771	if (rq->window_start)
				2772	return;
				2773
				2774	if (cpu == sync_cpu) {
				2775	rq->window_start = sched_ktime_clock();
				2776	} else {
				2777	raw_spin_unlock(&rq->lock);
				2778	double_rq_lock(rq, sync_rq);
				2779	rq->window_start = cpu_rq(sync_cpu)->window_start;
				2780	rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
				2781	rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
				2782	raw_spin_unlock(&sync_rq->lock);
				2783	}
				2784
				2785	rq->curr->ravg.mark_start = rq->window_start;
				2786	}
				2787
				2788	void migrate_sync_cpu(int cpu)
				2789	{
				2790	if (cpu == sync_cpu)
				2791	sync_cpu = smp_processor_id();
				2792	}
				2793
				2794	static void reset_all_task_stats(void)
				2795	{
				2796	struct task_struct g, p;
				2797
				2798	read_lock(&tasklist_lock);
				2799	do_each_thread(g, p) {
				2800	reset_task_stats(p);
				2801	} while_each_thread(g, p);
				2802	read_unlock(&tasklist_lock);
				2803	}
				2804
				2805	static void disable_window_stats(void)
				2806	{
				2807	unsigned long flags;
				2808	int i;
				2809
				2810	local_irq_save(flags);
				2811	for_each_possible_cpu(i)
				2812	raw_spin_lock(&cpu_rq(i)->lock);
				2813
				2814	sched_disable_window_stats = 1;
				2815
				2816	for_each_possible_cpu(i)
				2817	raw_spin_unlock(&cpu_rq(i)->lock);
				2818
				2819	local_irq_restore(flags);
				2820	}
				2821
				2822	/* Called with all cpu's rq->lock held */
				2823	static void enable_window_stats(void)
				2824	{
				2825	sched_disable_window_stats = 0;
				2826
				2827	}
				2828
				2829	enum reset_reason_code {
				2830	WINDOW_CHANGE,
				2831	POLICY_CHANGE,
				2832	HIST_SIZE_CHANGE,
				2833	FREQ_AGGREGATE_CHANGE,
				2834	};
				2835
				2836	const char *sched_window_reset_reasons[] = {
				2837	"WINDOW_CHANGE",
				2838	"POLICY_CHANGE",
				2839	"HIST_SIZE_CHANGE",
				2840	};
				2841
				2842	/* Called with IRQs enabled */
				2843	void reset_all_window_stats(u64 window_start, unsigned int window_size)
				2844	{
				2845	int cpu;
				2846	unsigned long flags;
				2847	u64 start_ts = sched_ktime_clock();
				2848	int reason = WINDOW_CHANGE;
				2849	unsigned int old = 0, new = 0;
				2850	struct related_thread_group *grp;
				2851
				2852	disable_window_stats();
				2853
				2854	reset_all_task_stats();
				2855
				2856	local_irq_save(flags);
				2857
				2858	read_lock(&related_thread_group_lock);
				2859
				2860	for_each_possible_cpu(cpu)
				2861	raw_spin_lock(&cpu_rq(cpu)->lock);
				2862
				2863	list_for_each_entry(grp, &related_thread_groups, list) {
				2864	int j;
				2865
				2866	for_each_possible_cpu(j) {
				2867	struct group_cpu_time *cpu_time;
				2868	/* Protected by rq lock */
				2869	cpu_time = _group_cpu_time(grp, j);
				2870	memset(cpu_time, 0, sizeof(struct group_cpu_time));
				2871	if (window_start)
				2872	cpu_time->window_start = window_start;
				2873	}
				2874	}
				2875
				2876	if (window_size) {
				2877	sched_ravg_window = window_size * TICK_NSEC;
				2878	set_hmp_defaults();
				2879	}
				2880
				2881	enable_window_stats();
				2882
				2883	for_each_possible_cpu(cpu) {
				2884	struct rq *rq = cpu_rq(cpu);
				2885
				2886	if (window_start)
				2887	rq->window_start = window_start;
				2888	rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
				2889	rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
				2890	reset_cpu_hmp_stats(cpu, 1);
				2891	}
				2892
				2893	if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
				2894	reason = POLICY_CHANGE;
				2895	old = sched_window_stats_policy;
				2896	new = sysctl_sched_window_stats_policy;
				2897	sched_window_stats_policy = sysctl_sched_window_stats_policy;
				2898	} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
				2899	reason = HIST_SIZE_CHANGE;
				2900	old = sched_ravg_hist_size;
				2901	new = sysctl_sched_ravg_hist_size;
				2902	sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
				2903	} else if (sched_freq_aggregate !=
				2904	sysctl_sched_freq_aggregate) {
				2905	reason = FREQ_AGGREGATE_CHANGE;
				2906	old = sched_freq_aggregate;
				2907	new = sysctl_sched_freq_aggregate;
				2908	sched_freq_aggregate = sysctl_sched_freq_aggregate;
				2909	}
				2910
				2911	for_each_possible_cpu(cpu)
				2912	raw_spin_unlock(&cpu_rq(cpu)->lock);
				2913
				2914	read_unlock(&related_thread_group_lock);
				2915
				2916	local_irq_restore(flags);
				2917
				2918	trace_sched_reset_all_window_stats(window_start, window_size,
				2919	sched_ktime_clock() - start_ts, reason, old, new);
				2920	}
				2921
				2922	static inline void
				2923	sync_window_start(struct rq rq, struct group_cpu_time cpu_time);
				2924
				2925	void sched_get_cpus_busy(struct sched_load *busy,
				2926	const struct cpumask *query_cpus)
				2927	{
				2928	unsigned long flags;
				2929	struct rq *rq;
				2930	const int cpus = cpumask_weight(query_cpus);
				2931	u64 load[cpus], group_load[cpus];
				2932	u64 nload[cpus], ngload[cpus];
				2933	u64 pload[cpus];
				2934	unsigned int cur_freq[cpus], max_freq[cpus];
				2935	int notifier_sent = 0;
				2936	int early_detection[cpus];
				2937	int cpu, i = 0;
				2938	unsigned int window_size;
				2939	u64 max_prev_sum = 0;
				2940	int max_busy_cpu = cpumask_first(query_cpus);
				2941	struct related_thread_group *grp;
				2942	u64 total_group_load = 0, total_ngload = 0;
				2943	bool aggregate_load = false;
				2944
				2945	if (unlikely(cpus == 0))
				2946	return;
				2947
				2948	/*
				2949	* This function could be called in timer context, and the
				2950	* current task may have been executing for a long time. Ensure
				2951	* that the window stats are current by doing an update.
				2952	*/
				2953	read_lock(&related_thread_group_lock);
				2954
				2955	local_irq_save(flags);
				2956	for_each_cpu(cpu, query_cpus)
				2957	raw_spin_lock(&cpu_rq(cpu)->lock);
				2958
				2959	window_size = sched_ravg_window;
				2960
				2961	for_each_cpu(cpu, query_cpus) {
				2962	rq = cpu_rq(cpu);
				2963
				2964	update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
				2965	0);
				2966	cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
				2967
				2968	load[i] = rq->old_busy_time = rq->prev_runnable_sum;
				2969	nload[i] = rq->nt_prev_runnable_sum;
				2970	pload[i] = rq->hmp_stats.pred_demands_sum;
				2971	rq->old_estimated_time = pload[i];
				2972
				2973	if (load[i] > max_prev_sum) {
				2974	max_prev_sum = load[i];
				2975	max_busy_cpu = cpu;
				2976	}
				2977
				2978	/*
				2979	* sched_get_cpus_busy() is called for all CPUs in a
				2980	* frequency domain. So the notifier_sent flag per
				2981	* cluster works even when a frequency domain spans
				2982	* more than 1 cluster.
				2983	*/
				2984	if (rq->cluster->notifier_sent) {
				2985	notifier_sent = 1;
				2986	rq->cluster->notifier_sent = 0;
				2987	}
				2988	early_detection[i] = (rq->ed_task != NULL);
				2989	cur_freq[i] = cpu_cur_freq(cpu);
				2990	max_freq[i] = cpu_max_freq(cpu);
				2991	i++;
				2992	}
				2993
				2994	for_each_related_thread_group(grp) {
				2995	for_each_cpu(cpu, query_cpus) {
				2996	/* Protected by rq_lock */
				2997	struct group_cpu_time *cpu_time =
				2998	_group_cpu_time(grp, cpu);
				2999	sync_window_start(cpu_rq(cpu), cpu_time);
				3000	}
				3001	}
				3002
				3003	group_load_in_freq_domain(
				3004	&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
				3005	&total_group_load, &total_ngload);
				3006	aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
				3007
				3008	i = 0;
				3009	for_each_cpu(cpu, query_cpus) {
				3010	group_load[i] = 0;
				3011	ngload[i] = 0;
				3012
				3013	if (early_detection[i])
				3014	goto skip_early;
				3015
				3016	rq = cpu_rq(cpu);
				3017	if (aggregate_load) {
				3018	if (cpu == max_busy_cpu) {
				3019	group_load[i] = total_group_load;
				3020	ngload[i] = total_ngload;
				3021	}
				3022	} else {
				3023	_group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
				3024	}
				3025
				3026	load[i] += group_load[i];
				3027	nload[i] += ngload[i];
				3028	/*
				3029	* Scale load in reference to cluster max_possible_freq.
				3030	*
				3031	* Note that scale_load_to_cpu() scales load in reference to
				3032	* the cluster max_freq.
				3033	*/
				3034	load[i] = scale_load_to_cpu(load[i], cpu);
				3035	nload[i] = scale_load_to_cpu(nload[i], cpu);
				3036	pload[i] = scale_load_to_cpu(pload[i], cpu);
				3037	skip_early:
				3038	i++;
				3039	}
				3040
				3041	for_each_cpu(cpu, query_cpus)
				3042	raw_spin_unlock(&(cpu_rq(cpu))->lock);
				3043	local_irq_restore(flags);
				3044
				3045	read_unlock(&related_thread_group_lock);
				3046
				3047	i = 0;
				3048	for_each_cpu(cpu, query_cpus) {
				3049	rq = cpu_rq(cpu);
				3050
				3051	if (early_detection[i]) {
				3052	busy[i].prev_load = div64_u64(sched_ravg_window,
				3053	NSEC_PER_USEC);
				3054	busy[i].new_task_load = 0;
				3055	goto exit_early;
				3056	}
				3057
				3058	/*
				3059	* When the load aggregation is controlled by
				3060	* sched_freq_aggregate_threshold, allow reporting loads
				3061	* greater than 100 @ Fcur to ramp up the frequency
				3062	* faster.
				3063	*/
				3064	if (notifier_sent \|\| (aggregate_load &&
				3065	sched_freq_aggregate_threshold)) {
				3066	load[i] = scale_load_to_freq(load[i], max_freq[i],
				3067	cpu_max_possible_freq(cpu));
				3068	nload[i] = scale_load_to_freq(nload[i], max_freq[i],
				3069	cpu_max_possible_freq(cpu));
				3070	} else {
				3071	load[i] = scale_load_to_freq(load[i], max_freq[i],
				3072	cur_freq[i]);
				3073	nload[i] = scale_load_to_freq(nload[i], max_freq[i],
				3074	cur_freq[i]);
				3075	if (load[i] > window_size)
				3076	load[i] = window_size;
				3077	if (nload[i] > window_size)
				3078	nload[i] = window_size;
				3079
				3080	load[i] = scale_load_to_freq(load[i], cur_freq[i],
				3081	cpu_max_possible_freq(cpu));
				3082	nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
				3083	cpu_max_possible_freq(cpu));
				3084	}
				3085	pload[i] = scale_load_to_freq(pload[i], max_freq[i],
				3086	rq->cluster->max_possible_freq);
				3087
				3088	busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
				3089	busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
				3090	busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
				3091
				3092	exit_early:
				3093	trace_sched_get_busy(cpu, busy[i].prev_load,
				3094	busy[i].new_task_load,
				3095	busy[i].predicted_load,
				3096	early_detection[i]);
				3097	i++;
				3098	}
				3099	}
				3100
				3101	void sched_set_io_is_busy(int val)
				3102	{
				3103	sched_io_is_busy = val;
				3104	}
				3105
				3106	int sched_set_window(u64 window_start, unsigned int window_size)
				3107	{
				3108	u64 now, cur_jiffies, jiffy_ktime_ns;
				3109	s64 ws;
				3110	unsigned long flags;
				3111
				3112	if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)
				3113	return -EINVAL;
				3114
				3115	mutex_lock(&policy_mutex);
				3116
				3117	/*
				3118	* Get a consistent view of ktime, jiffies, and the time
				3119	* since the last jiffy (based on last_jiffies_update).
				3120	*/
				3121	local_irq_save(flags);
				3122	cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
				3123	local_irq_restore(flags);
				3124
				3125	/* translate window_start from jiffies to nanoseconds */
				3126	ws = (window_start - cur_jiffies); /* jiffy difference */
				3127	ws *= TICK_NSEC;
				3128	ws += jiffy_ktime_ns;
				3129
				3130	/*
				3131	* Roll back calculated window start so that it is in
				3132	* the past (window stats must have a current window).
				3133	*/
				3134	while (ws > now)
				3135	ws -= (window_size * TICK_NSEC);
				3136
				3137	BUG_ON(sched_ktime_clock() < ws);
				3138
				3139	reset_all_window_stats(ws, window_size);
				3140
				3141	sched_update_freq_max_load(cpu_possible_mask);
				3142
				3143	mutex_unlock(&policy_mutex);
				3144
				3145	return 0;
				3146	}
				3147
				3148	void fixup_busy_time(struct task_struct *p, int new_cpu)
				3149	{
				3150	struct rq *src_rq = task_rq(p);
				3151	struct rq *dest_rq = cpu_rq(new_cpu);
				3152	u64 wallclock;
				3153	u64 src_curr_runnable_sum, dst_curr_runnable_sum;
				3154	u64 src_prev_runnable_sum, dst_prev_runnable_sum;
				3155	u64 src_nt_curr_runnable_sum, dst_nt_curr_runnable_sum;
				3156	u64 src_nt_prev_runnable_sum, dst_nt_prev_runnable_sum;
				3157	int migrate_type;
				3158	struct migration_sum_data d;
				3159	bool new_task;
				3160	struct related_thread_group *grp;
				3161
				3162	if (!p->on_rq && p->state != TASK_WAKING)
				3163	return;
				3164
				3165	if (exiting_task(p)) {
				3166	clear_ed_task(p, src_rq);
				3167	return;
				3168	}
				3169
				3170	if (p->state == TASK_WAKING)
				3171	double_rq_lock(src_rq, dest_rq);
				3172
				3173	if (sched_disable_window_stats)
				3174	goto done;
				3175
				3176	wallclock = sched_ktime_clock();
				3177
				3178	update_task_ravg(task_rq(p)->curr, task_rq(p),
				3179	TASK_UPDATE,
				3180	wallclock, 0);
				3181	update_task_ravg(dest_rq->curr, dest_rq,
				3182	TASK_UPDATE, wallclock, 0);
				3183
				3184	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
				3185	wallclock, 0);
				3186
				3187	update_task_cpu_cycles(p, new_cpu);
				3188
				3189	new_task = is_new_task(p);
				3190	/* Protected by rq_lock */
				3191	grp = p->grp;
				3192	if (grp && sched_freq_aggregate) {
				3193	struct group_cpu_time *cpu_time;
				3194
				3195	migrate_type = GROUP_TO_GROUP;
				3196	/* Protected by rq_lock */
				3197	cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
				3198	d.src_rq = NULL;
				3199	d.src_cpu_time = cpu_time;
				3200	src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
				3201	src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
				3202	src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
				3203	src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
				3204
				3205	/* Protected by rq_lock */
				3206	cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
				3207	d.dst_rq = NULL;
				3208	d.dst_cpu_time = cpu_time;
				3209	dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
				3210	dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
				3211	dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
				3212	dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
				3213	sync_window_start(dest_rq, cpu_time);
				3214	} else {
				3215	migrate_type = RQ_TO_RQ;
				3216	d.src_rq = src_rq;
				3217	d.src_cpu_time = NULL;
				3218	d.dst_rq = dest_rq;
				3219	d.dst_cpu_time = NULL;
				3220	src_curr_runnable_sum = &src_rq->curr_runnable_sum;
				3221	src_prev_runnable_sum = &src_rq->prev_runnable_sum;
				3222	src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
				3223	src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
				3224
				3225	dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
				3226	dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
				3227	dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
				3228	dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
				3229	}
				3230
				3231	if (p->ravg.curr_window) {
				3232	*src_curr_runnable_sum -= p->ravg.curr_window;
				3233	*dst_curr_runnable_sum += p->ravg.curr_window;
				3234	if (new_task) {
				3235	*src_nt_curr_runnable_sum -= p->ravg.curr_window;
				3236	*dst_nt_curr_runnable_sum += p->ravg.curr_window;
				3237	}
				3238	}
				3239
				3240	if (p->ravg.prev_window) {
				3241	*src_prev_runnable_sum -= p->ravg.prev_window;
				3242	*dst_prev_runnable_sum += p->ravg.prev_window;
				3243	if (new_task) {
				3244	*src_nt_prev_runnable_sum -= p->ravg.prev_window;
				3245	*dst_nt_prev_runnable_sum += p->ravg.prev_window;
				3246	}
				3247	}
				3248
				3249	if (p == src_rq->ed_task) {
				3250	src_rq->ed_task = NULL;
				3251	if (!dest_rq->ed_task)
				3252	dest_rq->ed_task = p;
				3253	}
				3254
				3255	trace_sched_migration_update_sum(p, migrate_type, &d);
				3256	BUG_ON((s64)*src_prev_runnable_sum < 0);
				3257	BUG_ON((s64)*src_curr_runnable_sum < 0);
				3258	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
				3259	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
				3260
				3261	done:
				3262	if (p->state == TASK_WAKING)
				3263	double_rq_unlock(src_rq, dest_rq);
				3264	}
				3265
				3266	#define sched_up_down_migrate_auto_update 1
				3267	static void check_for_up_down_migrate_update(const struct cpumask *cpus)
				3268	{
				3269	int i = cpumask_first(cpus);
				3270
				3271	if (!sched_up_down_migrate_auto_update)
				3272	return;
				3273
				3274	if (cpu_max_possible_capacity(i) == max_possible_capacity)
				3275	return;
				3276
				3277	if (cpu_max_possible_freq(i) == cpu_max_freq(i))
				3278	up_down_migrate_scale_factor = 1024;
				3279	else
				3280	up_down_migrate_scale_factor = (1024 *
				3281	cpu_max_possible_freq(i)) / cpu_max_freq(i);
				3282
				3283	update_up_down_migrate();
				3284	}
				3285
				3286	/* Return cluster which can offer required capacity for group */
				3287	static struct sched_cluster *
				3288	best_cluster(struct related_thread_group *grp, u64 total_demand)
				3289	{
				3290	struct sched_cluster *cluster = NULL;
				3291
				3292	for_each_sched_cluster(cluster) {
				3293	if (group_will_fit(cluster, grp, total_demand))
				3294	return cluster;
				3295	}
				3296
				3297	return NULL;
				3298	}
				3299
				3300	static void _set_preferred_cluster(struct related_thread_group *grp)
				3301	{
				3302	struct task_struct *p;
				3303	u64 combined_demand = 0;
				3304
				3305	if (!sysctl_sched_enable_colocation) {
				3306	grp->last_update = sched_ktime_clock();
				3307	grp->preferred_cluster = NULL;
				3308	return;
				3309	}
				3310
				3311	/*
				3312	* wakeup of two or more related tasks could race with each other and
				3313	* could result in multiple calls to _set_preferred_cluster being issued
				3314	* at same time. Avoid overhead in such cases of rechecking preferred
				3315	* cluster
				3316	*/
				3317	if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
				3318	return;
				3319
				3320	list_for_each_entry(p, &grp->tasks, grp_list)
				3321	combined_demand += p->ravg.demand;
				3322
				3323	grp->preferred_cluster = best_cluster(grp, combined_demand);
				3324	grp->last_update = sched_ktime_clock();
				3325	trace_sched_set_preferred_cluster(grp, combined_demand);
				3326	}
				3327
				3328	void set_preferred_cluster(struct related_thread_group *grp)
				3329	{
				3330	raw_spin_lock(&grp->lock);
				3331	_set_preferred_cluster(grp);
				3332	raw_spin_unlock(&grp->lock);
				3333	}
				3334
				3335	#define ADD_TASK 0
				3336	#define REM_TASK 1
				3337
				3338	static inline void free_group_cputime(struct related_thread_group *grp)
				3339	{
				3340	free_percpu(grp->cpu_time);
				3341	}
				3342
				3343	static int alloc_group_cputime(struct related_thread_group *grp)
				3344	{
				3345	int i;
				3346	struct group_cpu_time *cpu_time;
				3347	int cpu = raw_smp_processor_id();
				3348	struct rq *rq = cpu_rq(cpu);
				3349	u64 window_start = rq->window_start;
				3350
				3351	grp->cpu_time = alloc_percpu(struct group_cpu_time);
				3352	if (!grp->cpu_time)
				3353	return -ENOMEM;
				3354
				3355	for_each_possible_cpu(i) {
				3356	cpu_time = per_cpu_ptr(grp->cpu_time, i);
				3357	memset(cpu_time, 0, sizeof(struct group_cpu_time));
				3358	cpu_time->window_start = window_start;
				3359	}
				3360
				3361	return 0;
				3362	}
				3363
				3364	/*
				3365	* A group's window_start may be behind. When moving it forward, flip prev/curr
				3366	* counters. When moving forward > 1 window, prev counter is set to 0
				3367	*/
				3368	static inline void
				3369	sync_window_start(struct rq rq, struct group_cpu_time cpu_time)
				3370	{
				3371	u64 delta;
				3372	int nr_windows;
				3373	u64 curr_sum = cpu_time->curr_runnable_sum;
				3374	u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
				3375
				3376	delta = rq->window_start - cpu_time->window_start;
				3377	if (!delta)
				3378	return;
				3379
				3380	nr_windows = div64_u64(delta, sched_ravg_window);
				3381	if (nr_windows > 1)
				3382	curr_sum = nt_curr_sum = 0;
				3383
				3384	cpu_time->prev_runnable_sum = curr_sum;
				3385	cpu_time->curr_runnable_sum = 0;
				3386
				3387	cpu_time->nt_prev_runnable_sum = nt_curr_sum;
				3388	cpu_time->nt_curr_runnable_sum = 0;
				3389
				3390	cpu_time->window_start = rq->window_start;
				3391	}
				3392
				3393	/*
				3394	* Task's cpu usage is accounted in:
				3395	* rq->curr/prev_runnable_sum, when its ->grp is NULL
				3396	* grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
				3397	*
				3398	* Transfer task's cpu usage between those counters when transitioning between
				3399	* groups
				3400	*/
				3401	static void transfer_busy_time(struct rq rq, struct related_thread_group grp,
				3402	struct task_struct *p, int event)
				3403	{
				3404	u64 wallclock;
				3405	struct group_cpu_time *cpu_time;
				3406	u64 src_curr_runnable_sum, dst_curr_runnable_sum;
				3407	u64 src_prev_runnable_sum, dst_prev_runnable_sum;
				3408	u64 src_nt_curr_runnable_sum, dst_nt_curr_runnable_sum;
				3409	u64 src_nt_prev_runnable_sum, dst_nt_prev_runnable_sum;
				3410	struct migration_sum_data d;
				3411	int migrate_type;
				3412
				3413	if (!sched_freq_aggregate)
				3414	return;
				3415
				3416	wallclock = sched_ktime_clock();
				3417
				3418	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
				3419	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
				3420
				3421	/* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
				3422	cpu_time = _group_cpu_time(grp, cpu_of(rq));
				3423	if (event == ADD_TASK) {
				3424	sync_window_start(rq, cpu_time);
				3425	migrate_type = RQ_TO_GROUP;
				3426	d.src_rq = rq;
				3427	d.src_cpu_time = NULL;
				3428	d.dst_rq = NULL;
				3429	d.dst_cpu_time = cpu_time;
				3430	src_curr_runnable_sum = &rq->curr_runnable_sum;
				3431	dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
				3432	src_prev_runnable_sum = &rq->prev_runnable_sum;
				3433	dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
				3434
				3435	src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
				3436	dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
				3437	src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
				3438	dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
				3439	} else {
				3440	migrate_type = GROUP_TO_RQ;
				3441	d.src_rq = NULL;
				3442	d.src_cpu_time = cpu_time;
				3443	d.dst_rq = rq;
				3444	d.dst_cpu_time = NULL;
				3445
				3446	/*
				3447	* In case of REM_TASK, cpu_time->window_start would be
				3448	* uptodate, because of the update_task_ravg() we called
				3449	* above on the moving task. Hence no need for
				3450	* sync_window_start()
				3451	*/
				3452	src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
				3453	dst_curr_runnable_sum = &rq->curr_runnable_sum;
				3454	src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
				3455	dst_prev_runnable_sum = &rq->prev_runnable_sum;
				3456
				3457	src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
				3458	dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
				3459	src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
				3460	dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
				3461	}
				3462
				3463	*src_curr_runnable_sum -= p->ravg.curr_window;
				3464	*dst_curr_runnable_sum += p->ravg.curr_window;
				3465
				3466	*src_prev_runnable_sum -= p->ravg.prev_window;
				3467	*dst_prev_runnable_sum += p->ravg.prev_window;
				3468
				3469	if (is_new_task(p)) {
				3470	*src_nt_curr_runnable_sum -= p->ravg.curr_window;
				3471	*dst_nt_curr_runnable_sum += p->ravg.curr_window;
				3472	*src_nt_prev_runnable_sum -= p->ravg.prev_window;
				3473	*dst_nt_prev_runnable_sum += p->ravg.prev_window;
				3474	}
				3475
				3476	trace_sched_migration_update_sum(p, migrate_type, &d);
				3477
				3478	BUG_ON((s64)*src_curr_runnable_sum < 0);
				3479	BUG_ON((s64)*src_prev_runnable_sum < 0);
				3480	}
				3481
				3482	static inline struct group_cpu_time *
				3483	task_group_cpu_time(struct task_struct *p, int cpu)
				3484	{
				3485	return _group_cpu_time(rcu_dereference(p->grp), cpu);
				3486	}
				3487
				3488	static inline struct group_cpu_time *
				3489	_group_cpu_time(struct related_thread_group *grp, int cpu)
				3490	{
				3491	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
				3492	}
				3493
				3494	struct related_thread_group *alloc_related_thread_group(int group_id)
				3495	{
				3496	struct related_thread_group *grp;
				3497
				3498	grp = kzalloc(sizeof(*grp), GFP_KERNEL);
				3499	if (!grp)
				3500	return ERR_PTR(-ENOMEM);
				3501
				3502	if (alloc_group_cputime(grp)) {
				3503	kfree(grp);
				3504	return ERR_PTR(-ENOMEM);
				3505	}
				3506
				3507	grp->id = group_id;
				3508	INIT_LIST_HEAD(&grp->tasks);
				3509	INIT_LIST_HEAD(&grp->list);
				3510	raw_spin_lock_init(&grp->lock);
				3511
				3512	return grp;
				3513	}
				3514
				3515	struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
				3516	{
				3517	struct related_thread_group *grp;
				3518
				3519	list_for_each_entry(grp, &related_thread_groups, list) {
				3520	if (grp->id == group_id)
				3521	return grp;
				3522	}
				3523
				3524	return NULL;
				3525	}
				3526
				3527	/* See comments before preferred_cluster() */
				3528	static void free_related_thread_group(struct rcu_head *rcu)
				3529	{
				3530	struct related_thread_group *grp = container_of(rcu, struct
				3531	related_thread_group, rcu);
				3532
				3533	free_group_cputime(grp);
				3534	kfree(grp);
				3535	}
				3536
				3537	static void remove_task_from_group(struct task_struct *p)
				3538	{
				3539	struct related_thread_group *grp = p->grp;
				3540	struct rq *rq;
				3541	int empty_group = 1;
				3542	struct rq_flags rf;
				3543
				3544	raw_spin_lock(&grp->lock);
				3545
				3546	rq = __task_rq_lock(p, &rf);
				3547	transfer_busy_time(rq, p->grp, p, REM_TASK);
				3548	list_del_init(&p->grp_list);
				3549	rcu_assign_pointer(p->grp, NULL);
				3550	__task_rq_unlock(rq, &rf);
				3551
				3552	if (!list_empty(&grp->tasks)) {
				3553	empty_group = 0;
				3554	_set_preferred_cluster(grp);
				3555	}
				3556
				3557	raw_spin_unlock(&grp->lock);
				3558
				3559	if (empty_group) {
				3560	list_del(&grp->list);
				3561	call_rcu(&grp->rcu, free_related_thread_group);
				3562	}
				3563	}
				3564
				3565	static int
				3566	add_task_to_group(struct task_struct p, struct related_thread_group grp)
				3567	{
				3568	struct rq *rq;
				3569	struct rq_flags rf;
				3570
				3571	raw_spin_lock(&grp->lock);
				3572
				3573	/*
				3574	* Change p->grp under rq->lock. Will prevent races with read-side
				3575	* reference of p->grp in various hot-paths
				3576	*/
				3577	rq = __task_rq_lock(p, &rf);
				3578	transfer_busy_time(rq, grp, p, ADD_TASK);
				3579	list_add(&p->grp_list, &grp->tasks);
				3580	rcu_assign_pointer(p->grp, grp);
				3581	__task_rq_unlock(rq, &rf);
				3582
				3583	_set_preferred_cluster(grp);
				3584
				3585	raw_spin_unlock(&grp->lock);
				3586
				3587	return 0;
				3588	}
				3589
				3590	void add_new_task_to_grp(struct task_struct *new)
				3591	{
				3592	unsigned long flags;
				3593	struct related_thread_group *grp;
				3594	struct task_struct *parent;
				3595
				3596	if (!sysctl_sched_enable_thread_grouping)
				3597	return;
				3598
				3599	if (thread_group_leader(new))
				3600	return;
				3601
				3602	parent = new->group_leader;
				3603
				3604	/*
				3605	* The parent's pi_lock is required here to protect race
				3606	* against the parent task being removed from the
				3607	* group.
				3608	*/
				3609	raw_spin_lock_irqsave(&parent->pi_lock, flags);
				3610
				3611	/* protected by pi_lock. */
				3612	grp = task_related_thread_group(parent);
				3613	if (!grp) {
				3614	raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
				3615	return;
				3616	}
				3617	raw_spin_lock(&grp->lock);
				3618
				3619	rcu_assign_pointer(new->grp, grp);
				3620	list_add(&new->grp_list, &grp->tasks);
				3621
				3622	raw_spin_unlock(&grp->lock);
				3623	raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
				3624	}
				3625
				3626	int sched_set_group_id(struct task_struct *p, unsigned int group_id)
				3627	{
				3628	int rc = 0, destroy = 0;
				3629	unsigned long flags;
				3630	struct related_thread_group grp = NULL, new = NULL;
				3631
				3632	redo:
				3633	raw_spin_lock_irqsave(&p->pi_lock, flags);
				3634
				3635	if ((current != p && p->flags & PF_EXITING) \|\|
				3636	(!p->grp && !group_id) \|\|
				3637	(p->grp && p->grp->id == group_id))
				3638	goto done;
				3639
				3640	write_lock(&related_thread_group_lock);
				3641
				3642	if (!group_id) {
				3643	remove_task_from_group(p);
				3644	write_unlock(&related_thread_group_lock);
				3645	goto done;
				3646	}
				3647
				3648	if (p->grp && p->grp->id != group_id)
				3649	remove_task_from_group(p);
				3650
				3651	grp = lookup_related_thread_group(group_id);
				3652	if (!grp && !new) {
				3653	/* New group */
				3654	write_unlock(&related_thread_group_lock);
				3655	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				3656	new = alloc_related_thread_group(group_id);
				3657	if (IS_ERR(new))
				3658	return -ENOMEM;
				3659	destroy = 1;
				3660	/* Rerun checks (like task exiting), since we dropped pi_lock */
				3661	goto redo;
				3662	} else if (!grp && new) {
				3663	/* New group - use object allocated before */
				3664	destroy = 0;
				3665	list_add(&new->list, &related_thread_groups);
				3666	grp = new;
				3667	}
				3668
				3669	BUG_ON(!grp);
				3670	rc = add_task_to_group(p, grp);
				3671	write_unlock(&related_thread_group_lock);
				3672	done:
				3673	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
				3674
				3675	if (new && destroy) {
				3676	free_group_cputime(new);
				3677	kfree(new);
				3678	}
				3679
				3680	return rc;
				3681	}
				3682
				3683	unsigned int sched_get_group_id(struct task_struct *p)
				3684	{
				3685	unsigned int group_id;
				3686	struct related_thread_group *grp;
				3687
				3688	rcu_read_lock();
				3689	grp = task_related_thread_group(p);
				3690	group_id = grp ? grp->id : 0;
				3691	rcu_read_unlock();
				3692
				3693	return group_id;
				3694	}
				3695
				3696	static void update_cpu_cluster_capacity(const cpumask_t *cpus)
				3697	{
				3698	int i;
				3699	struct sched_cluster *cluster;
				3700	struct cpumask cpumask;
				3701
				3702	cpumask_copy(&cpumask, cpus);
				3703	pre_big_task_count_change(cpu_possible_mask);
				3704
				3705	for_each_cpu(i, &cpumask) {
				3706	cluster = cpu_rq(i)->cluster;
				3707	cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
				3708
				3709	cluster->capacity = compute_capacity(cluster);
				3710	cluster->load_scale_factor = compute_load_scale_factor(cluster);
				3711
				3712	/* 'cpus' can contain cpumask more than one cluster */
				3713	check_for_up_down_migrate_update(&cluster->cpus);
				3714	}
				3715
				3716	__update_min_max_capacity();
				3717
				3718	post_big_task_count_change(cpu_possible_mask);
				3719	}
				3720
				3721	static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
				3722	void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
				3723	{
				3724	struct cpumask cpumask;
				3725	struct sched_cluster *cluster;
				3726	int i, update_capacity = 0;
				3727	unsigned long flags;
				3728
				3729	spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
				3730	cpumask_copy(&cpumask, cpus);
				3731	for_each_cpu(i, &cpumask) {
				3732	cluster = cpu_rq(i)->cluster;
				3733	cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
				3734
				3735	update_capacity += (cluster->max_mitigated_freq != fmax);
				3736	cluster->max_mitigated_freq = fmax;
				3737	}
				3738	spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
				3739
				3740	if (update_capacity)
				3741	update_cpu_cluster_capacity(cpus);
				3742	}
				3743
				3744	static int cpufreq_notifier_policy(struct notifier_block *nb,
				3745	unsigned long val, void *data)
				3746	{
				3747	struct cpufreq_policy policy = (struct cpufreq_policy )data;
				3748	struct sched_cluster *cluster = NULL;
				3749	struct cpumask policy_cluster = *policy->related_cpus;
				3750	unsigned int orig_max_freq = 0;
				3751	int i, j, update_capacity = 0;
				3752
				3753	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
				3754	val != CPUFREQ_CREATE_POLICY)
				3755	return 0;
				3756
				3757	if (val == CPUFREQ_REMOVE_POLICY \|\| val == CPUFREQ_CREATE_POLICY) {
				3758	update_min_max_capacity();
				3759	return 0;
				3760	}
				3761
				3762	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
				3763	if (min_max_freq == 1)
				3764	min_max_freq = UINT_MAX;
				3765	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
				3766	BUG_ON(!min_max_freq);
				3767	BUG_ON(!policy->max);
				3768
				3769	for_each_cpu(i, &policy_cluster) {
				3770	cluster = cpu_rq(i)->cluster;
				3771	cpumask_andnot(&policy_cluster, &policy_cluster,
				3772	&cluster->cpus);
				3773
				3774	orig_max_freq = cluster->max_freq;
				3775	cluster->min_freq = policy->min;
				3776	cluster->max_freq = policy->max;
				3777	cluster->cur_freq = policy->cur;
				3778
				3779	if (!cluster->freq_init_done) {
				3780	mutex_lock(&cluster_lock);
				3781	for_each_cpu(j, &cluster->cpus)
				3782	cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
				3783	policy->related_cpus);
				3784	cluster->max_possible_freq = policy->cpuinfo.max_freq;
				3785	cluster->max_possible_capacity =
				3786	compute_max_possible_capacity(cluster);
				3787	cluster->freq_init_done = true;
				3788
				3789	sort_clusters();
				3790	update_all_clusters_stats();
				3791	mutex_unlock(&cluster_lock);
				3792	continue;
				3793	}
				3794
				3795	update_capacity += (orig_max_freq != cluster->max_freq);
				3796	}
				3797
				3798	if (update_capacity)
				3799	update_cpu_cluster_capacity(policy->related_cpus);
				3800
				3801	return 0;
				3802	}
				3803
				3804	static int cpufreq_notifier_trans(struct notifier_block *nb,
				3805	unsigned long val, void *data)
				3806	{
				3807	struct cpufreq_freqs freq = (struct cpufreq_freqs )data;
				3808	unsigned int cpu = freq->cpu, new_freq = freq->new;
				3809	unsigned long flags;
				3810	struct sched_cluster *cluster;
				3811	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
				3812	int i, j;
				3813
				3814	if (val != CPUFREQ_POSTCHANGE)
				3815	return 0;
				3816
				3817	BUG_ON(!new_freq);
				3818
				3819	if (cpu_cur_freq(cpu) == new_freq)
				3820	return 0;
				3821
				3822	for_each_cpu(i, &policy_cpus) {
				3823	cluster = cpu_rq(i)->cluster;
				3824
				3825	for_each_cpu(j, &cluster->cpus) {
				3826	struct rq *rq = cpu_rq(j);
				3827
				3828	raw_spin_lock_irqsave(&rq->lock, flags);
				3829	update_task_ravg(rq->curr, rq, TASK_UPDATE,
				3830	sched_ktime_clock(), 0);
				3831	raw_spin_unlock_irqrestore(&rq->lock, flags);
				3832	}
				3833
				3834	cluster->cur_freq = new_freq;
				3835	cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
				3836	}
				3837
				3838	return 0;
				3839	}
				3840
				3841	static int pwr_stats_ready_notifier(struct notifier_block *nb,
				3842	unsigned long cpu, void *data)
				3843	{
				3844	cpumask_t mask = CPU_MASK_NONE;
				3845
				3846	cpumask_set_cpu(cpu, &mask);
				3847	sched_update_freq_max_load(&mask);
				3848
				3849	mutex_lock(&cluster_lock);
				3850	sort_clusters();
				3851	mutex_unlock(&cluster_lock);
				3852
				3853	return 0;
				3854	}
				3855
				3856	static struct notifier_block notifier_policy_block = {
				3857	.notifier_call = cpufreq_notifier_policy
				3858	};
				3859
				3860	static struct notifier_block notifier_trans_block = {
				3861	.notifier_call = cpufreq_notifier_trans
				3862	};
				3863
				3864	static struct notifier_block notifier_pwr_stats_ready = {
				3865	.notifier_call = pwr_stats_ready_notifier
				3866	};
				3867
				3868	int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
				3869	{
				3870	return -EINVAL;
				3871	}
				3872
				3873	static int register_sched_callback(void)
				3874	{
				3875	int ret;
				3876
				3877	ret = cpufreq_register_notifier(&notifier_policy_block,
				3878	CPUFREQ_POLICY_NOTIFIER);
				3879
				3880	if (!ret)
				3881	ret = cpufreq_register_notifier(&notifier_trans_block,
				3882	CPUFREQ_TRANSITION_NOTIFIER);
				3883
				3884	register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
				3885
				3886	return 0;
				3887	}
				3888
				3889	/*
				3890	* cpufreq callbacks can be registered at core_initcall or later time.
				3891	* Any registration done prior to that is "forgotten" by cpufreq. See
				3892	* initialization of variable init_cpufreq_transition_notifier_list_called
				3893	* for further information.
				3894	*/
				3895	core_initcall(register_sched_callback);
				3896
				3897	int update_preferred_cluster(struct related_thread_group *grp,
				3898	struct task_struct *p, u32 old_load)
				3899	{
				3900	u32 new_load = task_load(p);
				3901
				3902	if (!grp)
				3903	return 0;
				3904
				3905	/*
				3906	* Update if task's load has changed significantly or a complete window
				3907	* has passed since we last updated preference
				3908	*/
				3909	if (abs(new_load - old_load) > sched_ravg_window / 4 \|\|
				3910	sched_ktime_clock() - grp->last_update > sched_ravg_window)
				3911	return 1;
				3912
				3913	return 0;
				3914	}
				3915
				3916	bool early_detection_notify(struct rq *rq, u64 wallclock)
				3917	{
				3918	struct task_struct *p;
				3919	int loop_max = 10;
				3920
				3921	if (!sched_boost() \|\| !rq->cfs.h_nr_running)
				3922	return 0;
				3923
				3924	rq->ed_task = NULL;
				3925	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
				3926	if (!loop_max)
				3927	break;
				3928
				3929	if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
				3930	rq->ed_task = p;
				3931	return 1;
				3932	}
				3933
				3934	loop_max--;
				3935	}
				3936
				3937	return 0;
				3938	}
				3939
				3940	#ifdef CONFIG_CGROUP_SCHED
				3941	u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
				3942	struct cftype *cft)
				3943	{
				3944	struct task_group *tg = css_tg(css);
				3945
				3946	return tg->upmigrate_discouraged;
				3947	}
				3948
				3949	int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
				3950	struct cftype *cft, u64 upmigrate_discourage)
				3951	{
				3952	struct task_group *tg = css_tg(css);
				3953	int discourage = upmigrate_discourage > 0;
				3954
				3955	if (tg->upmigrate_discouraged == discourage)
				3956	return 0;
				3957
				3958	/*
				3959	* Revisit big-task classification for tasks of this cgroup. It would
				3960	* have been efficient to walk tasks of just this cgroup in running
				3961	* state, but we don't have easy means to do that. Walk all tasks in
				3962	* running state on all cpus instead and re-visit their big task
				3963	* classification.
				3964	*/
				3965	get_online_cpus();
				3966	pre_big_task_count_change(cpu_online_mask);
				3967
				3968	tg->upmigrate_discouraged = discourage;
				3969
				3970	post_big_task_count_change(cpu_online_mask);
				3971	put_online_cpus();
				3972
				3973	return 0;
				3974	}
				3975	#endif /* CONFIG_CGROUP_SCHED */