Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: 49b835f1305f8c1cfe659bb466a00c4b71767185 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9469eb0	2017-09-07 17:03:53 +0200	[diff] [blame]	17	sched_debug_enabled = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
				21	early_param("sched_debug", sched_debug_setup);
				22
				23	static inline bool sched_debug(void)
				24	{
				25	return sched_debug_enabled;
				26	}
				27
				28	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				29	struct cpumask *groupmask)
				30	{
				31	struct sched_group *group = sd->groups;
				32
				33	cpumask_clear(groupmask);
				34
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	35	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	36
				37	if (!(sd->flags & SD_LOAD_BALANCE)) {
				38	printk("does not load-balance\n");
				39	if (sd->parent)
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	40	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	41	return -1;
				42	}
				43
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
				54	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				55	do {
				56	if (!group) {
				57	printk("\n");
				58	printk(KERN_ERR "ERROR: group is NULL\n");
				59	break;
				60	}
				61
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	62	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	63	printk(KERN_CONT "\n");
				64	printk(KERN_ERR "ERROR: empty group\n");
				65	break;
				66	}
				67
				68	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	69	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	70	printk(KERN_CONT "\n");
				71	printk(KERN_ERR "ERROR: repeated CPUs\n");
				72	break;
				73	}
				74
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	75	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	76
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	77	printk(KERN_CONT " %d:{ span=%*pbl",
				78	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	79	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	80
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	81	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	82	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	83	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	84	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	85	}
				86
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	87	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				88	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	89
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	90	if (group == sd->groups && sd->child &&
				91	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	92	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	93	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				94	}
				95
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	96	printk(KERN_CONT " }");
				97
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	98	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	99
				100	if (group != sd->groups)
				101	printk(KERN_CONT ",");
				102
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	103	} while (group != sd->groups);
				104	printk(KERN_CONT "\n");
				105
				106	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				107	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				108
				109	if (sd->parent &&
				110	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	111	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	112	return 0;
				113	}
				114
				115	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				116	{
				117	int level = 0;
				118
				119	if (!sched_debug_enabled)
				120	return;
				121
				122	if (!sd) {
				123	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				124	return;
				125	}
				126
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	127	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	128
				129	for (;;) {
				130	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				131	break;
				132	level++;
				133	sd = sd->parent;
				134	if (!sd)
				135	break;
				136	}
				137	}
				138	#else /* !CONFIG_SCHED_DEBUG */
				139
				140	# define sched_debug_enabled 0
				141	# define sched_domain_debug(sd, cpu) do { } while (0)
				142	static inline bool sched_debug(void)
				143	{
				144	return false;
				145	}
				146	#endif /* CONFIG_SCHED_DEBUG */
				147
				148	static int sd_degenerate(struct sched_domain *sd)
				149	{
				150	if (cpumask_weight(sched_domain_span(sd)) == 1)
				151	return 1;
				152
				153	/* Following flags need at least 2 groups */
				154	if (sd->flags & (SD_LOAD_BALANCE \|
				155	SD_BALANCE_NEWIDLE \|
				156	SD_BALANCE_FORK \|
				157	SD_BALANCE_EXEC \|
				158	SD_SHARE_CPUCAPACITY \|
				159	SD_ASYM_CPUCAPACITY \|
				160	SD_SHARE_PKG_RESOURCES \|
				161	SD_SHARE_POWERDOMAIN)) {
				162	if (sd->groups != sd->groups->next)
				163	return 0;
				164	}
				165
				166	/* Following flags don't use groups */
				167	if (sd->flags & (SD_WAKE_AFFINE))
				168	return 0;
				169
				170	return 1;
				171	}
				172
				173	static int
				174	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				175	{
				176	unsigned long cflags = sd->flags, pflags = parent->flags;
				177
				178	if (sd_degenerate(parent))
				179	return 1;
				180
				181	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				182	return 0;
				183
				184	/* Flags needing groups don't count if only 1 group in parent */
				185	if (parent->groups == parent->groups->next) {
				186	pflags &= ~(SD_LOAD_BALANCE \|
				187	SD_BALANCE_NEWIDLE \|
				188	SD_BALANCE_FORK \|
				189	SD_BALANCE_EXEC \|
				190	SD_ASYM_CPUCAPACITY \|
				191	SD_SHARE_CPUCAPACITY \|
				192	SD_SHARE_PKG_RESOURCES \|
				193	SD_PREFER_SIBLING \|
				194	SD_SHARE_POWERDOMAIN);
				195	if (nr_node_ids == 1)
				196	pflags &= ~SD_SERIALIZE;
				197	}
				198	if (~cflags & pflags)
				199	return 0;
				200
				201	return 1;
				202	}
				203
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	204	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	205	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	206	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	207	DEFINE_MUTEX(sched_energy_mutex);
				208	bool sched_energy_update;
				209
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	#ifdef CONFIG_PROC_SYSCTL
				211	int sched_energy_aware_handler(struct ctl_table *table, int write,
				212	void __user buffer, size_t lenp, loff_t *ppos)
				213	{
				214	int ret, state;
				215
				216	if (write && !capable(CAP_SYS_ADMIN))
				217	return -EPERM;
				218
				219	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				220	if (!ret && write) {
				221	state = static_branch_unlikely(&sched_energy_present);
				222	if (state != sysctl_sched_energy_aware) {
				223	mutex_lock(&sched_energy_mutex);
				224	sched_energy_update = 1;
				225	rebuild_sched_domains();
				226	sched_energy_update = 0;
				227	mutex_unlock(&sched_energy_mutex);
				228	}
				229	}
				230
				231	return ret;
				232	}
				233	#endif
				234
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	235	static void free_pd(struct perf_domain *pd)
				236	{
				237	struct perf_domain *tmp;
				238
				239	while (pd) {
				240	tmp = pd->next;
				241	kfree(pd);
				242	pd = tmp;
				243	}
				244	}
				245
				246	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				247	{
				248	while (pd) {
				249	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				250	return pd;
				251	pd = pd->next;
				252	}
				253
				254	return NULL;
				255	}
				256
				257	static struct perf_domain *pd_init(int cpu)
				258	{
				259	struct em_perf_domain *obj = em_cpu_get(cpu);
				260	struct perf_domain *pd;
				261
				262	if (!obj) {
				263	if (sched_debug())
				264	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				265	return NULL;
				266	}
				267
				268	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				269	if (!pd)
				270	return NULL;
				271	pd->em_pd = obj;
				272
				273	return pd;
				274	}
				275
				276	static void perf_domain_debug(const struct cpumask *cpu_map,
				277	struct perf_domain *pd)
				278	{
				279	if (!sched_debug() \|\| !pd)
				280	return;
				281
				282	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				283
				284	while (pd) {
				285	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
				286	cpumask_first(perf_domain_span(pd)),
				287	cpumask_pr_args(perf_domain_span(pd)),
				288	em_pd_nr_cap_states(pd->em_pd));
				289	pd = pd->next;
				290	}
				291
				292	printk(KERN_CONT "\n");
				293	}
				294
				295	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				296	{
				297	struct perf_domain *pd;
				298
				299	pd = container_of(rp, struct perf_domain, rcu);
				300	free_pd(pd);
				301	}
				302
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	303	static void sched_energy_set(bool has_eas)
				304	{
				305	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				306	if (sched_debug())
				307	pr_info("%s: stopping EAS\n", __func__);
				308	static_branch_disable_cpuslocked(&sched_energy_present);
				309	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				310	if (sched_debug())
				311	pr_info("%s: starting EAS\n", __func__);
				312	static_branch_enable_cpuslocked(&sched_energy_present);
				313	}
				314	}
				315
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	316	/*
				317	* EAS can be used on a root domain if it meets all the following conditions:
				318	* 1. an Energy Model (EM) is available;
				319	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
				320	* 3. the EM complexity is low enough to keep scheduling overheads low;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	321	* 4. schedutil is driving the frequency of all CPUs of the rd;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	322	*
				323	* The complexity of the Energy Model is defined as:
				324	*
				325	* C = nr_pd * (nr_cpus + nr_cs)
				326	*
				327	* with parameters defined as:
				328	* - nr_pd: the number of performance domains
				329	* - nr_cpus: the number of CPUs
				330	* - nr_cs: the sum of the number of capacity states of all performance
				331	* domains (for example, on a system with 2 performance domains,
				332	* with 10 capacity states each, nr_cs = 2 * 10 = 20).
				333	*
				334	* It is generally not a good idea to use such a model in the wake-up path on
				335	* very complex platforms because of the associated scheduling overheads. The
				336	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
				337	* with per-CPU DVFS and less than 8 capacity states each, for example.
				338	*/
				339	#define EM_MAX_COMPLEXITY 2048
				340
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	341	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	342	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	343	{
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	344	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	345	struct perf_domain pd = NULL, tmp;
				346	int cpu = cpumask_first(cpu_map);
				347	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	348	struct cpufreq_policy *policy;
				349	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	350
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	351	if (!sysctl_sched_energy_aware)
				352	goto free;
				353
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	354	/* EAS is enabled for asymmetric CPU capacity topologies. */
				355	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				356	if (sched_debug()) {
				357	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				358	cpumask_pr_args(cpu_map));
				359	}
				360	goto free;
				361	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	362
				363	for_each_cpu(i, cpu_map) {
				364	/* Skip already covered CPUs. */
				365	if (find_pd(pd, i))
				366	continue;
				367
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	368	/* Do not attempt EAS if schedutil is not being used. */
				369	policy = cpufreq_cpu_get(i);
				370	if (!policy)
				371	goto free;
				372	gov = policy->governor;
				373	cpufreq_cpu_put(policy);
				374	if (gov != &schedutil_gov) {
				375	if (rd->pd)
				376	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				377	cpumask_pr_args(cpu_map));
				378	goto free;
				379	}
				380
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	381	/* Create the new pd and add it to the local list. */
				382	tmp = pd_init(i);
				383	if (!tmp)
				384	goto free;
				385	tmp->next = pd;
				386	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	387
				388	/*
				389	* Count performance domains and capacity states for the
				390	* complexity check.
				391	*/
				392	nr_pd++;
				393	nr_cs += em_pd_nr_cap_states(pd->em_pd);
				394	}
				395
				396	/* Bail out if the Energy Model complexity is too high. */
				397	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
				398	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				399	cpumask_pr_args(cpu_map));
				400	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	401	}
				402
				403	perf_domain_debug(cpu_map, pd);
				404
				405	/* Attach the new list of performance domains to the root domain. */
				406	tmp = rd->pd;
				407	rcu_assign_pointer(rd->pd, pd);
				408	if (tmp)
				409	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				410
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	411	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	412
				413	free:
				414	free_pd(pd);
				415	tmp = rd->pd;
				416	rcu_assign_pointer(rd->pd, NULL);
				417	if (tmp)
				418	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	419
				420	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	421	}
				422	#else
				423	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	424	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	425
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	426	static void free_rootdomain(struct rcu_head *rcu)
				427	{
				428	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				429
				430	cpupri_cleanup(&rd->cpupri);
				431	cpudl_cleanup(&rd->cpudl);
				432	free_cpumask_var(rd->dlo_mask);
				433	free_cpumask_var(rd->rto_mask);
				434	free_cpumask_var(rd->online);
				435	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	436	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	437	kfree(rd);
				438	}
				439
				440	void rq_attach_root(struct rq rq, struct root_domain rd)
				441	{
				442	struct root_domain *old_rd = NULL;
				443	unsigned long flags;
				444
				445	raw_spin_lock_irqsave(&rq->lock, flags);
				446
				447	if (rq->rd) {
				448	old_rd = rq->rd;
				449
				450	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				451	set_rq_offline(rq);
				452
				453	cpumask_clear_cpu(rq->cpu, old_rd->span);
				454
				455	/*
				456	* If we dont want to free the old_rd yet then
				457	* set old_rd to NULL to skip the freeing later
				458	* in this function:
				459	*/
				460	if (!atomic_dec_and_test(&old_rd->refcount))
				461	old_rd = NULL;
				462	}
				463
				464	atomic_inc(&rd->refcount);
				465	rq->rd = rd;
				466
				467	cpumask_set_cpu(rq->cpu, rd->span);
				468	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				469	set_rq_online(rq);
				470
				471	raw_spin_unlock_irqrestore(&rq->lock, flags);
				472
				473	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	474	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	475	}
				476
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	477	void sched_get_rd(struct root_domain *rd)
				478	{
				479	atomic_inc(&rd->refcount);
				480	}
				481
				482	void sched_put_rd(struct root_domain *rd)
				483	{
				484	if (!atomic_dec_and_test(&rd->refcount))
				485	return;
				486
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	487	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	488	}
				489
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	490	static int init_rootdomain(struct root_domain *rd)
				491	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	492	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				493	goto out;
				494	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				495	goto free_span;
				496	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				497	goto free_online;
				498	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				499	goto free_dlo_mask;
				500
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	501	#ifdef HAVE_RT_PUSH_IPI
				502	rd->rto_cpu = -1;
				503	raw_spin_lock_init(&rd->rto_lock);
				504	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				505	#endif
				506
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	507	init_dl_bw(&rd->dl_bw);
				508	if (cpudl_init(&rd->cpudl) != 0)
				509	goto free_rto_mask;
				510
				511	if (cpupri_init(&rd->cpupri) != 0)
				512	goto free_cpudl;
				513	return 0;
				514
				515	free_cpudl:
				516	cpudl_cleanup(&rd->cpudl);
				517	free_rto_mask:
				518	free_cpumask_var(rd->rto_mask);
				519	free_dlo_mask:
				520	free_cpumask_var(rd->dlo_mask);
				521	free_online:
				522	free_cpumask_var(rd->online);
				523	free_span:
				524	free_cpumask_var(rd->span);
				525	out:
				526	return -ENOMEM;
				527	}
				528
				529	/*
				530	* By default the system creates a single root-domain with all CPUs as
				531	* members (mimicking the global state we have today).
				532	*/
				533	struct root_domain def_root_domain;
				534
				535	void init_defrootdomain(void)
				536	{
				537	init_rootdomain(&def_root_domain);
				538
				539	atomic_set(&def_root_domain.refcount, 1);
				540	}
				541
				542	static struct root_domain *alloc_rootdomain(void)
				543	{
				544	struct root_domain *rd;
				545
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	546	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	547	if (!rd)
				548	return NULL;
				549
				550	if (init_rootdomain(rd) != 0) {
				551	kfree(rd);
				552	return NULL;
				553	}
				554
				555	return rd;
				556	}
				557
				558	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				559	{
				560	struct sched_group tmp, first;
				561
				562	if (!sg)
				563	return;
				564
				565	first = sg;
				566	do {
				567	tmp = sg->next;
				568
				569	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				570	kfree(sg->sgc);
				571
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	572	if (atomic_dec_and_test(&sg->ref))
				573	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	574	sg = tmp;
				575	} while (sg != first);
				576	}
				577
				578	static void destroy_sched_domain(struct sched_domain *sd)
				579	{
				580	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	581	* A normal sched domain may have multiple group references, an
				582	* overlapping domain, having private groups, only one. Iterate,
				583	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	584	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	585	free_sched_groups(sd->groups, 1);
				586
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	587	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				588	kfree(sd->shared);
				589	kfree(sd);
				590	}
				591
				592	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				593	{
				594	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				595
				596	while (sd) {
				597	struct sched_domain *parent = sd->parent;
				598	destroy_sched_domain(sd);
				599	sd = parent;
				600	}
				601	}
				602
				603	static void destroy_sched_domains(struct sched_domain *sd)
				604	{
				605	if (sd)
				606	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				607	}
				608
				609	/*
				610	* Keep a special pointer to the highest sched_domain that has
				611	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				612	* allows us to avoid some pointer chasing select_idle_sibling().
				613	*
				614	* Also keep a unique ID per domain (we use the first CPU number in
				615	* the cpumask of the domain), this allows us to quickly tell if
				616	* two CPUs are in the same cache domain, see cpus_share_cache().
				617	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	618	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	619	DEFINE_PER_CPU(int, sd_llc_size);
				620	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	621	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				622	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				623	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				624	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	625	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	626
				627	static void update_top_cache_domain(int cpu)
				628	{
				629	struct sched_domain_shared *sds = NULL;
				630	struct sched_domain *sd;
				631	int id = cpu;
				632	int size = 1;
				633
				634	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				635	if (sd) {
				636	id = cpumask_first(sched_domain_span(sd));
				637	size = cpumask_weight(sched_domain_span(sd));
				638	sds = sd->shared;
				639	}
				640
				641	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				642	per_cpu(sd_llc_size, cpu) = size;
				643	per_cpu(sd_llc_id, cpu) = id;
				644	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				645
				646	sd = lowest_flag_domain(cpu, SD_NUMA);
				647	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				648
				649	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	650	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				651
				652	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
				653	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	654	}
				655
				656	/*
				657	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				658	* hold the hotplug lock.
				659	*/
				660	static void
				661	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				662	{
				663	struct rq *rq = cpu_rq(cpu);
				664	struct sched_domain *tmp;
				665
				666	/* Remove the sched domains which do not contribute to scheduling. */
				667	for (tmp = sd; tmp; ) {
				668	struct sched_domain *parent = tmp->parent;
				669	if (!parent)
				670	break;
				671
				672	if (sd_parent_degenerate(tmp, parent)) {
				673	tmp->parent = parent->parent;
				674	if (parent->parent)
				675	parent->parent->child = tmp;
				676	/*
				677	* Transfer SD_PREFER_SIBLING down in case of a
				678	* degenerate parent; the spans match for this
				679	* so the property transfers.
				680	*/
				681	if (parent->flags & SD_PREFER_SIBLING)
				682	tmp->flags \|= SD_PREFER_SIBLING;
				683	destroy_sched_domain(parent);
				684	} else
				685	tmp = tmp->parent;
				686	}
				687
				688	if (sd && sd_degenerate(sd)) {
				689	tmp = sd;
				690	sd = sd->parent;
				691	destroy_sched_domain(tmp);
				692	if (sd)
				693	sd->child = NULL;
				694	}
				695
				696	sched_domain_debug(sd, cpu);
				697
				698	rq_attach_root(rq, rd);
				699	tmp = rq->sd;
				700	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	701	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	702	destroy_sched_domains(tmp);
				703
				704	update_top_cache_domain(cpu);
				705	}
				706
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	707	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	708	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	709	struct root_domain *rd;
				710	};
				711
				712	enum s_alloc {
				713	sa_rootdomain,
				714	sa_sd,
				715	sa_sd_storage,
				716	sa_none,
				717	};
				718
				719	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	720	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	721	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	722	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	723	* The balance mask are all those CPUs that could actually end up at this
				724	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	725	*
				726	* Also see should_we_balance().
				727	*/
				728	int group_balance_cpu(struct sched_group *sg)
				729	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	730	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	731	}
				732
				733
				734	/*
				735	* NUMA topology (first read the regular topology blurb below)
				736	*
				737	* Given a node-distance table, for example:
				738	*
				739	* node 0 1 2 3
				740	* 0: 10 20 30 20
				741	* 1: 20 10 20 30
				742	* 2: 30 20 10 20
				743	* 3: 20 30 20 10
				744	*
				745	* which represents a 4 node ring topology like:
				746	*
				747	* 0 ----- 1
				748	* \| \|
				749	* \| \|
				750	* \| \|
				751	* 3 ----- 2
				752	*
				753	* We want to construct domains and groups to represent this. The way we go
				754	* about doing this is to build the domains on 'hops'. For each NUMA level we
				755	* construct the mask of all nodes reachable in @level hops.
				756	*
				757	* For the above NUMA topology that gives 3 levels:
				758	*
				759	* NUMA-2 0-3 0-3 0-3 0-3
				760	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				761	*
				762	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				763	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				764	*
				765	* NUMA-0 0 1 2 3
				766	*
				767	*
				768	* As can be seen; things don't nicely line up as with the regular topology.
				769	* When we iterate a domain in child domain chunks some nodes can be
				770	* represented multiple times -- hence the "overlap" naming for this part of
				771	* the topology.
				772	*
				773	* In order to minimize this overlap, we only build enough groups to cover the
				774	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				775	*
				776	* Because:
				777	*
				778	* - the first group of each domain is its child domain; this
				779	* gets us the first 0-1,3
				780	* - the only uncovered node is 2, who's child domain is 1-3.
				781	*
				782	* However, because of the overlap, computing a unique CPU for each group is
				783	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				784	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				785	* end up at those groups (they would end up in group: 0-1,3).
				786	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	787	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	788	* will contain those CPUs in the group that can reach this group given the
				789	* (child) domain tree.
				790	*
				791	* With this we can once again compute balance_cpu and sched_group_capacity
				792	* relations.
				793	*
				794	* XXX include words on how balance_cpu is unique and therefore can be
				795	* used for sched_group_capacity links.
				796	*
				797	*
				798	* Another 'interesting' topology is:
				799	*
				800	* node 0 1 2 3
				801	* 0: 10 20 20 30
				802	* 1: 20 10 20 20
				803	* 2: 20 20 10 20
				804	* 3: 30 20 20 10
				805	*
				806	* Which looks a little like:
				807	*
				808	* 0 ----- 1
				809	* \| / \|
				810	* \| / \|
				811	* \| / \|
				812	* 2 ----- 3
				813	*
				814	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				815	* are not.
				816	*
				817	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	818	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	819	*
				820	* NUMA-2 0-3 0-3
				821	* groups: {0-2},{1-3} {1-3},{0-2}
				822	*
				823	* NUMA-1 0-2 0-3 0-3 1-3
				824	*
				825	* NUMA-0 0 1 2 3
				826	*
				827	*/
				828
				829
				830	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	831	* Build the balance mask; it contains only those CPUs that can arrive at this
				832	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	833	*
				834	* We do this during the group creation pass, therefore the group information
				835	* isn't complete yet, however since each group represents a (child) domain we
				836	* can fully construct this using the sched_domain bits (which are already
				837	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	838	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	839	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	840	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	841	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	842	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	843	struct sd_data *sdd = sd->private;
				844	struct sched_domain *sibling;
				845	int i;
				846
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	847	cpumask_clear(mask);
				848
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	849	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	850	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	851
				852	/*
				853	* Can happen in the asymmetric case, where these siblings are
				854	* unused. The mask will not be empty because those CPUs that
				855	* do have the top domain _should_ span the domain.
				856	*/
				857	if (!sibling->child)
				858	continue;
				859
				860	/* If we would not end up here, we can't continue from here */
				861	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	862	continue;
				863
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	864	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	865	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	866
				867	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	868	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	869	}
				870
				871	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	872	* XXX: This creates per-node group entries; since the load-balancer will
				873	* immediately access remote memory to construct this group's load-balance
				874	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	875	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	876	static struct sched_group *
				877	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				878	{
				879	struct sched_group *sg;
				880	struct cpumask *sg_span;
				881
				882	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				883	GFP_KERNEL, cpu_to_node(cpu));
				884
				885	if (!sg)
				886	return NULL;
				887
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	888	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	889	if (sd->child)
				890	cpumask_copy(sg_span, sched_domain_span(sd->child));
				891	else
				892	cpumask_copy(sg_span, sched_domain_span(sd));
				893
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	894	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	895	return sg;
				896	}
				897
				898	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	899	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	900	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	901	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	902	struct sd_data *sdd = sd->private;
				903	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	904	int cpu;
				905
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	906	build_balance_mask(sd, sg, mask);
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	907	cpu = cpumask_first_and(sched_group_span(sg), mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	908
				909	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				910	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	911	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	912	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	913	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	914
				915	/*
				916	* Initialize sgc->capacity such that even if we mess up the
				917	* domains and no possible iteration will get us here, we won't
				918	* die on a /0 trap.
				919	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	920	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	921	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				922	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	923	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	924	}
				925
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	926	static int
				927	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				928	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	929	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	930	const struct cpumask *span = sched_domain_span(sd);
				931	struct cpumask *covered = sched_domains_tmpmask;
				932	struct sd_data *sdd = sd->private;
				933	struct sched_domain *sibling;
				934	int i;
				935
				936	cpumask_clear(covered);
				937
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	938	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	939	struct cpumask *sg_span;
				940
				941	if (cpumask_test_cpu(i, covered))
				942	continue;
				943
				944	sibling = *per_cpu_ptr(sdd->sd, i);
				945
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	946	/*
				947	* Asymmetric node setups can result in situations where the
				948	* domain tree is of unequal depth, make sure to skip domains
				949	* that already cover the entire range.
				950	*
				951	* In that case build_sched_domains() will have terminated the
				952	* iteration early and our sibling sd spans will be empty.
				953	* Domains should always include the CPU they're built on, so
				954	* check that.
				955	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	956	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				957	continue;
				958
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	959	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	960	if (!sg)
				961	goto fail;
				962
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	963	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	964	cpumask_or(covered, covered, sg_span);
				965
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	966	init_overlap_sched_group(sd, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	967
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	968	if (!first)
				969	first = sg;
				970	if (last)
				971	last->next = sg;
				972	last = sg;
				973	last->next = first;
				974	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	975	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	976
				977	return 0;
				978
				979	fail:
				980	free_sched_groups(first, 0);
				981
				982	return -ENOMEM;
				983	}
				984
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	985
				986	/*
				987	* Package topology (also see the load-balance blurb in fair.c)
				988	*
				989	* The scheduler builds a tree structure to represent a number of important
				990	* topology features. By default (default_topology[]) these include:
				991	*
				992	* - Simultaneous multithreading (SMT)
				993	* - Multi-Core Cache (MC)
				994	* - Package (DIE)
				995	*
				996	* Where the last one more or less denotes everything up to a NUMA node.
				997	*
				998	* The tree consists of 3 primary data structures:
				999	*
				1000	* sched_domain -> sched_group -> sched_group_capacity
				1001	* ^ ^ ^ ^
				1002	* `-' `-'
				1003	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1004	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1005	* denote the ever growing mask of CPUs belonging to that level of topology.
				1006	*
				1007	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1008	* denoting the domains of the level below (or individual CPUs in case of the
				1009	* first domain level). The sched_group linked by a sched_domain includes the
				1010	* CPU of that sched_domain [*].
				1011	*
				1012	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1013	*
				1014	* CPU 0 1 2 3 4 5 6 7
				1015	*
				1016	* DIE [ ]
				1017	* MC [ ] [ ]
				1018	* SMT [ ] [ ] [ ] [ ]
				1019	*
				1020	* - or -
				1021	*
				1022	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1023	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1024	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1025	*
				1026	* CPU 0 1 2 3 4 5 6 7
				1027	*
				1028	* One way to think about it is: sched_domain moves you up and down among these
				1029	* topology levels, while sched_group moves you sideways through it, at child
				1030	* domain granularity.
				1031	*
				1032	* sched_group_capacity ensures each unique sched_group has shared storage.
				1033	*
				1034	* There are two related construction problems, both require a CPU that
				1035	* uniquely identify each group (for a given domain):
				1036	*
				1037	* - The first is the balance_cpu (see should_we_balance() and the
				1038	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1039	* continue balancing at a higher domain.
				1040	*
				1041	* - The second is the sched_group_capacity; we want all identical groups
				1042	* to share a single sched_group_capacity.
				1043	*
				1044	* Since these topologies are exclusive by construction. That is, its
				1045	* impossible for an SMT thread to belong to multiple cores, and cores to
				1046	* be part of multiple caches. There is a very clear and unique location
				1047	* for each CPU in the hierarchy.
				1048	*
				1049	* Therefore computing a unique CPU for each group is trivial (the iteration
				1050	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1051	* group), we can simply pick the first CPU in each group.
				1052	*
				1053	*
				1054	* [*] in other words, the first group of each domain is its child domain.
				1055	*/
				1056
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1057	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1058	{
				1059	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1060	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1061	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1062	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1063
				1064	if (child)
				1065	cpu = cpumask_first(sched_domain_span(child));
				1066
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1067	sg = *per_cpu_ptr(sdd->sg, cpu);
				1068	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1069
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1070	/* Increase refcounts for claim_allocations: */
				1071	already_visited = atomic_inc_return(&sg->ref) > 1;
				1072	/* sgc visits should follow a similar trend as sg */
				1073	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1074
				1075	/* If we have already visited that group, it's already initialized. */
				1076	if (already_visited)
				1077	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1078
				1079	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1080	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1081	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1082	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1083	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1084	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1085	}
				1086
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1087	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1088	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1089	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1090
				1091	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1092	}
				1093
				1094	/*
				1095	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1096	* covered by the given span, will set each group's ->cpumask correctly,
				1097	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1098	*
				1099	* Assumes the sched_domain tree is fully constructed
				1100	*/
				1101	static int
				1102	build_sched_groups(struct sched_domain *sd, int cpu)
				1103	{
				1104	struct sched_group first = NULL, last = NULL;
				1105	struct sd_data *sdd = sd->private;
				1106	const struct cpumask *span = sched_domain_span(sd);
				1107	struct cpumask *covered;
				1108	int i;
				1109
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1110	lockdep_assert_held(&sched_domains_mutex);
				1111	covered = sched_domains_tmpmask;
				1112
				1113	cpumask_clear(covered);
				1114
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1115	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1116	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1117
				1118	if (cpumask_test_cpu(i, covered))
				1119	continue;
				1120
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1121	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1122
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1123	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1124
				1125	if (!first)
				1126	first = sg;
				1127	if (last)
				1128	last->next = sg;
				1129	last = sg;
				1130	}
				1131	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1132	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1133
				1134	return 0;
				1135	}
				1136
				1137	/*
				1138	* Initialize sched groups cpu_capacity.
				1139	*
				1140	* cpu_capacity indicates the capacity of sched group, which is used while
				1141	* distributing the load between different sched groups in a sched domain.
				1142	* Typically cpu_capacity for all the groups in a sched domain will be same
				1143	* unless there are asymmetries in the topology. If there are asymmetries,
				1144	* group having more cpu_capacity will pickup more load compared to the
				1145	* group having less cpu_capacity.
				1146	*/
				1147	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1148	{
				1149	struct sched_group *sg = sd->groups;
				1150
				1151	WARN_ON(!sg);
				1152
				1153	do {
				1154	int cpu, max_cpu = -1;
				1155
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1156	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1157
				1158	if (!(sd->flags & SD_ASYM_PACKING))
				1159	goto next;
				1160
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1161	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1162	if (max_cpu < 0)
				1163	max_cpu = cpu;
				1164	else if (sched_asym_prefer(cpu, max_cpu))
				1165	max_cpu = cpu;
				1166	}
				1167	sg->asym_prefer_cpu = max_cpu;
				1168
				1169	next:
				1170	sg = sg->next;
				1171	} while (sg != sd->groups);
				1172
				1173	if (cpu != group_balance_cpu(sg))
				1174	return;
				1175
				1176	update_group_capacity(sd, cpu);
				1177	}
				1178
				1179	/*
				1180	* Initializers for schedule domains
				1181	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1182	*/
				1183
				1184	static int default_relax_domain_level = -1;
				1185	int sched_domain_level_max;
				1186
				1187	static int __init setup_relax_domain_level(char *str)
				1188	{
				1189	if (kstrtoint(str, 0, &default_relax_domain_level))
				1190	pr_warn("Unable to set relax_domain_level\n");
				1191
				1192	return 1;
				1193	}
				1194	__setup("relax_domain_level=", setup_relax_domain_level);
				1195
				1196	static void set_domain_attribute(struct sched_domain *sd,
				1197	struct sched_domain_attr *attr)
				1198	{
				1199	int request;
				1200
				1201	if (!attr \|\| attr->relax_domain_level < 0) {
				1202	if (default_relax_domain_level < 0)
				1203	return;
				1204	else
				1205	request = default_relax_domain_level;
				1206	} else
				1207	request = attr->relax_domain_level;
				1208	if (request < sd->level) {
				1209	/* Turn off idle balance on this domain: */
				1210	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
				1211	} else {
				1212	/* Turn on idle balance on this domain: */
				1213	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
				1214	}
				1215	}
				1216
				1217	static void __sdt_free(const struct cpumask *cpu_map);
				1218	static int __sdt_alloc(const struct cpumask *cpu_map);
				1219
				1220	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1221	const struct cpumask *cpu_map)
				1222	{
				1223	switch (what) {
				1224	case sa_rootdomain:
				1225	if (!atomic_read(&d->rd->refcount))
				1226	free_rootdomain(&d->rd->rcu);
				1227	/* Fall through */
				1228	case sa_sd:
				1229	free_percpu(d->sd);
				1230	/* Fall through */
				1231	case sa_sd_storage:
				1232	__sdt_free(cpu_map);
				1233	/* Fall through */
				1234	case sa_none:
				1235	break;
				1236	}
				1237	}
				1238
				1239	static enum s_alloc
				1240	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1241	{
				1242	memset(d, 0, sizeof(*d));
				1243
				1244	if (__sdt_alloc(cpu_map))
				1245	return sa_sd_storage;
				1246	d->sd = alloc_percpu(struct sched_domain *);
				1247	if (!d->sd)
				1248	return sa_sd_storage;
				1249	d->rd = alloc_rootdomain();
				1250	if (!d->rd)
				1251	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1252
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1253	return sa_rootdomain;
				1254	}
				1255
				1256	/*
				1257	* NULL the sd_data elements we've used to build the sched_domain and
				1258	* sched_group structure so that the subsequent __free_domain_allocs()
				1259	* will not free the data we're using.
				1260	*/
				1261	static void claim_allocations(int cpu, struct sched_domain *sd)
				1262	{
				1263	struct sd_data *sdd = sd->private;
				1264
				1265	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1266	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1267
				1268	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1269	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1270
				1271	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1272	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1273
				1274	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1275	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1276	}
				1277
				1278	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1279	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1280
				1281	static int sched_domains_numa_levels;
				1282	static int sched_domains_curr_level;
				1283
				1284	int sched_max_numa_distance;
				1285	static int *sched_domains_numa_distance;
				1286	static struct cpumask ***sched_domains_numa_masks;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1287	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1288	#endif
				1289
				1290	/*
				1291	* SD_flags allowed in topology descriptions.
				1292	*
				1293	* These flags are purely descriptive of the topology and do not prescribe
				1294	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1295	* function:
				1296	*
				1297	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1298	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1299	* SD_NUMA - describes NUMA topologies
				1300	* SD_SHARE_POWERDOMAIN - describes shared power domain
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1301	*
				1302	* Odd one out, which beside describing the topology has a quirk also
				1303	* prescribes the desired behaviour that goes along with it:
				1304	*
				1305	* SD_ASYM_PACKING - describes SMT quirks
				1306	*/
				1307	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1308	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1309	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1310	SD_NUMA \| \
				1311	SD_ASYM_PACKING \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1312	SD_SHARE_POWERDOMAIN)
				1313
				1314	static struct sched_domain *
				1315	sd_init(struct sched_domain_topology_level *tl,
				1316	const struct cpumask *cpu_map,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1317	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1318	{
				1319	struct sd_data *sdd = &tl->data;
				1320	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1321	int sd_id, sd_weight, sd_flags = 0;
				1322
				1323	#ifdef CONFIG_NUMA
				1324	/*
				1325	* Ugly hack to pass state to sd_numa_mask()...
				1326	*/
				1327	sched_domains_curr_level = tl->numa_level;
				1328	#endif
				1329
				1330	sd_weight = cpumask_weight(tl->mask(cpu));
				1331
				1332	if (tl->sd_flags)
				1333	sd_flags = (*tl->sd_flags)();
				1334	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1335	"wrong sd_flags in topology description\n"))
				1336	sd_flags &= ~TOPOLOGY_SD_FLAGS;
				1337
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1338	/* Apply detected topology flags */
				1339	sd_flags \|= dflags;
				1340
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1341	*sd = (struct sched_domain){
				1342	.min_interval = sd_weight,
				1343	.max_interval = 2*sd_weight,
				1344	.busy_factor = 32,
				1345	.imbalance_pct = 125,
				1346
				1347	.cache_nice_tries = 0,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1348
				1349	.flags = 1*SD_LOAD_BALANCE
				1350	\| 1*SD_BALANCE_NEWIDLE
				1351	\| 1*SD_BALANCE_EXEC
				1352	\| 1*SD_BALANCE_FORK
				1353	\| 0*SD_BALANCE_WAKE
				1354	\| 1*SD_WAKE_AFFINE
				1355	\| 0*SD_SHARE_CPUCAPACITY
				1356	\| 0*SD_SHARE_PKG_RESOURCES
				1357	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1358	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1359	\| 0*SD_NUMA
				1360	\| sd_flags
				1361	,
				1362
				1363	.last_balance = jiffies,
				1364	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1365	.max_newidle_lb_cost = 0,
				1366	.next_decay_max_lb_cost = jiffies,
				1367	.child = child,
				1368	#ifdef CONFIG_SCHED_DEBUG
				1369	.name = tl->name,
				1370	#endif
				1371	};
				1372
				1373	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
				1374	sd_id = cpumask_first(sched_domain_span(sd));
				1375
				1376	/*
				1377	* Convert topological properties into behaviour.
				1378	*/
				1379
				1380	if (sd->flags & SD_ASYM_CPUCAPACITY) {
				1381	struct sched_domain *t = sd;
				1382
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1383	/*
				1384	* Don't attempt to spread across CPUs of different capacities.
				1385	*/
				1386	if (sd->child)
				1387	sd->child->flags &= ~SD_PREFER_SIBLING;
				1388
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1389	for_each_lower_domain(t)
				1390	t->flags \|= SD_BALANCE_WAKE;
				1391	}
				1392
				1393	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1394	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1395
				1396	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1397	sd->imbalance_pct = 117;
				1398	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1399
				1400	#ifdef CONFIG_NUMA
				1401	} else if (sd->flags & SD_NUMA) {
				1402	sd->cache_nice_tries = 2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1403
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1404	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1405	sd->flags \|= SD_SERIALIZE;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1406	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1407	sd->flags &= ~(SD_BALANCE_EXEC \|
				1408	SD_BALANCE_FORK \|
				1409	SD_WAKE_AFFINE);
				1410	}
				1411
				1412	#endif
				1413	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1414	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1415	}
				1416
				1417	/*
				1418	* For all levels sharing cache; connect a sched_domain_shared
				1419	* instance.
				1420	*/
				1421	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1422	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1423	atomic_inc(&sd->shared->ref);
				1424	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1425	}
				1426
				1427	sd->private = sdd;
				1428
				1429	return sd;
				1430	}
				1431
				1432	/*
				1433	* Topology list, bottom-up.
				1434	*/
				1435	static struct sched_domain_topology_level default_topology[] = {
				1436	#ifdef CONFIG_SCHED_SMT
				1437	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1438	#endif
				1439	#ifdef CONFIG_SCHED_MC
				1440	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1441	#endif
				1442	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1443	{ NULL, },
				1444	};
				1445
				1446	static struct sched_domain_topology_level *sched_domain_topology =
				1447	default_topology;
				1448
				1449	#define for_each_sd_topology(tl) \
				1450	for (tl = sched_domain_topology; tl->mask; tl++)
				1451
				1452	void set_sched_topology(struct sched_domain_topology_level *tl)
				1453	{
				1454	if (WARN_ON_ONCE(sched_smp_initialized))
				1455	return;
				1456
				1457	sched_domain_topology = tl;
				1458	}
				1459
				1460	#ifdef CONFIG_NUMA
				1461
				1462	static const struct cpumask *sd_numa_mask(int cpu)
				1463	{
				1464	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1465	}
				1466
				1467	static void sched_numa_warn(const char *str)
				1468	{
				1469	static int done = false;
				1470	int i,j;
				1471
				1472	if (done)
				1473	return;
				1474
				1475	done = true;
				1476
				1477	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1478
				1479	for (i = 0; i < nr_node_ids; i++) {
				1480	printk(KERN_WARNING " ");
				1481	for (j = 0; j < nr_node_ids; j++)
				1482	printk(KERN_CONT "%02d ", node_distance(i,j));
				1483	printk(KERN_CONT "\n");
				1484	}
				1485	printk(KERN_WARNING "\n");
				1486	}
				1487
				1488	bool find_numa_distance(int distance)
				1489	{
				1490	int i;
				1491
				1492	if (distance == node_distance(0, 0))
				1493	return true;
				1494
				1495	for (i = 0; i < sched_domains_numa_levels; i++) {
				1496	if (sched_domains_numa_distance[i] == distance)
				1497	return true;
				1498	}
				1499
				1500	return false;
				1501	}
				1502
				1503	/*
				1504	* A system can have three types of NUMA topology:
				1505	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1506	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1507	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1508	*
				1509	* The difference between a glueless mesh topology and a backplane
				1510	* topology lies in whether communication between not directly
				1511	* connected nodes goes through intermediary nodes (where programs
				1512	* could run), or through backplane controllers. This affects
				1513	* placement of programs.
				1514	*
				1515	* The type of topology can be discerned with the following tests:
				1516	* - If the maximum distance between any nodes is 1 hop, the system
				1517	* is directly connected.
				1518	* - If for two nodes A and B, located N > 1 hops away from each other,
				1519	* there is an intermediary node C, which is < N hops away from both
				1520	* nodes A and B, the system is a glueless mesh.
				1521	*/
				1522	static void init_numa_topology_type(void)
				1523	{
				1524	int a, b, c, n;
				1525
				1526	n = sched_max_numa_distance;
				1527
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1528	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1529	sched_numa_topology_type = NUMA_DIRECT;
				1530	return;
				1531	}
				1532
				1533	for_each_online_node(a) {
				1534	for_each_online_node(b) {
				1535	/* Find two nodes furthest removed from each other. */
				1536	if (node_distance(a, b) < n)
				1537	continue;
				1538
				1539	/* Is there an intermediary node between a and b? */
				1540	for_each_online_node(c) {
				1541	if (node_distance(a, c) < n &&
				1542	node_distance(b, c) < n) {
				1543	sched_numa_topology_type =
				1544	NUMA_GLUELESS_MESH;
				1545	return;
				1546	}
				1547	}
				1548
				1549	sched_numa_topology_type = NUMA_BACKPLANE;
				1550	return;
				1551	}
				1552	}
				1553	}
				1554
				1555	void sched_init_numa(void)
				1556	{
				1557	int next_distance, curr_distance = node_distance(0, 0);
				1558	struct sched_domain_topology_level *tl;
				1559	int level = 0;
				1560	int i, j, k;
				1561
Peter Zijlstra	993f0b0	2018-11-02 14:22:25 +0100	[diff] [blame]	1562	sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1563	if (!sched_domains_numa_distance)
				1564	return;
				1565
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1566	/* Includes NUMA identity node at level 0. */
				1567	sched_domains_numa_distance[level++] = curr_distance;
				1568	sched_domains_numa_levels = level;
				1569
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1570	/*
				1571	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1572	* unique distances in the node_distance() table.
				1573	*
				1574	* Assumes node_distance(0,j) includes all distances in
				1575	* node_distance(i,j) in order to avoid cubic time.
				1576	*/
				1577	next_distance = curr_distance;
				1578	for (i = 0; i < nr_node_ids; i++) {
				1579	for (j = 0; j < nr_node_ids; j++) {
				1580	for (k = 0; k < nr_node_ids; k++) {
				1581	int distance = node_distance(i, k);
				1582
				1583	if (distance > curr_distance &&
				1584	(distance < next_distance \|\|
				1585	next_distance == curr_distance))
				1586	next_distance = distance;
				1587
				1588	/*
				1589	* While not a strong assumption it would be nice to know
				1590	* about cases where if node A is connected to B, B is not
				1591	* equally connected to A.
				1592	*/
				1593	if (sched_debug() && node_distance(k, i) != distance)
				1594	sched_numa_warn("Node-distance not symmetric");
				1595
				1596	if (sched_debug() && i && !find_numa_distance(distance))
				1597	sched_numa_warn("Node-0 not representative");
				1598	}
				1599	if (next_distance != curr_distance) {
				1600	sched_domains_numa_distance[level++] = next_distance;
				1601	sched_domains_numa_levels = level;
				1602	curr_distance = next_distance;
				1603	} else break;
				1604	}
				1605
				1606	/*
				1607	* In case of sched_debug() we verify the above assumption.
				1608	*/
				1609	if (!sched_debug())
				1610	break;
				1611	}
				1612
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1613	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1614	* 'level' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1615	*
				1616	* The sched_domains_numa_distance[] array includes the actual distance
				1617	* numbers.
				1618	*/
				1619
				1620	/*
				1621	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1622	* If it fails to allocate memory for array sched_domains_numa_masks[][],
				1623	* the array will contain less then 'level' members. This could be
				1624	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1625	* in other functions.
				1626	*
				1627	* We reset it to 'level' at the end of this function.
				1628	*/
				1629	sched_domains_numa_levels = 0;
				1630
				1631	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
				1632	if (!sched_domains_numa_masks)
				1633	return;
				1634
				1635	/*
				1636	* Now for each level, construct a mask per node which contains all
				1637	* CPUs of nodes that are that many hops away from us.
				1638	*/
				1639	for (i = 0; i < level; i++) {
				1640	sched_domains_numa_masks[i] =
				1641	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1642	if (!sched_domains_numa_masks[i])
				1643	return;
				1644
				1645	for (j = 0; j < nr_node_ids; j++) {
				1646	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
				1647	if (!mask)
				1648	return;
				1649
				1650	sched_domains_numa_masks[i][j] = mask;
				1651
				1652	for_each_node(k) {
				1653	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1654	continue;
				1655
				1656	cpumask_or(mask, mask, cpumask_of_node(k));
				1657	}
				1658	}
				1659	}
				1660
				1661	/* Compute default topology size */
				1662	for (i = 0; sched_domain_topology[i].mask; i++);
				1663
				1664	tl = kzalloc((i + level + 1) *
				1665	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1666	if (!tl)
				1667	return;
				1668
				1669	/*
				1670	* Copy the default topology bits..
				1671	*/
				1672	for (i = 0; sched_domain_topology[i].mask; i++)
				1673	tl[i] = sched_domain_topology[i];
				1674
				1675	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1676	* Add the NUMA identity distance, aka single NODE.
				1677	*/
				1678	tl[i++] = (struct sched_domain_topology_level){
				1679	.mask = sd_numa_mask,
				1680	.numa_level = 0,
				1681	SD_INIT_NAME(NODE)
				1682	};
				1683
				1684	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1685	* .. and append 'j' levels of NUMA goodness.
				1686	*/
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1687	for (j = 1; j < level; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1688	tl[i] = (struct sched_domain_topology_level){
				1689	.mask = sd_numa_mask,
				1690	.sd_flags = cpu_numa_flags,
				1691	.flags = SDTL_OVERLAP,
				1692	.numa_level = j,
				1693	SD_INIT_NAME(NUMA)
				1694	};
				1695	}
				1696
				1697	sched_domain_topology = tl;
				1698
				1699	sched_domains_numa_levels = level;
				1700	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
				1701
				1702	init_numa_topology_type();
				1703	}
				1704
				1705	void sched_domains_numa_masks_set(unsigned int cpu)
				1706	{
				1707	int node = cpu_to_node(cpu);
				1708	int i, j;
				1709
				1710	for (i = 0; i < sched_domains_numa_levels; i++) {
				1711	for (j = 0; j < nr_node_ids; j++) {
				1712	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1713	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1714	}
				1715	}
				1716	}
				1717
				1718	void sched_domains_numa_masks_clear(unsigned int cpu)
				1719	{
				1720	int i, j;
				1721
				1722	for (i = 0; i < sched_domains_numa_levels; i++) {
				1723	for (j = 0; j < nr_node_ids; j++)
				1724	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1725	}
				1726	}
				1727
Wanpeng Li	e0e8d49	2019-06-28 16:51:41 +0800	[diff] [blame]	1728	/*
				1729	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1730	* closest to @cpu from @cpumask.
				1731	* cpumask: cpumask to find a cpu from
				1732	* cpu: cpu to be close to
				1733	*
				1734	* returns: cpu, or nr_cpu_ids when nothing found.
				1735	*/
				1736	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				1737	{
				1738	int i, j = cpu_to_node(cpu);
				1739
				1740	for (i = 0; i < sched_domains_numa_levels; i++) {
				1741	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				1742	if (cpu < nr_cpu_ids)
				1743	return cpu;
				1744	}
				1745	return nr_cpu_ids;
				1746	}
				1747
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1748	#endif /* CONFIG_NUMA */
				1749
				1750	static int __sdt_alloc(const struct cpumask *cpu_map)
				1751	{
				1752	struct sched_domain_topology_level *tl;
				1753	int j;
				1754
				1755	for_each_sd_topology(tl) {
				1756	struct sd_data *sdd = &tl->data;
				1757
				1758	sdd->sd = alloc_percpu(struct sched_domain *);
				1759	if (!sdd->sd)
				1760	return -ENOMEM;
				1761
				1762	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				1763	if (!sdd->sds)
				1764	return -ENOMEM;
				1765
				1766	sdd->sg = alloc_percpu(struct sched_group *);
				1767	if (!sdd->sg)
				1768	return -ENOMEM;
				1769
				1770	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				1771	if (!sdd->sgc)
				1772	return -ENOMEM;
				1773
				1774	for_each_cpu(j, cpu_map) {
				1775	struct sched_domain *sd;
				1776	struct sched_domain_shared *sds;
				1777	struct sched_group *sg;
				1778	struct sched_group_capacity *sgc;
				1779
				1780	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				1781	GFP_KERNEL, cpu_to_node(j));
				1782	if (!sd)
				1783	return -ENOMEM;
				1784
				1785	*per_cpu_ptr(sdd->sd, j) = sd;
				1786
				1787	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				1788	GFP_KERNEL, cpu_to_node(j));
				1789	if (!sds)
				1790	return -ENOMEM;
				1791
				1792	*per_cpu_ptr(sdd->sds, j) = sds;
				1793
				1794	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				1795	GFP_KERNEL, cpu_to_node(j));
				1796	if (!sg)
				1797	return -ENOMEM;
				1798
				1799	sg->next = sg;
				1800
				1801	*per_cpu_ptr(sdd->sg, j) = sg;
				1802
				1803	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				1804	GFP_KERNEL, cpu_to_node(j));
				1805	if (!sgc)
				1806	return -ENOMEM;
				1807
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	1808	#ifdef CONFIG_SCHED_DEBUG
				1809	sgc->id = j;
				1810	#endif
				1811
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1812	*per_cpu_ptr(sdd->sgc, j) = sgc;
				1813	}
				1814	}
				1815
				1816	return 0;
				1817	}
				1818
				1819	static void __sdt_free(const struct cpumask *cpu_map)
				1820	{
				1821	struct sched_domain_topology_level *tl;
				1822	int j;
				1823
				1824	for_each_sd_topology(tl) {
				1825	struct sd_data *sdd = &tl->data;
				1826
				1827	for_each_cpu(j, cpu_map) {
				1828	struct sched_domain *sd;
				1829
				1830	if (sdd->sd) {
				1831	sd = *per_cpu_ptr(sdd->sd, j);
				1832	if (sd && (sd->flags & SD_OVERLAP))
				1833	free_sched_groups(sd->groups, 0);
				1834	kfree(*per_cpu_ptr(sdd->sd, j));
				1835	}
				1836
				1837	if (sdd->sds)
				1838	kfree(*per_cpu_ptr(sdd->sds, j));
				1839	if (sdd->sg)
				1840	kfree(*per_cpu_ptr(sdd->sg, j));
				1841	if (sdd->sgc)
				1842	kfree(*per_cpu_ptr(sdd->sgc, j));
				1843	}
				1844	free_percpu(sdd->sd);
				1845	sdd->sd = NULL;
				1846	free_percpu(sdd->sds);
				1847	sdd->sds = NULL;
				1848	free_percpu(sdd->sg);
				1849	sdd->sg = NULL;
				1850	free_percpu(sdd->sgc);
				1851	sdd->sgc = NULL;
				1852	}
				1853	}
				1854
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	1855	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1856	const struct cpumask cpu_map, struct sched_domain_attr attr,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1857	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1858	{
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1859	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1860
				1861	if (child) {
				1862	sd->level = child->level + 1;
				1863	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				1864	child->parent = sd;
				1865
				1866	if (!cpumask_subset(sched_domain_span(child),
				1867	sched_domain_span(sd))) {
				1868	pr_err("BUG: arch topology borken\n");
				1869	#ifdef CONFIG_SCHED_DEBUG
				1870	pr_err(" the %s domain not a subset of the %s domain\n",
				1871	child->name, sd->name);
				1872	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1873	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1874	cpumask_or(sched_domain_span(sd),
				1875	sched_domain_span(sd),
				1876	sched_domain_span(child));
				1877	}
				1878
				1879	}
				1880	set_domain_attribute(sd, attr);
				1881
				1882	return sd;
				1883	}
				1884
				1885	/*
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1886	* Find the sched_domain_topology_level where all CPU capacities are visible
				1887	* for all CPUs.
				1888	*/
				1889	static struct sched_domain_topology_level
				1890	asym_cpu_capacity_level(const struct cpumask cpu_map)
				1891	{
				1892	int i, j, asym_level = 0;
				1893	bool asym = false;
				1894	struct sched_domain_topology_level tl, asym_tl = NULL;
				1895	unsigned long cap;
				1896
				1897	/* Is there any asymmetry? */
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1898	cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1899
				1900	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1901	if (arch_scale_cpu_capacity(i) != cap) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1902	asym = true;
				1903	break;
				1904	}
				1905	}
				1906
				1907	if (!asym)
				1908	return NULL;
				1909
				1910	/*
				1911	* Examine topology from all CPU's point of views to detect the lowest
				1912	* sched_domain_topology_level where a highest capacity CPU is visible
				1913	* to everyone.
				1914	*/
				1915	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1916	unsigned long max_capacity = arch_scale_cpu_capacity(i);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1917	int tl_id = 0;
				1918
				1919	for_each_sd_topology(tl) {
				1920	if (tl_id < asym_level)
				1921	goto next_level;
				1922
				1923	for_each_cpu_and(j, tl->mask(i), cpu_map) {
				1924	unsigned long capacity;
				1925
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1926	capacity = arch_scale_cpu_capacity(j);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1927
				1928	if (capacity <= max_capacity)
				1929	continue;
				1930
				1931	max_capacity = capacity;
				1932	asym_level = tl_id;
				1933	asym_tl = tl;
				1934	}
				1935	next_level:
				1936	tl_id++;
				1937	}
				1938	}
				1939
				1940	return asym_tl;
				1941	}
				1942
				1943
				1944	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1945	* Build sched domains for a given set of CPUs and attach the sched domains
				1946	* to the individual CPUs
				1947	*/
				1948	static int
				1949	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				1950	{
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	1951	enum s_alloc alloc_state = sa_none;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1952	struct sched_domain *sd;
				1953	struct s_data d;
				1954	struct rq *rq = NULL;
				1955	int i, ret = -ENOMEM;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1956	struct sched_domain_topology_level *tl_asym;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1957	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1958
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	1959	if (WARN_ON(cpumask_empty(cpu_map)))
				1960	goto error;
				1961
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1962	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				1963	if (alloc_state != sa_rootdomain)
				1964	goto error;
				1965
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1966	tl_asym = asym_cpu_capacity_level(cpu_map);
				1967
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1968	/* Set up domains for CPUs specified by the cpu_map: */
				1969	for_each_cpu(i, cpu_map) {
				1970	struct sched_domain_topology_level *tl;
				1971
				1972	sd = NULL;
				1973	for_each_sd_topology(tl) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1974	int dflags = 0;
				1975
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1976	if (tl == tl_asym) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1977	dflags \|= SD_ASYM_CPUCAPACITY;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1978	has_asym = true;
				1979	}
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1980
				1981	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
				1982
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1983	if (tl == sched_domain_topology)
				1984	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	1985	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1986	sd->flags \|= SD_OVERLAP;
				1987	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				1988	break;
				1989	}
				1990	}
				1991
				1992	/* Build the groups for the domains */
				1993	for_each_cpu(i, cpu_map) {
				1994	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				1995	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				1996	if (sd->flags & SD_OVERLAP) {
				1997	if (build_overlap_sched_groups(sd, i))
				1998	goto error;
				1999	} else {
				2000	if (build_sched_groups(sd, i))
				2001	goto error;
				2002	}
				2003	}
				2004	}
				2005
				2006	/* Calculate CPU capacity for physical packages and nodes */
				2007	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2008	if (!cpumask_test_cpu(i, cpu_map))
				2009	continue;
				2010
				2011	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2012	claim_allocations(i, sd);
				2013	init_sched_groups_capacity(i, sd);
				2014	}
				2015	}
				2016
				2017	/* Attach the domains */
				2018	rcu_read_lock();
				2019	for_each_cpu(i, cpu_map) {
				2020	rq = cpu_rq(i);
				2021	sd = *per_cpu_ptr(d.sd, i);
				2022
				2023	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2024	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2025	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2026
				2027	cpu_attach_domain(sd, d.rd, i);
				2028	}
				2029	rcu_read_unlock();
				2030
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2031	if (has_asym)
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame^]	2032	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2033
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2034	if (rq && sched_debug_enabled) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2035	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2036	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2037	}
				2038
				2039	ret = 0;
				2040	error:
				2041	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2042
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2043	return ret;
				2044	}
				2045
				2046	/* Current sched domains: */
				2047	static cpumask_var_t *doms_cur;
				2048
				2049	/* Number of sched domains in 'doms_cur': */
				2050	static int ndoms_cur;
				2051
				2052	/* Attribues of custom domains in 'doms_cur' */
				2053	static struct sched_domain_attr *dattr_cur;
				2054
				2055	/*
				2056	* Special case: If a kmalloc() of a doms_cur partition (array of
				2057	* cpumask) fails, then fallback to a single sched domain,
				2058	* as determined by the single cpumask fallback_doms.
				2059	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2060	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2061
				2062	/*
				2063	* arch_update_cpu_topology lets virtualized architectures update the
				2064	* CPU core maps. It is supposed to return 1 if the topology changed
				2065	* or 0 if it stayed the same.
				2066	*/
				2067	int __weak arch_update_cpu_topology(void)
				2068	{
				2069	return 0;
				2070	}
				2071
				2072	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2073	{
				2074	int i;
				2075	cpumask_var_t *doms;
				2076
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2077	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2078	if (!doms)
				2079	return NULL;
				2080	for (i = 0; i < ndoms; i++) {
				2081	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2082	free_sched_domains(doms, i);
				2083	return NULL;
				2084	}
				2085	}
				2086	return doms;
				2087	}
				2088
				2089	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2090	{
				2091	unsigned int i;
				2092	for (i = 0; i < ndoms; i++)
				2093	free_cpumask_var(doms[i]);
				2094	kfree(doms);
				2095	}
				2096
				2097	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame]	2098	* Set up scheduler domains and groups. For now this just excludes isolated
				2099	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2100	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2101	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2102	{
				2103	int err;
				2104
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2105	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2106	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2107	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2108
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2109	arch_update_cpu_topology();
				2110	ndoms_cur = 1;
				2111	doms_cur = alloc_sched_domains(ndoms_cur);
				2112	if (!doms_cur)
				2113	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2114	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2115	err = build_sched_domains(doms_cur[0], NULL);
				2116	register_sched_domain_sysctl();
				2117
				2118	return err;
				2119	}
				2120
				2121	/*
				2122	* Detach sched domains from a group of CPUs specified in cpu_map
				2123	* These CPUs will now be attached to the NULL domain
				2124	*/
				2125	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2126	{
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame^]	2127	unsigned int cpu = cpumask_any(cpu_map);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2128	int i;
				2129
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame^]	2130	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2131	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2132
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2133	rcu_read_lock();
				2134	for_each_cpu(i, cpu_map)
				2135	cpu_attach_domain(NULL, &def_root_domain, i);
				2136	rcu_read_unlock();
				2137	}
				2138
				2139	/* handle null as "default" */
				2140	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2141	struct sched_domain_attr *new, int idx_new)
				2142	{
				2143	struct sched_domain_attr tmp;
				2144
				2145	/* Fast path: */
				2146	if (!new && !cur)
				2147	return 1;
				2148
				2149	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2150
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2151	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2152	new ? (new + idx_new) : &tmp,
				2153	sizeof(struct sched_domain_attr));
				2154	}
				2155
				2156	/*
				2157	* Partition sched domains as specified by the 'ndoms_new'
				2158	* cpumasks in the array doms_new[] of cpumasks. This compares
				2159	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2160	* It destroys each deleted domain and builds each new domain.
				2161	*
				2162	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2163	* The masks don't intersect (don't overlap.) We should setup one
				2164	* sched domain for each mask. CPUs not in any of the cpumasks will
				2165	* not be load balanced. If the same cpumask appears both in the
				2166	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2167	* it as it is.
				2168	*
				2169	* The passed in 'doms_new' should be allocated using
				2170	* alloc_sched_domains. This routine takes ownership of it and will
				2171	* free_sched_domains it when done with it. If the caller failed the
				2172	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2173	* and partition_sched_domains() will fallback to the single partition
				2174	* 'fallback_doms', it also forces the domains to be rebuilt.
				2175	*
				2176	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2177	* ndoms_new == 0 is a special case for destroying existing domains,
				2178	* and it will not create the default domain.
				2179	*
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2180	* Call with hotplug lock and sched_domains_mutex held
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2181	*/
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2182	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2183	struct sched_domain_attr *dattr_new)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2184	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2185	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2186	int i, j, n;
				2187	int new_topology;
				2188
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2189	lockdep_assert_held(&sched_domains_mutex);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2190
				2191	/* Always unregister in case we don't destroy any domains: */
				2192	unregister_sched_domain_sysctl();
				2193
				2194	/* Let the architecture update CPU core mappings: */
				2195	new_topology = arch_update_cpu_topology();
				2196
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2197	if (!doms_new) {
				2198	WARN_ON_ONCE(dattr_new);
				2199	n = 0;
				2200	doms_new = alloc_sched_domains(1);
				2201	if (doms_new) {
				2202	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2203	cpumask_and(doms_new[0], cpu_active_mask,
				2204	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2205	}
				2206	} else {
				2207	n = ndoms_new;
				2208	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2209
				2210	/* Destroy deleted domains: */
				2211	for (i = 0; i < ndoms_cur; i++) {
				2212	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2213	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2214	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2215	struct root_domain *rd;
				2216
				2217	/*
				2218	* This domain won't be destroyed and as such
				2219	* its dl_bw->total_bw needs to be cleared. It
				2220	* will be recomputed in function
				2221	* update_tasks_root_domain().
				2222	*/
				2223	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2224	dl_clear_root_domain(rd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2225	goto match1;
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2226	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2227	}
				2228	/* No match - a current sched domain not in new doms_new[] */
				2229	detach_destroy_domains(doms_cur[i]);
				2230	match1:
				2231	;
				2232	}
				2233
				2234	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2235	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2236	n = 0;
				2237	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2238	cpumask_and(doms_new[0], cpu_active_mask,
				2239	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2240	}
				2241
				2242	/* Build new domains: */
				2243	for (i = 0; i < ndoms_new; i++) {
				2244	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2245	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2246	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2247	goto match2;
				2248	}
				2249	/* No match - add a new doms_new */
				2250	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2251	match2:
				2252	;
				2253	}
				2254
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2255	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2256	/* Build perf. domains: */
				2257	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2258	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2259	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2260	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2261	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2262	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2263	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2264	}
				2265	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2266	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2267	match3:
				2268	;
				2269	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2270	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2271	#endif
				2272
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2273	/* Remember the new sched domains: */
				2274	if (doms_cur != &fallback_doms)
				2275	free_sched_domains(doms_cur, ndoms_cur);
				2276
				2277	kfree(dattr_cur);
				2278	doms_cur = doms_new;
				2279	dattr_cur = dattr_new;
				2280	ndoms_cur = ndoms_new;
				2281
				2282	register_sched_domain_sysctl();
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2283	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2284
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2285	/*
				2286	* Call with hotplug lock held
				2287	*/
				2288	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2289	struct sched_domain_attr *dattr_new)
				2290	{
				2291	mutex_lock(&sched_domains_mutex);
				2292	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2293	mutex_unlock(&sched_domains_mutex);
				2294	}