Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: f53f89df837d84786635a302209e45f4cc72da85 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9469eb0	2017-09-07 17:03:53 +0200	[diff] [blame]	17	sched_debug_enabled = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
				21	early_param("sched_debug", sched_debug_setup);
				22
				23	static inline bool sched_debug(void)
				24	{
				25	return sched_debug_enabled;
				26	}
				27
				28	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				29	struct cpumask *groupmask)
				30	{
				31	struct sched_group *group = sd->groups;
				32
				33	cpumask_clear(groupmask);
				34
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	35	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	36
				37	if (!(sd->flags & SD_LOAD_BALANCE)) {
				38	printk("does not load-balance\n");
				39	if (sd->parent)
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	40	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	41	return -1;
				42	}
				43
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
				54	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				55	do {
				56	if (!group) {
				57	printk("\n");
				58	printk(KERN_ERR "ERROR: group is NULL\n");
				59	break;
				60	}
				61
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	62	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	63	printk(KERN_CONT "\n");
				64	printk(KERN_ERR "ERROR: empty group\n");
				65	break;
				66	}
				67
				68	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	69	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	70	printk(KERN_CONT "\n");
				71	printk(KERN_ERR "ERROR: repeated CPUs\n");
				72	break;
				73	}
				74
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	75	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	76
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	77	printk(KERN_CONT " %d:{ span=%*pbl",
				78	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	79	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	80
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	81	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	82	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	83	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	84	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	85	}
				86
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	87	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				88	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	89
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	90	if (group == sd->groups && sd->child &&
				91	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	92	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	93	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				94	}
				95
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	96	printk(KERN_CONT " }");
				97
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	98	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	99
				100	if (group != sd->groups)
				101	printk(KERN_CONT ",");
				102
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	103	} while (group != sd->groups);
				104	printk(KERN_CONT "\n");
				105
				106	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				107	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				108
				109	if (sd->parent &&
				110	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	111	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	112	return 0;
				113	}
				114
				115	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				116	{
				117	int level = 0;
				118
				119	if (!sched_debug_enabled)
				120	return;
				121
				122	if (!sd) {
				123	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				124	return;
				125	}
				126
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	127	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	128
				129	for (;;) {
				130	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				131	break;
				132	level++;
				133	sd = sd->parent;
				134	if (!sd)
				135	break;
				136	}
				137	}
				138	#else /* !CONFIG_SCHED_DEBUG */
				139
				140	# define sched_debug_enabled 0
				141	# define sched_domain_debug(sd, cpu) do { } while (0)
				142	static inline bool sched_debug(void)
				143	{
				144	return false;
				145	}
				146	#endif /* CONFIG_SCHED_DEBUG */
				147
				148	static int sd_degenerate(struct sched_domain *sd)
				149	{
				150	if (cpumask_weight(sched_domain_span(sd)) == 1)
				151	return 1;
				152
				153	/* Following flags need at least 2 groups */
				154	if (sd->flags & (SD_LOAD_BALANCE \|
				155	SD_BALANCE_NEWIDLE \|
				156	SD_BALANCE_FORK \|
				157	SD_BALANCE_EXEC \|
				158	SD_SHARE_CPUCAPACITY \|
				159	SD_ASYM_CPUCAPACITY \|
				160	SD_SHARE_PKG_RESOURCES \|
				161	SD_SHARE_POWERDOMAIN)) {
				162	if (sd->groups != sd->groups->next)
				163	return 0;
				164	}
				165
				166	/* Following flags don't use groups */
				167	if (sd->flags & (SD_WAKE_AFFINE))
				168	return 0;
				169
				170	return 1;
				171	}
				172
				173	static int
				174	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				175	{
				176	unsigned long cflags = sd->flags, pflags = parent->flags;
				177
				178	if (sd_degenerate(parent))
				179	return 1;
				180
				181	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				182	return 0;
				183
				184	/* Flags needing groups don't count if only 1 group in parent */
				185	if (parent->groups == parent->groups->next) {
				186	pflags &= ~(SD_LOAD_BALANCE \|
				187	SD_BALANCE_NEWIDLE \|
				188	SD_BALANCE_FORK \|
				189	SD_BALANCE_EXEC \|
				190	SD_ASYM_CPUCAPACITY \|
				191	SD_SHARE_CPUCAPACITY \|
				192	SD_SHARE_PKG_RESOURCES \|
				193	SD_PREFER_SIBLING \|
				194	SD_SHARE_POWERDOMAIN);
				195	if (nr_node_ids == 1)
				196	pflags &= ~SD_SERIALIZE;
				197	}
				198	if (~cflags & pflags)
				199	return 0;
				200
				201	return 1;
				202	}
				203
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	204	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	205	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	206	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	207	DEFINE_MUTEX(sched_energy_mutex);
				208	bool sched_energy_update;
				209
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	#ifdef CONFIG_PROC_SYSCTL
				211	int sched_energy_aware_handler(struct ctl_table *table, int write,
				212	void __user buffer, size_t lenp, loff_t *ppos)
				213	{
				214	int ret, state;
				215
				216	if (write && !capable(CAP_SYS_ADMIN))
				217	return -EPERM;
				218
				219	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				220	if (!ret && write) {
				221	state = static_branch_unlikely(&sched_energy_present);
				222	if (state != sysctl_sched_energy_aware) {
				223	mutex_lock(&sched_energy_mutex);
				224	sched_energy_update = 1;
				225	rebuild_sched_domains();
				226	sched_energy_update = 0;
				227	mutex_unlock(&sched_energy_mutex);
				228	}
				229	}
				230
				231	return ret;
				232	}
				233	#endif
				234
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	235	static void free_pd(struct perf_domain *pd)
				236	{
				237	struct perf_domain *tmp;
				238
				239	while (pd) {
				240	tmp = pd->next;
				241	kfree(pd);
				242	pd = tmp;
				243	}
				244	}
				245
				246	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				247	{
				248	while (pd) {
				249	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				250	return pd;
				251	pd = pd->next;
				252	}
				253
				254	return NULL;
				255	}
				256
				257	static struct perf_domain *pd_init(int cpu)
				258	{
				259	struct em_perf_domain *obj = em_cpu_get(cpu);
				260	struct perf_domain *pd;
				261
				262	if (!obj) {
				263	if (sched_debug())
				264	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				265	return NULL;
				266	}
				267
				268	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				269	if (!pd)
				270	return NULL;
				271	pd->em_pd = obj;
				272
				273	return pd;
				274	}
				275
				276	static void perf_domain_debug(const struct cpumask *cpu_map,
				277	struct perf_domain *pd)
				278	{
				279	if (!sched_debug() \|\| !pd)
				280	return;
				281
				282	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				283
				284	while (pd) {
				285	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
				286	cpumask_first(perf_domain_span(pd)),
				287	cpumask_pr_args(perf_domain_span(pd)),
				288	em_pd_nr_cap_states(pd->em_pd));
				289	pd = pd->next;
				290	}
				291
				292	printk(KERN_CONT "\n");
				293	}
				294
				295	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				296	{
				297	struct perf_domain *pd;
				298
				299	pd = container_of(rp, struct perf_domain, rcu);
				300	free_pd(pd);
				301	}
				302
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	303	static void sched_energy_set(bool has_eas)
				304	{
				305	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				306	if (sched_debug())
				307	pr_info("%s: stopping EAS\n", __func__);
				308	static_branch_disable_cpuslocked(&sched_energy_present);
				309	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				310	if (sched_debug())
				311	pr_info("%s: starting EAS\n", __func__);
				312	static_branch_enable_cpuslocked(&sched_energy_present);
				313	}
				314	}
				315
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	316	/*
				317	* EAS can be used on a root domain if it meets all the following conditions:
				318	* 1. an Energy Model (EM) is available;
				319	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
				320	* 3. the EM complexity is low enough to keep scheduling overheads low;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	321	* 4. schedutil is driving the frequency of all CPUs of the rd;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	322	*
				323	* The complexity of the Energy Model is defined as:
				324	*
				325	* C = nr_pd * (nr_cpus + nr_cs)
				326	*
				327	* with parameters defined as:
				328	* - nr_pd: the number of performance domains
				329	* - nr_cpus: the number of CPUs
				330	* - nr_cs: the sum of the number of capacity states of all performance
				331	* domains (for example, on a system with 2 performance domains,
				332	* with 10 capacity states each, nr_cs = 2 * 10 = 20).
				333	*
				334	* It is generally not a good idea to use such a model in the wake-up path on
				335	* very complex platforms because of the associated scheduling overheads. The
				336	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
				337	* with per-CPU DVFS and less than 8 capacity states each, for example.
				338	*/
				339	#define EM_MAX_COMPLEXITY 2048
				340
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	341	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	342	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	343	{
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	344	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	345	struct perf_domain pd = NULL, tmp;
				346	int cpu = cpumask_first(cpu_map);
				347	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	348	struct cpufreq_policy *policy;
				349	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	350
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	351	if (!sysctl_sched_energy_aware)
				352	goto free;
				353
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	354	/* EAS is enabled for asymmetric CPU capacity topologies. */
				355	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				356	if (sched_debug()) {
				357	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				358	cpumask_pr_args(cpu_map));
				359	}
				360	goto free;
				361	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	362
				363	for_each_cpu(i, cpu_map) {
				364	/* Skip already covered CPUs. */
				365	if (find_pd(pd, i))
				366	continue;
				367
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	368	/* Do not attempt EAS if schedutil is not being used. */
				369	policy = cpufreq_cpu_get(i);
				370	if (!policy)
				371	goto free;
				372	gov = policy->governor;
				373	cpufreq_cpu_put(policy);
				374	if (gov != &schedutil_gov) {
				375	if (rd->pd)
				376	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				377	cpumask_pr_args(cpu_map));
				378	goto free;
				379	}
				380
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	381	/* Create the new pd and add it to the local list. */
				382	tmp = pd_init(i);
				383	if (!tmp)
				384	goto free;
				385	tmp->next = pd;
				386	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	387
				388	/*
				389	* Count performance domains and capacity states for the
				390	* complexity check.
				391	*/
				392	nr_pd++;
				393	nr_cs += em_pd_nr_cap_states(pd->em_pd);
				394	}
				395
				396	/* Bail out if the Energy Model complexity is too high. */
				397	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
				398	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				399	cpumask_pr_args(cpu_map));
				400	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	401	}
				402
				403	perf_domain_debug(cpu_map, pd);
				404
				405	/* Attach the new list of performance domains to the root domain. */
				406	tmp = rd->pd;
				407	rcu_assign_pointer(rd->pd, pd);
				408	if (tmp)
				409	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				410
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	411	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	412
				413	free:
				414	free_pd(pd);
				415	tmp = rd->pd;
				416	rcu_assign_pointer(rd->pd, NULL);
				417	if (tmp)
				418	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	419
				420	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	421	}
				422	#else
				423	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	424	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	425
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	426	static void free_rootdomain(struct rcu_head *rcu)
				427	{
				428	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				429
				430	cpupri_cleanup(&rd->cpupri);
				431	cpudl_cleanup(&rd->cpudl);
				432	free_cpumask_var(rd->dlo_mask);
				433	free_cpumask_var(rd->rto_mask);
				434	free_cpumask_var(rd->online);
				435	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	436	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	437	kfree(rd);
				438	}
				439
				440	void rq_attach_root(struct rq rq, struct root_domain rd)
				441	{
				442	struct root_domain *old_rd = NULL;
				443	unsigned long flags;
				444
				445	raw_spin_lock_irqsave(&rq->lock, flags);
				446
				447	if (rq->rd) {
				448	old_rd = rq->rd;
				449
				450	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				451	set_rq_offline(rq);
				452
				453	cpumask_clear_cpu(rq->cpu, old_rd->span);
				454
				455	/*
				456	* If we dont want to free the old_rd yet then
				457	* set old_rd to NULL to skip the freeing later
				458	* in this function:
				459	*/
				460	if (!atomic_dec_and_test(&old_rd->refcount))
				461	old_rd = NULL;
				462	}
				463
				464	atomic_inc(&rd->refcount);
				465	rq->rd = rd;
				466
				467	cpumask_set_cpu(rq->cpu, rd->span);
				468	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				469	set_rq_online(rq);
				470
				471	raw_spin_unlock_irqrestore(&rq->lock, flags);
				472
				473	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	474	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	475	}
				476
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	477	void sched_get_rd(struct root_domain *rd)
				478	{
				479	atomic_inc(&rd->refcount);
				480	}
				481
				482	void sched_put_rd(struct root_domain *rd)
				483	{
				484	if (!atomic_dec_and_test(&rd->refcount))
				485	return;
				486
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	487	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	488	}
				489
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	490	static int init_rootdomain(struct root_domain *rd)
				491	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	492	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				493	goto out;
				494	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				495	goto free_span;
				496	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				497	goto free_online;
				498	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				499	goto free_dlo_mask;
				500
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	501	#ifdef HAVE_RT_PUSH_IPI
				502	rd->rto_cpu = -1;
				503	raw_spin_lock_init(&rd->rto_lock);
				504	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				505	#endif
				506
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	507	init_dl_bw(&rd->dl_bw);
				508	if (cpudl_init(&rd->cpudl) != 0)
				509	goto free_rto_mask;
				510
				511	if (cpupri_init(&rd->cpupri) != 0)
				512	goto free_cpudl;
				513	return 0;
				514
				515	free_cpudl:
				516	cpudl_cleanup(&rd->cpudl);
				517	free_rto_mask:
				518	free_cpumask_var(rd->rto_mask);
				519	free_dlo_mask:
				520	free_cpumask_var(rd->dlo_mask);
				521	free_online:
				522	free_cpumask_var(rd->online);
				523	free_span:
				524	free_cpumask_var(rd->span);
				525	out:
				526	return -ENOMEM;
				527	}
				528
				529	/*
				530	* By default the system creates a single root-domain with all CPUs as
				531	* members (mimicking the global state we have today).
				532	*/
				533	struct root_domain def_root_domain;
				534
				535	void init_defrootdomain(void)
				536	{
				537	init_rootdomain(&def_root_domain);
				538
				539	atomic_set(&def_root_domain.refcount, 1);
				540	}
				541
				542	static struct root_domain *alloc_rootdomain(void)
				543	{
				544	struct root_domain *rd;
				545
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	546	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	547	if (!rd)
				548	return NULL;
				549
				550	if (init_rootdomain(rd) != 0) {
				551	kfree(rd);
				552	return NULL;
				553	}
				554
				555	return rd;
				556	}
				557
				558	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				559	{
				560	struct sched_group tmp, first;
				561
				562	if (!sg)
				563	return;
				564
				565	first = sg;
				566	do {
				567	tmp = sg->next;
				568
				569	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				570	kfree(sg->sgc);
				571
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	572	if (atomic_dec_and_test(&sg->ref))
				573	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	574	sg = tmp;
				575	} while (sg != first);
				576	}
				577
				578	static void destroy_sched_domain(struct sched_domain *sd)
				579	{
				580	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	581	* A normal sched domain may have multiple group references, an
				582	* overlapping domain, having private groups, only one. Iterate,
				583	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	584	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	585	free_sched_groups(sd->groups, 1);
				586
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	587	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				588	kfree(sd->shared);
				589	kfree(sd);
				590	}
				591
				592	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				593	{
				594	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				595
				596	while (sd) {
				597	struct sched_domain *parent = sd->parent;
				598	destroy_sched_domain(sd);
				599	sd = parent;
				600	}
				601	}
				602
				603	static void destroy_sched_domains(struct sched_domain *sd)
				604	{
				605	if (sd)
				606	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				607	}
				608
				609	/*
				610	* Keep a special pointer to the highest sched_domain that has
				611	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				612	* allows us to avoid some pointer chasing select_idle_sibling().
				613	*
				614	* Also keep a unique ID per domain (we use the first CPU number in
				615	* the cpumask of the domain), this allows us to quickly tell if
				616	* two CPUs are in the same cache domain, see cpus_share_cache().
				617	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	618	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	619	DEFINE_PER_CPU(int, sd_llc_size);
				620	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	621	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				622	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				623	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				624	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	625	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	626
				627	static void update_top_cache_domain(int cpu)
				628	{
				629	struct sched_domain_shared *sds = NULL;
				630	struct sched_domain *sd;
				631	int id = cpu;
				632	int size = 1;
				633
				634	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				635	if (sd) {
				636	id = cpumask_first(sched_domain_span(sd));
				637	size = cpumask_weight(sched_domain_span(sd));
				638	sds = sd->shared;
				639	}
				640
				641	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				642	per_cpu(sd_llc_size, cpu) = size;
				643	per_cpu(sd_llc_id, cpu) = id;
				644	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				645
				646	sd = lowest_flag_domain(cpu, SD_NUMA);
				647	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				648
				649	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	650	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				651
				652	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
				653	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	654	}
				655
				656	/*
				657	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				658	* hold the hotplug lock.
				659	*/
				660	static void
				661	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				662	{
				663	struct rq *rq = cpu_rq(cpu);
				664	struct sched_domain *tmp;
				665
				666	/* Remove the sched domains which do not contribute to scheduling. */
				667	for (tmp = sd; tmp; ) {
				668	struct sched_domain *parent = tmp->parent;
				669	if (!parent)
				670	break;
				671
				672	if (sd_parent_degenerate(tmp, parent)) {
				673	tmp->parent = parent->parent;
				674	if (parent->parent)
				675	parent->parent->child = tmp;
				676	/*
				677	* Transfer SD_PREFER_SIBLING down in case of a
				678	* degenerate parent; the spans match for this
				679	* so the property transfers.
				680	*/
				681	if (parent->flags & SD_PREFER_SIBLING)
				682	tmp->flags \|= SD_PREFER_SIBLING;
				683	destroy_sched_domain(parent);
				684	} else
				685	tmp = tmp->parent;
				686	}
				687
				688	if (sd && sd_degenerate(sd)) {
				689	tmp = sd;
				690	sd = sd->parent;
				691	destroy_sched_domain(tmp);
				692	if (sd)
				693	sd->child = NULL;
				694	}
				695
				696	sched_domain_debug(sd, cpu);
				697
				698	rq_attach_root(rq, rd);
				699	tmp = rq->sd;
				700	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	701	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	702	destroy_sched_domains(tmp);
				703
				704	update_top_cache_domain(cpu);
				705	}
				706
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	707	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	708	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	709	struct root_domain *rd;
				710	};
				711
				712	enum s_alloc {
				713	sa_rootdomain,
				714	sa_sd,
				715	sa_sd_storage,
				716	sa_none,
				717	};
				718
				719	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	720	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	721	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	722	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	723	* The balance mask are all those CPUs that could actually end up at this
				724	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	725	*
				726	* Also see should_we_balance().
				727	*/
				728	int group_balance_cpu(struct sched_group *sg)
				729	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	730	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	731	}
				732
				733
				734	/*
				735	* NUMA topology (first read the regular topology blurb below)
				736	*
				737	* Given a node-distance table, for example:
				738	*
				739	* node 0 1 2 3
				740	* 0: 10 20 30 20
				741	* 1: 20 10 20 30
				742	* 2: 30 20 10 20
				743	* 3: 20 30 20 10
				744	*
				745	* which represents a 4 node ring topology like:
				746	*
				747	* 0 ----- 1
				748	* \| \|
				749	* \| \|
				750	* \| \|
				751	* 3 ----- 2
				752	*
				753	* We want to construct domains and groups to represent this. The way we go
				754	* about doing this is to build the domains on 'hops'. For each NUMA level we
				755	* construct the mask of all nodes reachable in @level hops.
				756	*
				757	* For the above NUMA topology that gives 3 levels:
				758	*
				759	* NUMA-2 0-3 0-3 0-3 0-3
				760	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				761	*
				762	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				763	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				764	*
				765	* NUMA-0 0 1 2 3
				766	*
				767	*
				768	* As can be seen; things don't nicely line up as with the regular topology.
				769	* When we iterate a domain in child domain chunks some nodes can be
				770	* represented multiple times -- hence the "overlap" naming for this part of
				771	* the topology.
				772	*
				773	* In order to minimize this overlap, we only build enough groups to cover the
				774	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				775	*
				776	* Because:
				777	*
				778	* - the first group of each domain is its child domain; this
				779	* gets us the first 0-1,3
				780	* - the only uncovered node is 2, who's child domain is 1-3.
				781	*
				782	* However, because of the overlap, computing a unique CPU for each group is
				783	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				784	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				785	* end up at those groups (they would end up in group: 0-1,3).
				786	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	787	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	788	* will contain those CPUs in the group that can reach this group given the
				789	* (child) domain tree.
				790	*
				791	* With this we can once again compute balance_cpu and sched_group_capacity
				792	* relations.
				793	*
				794	* XXX include words on how balance_cpu is unique and therefore can be
				795	* used for sched_group_capacity links.
				796	*
				797	*
				798	* Another 'interesting' topology is:
				799	*
				800	* node 0 1 2 3
				801	* 0: 10 20 20 30
				802	* 1: 20 10 20 20
				803	* 2: 20 20 10 20
				804	* 3: 30 20 20 10
				805	*
				806	* Which looks a little like:
				807	*
				808	* 0 ----- 1
				809	* \| / \|
				810	* \| / \|
				811	* \| / \|
				812	* 2 ----- 3
				813	*
				814	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				815	* are not.
				816	*
				817	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	818	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	819	*
				820	* NUMA-2 0-3 0-3
				821	* groups: {0-2},{1-3} {1-3},{0-2}
				822	*
				823	* NUMA-1 0-2 0-3 0-3 1-3
				824	*
				825	* NUMA-0 0 1 2 3
				826	*
				827	*/
				828
				829
				830	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	831	* Build the balance mask; it contains only those CPUs that can arrive at this
				832	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	833	*
				834	* We do this during the group creation pass, therefore the group information
				835	* isn't complete yet, however since each group represents a (child) domain we
				836	* can fully construct this using the sched_domain bits (which are already
				837	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	838	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	839	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	840	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	841	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	842	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	843	struct sd_data *sdd = sd->private;
				844	struct sched_domain *sibling;
				845	int i;
				846
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	847	cpumask_clear(mask);
				848
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	849	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	850	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	851
				852	/*
				853	* Can happen in the asymmetric case, where these siblings are
				854	* unused. The mask will not be empty because those CPUs that
				855	* do have the top domain _should_ span the domain.
				856	*/
				857	if (!sibling->child)
				858	continue;
				859
				860	/* If we would not end up here, we can't continue from here */
				861	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	862	continue;
				863
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	864	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	865	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	866
				867	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	868	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	869	}
				870
				871	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	872	* XXX: This creates per-node group entries; since the load-balancer will
				873	* immediately access remote memory to construct this group's load-balance
				874	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	875	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	876	static struct sched_group *
				877	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				878	{
				879	struct sched_group *sg;
				880	struct cpumask *sg_span;
				881
				882	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				883	GFP_KERNEL, cpu_to_node(cpu));
				884
				885	if (!sg)
				886	return NULL;
				887
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	888	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	889	if (sd->child)
				890	cpumask_copy(sg_span, sched_domain_span(sd->child));
				891	else
				892	cpumask_copy(sg_span, sched_domain_span(sd));
				893
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	894	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	895	return sg;
				896	}
				897
				898	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	899	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	900	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	901	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	902	struct sd_data *sdd = sd->private;
				903	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	904	int cpu;
				905
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	906	build_balance_mask(sd, sg, mask);
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	907	cpu = cpumask_first_and(sched_group_span(sg), mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	908
				909	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				910	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	911	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	912	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	913	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	914
				915	/*
				916	* Initialize sgc->capacity such that even if we mess up the
				917	* domains and no possible iteration will get us here, we won't
				918	* die on a /0 trap.
				919	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	920	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	921	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				922	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	923	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	924	}
				925
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	926	static int
				927	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				928	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	929	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	930	const struct cpumask *span = sched_domain_span(sd);
				931	struct cpumask *covered = sched_domains_tmpmask;
				932	struct sd_data *sdd = sd->private;
				933	struct sched_domain *sibling;
				934	int i;
				935
				936	cpumask_clear(covered);
				937
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	938	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	939	struct cpumask *sg_span;
				940
				941	if (cpumask_test_cpu(i, covered))
				942	continue;
				943
				944	sibling = *per_cpu_ptr(sdd->sd, i);
				945
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	946	/*
				947	* Asymmetric node setups can result in situations where the
				948	* domain tree is of unequal depth, make sure to skip domains
				949	* that already cover the entire range.
				950	*
				951	* In that case build_sched_domains() will have terminated the
				952	* iteration early and our sibling sd spans will be empty.
				953	* Domains should always include the CPU they're built on, so
				954	* check that.
				955	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	956	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				957	continue;
				958
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	959	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	960	if (!sg)
				961	goto fail;
				962
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	963	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	964	cpumask_or(covered, covered, sg_span);
				965
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	966	init_overlap_sched_group(sd, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	967
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	968	if (!first)
				969	first = sg;
				970	if (last)
				971	last->next = sg;
				972	last = sg;
				973	last->next = first;
				974	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	975	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	976
				977	return 0;
				978
				979	fail:
				980	free_sched_groups(first, 0);
				981
				982	return -ENOMEM;
				983	}
				984
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	985
				986	/*
				987	* Package topology (also see the load-balance blurb in fair.c)
				988	*
				989	* The scheduler builds a tree structure to represent a number of important
				990	* topology features. By default (default_topology[]) these include:
				991	*
				992	* - Simultaneous multithreading (SMT)
				993	* - Multi-Core Cache (MC)
				994	* - Package (DIE)
				995	*
				996	* Where the last one more or less denotes everything up to a NUMA node.
				997	*
				998	* The tree consists of 3 primary data structures:
				999	*
				1000	* sched_domain -> sched_group -> sched_group_capacity
				1001	* ^ ^ ^ ^
				1002	* `-' `-'
				1003	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1004	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1005	* denote the ever growing mask of CPUs belonging to that level of topology.
				1006	*
				1007	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1008	* denoting the domains of the level below (or individual CPUs in case of the
				1009	* first domain level). The sched_group linked by a sched_domain includes the
				1010	* CPU of that sched_domain [*].
				1011	*
				1012	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1013	*
				1014	* CPU 0 1 2 3 4 5 6 7
				1015	*
				1016	* DIE [ ]
				1017	* MC [ ] [ ]
				1018	* SMT [ ] [ ] [ ] [ ]
				1019	*
				1020	* - or -
				1021	*
				1022	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1023	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1024	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1025	*
				1026	* CPU 0 1 2 3 4 5 6 7
				1027	*
				1028	* One way to think about it is: sched_domain moves you up and down among these
				1029	* topology levels, while sched_group moves you sideways through it, at child
				1030	* domain granularity.
				1031	*
				1032	* sched_group_capacity ensures each unique sched_group has shared storage.
				1033	*
				1034	* There are two related construction problems, both require a CPU that
				1035	* uniquely identify each group (for a given domain):
				1036	*
				1037	* - The first is the balance_cpu (see should_we_balance() and the
				1038	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1039	* continue balancing at a higher domain.
				1040	*
				1041	* - The second is the sched_group_capacity; we want all identical groups
				1042	* to share a single sched_group_capacity.
				1043	*
				1044	* Since these topologies are exclusive by construction. That is, its
				1045	* impossible for an SMT thread to belong to multiple cores, and cores to
				1046	* be part of multiple caches. There is a very clear and unique location
				1047	* for each CPU in the hierarchy.
				1048	*
				1049	* Therefore computing a unique CPU for each group is trivial (the iteration
				1050	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1051	* group), we can simply pick the first CPU in each group.
				1052	*
				1053	*
				1054	* [*] in other words, the first group of each domain is its child domain.
				1055	*/
				1056
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1057	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1058	{
				1059	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1060	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1061	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1062	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1063
				1064	if (child)
				1065	cpu = cpumask_first(sched_domain_span(child));
				1066
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1067	sg = *per_cpu_ptr(sdd->sg, cpu);
				1068	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1069
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1070	/* Increase refcounts for claim_allocations: */
				1071	already_visited = atomic_inc_return(&sg->ref) > 1;
				1072	/* sgc visits should follow a similar trend as sg */
				1073	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1074
				1075	/* If we have already visited that group, it's already initialized. */
				1076	if (already_visited)
				1077	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1078
				1079	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1080	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1081	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1082	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1083	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1084	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1085	}
				1086
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1087	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1088	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1089	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1090
				1091	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1092	}
				1093
				1094	/*
				1095	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1096	* covered by the given span, will set each group's ->cpumask correctly,
				1097	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1098	*
				1099	* Assumes the sched_domain tree is fully constructed
				1100	*/
				1101	static int
				1102	build_sched_groups(struct sched_domain *sd, int cpu)
				1103	{
				1104	struct sched_group first = NULL, last = NULL;
				1105	struct sd_data *sdd = sd->private;
				1106	const struct cpumask *span = sched_domain_span(sd);
				1107	struct cpumask *covered;
				1108	int i;
				1109
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1110	lockdep_assert_held(&sched_domains_mutex);
				1111	covered = sched_domains_tmpmask;
				1112
				1113	cpumask_clear(covered);
				1114
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1115	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1116	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1117
				1118	if (cpumask_test_cpu(i, covered))
				1119	continue;
				1120
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1121	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1122
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1123	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1124
				1125	if (!first)
				1126	first = sg;
				1127	if (last)
				1128	last->next = sg;
				1129	last = sg;
				1130	}
				1131	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1132	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1133
				1134	return 0;
				1135	}
				1136
				1137	/*
				1138	* Initialize sched groups cpu_capacity.
				1139	*
				1140	* cpu_capacity indicates the capacity of sched group, which is used while
				1141	* distributing the load between different sched groups in a sched domain.
				1142	* Typically cpu_capacity for all the groups in a sched domain will be same
				1143	* unless there are asymmetries in the topology. If there are asymmetries,
				1144	* group having more cpu_capacity will pickup more load compared to the
				1145	* group having less cpu_capacity.
				1146	*/
				1147	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1148	{
				1149	struct sched_group *sg = sd->groups;
				1150
				1151	WARN_ON(!sg);
				1152
				1153	do {
				1154	int cpu, max_cpu = -1;
				1155
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1156	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1157
				1158	if (!(sd->flags & SD_ASYM_PACKING))
				1159	goto next;
				1160
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1161	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1162	if (max_cpu < 0)
				1163	max_cpu = cpu;
				1164	else if (sched_asym_prefer(cpu, max_cpu))
				1165	max_cpu = cpu;
				1166	}
				1167	sg->asym_prefer_cpu = max_cpu;
				1168
				1169	next:
				1170	sg = sg->next;
				1171	} while (sg != sd->groups);
				1172
				1173	if (cpu != group_balance_cpu(sg))
				1174	return;
				1175
				1176	update_group_capacity(sd, cpu);
				1177	}
				1178
				1179	/*
				1180	* Initializers for schedule domains
				1181	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1182	*/
				1183
				1184	static int default_relax_domain_level = -1;
				1185	int sched_domain_level_max;
				1186
				1187	static int __init setup_relax_domain_level(char *str)
				1188	{
				1189	if (kstrtoint(str, 0, &default_relax_domain_level))
				1190	pr_warn("Unable to set relax_domain_level\n");
				1191
				1192	return 1;
				1193	}
				1194	__setup("relax_domain_level=", setup_relax_domain_level);
				1195
				1196	static void set_domain_attribute(struct sched_domain *sd,
				1197	struct sched_domain_attr *attr)
				1198	{
				1199	int request;
				1200
				1201	if (!attr \|\| attr->relax_domain_level < 0) {
				1202	if (default_relax_domain_level < 0)
				1203	return;
				1204	else
				1205	request = default_relax_domain_level;
				1206	} else
				1207	request = attr->relax_domain_level;
				1208	if (request < sd->level) {
				1209	/* Turn off idle balance on this domain: */
				1210	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
				1211	} else {
				1212	/* Turn on idle balance on this domain: */
				1213	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
				1214	}
				1215	}
				1216
				1217	static void __sdt_free(const struct cpumask *cpu_map);
				1218	static int __sdt_alloc(const struct cpumask *cpu_map);
				1219
				1220	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1221	const struct cpumask *cpu_map)
				1222	{
				1223	switch (what) {
				1224	case sa_rootdomain:
				1225	if (!atomic_read(&d->rd->refcount))
				1226	free_rootdomain(&d->rd->rcu);
				1227	/* Fall through */
				1228	case sa_sd:
				1229	free_percpu(d->sd);
				1230	/* Fall through */
				1231	case sa_sd_storage:
				1232	__sdt_free(cpu_map);
				1233	/* Fall through */
				1234	case sa_none:
				1235	break;
				1236	}
				1237	}
				1238
				1239	static enum s_alloc
				1240	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1241	{
				1242	memset(d, 0, sizeof(*d));
				1243
				1244	if (__sdt_alloc(cpu_map))
				1245	return sa_sd_storage;
				1246	d->sd = alloc_percpu(struct sched_domain *);
				1247	if (!d->sd)
				1248	return sa_sd_storage;
				1249	d->rd = alloc_rootdomain();
				1250	if (!d->rd)
				1251	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1252
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1253	return sa_rootdomain;
				1254	}
				1255
				1256	/*
				1257	* NULL the sd_data elements we've used to build the sched_domain and
				1258	* sched_group structure so that the subsequent __free_domain_allocs()
				1259	* will not free the data we're using.
				1260	*/
				1261	static void claim_allocations(int cpu, struct sched_domain *sd)
				1262	{
				1263	struct sd_data *sdd = sd->private;
				1264
				1265	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1266	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1267
				1268	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1269	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1270
				1271	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1272	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1273
				1274	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1275	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1276	}
				1277
				1278	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1279	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1280
				1281	static int sched_domains_numa_levels;
				1282	static int sched_domains_curr_level;
				1283
				1284	int sched_max_numa_distance;
				1285	static int *sched_domains_numa_distance;
				1286	static struct cpumask ***sched_domains_numa_masks;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1287	#endif
				1288
				1289	/*
				1290	* SD_flags allowed in topology descriptions.
				1291	*
				1292	* These flags are purely descriptive of the topology and do not prescribe
				1293	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1294	* function:
				1295	*
				1296	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1297	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1298	* SD_NUMA - describes NUMA topologies
				1299	* SD_SHARE_POWERDOMAIN - describes shared power domain
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1300	*
				1301	* Odd one out, which beside describing the topology has a quirk also
				1302	* prescribes the desired behaviour that goes along with it:
				1303	*
				1304	* SD_ASYM_PACKING - describes SMT quirks
				1305	*/
				1306	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1307	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1308	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1309	SD_NUMA \| \
				1310	SD_ASYM_PACKING \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1311	SD_SHARE_POWERDOMAIN)
				1312
				1313	static struct sched_domain *
				1314	sd_init(struct sched_domain_topology_level *tl,
				1315	const struct cpumask *cpu_map,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1316	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1317	{
				1318	struct sd_data *sdd = &tl->data;
				1319	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1320	int sd_id, sd_weight, sd_flags = 0;
				1321
				1322	#ifdef CONFIG_NUMA
				1323	/*
				1324	* Ugly hack to pass state to sd_numa_mask()...
				1325	*/
				1326	sched_domains_curr_level = tl->numa_level;
				1327	#endif
				1328
				1329	sd_weight = cpumask_weight(tl->mask(cpu));
				1330
				1331	if (tl->sd_flags)
				1332	sd_flags = (*tl->sd_flags)();
				1333	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1334	"wrong sd_flags in topology description\n"))
				1335	sd_flags &= ~TOPOLOGY_SD_FLAGS;
				1336
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1337	/* Apply detected topology flags */
				1338	sd_flags \|= dflags;
				1339
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1340	*sd = (struct sched_domain){
				1341	.min_interval = sd_weight,
				1342	.max_interval = 2*sd_weight,
				1343	.busy_factor = 32,
				1344	.imbalance_pct = 125,
				1345
				1346	.cache_nice_tries = 0,
				1347	.busy_idx = 0,
				1348	.idle_idx = 0,
				1349	.newidle_idx = 0,
				1350	.wake_idx = 0,
				1351	.forkexec_idx = 0,
				1352
				1353	.flags = 1*SD_LOAD_BALANCE
				1354	\| 1*SD_BALANCE_NEWIDLE
				1355	\| 1*SD_BALANCE_EXEC
				1356	\| 1*SD_BALANCE_FORK
				1357	\| 0*SD_BALANCE_WAKE
				1358	\| 1*SD_WAKE_AFFINE
				1359	\| 0*SD_SHARE_CPUCAPACITY
				1360	\| 0*SD_SHARE_PKG_RESOURCES
				1361	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1362	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1363	\| 0*SD_NUMA
				1364	\| sd_flags
				1365	,
				1366
				1367	.last_balance = jiffies,
				1368	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1369	.max_newidle_lb_cost = 0,
				1370	.next_decay_max_lb_cost = jiffies,
				1371	.child = child,
				1372	#ifdef CONFIG_SCHED_DEBUG
				1373	.name = tl->name,
				1374	#endif
				1375	};
				1376
				1377	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
				1378	sd_id = cpumask_first(sched_domain_span(sd));
				1379
				1380	/*
				1381	* Convert topological properties into behaviour.
				1382	*/
				1383
				1384	if (sd->flags & SD_ASYM_CPUCAPACITY) {
				1385	struct sched_domain *t = sd;
				1386
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1387	/*
				1388	* Don't attempt to spread across CPUs of different capacities.
				1389	*/
				1390	if (sd->child)
				1391	sd->child->flags &= ~SD_PREFER_SIBLING;
				1392
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1393	for_each_lower_domain(t)
				1394	t->flags \|= SD_BALANCE_WAKE;
				1395	}
				1396
				1397	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1398	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1399
				1400	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1401	sd->imbalance_pct = 117;
				1402	sd->cache_nice_tries = 1;
				1403	sd->busy_idx = 2;
				1404
				1405	#ifdef CONFIG_NUMA
				1406	} else if (sd->flags & SD_NUMA) {
				1407	sd->cache_nice_tries = 2;
				1408	sd->busy_idx = 3;
				1409	sd->idle_idx = 2;
				1410
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1411	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1412	sd->flags \|= SD_SERIALIZE;
				1413	if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
				1414	sd->flags &= ~(SD_BALANCE_EXEC \|
				1415	SD_BALANCE_FORK \|
				1416	SD_WAKE_AFFINE);
				1417	}
				1418
				1419	#endif
				1420	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1421	sd->cache_nice_tries = 1;
				1422	sd->busy_idx = 2;
				1423	sd->idle_idx = 1;
				1424	}
				1425
				1426	/*
				1427	* For all levels sharing cache; connect a sched_domain_shared
				1428	* instance.
				1429	*/
				1430	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1431	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1432	atomic_inc(&sd->shared->ref);
				1433	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1434	}
				1435
				1436	sd->private = sdd;
				1437
				1438	return sd;
				1439	}
				1440
				1441	/*
				1442	* Topology list, bottom-up.
				1443	*/
				1444	static struct sched_domain_topology_level default_topology[] = {
				1445	#ifdef CONFIG_SCHED_SMT
				1446	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1447	#endif
				1448	#ifdef CONFIG_SCHED_MC
				1449	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1450	#endif
				1451	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1452	{ NULL, },
				1453	};
				1454
				1455	static struct sched_domain_topology_level *sched_domain_topology =
				1456	default_topology;
				1457
				1458	#define for_each_sd_topology(tl) \
				1459	for (tl = sched_domain_topology; tl->mask; tl++)
				1460
				1461	void set_sched_topology(struct sched_domain_topology_level *tl)
				1462	{
				1463	if (WARN_ON_ONCE(sched_smp_initialized))
				1464	return;
				1465
				1466	sched_domain_topology = tl;
				1467	}
				1468
				1469	#ifdef CONFIG_NUMA
				1470
				1471	static const struct cpumask *sd_numa_mask(int cpu)
				1472	{
				1473	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1474	}
				1475
				1476	static void sched_numa_warn(const char *str)
				1477	{
				1478	static int done = false;
				1479	int i,j;
				1480
				1481	if (done)
				1482	return;
				1483
				1484	done = true;
				1485
				1486	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1487
				1488	for (i = 0; i < nr_node_ids; i++) {
				1489	printk(KERN_WARNING " ");
				1490	for (j = 0; j < nr_node_ids; j++)
				1491	printk(KERN_CONT "%02d ", node_distance(i,j));
				1492	printk(KERN_CONT "\n");
				1493	}
				1494	printk(KERN_WARNING "\n");
				1495	}
				1496
				1497	bool find_numa_distance(int distance)
				1498	{
				1499	int i;
				1500
				1501	if (distance == node_distance(0, 0))
				1502	return true;
				1503
				1504	for (i = 0; i < sched_domains_numa_levels; i++) {
				1505	if (sched_domains_numa_distance[i] == distance)
				1506	return true;
				1507	}
				1508
				1509	return false;
				1510	}
				1511
				1512	/*
				1513	* A system can have three types of NUMA topology:
				1514	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1515	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1516	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1517	*
				1518	* The difference between a glueless mesh topology and a backplane
				1519	* topology lies in whether communication between not directly
				1520	* connected nodes goes through intermediary nodes (where programs
				1521	* could run), or through backplane controllers. This affects
				1522	* placement of programs.
				1523	*
				1524	* The type of topology can be discerned with the following tests:
				1525	* - If the maximum distance between any nodes is 1 hop, the system
				1526	* is directly connected.
				1527	* - If for two nodes A and B, located N > 1 hops away from each other,
				1528	* there is an intermediary node C, which is < N hops away from both
				1529	* nodes A and B, the system is a glueless mesh.
				1530	*/
				1531	static void init_numa_topology_type(void)
				1532	{
				1533	int a, b, c, n;
				1534
				1535	n = sched_max_numa_distance;
				1536
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1537	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1538	sched_numa_topology_type = NUMA_DIRECT;
				1539	return;
				1540	}
				1541
				1542	for_each_online_node(a) {
				1543	for_each_online_node(b) {
				1544	/* Find two nodes furthest removed from each other. */
				1545	if (node_distance(a, b) < n)
				1546	continue;
				1547
				1548	/* Is there an intermediary node between a and b? */
				1549	for_each_online_node(c) {
				1550	if (node_distance(a, c) < n &&
				1551	node_distance(b, c) < n) {
				1552	sched_numa_topology_type =
				1553	NUMA_GLUELESS_MESH;
				1554	return;
				1555	}
				1556	}
				1557
				1558	sched_numa_topology_type = NUMA_BACKPLANE;
				1559	return;
				1560	}
				1561	}
				1562	}
				1563
				1564	void sched_init_numa(void)
				1565	{
				1566	int next_distance, curr_distance = node_distance(0, 0);
				1567	struct sched_domain_topology_level *tl;
				1568	int level = 0;
				1569	int i, j, k;
				1570
Peter Zijlstra	993f0b0	2018-11-02 14:22:25 +0100	[diff] [blame]	1571	sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1572	if (!sched_domains_numa_distance)
				1573	return;
				1574
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1575	/* Includes NUMA identity node at level 0. */
				1576	sched_domains_numa_distance[level++] = curr_distance;
				1577	sched_domains_numa_levels = level;
				1578
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1579	/*
				1580	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1581	* unique distances in the node_distance() table.
				1582	*
				1583	* Assumes node_distance(0,j) includes all distances in
				1584	* node_distance(i,j) in order to avoid cubic time.
				1585	*/
				1586	next_distance = curr_distance;
				1587	for (i = 0; i < nr_node_ids; i++) {
				1588	for (j = 0; j < nr_node_ids; j++) {
				1589	for (k = 0; k < nr_node_ids; k++) {
				1590	int distance = node_distance(i, k);
				1591
				1592	if (distance > curr_distance &&
				1593	(distance < next_distance \|\|
				1594	next_distance == curr_distance))
				1595	next_distance = distance;
				1596
				1597	/*
				1598	* While not a strong assumption it would be nice to know
				1599	* about cases where if node A is connected to B, B is not
				1600	* equally connected to A.
				1601	*/
				1602	if (sched_debug() && node_distance(k, i) != distance)
				1603	sched_numa_warn("Node-distance not symmetric");
				1604
				1605	if (sched_debug() && i && !find_numa_distance(distance))
				1606	sched_numa_warn("Node-0 not representative");
				1607	}
				1608	if (next_distance != curr_distance) {
				1609	sched_domains_numa_distance[level++] = next_distance;
				1610	sched_domains_numa_levels = level;
				1611	curr_distance = next_distance;
				1612	} else break;
				1613	}
				1614
				1615	/*
				1616	* In case of sched_debug() we verify the above assumption.
				1617	*/
				1618	if (!sched_debug())
				1619	break;
				1620	}
				1621
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1622	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1623	* 'level' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1624	*
				1625	* The sched_domains_numa_distance[] array includes the actual distance
				1626	* numbers.
				1627	*/
				1628
				1629	/*
				1630	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1631	* If it fails to allocate memory for array sched_domains_numa_masks[][],
				1632	* the array will contain less then 'level' members. This could be
				1633	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1634	* in other functions.
				1635	*
				1636	* We reset it to 'level' at the end of this function.
				1637	*/
				1638	sched_domains_numa_levels = 0;
				1639
				1640	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
				1641	if (!sched_domains_numa_masks)
				1642	return;
				1643
				1644	/*
				1645	* Now for each level, construct a mask per node which contains all
				1646	* CPUs of nodes that are that many hops away from us.
				1647	*/
				1648	for (i = 0; i < level; i++) {
				1649	sched_domains_numa_masks[i] =
				1650	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1651	if (!sched_domains_numa_masks[i])
				1652	return;
				1653
				1654	for (j = 0; j < nr_node_ids; j++) {
				1655	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
				1656	if (!mask)
				1657	return;
				1658
				1659	sched_domains_numa_masks[i][j] = mask;
				1660
				1661	for_each_node(k) {
				1662	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1663	continue;
				1664
				1665	cpumask_or(mask, mask, cpumask_of_node(k));
				1666	}
				1667	}
				1668	}
				1669
				1670	/* Compute default topology size */
				1671	for (i = 0; sched_domain_topology[i].mask; i++);
				1672
				1673	tl = kzalloc((i + level + 1) *
				1674	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1675	if (!tl)
				1676	return;
				1677
				1678	/*
				1679	* Copy the default topology bits..
				1680	*/
				1681	for (i = 0; sched_domain_topology[i].mask; i++)
				1682	tl[i] = sched_domain_topology[i];
				1683
				1684	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1685	* Add the NUMA identity distance, aka single NODE.
				1686	*/
				1687	tl[i++] = (struct sched_domain_topology_level){
				1688	.mask = sd_numa_mask,
				1689	.numa_level = 0,
				1690	SD_INIT_NAME(NODE)
				1691	};
				1692
				1693	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1694	* .. and append 'j' levels of NUMA goodness.
				1695	*/
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1696	for (j = 1; j < level; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1697	tl[i] = (struct sched_domain_topology_level){
				1698	.mask = sd_numa_mask,
				1699	.sd_flags = cpu_numa_flags,
				1700	.flags = SDTL_OVERLAP,
				1701	.numa_level = j,
				1702	SD_INIT_NAME(NUMA)
				1703	};
				1704	}
				1705
				1706	sched_domain_topology = tl;
				1707
				1708	sched_domains_numa_levels = level;
				1709	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
				1710
				1711	init_numa_topology_type();
				1712	}
				1713
				1714	void sched_domains_numa_masks_set(unsigned int cpu)
				1715	{
				1716	int node = cpu_to_node(cpu);
				1717	int i, j;
				1718
				1719	for (i = 0; i < sched_domains_numa_levels; i++) {
				1720	for (j = 0; j < nr_node_ids; j++) {
				1721	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1722	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1723	}
				1724	}
				1725	}
				1726
				1727	void sched_domains_numa_masks_clear(unsigned int cpu)
				1728	{
				1729	int i, j;
				1730
				1731	for (i = 0; i < sched_domains_numa_levels; i++) {
				1732	for (j = 0; j < nr_node_ids; j++)
				1733	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1734	}
				1735	}
				1736
				1737	#endif /* CONFIG_NUMA */
				1738
				1739	static int __sdt_alloc(const struct cpumask *cpu_map)
				1740	{
				1741	struct sched_domain_topology_level *tl;
				1742	int j;
				1743
				1744	for_each_sd_topology(tl) {
				1745	struct sd_data *sdd = &tl->data;
				1746
				1747	sdd->sd = alloc_percpu(struct sched_domain *);
				1748	if (!sdd->sd)
				1749	return -ENOMEM;
				1750
				1751	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				1752	if (!sdd->sds)
				1753	return -ENOMEM;
				1754
				1755	sdd->sg = alloc_percpu(struct sched_group *);
				1756	if (!sdd->sg)
				1757	return -ENOMEM;
				1758
				1759	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				1760	if (!sdd->sgc)
				1761	return -ENOMEM;
				1762
				1763	for_each_cpu(j, cpu_map) {
				1764	struct sched_domain *sd;
				1765	struct sched_domain_shared *sds;
				1766	struct sched_group *sg;
				1767	struct sched_group_capacity *sgc;
				1768
				1769	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				1770	GFP_KERNEL, cpu_to_node(j));
				1771	if (!sd)
				1772	return -ENOMEM;
				1773
				1774	*per_cpu_ptr(sdd->sd, j) = sd;
				1775
				1776	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				1777	GFP_KERNEL, cpu_to_node(j));
				1778	if (!sds)
				1779	return -ENOMEM;
				1780
				1781	*per_cpu_ptr(sdd->sds, j) = sds;
				1782
				1783	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				1784	GFP_KERNEL, cpu_to_node(j));
				1785	if (!sg)
				1786	return -ENOMEM;
				1787
				1788	sg->next = sg;
				1789
				1790	*per_cpu_ptr(sdd->sg, j) = sg;
				1791
				1792	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				1793	GFP_KERNEL, cpu_to_node(j));
				1794	if (!sgc)
				1795	return -ENOMEM;
				1796
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	1797	#ifdef CONFIG_SCHED_DEBUG
				1798	sgc->id = j;
				1799	#endif
				1800
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1801	*per_cpu_ptr(sdd->sgc, j) = sgc;
				1802	}
				1803	}
				1804
				1805	return 0;
				1806	}
				1807
				1808	static void __sdt_free(const struct cpumask *cpu_map)
				1809	{
				1810	struct sched_domain_topology_level *tl;
				1811	int j;
				1812
				1813	for_each_sd_topology(tl) {
				1814	struct sd_data *sdd = &tl->data;
				1815
				1816	for_each_cpu(j, cpu_map) {
				1817	struct sched_domain *sd;
				1818
				1819	if (sdd->sd) {
				1820	sd = *per_cpu_ptr(sdd->sd, j);
				1821	if (sd && (sd->flags & SD_OVERLAP))
				1822	free_sched_groups(sd->groups, 0);
				1823	kfree(*per_cpu_ptr(sdd->sd, j));
				1824	}
				1825
				1826	if (sdd->sds)
				1827	kfree(*per_cpu_ptr(sdd->sds, j));
				1828	if (sdd->sg)
				1829	kfree(*per_cpu_ptr(sdd->sg, j));
				1830	if (sdd->sgc)
				1831	kfree(*per_cpu_ptr(sdd->sgc, j));
				1832	}
				1833	free_percpu(sdd->sd);
				1834	sdd->sd = NULL;
				1835	free_percpu(sdd->sds);
				1836	sdd->sds = NULL;
				1837	free_percpu(sdd->sg);
				1838	sdd->sg = NULL;
				1839	free_percpu(sdd->sgc);
				1840	sdd->sgc = NULL;
				1841	}
				1842	}
				1843
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	1844	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1845	const struct cpumask cpu_map, struct sched_domain_attr attr,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1846	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1847	{
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1848	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1849
				1850	if (child) {
				1851	sd->level = child->level + 1;
				1852	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				1853	child->parent = sd;
				1854
				1855	if (!cpumask_subset(sched_domain_span(child),
				1856	sched_domain_span(sd))) {
				1857	pr_err("BUG: arch topology borken\n");
				1858	#ifdef CONFIG_SCHED_DEBUG
				1859	pr_err(" the %s domain not a subset of the %s domain\n",
				1860	child->name, sd->name);
				1861	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1862	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1863	cpumask_or(sched_domain_span(sd),
				1864	sched_domain_span(sd),
				1865	sched_domain_span(child));
				1866	}
				1867
				1868	}
				1869	set_domain_attribute(sd, attr);
				1870
				1871	return sd;
				1872	}
				1873
				1874	/*
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1875	* Find the sched_domain_topology_level where all CPU capacities are visible
				1876	* for all CPUs.
				1877	*/
				1878	static struct sched_domain_topology_level
				1879	asym_cpu_capacity_level(const struct cpumask cpu_map)
				1880	{
				1881	int i, j, asym_level = 0;
				1882	bool asym = false;
				1883	struct sched_domain_topology_level tl, asym_tl = NULL;
				1884	unsigned long cap;
				1885
				1886	/* Is there any asymmetry? */
				1887	cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
				1888
				1889	for_each_cpu(i, cpu_map) {
				1890	if (arch_scale_cpu_capacity(NULL, i) != cap) {
				1891	asym = true;
				1892	break;
				1893	}
				1894	}
				1895
				1896	if (!asym)
				1897	return NULL;
				1898
				1899	/*
				1900	* Examine topology from all CPU's point of views to detect the lowest
				1901	* sched_domain_topology_level where a highest capacity CPU is visible
				1902	* to everyone.
				1903	*/
				1904	for_each_cpu(i, cpu_map) {
				1905	unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
				1906	int tl_id = 0;
				1907
				1908	for_each_sd_topology(tl) {
				1909	if (tl_id < asym_level)
				1910	goto next_level;
				1911
				1912	for_each_cpu_and(j, tl->mask(i), cpu_map) {
				1913	unsigned long capacity;
				1914
				1915	capacity = arch_scale_cpu_capacity(NULL, j);
				1916
				1917	if (capacity <= max_capacity)
				1918	continue;
				1919
				1920	max_capacity = capacity;
				1921	asym_level = tl_id;
				1922	asym_tl = tl;
				1923	}
				1924	next_level:
				1925	tl_id++;
				1926	}
				1927	}
				1928
				1929	return asym_tl;
				1930	}
				1931
				1932
				1933	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1934	* Build sched domains for a given set of CPUs and attach the sched domains
				1935	* to the individual CPUs
				1936	*/
				1937	static int
				1938	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				1939	{
				1940	enum s_alloc alloc_state;
				1941	struct sched_domain *sd;
				1942	struct s_data d;
				1943	struct rq *rq = NULL;
				1944	int i, ret = -ENOMEM;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1945	struct sched_domain_topology_level *tl_asym;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1946	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1947
				1948	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				1949	if (alloc_state != sa_rootdomain)
				1950	goto error;
				1951
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1952	tl_asym = asym_cpu_capacity_level(cpu_map);
				1953
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1954	/* Set up domains for CPUs specified by the cpu_map: */
				1955	for_each_cpu(i, cpu_map) {
				1956	struct sched_domain_topology_level *tl;
				1957
				1958	sd = NULL;
				1959	for_each_sd_topology(tl) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1960	int dflags = 0;
				1961
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1962	if (tl == tl_asym) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1963	dflags \|= SD_ASYM_CPUCAPACITY;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1964	has_asym = true;
				1965	}
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1966
				1967	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
				1968
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1969	if (tl == sched_domain_topology)
				1970	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	1971	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1972	sd->flags \|= SD_OVERLAP;
				1973	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				1974	break;
				1975	}
				1976	}
				1977
				1978	/* Build the groups for the domains */
				1979	for_each_cpu(i, cpu_map) {
				1980	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				1981	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				1982	if (sd->flags & SD_OVERLAP) {
				1983	if (build_overlap_sched_groups(sd, i))
				1984	goto error;
				1985	} else {
				1986	if (build_sched_groups(sd, i))
				1987	goto error;
				1988	}
				1989	}
				1990	}
				1991
				1992	/* Calculate CPU capacity for physical packages and nodes */
				1993	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				1994	if (!cpumask_test_cpu(i, cpu_map))
				1995	continue;
				1996
				1997	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				1998	claim_allocations(i, sd);
				1999	init_sched_groups_capacity(i, sd);
				2000	}
				2001	}
				2002
				2003	/* Attach the domains */
				2004	rcu_read_lock();
				2005	for_each_cpu(i, cpu_map) {
				2006	rq = cpu_rq(i);
				2007	sd = *per_cpu_ptr(d.sd, i);
				2008
				2009	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2010	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2011	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2012
				2013	cpu_attach_domain(sd, d.rd, i);
				2014	}
				2015	rcu_read_unlock();
				2016
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2017	if (has_asym)
				2018	static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
				2019
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2020	if (rq && sched_debug_enabled) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2021	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2022	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2023	}
				2024
				2025	ret = 0;
				2026	error:
				2027	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2028
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2029	return ret;
				2030	}
				2031
				2032	/* Current sched domains: */
				2033	static cpumask_var_t *doms_cur;
				2034
				2035	/* Number of sched domains in 'doms_cur': */
				2036	static int ndoms_cur;
				2037
				2038	/* Attribues of custom domains in 'doms_cur' */
				2039	static struct sched_domain_attr *dattr_cur;
				2040
				2041	/*
				2042	* Special case: If a kmalloc() of a doms_cur partition (array of
				2043	* cpumask) fails, then fallback to a single sched domain,
				2044	* as determined by the single cpumask fallback_doms.
				2045	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2046	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2047
				2048	/*
				2049	* arch_update_cpu_topology lets virtualized architectures update the
				2050	* CPU core maps. It is supposed to return 1 if the topology changed
				2051	* or 0 if it stayed the same.
				2052	*/
				2053	int __weak arch_update_cpu_topology(void)
				2054	{
				2055	return 0;
				2056	}
				2057
				2058	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2059	{
				2060	int i;
				2061	cpumask_var_t *doms;
				2062
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2063	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2064	if (!doms)
				2065	return NULL;
				2066	for (i = 0; i < ndoms; i++) {
				2067	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2068	free_sched_domains(doms, i);
				2069	return NULL;
				2070	}
				2071	}
				2072	return doms;
				2073	}
				2074
				2075	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2076	{
				2077	unsigned int i;
				2078	for (i = 0; i < ndoms; i++)
				2079	free_cpumask_var(doms[i]);
				2080	kfree(doms);
				2081	}
				2082
				2083	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame^]	2084	* Set up scheduler domains and groups. For now this just excludes isolated
				2085	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2086	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2087	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2088	{
				2089	int err;
				2090
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2091	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2092	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2093	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2094
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2095	arch_update_cpu_topology();
				2096	ndoms_cur = 1;
				2097	doms_cur = alloc_sched_domains(ndoms_cur);
				2098	if (!doms_cur)
				2099	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2100	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2101	err = build_sched_domains(doms_cur[0], NULL);
				2102	register_sched_domain_sysctl();
				2103
				2104	return err;
				2105	}
				2106
				2107	/*
				2108	* Detach sched domains from a group of CPUs specified in cpu_map
				2109	* These CPUs will now be attached to the NULL domain
				2110	*/
				2111	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2112	{
				2113	int i;
				2114
				2115	rcu_read_lock();
				2116	for_each_cpu(i, cpu_map)
				2117	cpu_attach_domain(NULL, &def_root_domain, i);
				2118	rcu_read_unlock();
				2119	}
				2120
				2121	/* handle null as "default" */
				2122	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2123	struct sched_domain_attr *new, int idx_new)
				2124	{
				2125	struct sched_domain_attr tmp;
				2126
				2127	/* Fast path: */
				2128	if (!new && !cur)
				2129	return 1;
				2130
				2131	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2132
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2133	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2134	new ? (new + idx_new) : &tmp,
				2135	sizeof(struct sched_domain_attr));
				2136	}
				2137
				2138	/*
				2139	* Partition sched domains as specified by the 'ndoms_new'
				2140	* cpumasks in the array doms_new[] of cpumasks. This compares
				2141	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2142	* It destroys each deleted domain and builds each new domain.
				2143	*
				2144	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2145	* The masks don't intersect (don't overlap.) We should setup one
				2146	* sched domain for each mask. CPUs not in any of the cpumasks will
				2147	* not be load balanced. If the same cpumask appears both in the
				2148	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2149	* it as it is.
				2150	*
				2151	* The passed in 'doms_new' should be allocated using
				2152	* alloc_sched_domains. This routine takes ownership of it and will
				2153	* free_sched_domains it when done with it. If the caller failed the
				2154	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2155	* and partition_sched_domains() will fallback to the single partition
				2156	* 'fallback_doms', it also forces the domains to be rebuilt.
				2157	*
				2158	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2159	* ndoms_new == 0 is a special case for destroying existing domains,
				2160	* and it will not create the default domain.
				2161	*
				2162	* Call with hotplug lock held
				2163	*/
				2164	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2165	struct sched_domain_attr *dattr_new)
				2166	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2167	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2168	int i, j, n;
				2169	int new_topology;
				2170
				2171	mutex_lock(&sched_domains_mutex);
				2172
				2173	/* Always unregister in case we don't destroy any domains: */
				2174	unregister_sched_domain_sysctl();
				2175
				2176	/* Let the architecture update CPU core mappings: */
				2177	new_topology = arch_update_cpu_topology();
				2178
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2179	if (!doms_new) {
				2180	WARN_ON_ONCE(dattr_new);
				2181	n = 0;
				2182	doms_new = alloc_sched_domains(1);
				2183	if (doms_new) {
				2184	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2185	cpumask_and(doms_new[0], cpu_active_mask,
				2186	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2187	}
				2188	} else {
				2189	n = ndoms_new;
				2190	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2191
				2192	/* Destroy deleted domains: */
				2193	for (i = 0; i < ndoms_cur; i++) {
				2194	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2195	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
				2196	dattrs_equal(dattr_cur, i, dattr_new, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2197	goto match1;
				2198	}
				2199	/* No match - a current sched domain not in new doms_new[] */
				2200	detach_destroy_domains(doms_cur[i]);
				2201	match1:
				2202	;
				2203	}
				2204
				2205	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2206	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2207	n = 0;
				2208	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2209	cpumask_and(doms_new[0], cpu_active_mask,
				2210	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2211	}
				2212
				2213	/* Build new domains: */
				2214	for (i = 0; i < ndoms_new; i++) {
				2215	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2216	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2217	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2218	goto match2;
				2219	}
				2220	/* No match - add a new doms_new */
				2221	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2222	match2:
				2223	;
				2224	}
				2225
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2226	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2227	/* Build perf. domains: */
				2228	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2229	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2230	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2231	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2232	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2233	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2234	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2235	}
				2236	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2237	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2238	match3:
				2239	;
				2240	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2241	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2242	#endif
				2243
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2244	/* Remember the new sched domains: */
				2245	if (doms_cur != &fallback_doms)
				2246	free_sched_domains(doms_cur, ndoms_cur);
				2247
				2248	kfree(dattr_cur);
				2249	doms_cur = doms_new;
				2250	dattr_cur = dattr_new;
				2251	ndoms_cur = ndoms_new;
				2252
				2253	register_sched_domain_sysctl();
				2254
				2255	mutex_unlock(&sched_domains_mutex);
				2256	}