Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: 4e8698e62f0754a39a0662a920b1a601814c00ae [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	17	sched_debug_verbose = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	21	early_param("sched_verbose", sched_debug_setup);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	22
				23	static inline bool sched_debug(void)
				24	{
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	25	return sched_debug_verbose;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	26	}
				27
Valentin Schneider	848785d	2020-09-08 19:49:56 +0100	[diff] [blame]	28	#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
				29	const struct sd_flag_debug sd_flag_debug[] = {
				30	#include <linux/sched/sd_flags.h>
				31	};
				32	#undef SD_FLAG
				33
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	34	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				35	struct cpumask *groupmask)
				36	{
				37	struct sched_group *group = sd->groups;
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	38	unsigned long flags = sd->flags;
				39	unsigned int idx;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	40
				41	cpumask_clear(groupmask);
				42
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	43	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	54	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
				55	unsigned int flag = BIT(idx);
				56	unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
				57
				58	if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
				59	!(sd->child->flags & flag))
				60	printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
				61	sd_flag_debug[idx].name);
				62
				63	if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
				64	!(sd->parent->flags & flag))
				65	printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
				66	sd_flag_debug[idx].name);
				67	}
				68
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	69	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				70	do {
				71	if (!group) {
				72	printk("\n");
				73	printk(KERN_ERR "ERROR: group is NULL\n");
				74	break;
				75	}
				76
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	77	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	78	printk(KERN_CONT "\n");
				79	printk(KERN_ERR "ERROR: empty group\n");
				80	break;
				81	}
				82
				83	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	84	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	85	printk(KERN_CONT "\n");
				86	printk(KERN_ERR "ERROR: repeated CPUs\n");
				87	break;
				88	}
				89
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	90	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	91
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	92	printk(KERN_CONT " %d:{ span=%*pbl",
				93	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	94	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	95
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	96	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	97	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	98	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	99	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	100	}
				101
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	102	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				103	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	104
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	105	if (group == sd->groups && sd->child &&
				106	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	107	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	108	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				109	}
				110
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	111	printk(KERN_CONT " }");
				112
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	113	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	114
				115	if (group != sd->groups)
				116	printk(KERN_CONT ",");
				117
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	118	} while (group != sd->groups);
				119	printk(KERN_CONT "\n");
				120
				121	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				122	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				123
				124	if (sd->parent &&
				125	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	126	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	127	return 0;
				128	}
				129
				130	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				131	{
				132	int level = 0;
				133
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	134	if (!sched_debug_verbose)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	135	return;
				136
				137	if (!sd) {
				138	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				139	return;
				140	}
				141
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	142	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	143
				144	for (;;) {
				145	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				146	break;
				147	level++;
				148	sd = sd->parent;
				149	if (!sd)
				150	break;
				151	}
				152	}
				153	#else /* !CONFIG_SCHED_DEBUG */
				154
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	155	# define sched_debug_verbose 0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	156	# define sched_domain_debug(sd, cpu) do { } while (0)
				157	static inline bool sched_debug(void)
				158	{
				159	return false;
				160	}
				161	#endif /* CONFIG_SCHED_DEBUG */
				162
Valentin Schneider	4fc472f	2020-08-25 14:32:16 +0100	[diff] [blame]	163	/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
				164	#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
				165	static const unsigned int SD_DEGENERATE_GROUPS_MASK =
				166	#include <linux/sched/sd_flags.h>
				167	0;
				168	#undef SD_FLAG
				169
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	170	static int sd_degenerate(struct sched_domain *sd)
				171	{
				172	if (cpumask_weight(sched_domain_span(sd)) == 1)
				173	return 1;
				174
				175	/* Following flags need at least 2 groups */
Valentin Schneider	6f34981	2020-08-17 12:29:54 +0100	[diff] [blame]	176	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
				177	(sd->groups != sd->groups->next))
				178	return 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	179
				180	/* Following flags don't use groups */
				181	if (sd->flags & (SD_WAKE_AFFINE))
				182	return 0;
				183
				184	return 1;
				185	}
				186
				187	static int
				188	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				189	{
				190	unsigned long cflags = sd->flags, pflags = parent->flags;
				191
				192	if (sd_degenerate(parent))
				193	return 1;
				194
				195	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				196	return 0;
				197
				198	/* Flags needing groups don't count if only 1 group in parent */
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	199	if (parent->groups == parent->groups->next)
Valentin Schneider	3a6712c	2020-08-17 12:29:57 +0100	[diff] [blame]	200	pflags &= ~SD_DEGENERATE_GROUPS_MASK;
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	201
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	202	if (~cflags & pflags)
				203	return 0;
				204
				205	return 1;
				206	}
				207
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	208	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	209	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	211	DEFINE_MUTEX(sched_energy_mutex);
				212	bool sched_energy_update;
				213
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	214	void rebuild_sched_domains_energy(void)
				215	{
				216	mutex_lock(&sched_energy_mutex);
				217	sched_energy_update = true;
				218	rebuild_sched_domains();
				219	sched_energy_update = false;
				220	mutex_unlock(&sched_energy_mutex);
				221	}
				222
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	223	#ifdef CONFIG_PROC_SYSCTL
				224	int sched_energy_aware_handler(struct ctl_table *table, int write,
Christoph Hellwig	3292739	2020-04-24 08:43:38 +0200	[diff] [blame]	225	void buffer, size_t lenp, loff_t *ppos)
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	226	{
				227	int ret, state;
				228
				229	if (write && !capable(CAP_SYS_ADMIN))
				230	return -EPERM;
				231
				232	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				233	if (!ret && write) {
				234	state = static_branch_unlikely(&sched_energy_present);
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	235	if (state != sysctl_sched_energy_aware)
				236	rebuild_sched_domains_energy();
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	237	}
				238
				239	return ret;
				240	}
				241	#endif
				242
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	243	static void free_pd(struct perf_domain *pd)
				244	{
				245	struct perf_domain *tmp;
				246
				247	while (pd) {
				248	tmp = pd->next;
				249	kfree(pd);
				250	pd = tmp;
				251	}
				252	}
				253
				254	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				255	{
				256	while (pd) {
				257	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				258	return pd;
				259	pd = pd->next;
				260	}
				261
				262	return NULL;
				263	}
				264
				265	static struct perf_domain *pd_init(int cpu)
				266	{
				267	struct em_perf_domain *obj = em_cpu_get(cpu);
				268	struct perf_domain *pd;
				269
				270	if (!obj) {
				271	if (sched_debug())
				272	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				273	return NULL;
				274	}
				275
				276	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				277	if (!pd)
				278	return NULL;
				279	pd->em_pd = obj;
				280
				281	return pd;
				282	}
				283
				284	static void perf_domain_debug(const struct cpumask *cpu_map,
				285	struct perf_domain *pd)
				286	{
				287	if (!sched_debug() \|\| !pd)
				288	return;
				289
				290	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				291
				292	while (pd) {
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	293	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	294	cpumask_first(perf_domain_span(pd)),
				295	cpumask_pr_args(perf_domain_span(pd)),
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	296	em_pd_nr_perf_states(pd->em_pd));
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	297	pd = pd->next;
				298	}
				299
				300	printk(KERN_CONT "\n");
				301	}
				302
				303	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				304	{
				305	struct perf_domain *pd;
				306
				307	pd = container_of(rp, struct perf_domain, rcu);
				308	free_pd(pd);
				309	}
				310
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	311	static void sched_energy_set(bool has_eas)
				312	{
				313	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				314	if (sched_debug())
				315	pr_info("%s: stopping EAS\n", __func__);
				316	static_branch_disable_cpuslocked(&sched_energy_present);
				317	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				318	if (sched_debug())
				319	pr_info("%s: starting EAS\n", __func__);
				320	static_branch_enable_cpuslocked(&sched_energy_present);
				321	}
				322	}
				323
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	324	/*
				325	* EAS can be used on a root domain if it meets all the following conditions:
				326	* 1. an Energy Model (EM) is available;
				327	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	328	* 3. no SMT is detected.
				329	* 4. the EM complexity is low enough to keep scheduling overheads low;
				330	* 5. schedutil is driving the frequency of all CPUs of the rd;
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	331	* 6. frequency invariance support is present;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	332	*
				333	* The complexity of the Energy Model is defined as:
				334	*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	335	* C = nr_pd * (nr_cpus + nr_ps)
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	336	*
				337	* with parameters defined as:
				338	* - nr_pd: the number of performance domains
				339	* - nr_cpus: the number of CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	340	* - nr_ps: the sum of the number of performance states of all performance
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	341	* domains (for example, on a system with 2 performance domains,
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	342	* with 10 performance states each, nr_ps = 2 * 10 = 20).
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	343	*
				344	* It is generally not a good idea to use such a model in the wake-up path on
				345	* very complex platforms because of the associated scheduling overheads. The
				346	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	347	* with per-CPU DVFS and less than 8 performance states each, for example.
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	348	*/
				349	#define EM_MAX_COMPLEXITY 2048
				350
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	351	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	352	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	353	{
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	354	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	355	struct perf_domain pd = NULL, tmp;
				356	int cpu = cpumask_first(cpu_map);
				357	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	358	struct cpufreq_policy *policy;
				359	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	360
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	361	if (!sysctl_sched_energy_aware)
				362	goto free;
				363
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	364	/* EAS is enabled for asymmetric CPU capacity topologies. */
				365	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				366	if (sched_debug()) {
				367	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				368	cpumask_pr_args(cpu_map));
				369	}
				370	goto free;
				371	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	372
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	373	/* EAS definitely does not handle SMT */
				374	if (sched_smt_active()) {
				375	pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
				376	cpumask_pr_args(cpu_map));
				377	goto free;
				378	}
				379
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	380	if (!arch_scale_freq_invariant()) {
				381	if (sched_debug()) {
				382	pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
				383	cpumask_pr_args(cpu_map));
				384	}
				385	goto free;
				386	}
				387
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	388	for_each_cpu(i, cpu_map) {
				389	/* Skip already covered CPUs. */
				390	if (find_pd(pd, i))
				391	continue;
				392
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	393	/* Do not attempt EAS if schedutil is not being used. */
				394	policy = cpufreq_cpu_get(i);
				395	if (!policy)
				396	goto free;
				397	gov = policy->governor;
				398	cpufreq_cpu_put(policy);
				399	if (gov != &schedutil_gov) {
				400	if (rd->pd)
				401	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				402	cpumask_pr_args(cpu_map));
				403	goto free;
				404	}
				405
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	406	/* Create the new pd and add it to the local list. */
				407	tmp = pd_init(i);
				408	if (!tmp)
				409	goto free;
				410	tmp->next = pd;
				411	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	412
				413	/*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	414	* Count performance domains and performance states for the
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	415	* complexity check.
				416	*/
				417	nr_pd++;
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	418	nr_ps += em_pd_nr_perf_states(pd->em_pd);
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	419	}
				420
				421	/* Bail out if the Energy Model complexity is too high. */
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	422	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	423	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				424	cpumask_pr_args(cpu_map));
				425	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	426	}
				427
				428	perf_domain_debug(cpu_map, pd);
				429
				430	/* Attach the new list of performance domains to the root domain. */
				431	tmp = rd->pd;
				432	rcu_assign_pointer(rd->pd, pd);
				433	if (tmp)
				434	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				435
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	436	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	437
				438	free:
				439	free_pd(pd);
				440	tmp = rd->pd;
				441	rcu_assign_pointer(rd->pd, NULL);
				442	if (tmp)
				443	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	444
				445	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	446	}
				447	#else
				448	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	449	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	450
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	451	static void free_rootdomain(struct rcu_head *rcu)
				452	{
				453	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				454
				455	cpupri_cleanup(&rd->cpupri);
				456	cpudl_cleanup(&rd->cpudl);
				457	free_cpumask_var(rd->dlo_mask);
				458	free_cpumask_var(rd->rto_mask);
				459	free_cpumask_var(rd->online);
				460	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	461	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	462	kfree(rd);
				463	}
				464
				465	void rq_attach_root(struct rq rq, struct root_domain rd)
				466	{
				467	struct root_domain *old_rd = NULL;
				468	unsigned long flags;
				469
Peter Zijlstra	5cb9eaa	2020-11-17 18:19:31 -0500	[diff] [blame]	470	raw_spin_rq_lock_irqsave(rq, flags);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	471
				472	if (rq->rd) {
				473	old_rd = rq->rd;
				474
				475	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				476	set_rq_offline(rq);
				477
				478	cpumask_clear_cpu(rq->cpu, old_rd->span);
				479
				480	/*
				481	* If we dont want to free the old_rd yet then
				482	* set old_rd to NULL to skip the freeing later
				483	* in this function:
				484	*/
				485	if (!atomic_dec_and_test(&old_rd->refcount))
				486	old_rd = NULL;
				487	}
				488
				489	atomic_inc(&rd->refcount);
				490	rq->rd = rd;
				491
				492	cpumask_set_cpu(rq->cpu, rd->span);
				493	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				494	set_rq_online(rq);
				495
Peter Zijlstra	5cb9eaa	2020-11-17 18:19:31 -0500	[diff] [blame]	496	raw_spin_rq_unlock_irqrestore(rq, flags);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	497
				498	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	499	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	500	}
				501
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	502	void sched_get_rd(struct root_domain *rd)
				503	{
				504	atomic_inc(&rd->refcount);
				505	}
				506
				507	void sched_put_rd(struct root_domain *rd)
				508	{
				509	if (!atomic_dec_and_test(&rd->refcount))
				510	return;
				511
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	512	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	513	}
				514
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	515	static int init_rootdomain(struct root_domain *rd)
				516	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	517	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				518	goto out;
				519	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				520	goto free_span;
				521	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				522	goto free_online;
				523	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				524	goto free_dlo_mask;
				525
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	526	#ifdef HAVE_RT_PUSH_IPI
				527	rd->rto_cpu = -1;
				528	raw_spin_lock_init(&rd->rto_lock);
				529	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				530	#endif
				531
Peng Liu	2676242	2020-10-08 23:48:46 +0800	[diff] [blame]	532	rd->visit_gen = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	533	init_dl_bw(&rd->dl_bw);
				534	if (cpudl_init(&rd->cpudl) != 0)
				535	goto free_rto_mask;
				536
				537	if (cpupri_init(&rd->cpupri) != 0)
				538	goto free_cpudl;
				539	return 0;
				540
				541	free_cpudl:
				542	cpudl_cleanup(&rd->cpudl);
				543	free_rto_mask:
				544	free_cpumask_var(rd->rto_mask);
				545	free_dlo_mask:
				546	free_cpumask_var(rd->dlo_mask);
				547	free_online:
				548	free_cpumask_var(rd->online);
				549	free_span:
				550	free_cpumask_var(rd->span);
				551	out:
				552	return -ENOMEM;
				553	}
				554
				555	/*
				556	* By default the system creates a single root-domain with all CPUs as
				557	* members (mimicking the global state we have today).
				558	*/
				559	struct root_domain def_root_domain;
				560
				561	void init_defrootdomain(void)
				562	{
				563	init_rootdomain(&def_root_domain);
				564
				565	atomic_set(&def_root_domain.refcount, 1);
				566	}
				567
				568	static struct root_domain *alloc_rootdomain(void)
				569	{
				570	struct root_domain *rd;
				571
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	572	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	573	if (!rd)
				574	return NULL;
				575
				576	if (init_rootdomain(rd) != 0) {
				577	kfree(rd);
				578	return NULL;
				579	}
				580
				581	return rd;
				582	}
				583
				584	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				585	{
				586	struct sched_group tmp, first;
				587
				588	if (!sg)
				589	return;
				590
				591	first = sg;
				592	do {
				593	tmp = sg->next;
				594
				595	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				596	kfree(sg->sgc);
				597
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	598	if (atomic_dec_and_test(&sg->ref))
				599	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	600	sg = tmp;
				601	} while (sg != first);
				602	}
				603
				604	static void destroy_sched_domain(struct sched_domain *sd)
				605	{
				606	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	607	* A normal sched domain may have multiple group references, an
				608	* overlapping domain, having private groups, only one. Iterate,
				609	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	610	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	611	free_sched_groups(sd->groups, 1);
				612
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	613	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				614	kfree(sd->shared);
				615	kfree(sd);
				616	}
				617
				618	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				619	{
				620	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				621
				622	while (sd) {
				623	struct sched_domain *parent = sd->parent;
				624	destroy_sched_domain(sd);
				625	sd = parent;
				626	}
				627	}
				628
				629	static void destroy_sched_domains(struct sched_domain *sd)
				630	{
				631	if (sd)
				632	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				633	}
				634
				635	/*
				636	* Keep a special pointer to the highest sched_domain that has
				637	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				638	* allows us to avoid some pointer chasing select_idle_sibling().
				639	*
				640	* Also keep a unique ID per domain (we use the first CPU number in
				641	* the cpumask of the domain), this allows us to quickly tell if
				642	* two CPUs are in the same cache domain, see cpus_share_cache().
				643	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	644	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	645	DEFINE_PER_CPU(int, sd_llc_size);
				646	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	647	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				648	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				649	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				650	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	651	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	652
				653	static void update_top_cache_domain(int cpu)
				654	{
				655	struct sched_domain_shared *sds = NULL;
				656	struct sched_domain *sd;
				657	int id = cpu;
				658	int size = 1;
				659
				660	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				661	if (sd) {
				662	id = cpumask_first(sched_domain_span(sd));
				663	size = cpumask_weight(sched_domain_span(sd));
				664	sds = sd->shared;
				665	}
				666
				667	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				668	per_cpu(sd_llc_size, cpu) = size;
				669	per_cpu(sd_llc_id, cpu) = id;
				670	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				671
				672	sd = lowest_flag_domain(cpu, SD_NUMA);
				673	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				674
				675	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	676	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				677
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	678	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	679	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	680	}
				681
				682	/*
				683	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				684	* hold the hotplug lock.
				685	*/
				686	static void
				687	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				688	{
				689	struct rq *rq = cpu_rq(cpu);
				690	struct sched_domain *tmp;
Valentin Schneider	b5b2173	2020-11-10 18:43:00 +0000	[diff] [blame]	691	int numa_distance = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	692
				693	/* Remove the sched domains which do not contribute to scheduling. */
				694	for (tmp = sd; tmp; ) {
				695	struct sched_domain *parent = tmp->parent;
				696	if (!parent)
				697	break;
				698
				699	if (sd_parent_degenerate(tmp, parent)) {
				700	tmp->parent = parent->parent;
				701	if (parent->parent)
				702	parent->parent->child = tmp;
				703	/*
				704	* Transfer SD_PREFER_SIBLING down in case of a
				705	* degenerate parent; the spans match for this
				706	* so the property transfers.
				707	*/
				708	if (parent->flags & SD_PREFER_SIBLING)
				709	tmp->flags \|= SD_PREFER_SIBLING;
				710	destroy_sched_domain(parent);
				711	} else
				712	tmp = tmp->parent;
				713	}
				714
				715	if (sd && sd_degenerate(sd)) {
				716	tmp = sd;
				717	sd = sd->parent;
				718	destroy_sched_domain(tmp);
				719	if (sd)
				720	sd->child = NULL;
				721	}
				722
Valentin Schneider	b5b2173	2020-11-10 18:43:00 +0000	[diff] [blame]	723	for (tmp = sd; tmp; tmp = tmp->parent)
				724	numa_distance += !!(tmp->flags & SD_NUMA);
				725
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	726	sched_domain_debug(sd, cpu);
				727
				728	rq_attach_root(rq, rd);
				729	tmp = rq->sd;
				730	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	731	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	732	destroy_sched_domains(tmp);
				733
				734	update_top_cache_domain(cpu);
				735	}
				736
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	737	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	738	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	739	struct root_domain *rd;
				740	};
				741
				742	enum s_alloc {
				743	sa_rootdomain,
				744	sa_sd,
				745	sa_sd_storage,
				746	sa_none,
				747	};
				748
				749	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	750	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	751	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	752	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	753	* The balance mask are all those CPUs that could actually end up at this
				754	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	755	*
				756	* Also see should_we_balance().
				757	*/
				758	int group_balance_cpu(struct sched_group *sg)
				759	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	760	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	761	}
				762
				763
				764	/*
				765	* NUMA topology (first read the regular topology blurb below)
				766	*
				767	* Given a node-distance table, for example:
				768	*
				769	* node 0 1 2 3
				770	* 0: 10 20 30 20
				771	* 1: 20 10 20 30
				772	* 2: 30 20 10 20
				773	* 3: 20 30 20 10
				774	*
				775	* which represents a 4 node ring topology like:
				776	*
				777	* 0 ----- 1
				778	* \| \|
				779	* \| \|
				780	* \| \|
				781	* 3 ----- 2
				782	*
				783	* We want to construct domains and groups to represent this. The way we go
				784	* about doing this is to build the domains on 'hops'. For each NUMA level we
				785	* construct the mask of all nodes reachable in @level hops.
				786	*
				787	* For the above NUMA topology that gives 3 levels:
				788	*
				789	* NUMA-2 0-3 0-3 0-3 0-3
				790	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				791	*
				792	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				793	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				794	*
				795	* NUMA-0 0 1 2 3
				796	*
				797	*
				798	* As can be seen; things don't nicely line up as with the regular topology.
				799	* When we iterate a domain in child domain chunks some nodes can be
				800	* represented multiple times -- hence the "overlap" naming for this part of
				801	* the topology.
				802	*
				803	* In order to minimize this overlap, we only build enough groups to cover the
				804	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				805	*
				806	* Because:
				807	*
				808	* - the first group of each domain is its child domain; this
				809	* gets us the first 0-1,3
				810	* - the only uncovered node is 2, who's child domain is 1-3.
				811	*
				812	* However, because of the overlap, computing a unique CPU for each group is
				813	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				814	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				815	* end up at those groups (they would end up in group: 0-1,3).
				816	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	817	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	818	* will contain those CPUs in the group that can reach this group given the
				819	* (child) domain tree.
				820	*
				821	* With this we can once again compute balance_cpu and sched_group_capacity
				822	* relations.
				823	*
				824	* XXX include words on how balance_cpu is unique and therefore can be
				825	* used for sched_group_capacity links.
				826	*
				827	*
				828	* Another 'interesting' topology is:
				829	*
				830	* node 0 1 2 3
				831	* 0: 10 20 20 30
				832	* 1: 20 10 20 20
				833	* 2: 20 20 10 20
				834	* 3: 30 20 20 10
				835	*
				836	* Which looks a little like:
				837	*
				838	* 0 ----- 1
				839	* \| / \|
				840	* \| / \|
				841	* \| / \|
				842	* 2 ----- 3
				843	*
				844	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				845	* are not.
				846	*
				847	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	848	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	849	*
				850	* NUMA-2 0-3 0-3
				851	* groups: {0-2},{1-3} {1-3},{0-2}
				852	*
				853	* NUMA-1 0-2 0-3 0-3 1-3
				854	*
				855	* NUMA-0 0 1 2 3
				856	*
				857	*/
				858
				859
				860	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	861	* Build the balance mask; it contains only those CPUs that can arrive at this
				862	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	863	*
				864	* We do this during the group creation pass, therefore the group information
				865	* isn't complete yet, however since each group represents a (child) domain we
				866	* can fully construct this using the sched_domain bits (which are already
				867	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	868	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	869	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	870	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	871	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	872	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	873	struct sd_data *sdd = sd->private;
				874	struct sched_domain *sibling;
				875	int i;
				876
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	877	cpumask_clear(mask);
				878
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	879	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	880	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	881
				882	/*
				883	* Can happen in the asymmetric case, where these siblings are
				884	* unused. The mask will not be empty because those CPUs that
				885	* do have the top domain _should_ span the domain.
				886	*/
				887	if (!sibling->child)
				888	continue;
				889
				890	/* If we would not end up here, we can't continue from here */
				891	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	892	continue;
				893
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	894	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	895	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	896
				897	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	898	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	899	}
				900
				901	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	902	* XXX: This creates per-node group entries; since the load-balancer will
				903	* immediately access remote memory to construct this group's load-balance
				904	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	905	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	906	static struct sched_group *
				907	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				908	{
				909	struct sched_group *sg;
				910	struct cpumask *sg_span;
				911
				912	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				913	GFP_KERNEL, cpu_to_node(cpu));
				914
				915	if (!sg)
				916	return NULL;
				917
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	918	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	919	if (sd->child)
				920	cpumask_copy(sg_span, sched_domain_span(sd->child));
				921	else
				922	cpumask_copy(sg_span, sched_domain_span(sd));
				923
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	924	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	925	return sg;
				926	}
				927
				928	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	929	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	930	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	931	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	932	struct sd_data *sdd = sd->private;
				933	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	934	int cpu;
				935
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	936	build_balance_mask(sd, sg, mask);
Barry Song	0a2b65c	2021-03-25 15:31:40 +1300	[diff] [blame]	937	cpu = cpumask_first(mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	938
				939	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				940	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	941	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	942	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	943	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	944
				945	/*
				946	* Initialize sgc->capacity such that even if we mess up the
				947	* domains and no possible iteration will get us here, we won't
				948	* die on a /0 trap.
				949	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	950	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	951	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				952	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	953	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	954	}
				955
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	956	static struct sched_domain *
				957	find_descended_sibling(struct sched_domain sd, struct sched_domain sibling)
				958	{
				959	/*
				960	* The proper descendant would be the one whose child won't span out
				961	* of sd
				962	*/
				963	while (sibling->child &&
				964	!cpumask_subset(sched_domain_span(sibling->child),
				965	sched_domain_span(sd)))
				966	sibling = sibling->child;
				967
				968	/*
				969	* As we are referencing sgc across different topology level, we need
				970	* to go down to skip those sched_domains which don't contribute to
				971	* scheduling because they will be degenerated in cpu_attach_domain
				972	*/
				973	while (sibling->child &&
				974	cpumask_equal(sched_domain_span(sibling->child),
				975	sched_domain_span(sibling)))
				976	sibling = sibling->child;
				977
				978	return sibling;
				979	}
				980
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	981	static int
				982	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				983	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	984	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	985	const struct cpumask *span = sched_domain_span(sd);
				986	struct cpumask *covered = sched_domains_tmpmask;
				987	struct sd_data *sdd = sd->private;
				988	struct sched_domain *sibling;
				989	int i;
				990
				991	cpumask_clear(covered);
				992
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	993	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	994	struct cpumask *sg_span;
				995
				996	if (cpumask_test_cpu(i, covered))
				997	continue;
				998
				999	sibling = *per_cpu_ptr(sdd->sd, i);
				1000
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	1001	/*
				1002	* Asymmetric node setups can result in situations where the
				1003	* domain tree is of unequal depth, make sure to skip domains
				1004	* that already cover the entire range.
				1005	*
				1006	* In that case build_sched_domains() will have terminated the
				1007	* iteration early and our sibling sd spans will be empty.
				1008	* Domains should always include the CPU they're built on, so
				1009	* check that.
				1010	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1011	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				1012	continue;
				1013
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	1014	/*
				1015	* Usually we build sched_group by sibling's child sched_domain
				1016	* But for machines whose NUMA diameter are 3 or above, we move
				1017	* to build sched_group by sibling's proper descendant's child
				1018	* domain because sibling's child sched_domain will span out of
				1019	* the sched_domain being built as below.
				1020	*
				1021	* Smallest diameter=3 topology is:
				1022	*
				1023	* node 0 1 2 3
				1024	* 0: 10 20 30 40
				1025	* 1: 20 10 20 30
				1026	* 2: 30 20 10 20
				1027	* 3: 40 30 20 10
				1028	*
				1029	* 0 --- 1 --- 2 --- 3
				1030	*
				1031	* NUMA-3 0-3 N/A N/A 0-3
				1032	* groups: {0-2},{1-3} {1-3},{0-2}
				1033	*
				1034	* NUMA-2 0-2 0-3 0-3 1-3
				1035	* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
				1036	*
				1037	* NUMA-1 0-1 0-2 1-3 2-3
				1038	* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
				1039	*
				1040	* NUMA-0 0 1 2 3
				1041	*
				1042	* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
				1043	* group span isn't a subset of the domain span.
				1044	*/
				1045	if (sibling->child &&
				1046	!cpumask_subset(sched_domain_span(sibling->child), span))
				1047	sibling = find_descended_sibling(sd, sibling);
				1048
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	1049	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1050	if (!sg)
				1051	goto fail;
				1052
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1053	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1054	cpumask_or(covered, covered, sg_span);
				1055
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	1056	init_overlap_sched_group(sibling, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1057
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1058	if (!first)
				1059	first = sg;
				1060	if (last)
				1061	last->next = sg;
				1062	last = sg;
				1063	last->next = first;
				1064	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	1065	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1066
				1067	return 0;
				1068
				1069	fail:
				1070	free_sched_groups(first, 0);
				1071
				1072	return -ENOMEM;
				1073	}
				1074
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1075
				1076	/*
				1077	* Package topology (also see the load-balance blurb in fair.c)
				1078	*
				1079	* The scheduler builds a tree structure to represent a number of important
				1080	* topology features. By default (default_topology[]) these include:
				1081	*
				1082	* - Simultaneous multithreading (SMT)
				1083	* - Multi-Core Cache (MC)
				1084	* - Package (DIE)
				1085	*
				1086	* Where the last one more or less denotes everything up to a NUMA node.
				1087	*
				1088	* The tree consists of 3 primary data structures:
				1089	*
				1090	* sched_domain -> sched_group -> sched_group_capacity
				1091	* ^ ^ ^ ^
				1092	* `-' `-'
				1093	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1094	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1095	* denote the ever growing mask of CPUs belonging to that level of topology.
				1096	*
				1097	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1098	* denoting the domains of the level below (or individual CPUs in case of the
				1099	* first domain level). The sched_group linked by a sched_domain includes the
				1100	* CPU of that sched_domain [*].
				1101	*
				1102	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1103	*
				1104	* CPU 0 1 2 3 4 5 6 7
				1105	*
				1106	* DIE [ ]
				1107	* MC [ ] [ ]
				1108	* SMT [ ] [ ] [ ] [ ]
				1109	*
				1110	* - or -
				1111	*
				1112	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1113	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1114	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1115	*
				1116	* CPU 0 1 2 3 4 5 6 7
				1117	*
				1118	* One way to think about it is: sched_domain moves you up and down among these
				1119	* topology levels, while sched_group moves you sideways through it, at child
				1120	* domain granularity.
				1121	*
				1122	* sched_group_capacity ensures each unique sched_group has shared storage.
				1123	*
				1124	* There are two related construction problems, both require a CPU that
				1125	* uniquely identify each group (for a given domain):
				1126	*
				1127	* - The first is the balance_cpu (see should_we_balance() and the
				1128	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1129	* continue balancing at a higher domain.
				1130	*
				1131	* - The second is the sched_group_capacity; we want all identical groups
				1132	* to share a single sched_group_capacity.
				1133	*
				1134	* Since these topologies are exclusive by construction. That is, its
				1135	* impossible for an SMT thread to belong to multiple cores, and cores to
				1136	* be part of multiple caches. There is a very clear and unique location
				1137	* for each CPU in the hierarchy.
				1138	*
				1139	* Therefore computing a unique CPU for each group is trivial (the iteration
				1140	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1141	* group), we can simply pick the first CPU in each group.
				1142	*
				1143	*
				1144	* [*] in other words, the first group of each domain is its child domain.
				1145	*/
				1146
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1147	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1148	{
				1149	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1150	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1151	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1152	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1153
				1154	if (child)
				1155	cpu = cpumask_first(sched_domain_span(child));
				1156
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1157	sg = *per_cpu_ptr(sdd->sg, cpu);
				1158	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1159
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1160	/* Increase refcounts for claim_allocations: */
				1161	already_visited = atomic_inc_return(&sg->ref) > 1;
				1162	/* sgc visits should follow a similar trend as sg */
				1163	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1164
				1165	/* If we have already visited that group, it's already initialized. */
				1166	if (already_visited)
				1167	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1168
				1169	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1170	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1171	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1172	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1173	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1174	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1175	}
				1176
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1177	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1178	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1179	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1180
				1181	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1182	}
				1183
				1184	/*
				1185	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1186	* covered by the given span, will set each group's ->cpumask correctly,
				1187	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1188	*
				1189	* Assumes the sched_domain tree is fully constructed
				1190	*/
				1191	static int
				1192	build_sched_groups(struct sched_domain *sd, int cpu)
				1193	{
				1194	struct sched_group first = NULL, last = NULL;
				1195	struct sd_data *sdd = sd->private;
				1196	const struct cpumask *span = sched_domain_span(sd);
				1197	struct cpumask *covered;
				1198	int i;
				1199
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1200	lockdep_assert_held(&sched_domains_mutex);
				1201	covered = sched_domains_tmpmask;
				1202
				1203	cpumask_clear(covered);
				1204
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1205	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1206	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1207
				1208	if (cpumask_test_cpu(i, covered))
				1209	continue;
				1210
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1211	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1212
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1213	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1214
				1215	if (!first)
				1216	first = sg;
				1217	if (last)
				1218	last->next = sg;
				1219	last = sg;
				1220	}
				1221	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1222	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1223
				1224	return 0;
				1225	}
				1226
				1227	/*
				1228	* Initialize sched groups cpu_capacity.
				1229	*
				1230	* cpu_capacity indicates the capacity of sched group, which is used while
				1231	* distributing the load between different sched groups in a sched domain.
				1232	* Typically cpu_capacity for all the groups in a sched domain will be same
				1233	* unless there are asymmetries in the topology. If there are asymmetries,
				1234	* group having more cpu_capacity will pickup more load compared to the
				1235	* group having less cpu_capacity.
				1236	*/
				1237	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1238	{
				1239	struct sched_group *sg = sd->groups;
				1240
				1241	WARN_ON(!sg);
				1242
				1243	do {
				1244	int cpu, max_cpu = -1;
				1245
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1246	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1247
				1248	if (!(sd->flags & SD_ASYM_PACKING))
				1249	goto next;
				1250
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1251	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1252	if (max_cpu < 0)
				1253	max_cpu = cpu;
				1254	else if (sched_asym_prefer(cpu, max_cpu))
				1255	max_cpu = cpu;
				1256	}
				1257	sg->asym_prefer_cpu = max_cpu;
				1258
				1259	next:
				1260	sg = sg->next;
				1261	} while (sg != sd->groups);
				1262
				1263	if (cpu != group_balance_cpu(sg))
				1264	return;
				1265
				1266	update_group_capacity(sd, cpu);
				1267	}
				1268
				1269	/*
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1270	* Asymmetric CPU capacity bits
				1271	*/
				1272	struct asym_cap_data {
				1273	struct list_head link;
				1274	unsigned long capacity;
				1275	unsigned long cpus[];
				1276	};
				1277
				1278	/*
				1279	* Set of available CPUs grouped by their corresponding capacities
				1280	* Each list entry contains a CPU mask reflecting CPUs that share the same
				1281	* capacity.
				1282	* The lifespan of data is unlimited.
				1283	*/
				1284	static LIST_HEAD(asym_cap_list);
				1285
				1286	#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
				1287
				1288	/*
				1289	* Verify whether there is any CPU capacity asymmetry in a given sched domain.
				1290	* Provides sd_flags reflecting the asymmetry scope.
				1291	*/
				1292	static inline int
				1293	asym_cpu_capacity_classify(const struct cpumask *sd_span,
				1294	const struct cpumask *cpu_map)
				1295	{
				1296	struct asym_cap_data *entry;
				1297	int count = 0, miss = 0;
				1298
				1299	/*
				1300	* Count how many unique CPU capacities this domain spans across
				1301	* (compare sched_domain CPUs mask with ones representing available
				1302	* CPUs capacities). Take into account CPUs that might be offline:
				1303	* skip those.
				1304	*/
				1305	list_for_each_entry(entry, &asym_cap_list, link) {
				1306	if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
				1307	++count;
				1308	else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
				1309	++miss;
				1310	}
				1311
				1312	WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
				1313
				1314	/* No asymmetry detected */
				1315	if (count < 2)
				1316	return 0;
				1317	/* Some of the available CPU capacity values have not been detected */
				1318	if (miss)
				1319	return SD_ASYM_CPUCAPACITY;
				1320
				1321	/* Full asymmetry */
				1322	return SD_ASYM_CPUCAPACITY \| SD_ASYM_CPUCAPACITY_FULL;
				1323
				1324	}
				1325
				1326	static inline void asym_cpu_capacity_update_data(int cpu)
				1327	{
				1328	unsigned long capacity = arch_scale_cpu_capacity(cpu);
				1329	struct asym_cap_data *entry = NULL;
				1330
				1331	list_for_each_entry(entry, &asym_cap_list, link) {
				1332	if (capacity == entry->capacity)
				1333	goto done;
				1334	}
				1335
				1336	entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
				1337	if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
				1338	return;
				1339	entry->capacity = capacity;
				1340	list_add(&entry->link, &asym_cap_list);
				1341	done:
				1342	__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
				1343	}
				1344
				1345	/*
				1346	* Build-up/update list of CPUs grouped by their capacities
				1347	* An update requires explicit request to rebuild sched domains
				1348	* with state indicating CPU topology changes.
				1349	*/
				1350	static void asym_cpu_capacity_scan(void)
				1351	{
				1352	struct asym_cap_data entry, next;
				1353	int cpu;
				1354
				1355	list_for_each_entry(entry, &asym_cap_list, link)
				1356	cpumask_clear(cpu_capacity_span(entry));
				1357
				1358	for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
				1359	asym_cpu_capacity_update_data(cpu);
				1360
				1361	list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
				1362	if (cpumask_empty(cpu_capacity_span(entry))) {
				1363	list_del(&entry->link);
				1364	kfree(entry);
				1365	}
				1366	}
				1367
				1368	/*
				1369	* Only one capacity value has been detected i.e. this system is symmetric.
				1370	* No need to keep this data around.
				1371	*/
				1372	if (list_is_singular(&asym_cap_list)) {
				1373	entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
				1374	list_del(&entry->link);
				1375	kfree(entry);
				1376	}
				1377	}
				1378
				1379	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1380	* Initializers for schedule domains
				1381	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1382	*/
				1383
				1384	static int default_relax_domain_level = -1;
				1385	int sched_domain_level_max;
				1386
				1387	static int __init setup_relax_domain_level(char *str)
				1388	{
				1389	if (kstrtoint(str, 0, &default_relax_domain_level))
				1390	pr_warn("Unable to set relax_domain_level\n");
				1391
				1392	return 1;
				1393	}
				1394	__setup("relax_domain_level=", setup_relax_domain_level);
				1395
				1396	static void set_domain_attribute(struct sched_domain *sd,
				1397	struct sched_domain_attr *attr)
				1398	{
				1399	int request;
				1400
				1401	if (!attr \|\| attr->relax_domain_level < 0) {
				1402	if (default_relax_domain_level < 0)
				1403	return;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1404	request = default_relax_domain_level;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1405	} else
				1406	request = attr->relax_domain_level;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1407
				1408	if (sd->level > request) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1409	/* Turn off idle balance on this domain: */
				1410	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1411	}
				1412	}
				1413
				1414	static void __sdt_free(const struct cpumask *cpu_map);
				1415	static int __sdt_alloc(const struct cpumask *cpu_map);
				1416
				1417	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1418	const struct cpumask *cpu_map)
				1419	{
				1420	switch (what) {
				1421	case sa_rootdomain:
				1422	if (!atomic_read(&d->rd->refcount))
				1423	free_rootdomain(&d->rd->rcu);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1424	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1425	case sa_sd:
				1426	free_percpu(d->sd);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1427	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1428	case sa_sd_storage:
				1429	__sdt_free(cpu_map);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1430	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1431	case sa_none:
				1432	break;
				1433	}
				1434	}
				1435
				1436	static enum s_alloc
				1437	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1438	{
				1439	memset(d, 0, sizeof(*d));
				1440
				1441	if (__sdt_alloc(cpu_map))
				1442	return sa_sd_storage;
				1443	d->sd = alloc_percpu(struct sched_domain *);
				1444	if (!d->sd)
				1445	return sa_sd_storage;
				1446	d->rd = alloc_rootdomain();
				1447	if (!d->rd)
				1448	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1449
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1450	return sa_rootdomain;
				1451	}
				1452
				1453	/*
				1454	* NULL the sd_data elements we've used to build the sched_domain and
				1455	* sched_group structure so that the subsequent __free_domain_allocs()
				1456	* will not free the data we're using.
				1457	*/
				1458	static void claim_allocations(int cpu, struct sched_domain *sd)
				1459	{
				1460	struct sd_data *sdd = sd->private;
				1461
				1462	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1463	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1464
				1465	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1466	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1467
				1468	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1469	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1470
				1471	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1472	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1473	}
				1474
				1475	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1476	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1477
				1478	static int sched_domains_numa_levels;
				1479	static int sched_domains_curr_level;
				1480
				1481	int sched_max_numa_distance;
				1482	static int *sched_domains_numa_distance;
				1483	static struct cpumask ***sched_domains_numa_masks;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1484	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame^]	1485
				1486	static unsigned long __read_mostly *sched_numa_onlined_nodes;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1487	#endif
				1488
				1489	/*
				1490	* SD_flags allowed in topology descriptions.
				1491	*
				1492	* These flags are purely descriptive of the topology and do not prescribe
				1493	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1494	* function:
				1495	*
				1496	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1497	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1498	* SD_NUMA - describes NUMA topologies
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1499	*
				1500	* Odd one out, which beside describing the topology has a quirk also
				1501	* prescribes the desired behaviour that goes along with it:
				1502	*
				1503	* SD_ASYM_PACKING - describes SMT quirks
				1504	*/
				1505	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1506	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1507	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1508	SD_NUMA \| \
Valentin Schneider	cfe7ddc	2020-08-17 12:29:47 +0100	[diff] [blame]	1509	SD_ASYM_PACKING)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1510
				1511	static struct sched_domain *
				1512	sd_init(struct sched_domain_topology_level *tl,
				1513	const struct cpumask *cpu_map,
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1514	struct sched_domain *child, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1515	{
				1516	struct sd_data *sdd = &tl->data;
				1517	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1518	int sd_id, sd_weight, sd_flags = 0;
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1519	struct cpumask *sd_span;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1520
				1521	#ifdef CONFIG_NUMA
				1522	/*
				1523	* Ugly hack to pass state to sd_numa_mask()...
				1524	*/
				1525	sched_domains_curr_level = tl->numa_level;
				1526	#endif
				1527
				1528	sd_weight = cpumask_weight(tl->mask(cpu));
				1529
				1530	if (tl->sd_flags)
				1531	sd_flags = (*tl->sd_flags)();
				1532	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1533	"wrong sd_flags in topology description\n"))
Peng Liu	9b1b234	2020-06-09 23:09:36 +0800	[diff] [blame]	1534	sd_flags &= TOPOLOGY_SD_FLAGS;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1535
				1536	*sd = (struct sched_domain){
				1537	.min_interval = sd_weight,
				1538	.max_interval = 2*sd_weight,
Vincent Guittot	6e74991	2020-09-21 09:24:24 +0200	[diff] [blame]	1539	.busy_factor = 16,
Vincent Guittot	2208cda	2020-09-21 09:24:22 +0200	[diff] [blame]	1540	.imbalance_pct = 117,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1541
				1542	.cache_nice_tries = 0,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1543
Valentin Schneider	36c5bdc	2020-04-15 22:05:07 +0100	[diff] [blame]	1544	.flags = 1*SD_BALANCE_NEWIDLE
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1545	\| 1*SD_BALANCE_EXEC
				1546	\| 1*SD_BALANCE_FORK
				1547	\| 0*SD_BALANCE_WAKE
				1548	\| 1*SD_WAKE_AFFINE
				1549	\| 0*SD_SHARE_CPUCAPACITY
				1550	\| 0*SD_SHARE_PKG_RESOURCES
				1551	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1552	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1553	\| 0*SD_NUMA
				1554	\| sd_flags
				1555	,
				1556
				1557	.last_balance = jiffies,
				1558	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1559	.max_newidle_lb_cost = 0,
				1560	.next_decay_max_lb_cost = jiffies,
				1561	.child = child,
				1562	#ifdef CONFIG_SCHED_DEBUG
				1563	.name = tl->name,
				1564	#endif
				1565	};
				1566
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1567	sd_span = sched_domain_span(sd);
				1568	cpumask_and(sd_span, cpu_map, tl->mask(cpu));
				1569	sd_id = cpumask_first(sd_span);
				1570
				1571	sd->flags \|= asym_cpu_capacity_classify(sd_span, cpu_map);
				1572
				1573	WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY \| SD_ASYM_CPUCAPACITY)) ==
				1574	(SD_SHARE_CPUCAPACITY \| SD_ASYM_CPUCAPACITY),
				1575	"CPU capacity asymmetry not supported on SMT\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1576
				1577	/*
				1578	* Convert topological properties into behaviour.
				1579	*/
Morten Rasmussen	a526d46	2020-02-06 19:19:55 +0000	[diff] [blame]	1580	/* Don't attempt to spread across CPUs of different capacities. */
				1581	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
				1582	sd->child->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1583
				1584	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1585	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1586
				1587	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1588	sd->imbalance_pct = 117;
				1589	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1590
				1591	#ifdef CONFIG_NUMA
				1592	} else if (sd->flags & SD_NUMA) {
				1593	sd->cache_nice_tries = 2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1594
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1595	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1596	sd->flags \|= SD_SERIALIZE;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1597	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1598	sd->flags &= ~(SD_BALANCE_EXEC \|
				1599	SD_BALANCE_FORK \|
				1600	SD_WAKE_AFFINE);
				1601	}
				1602
				1603	#endif
				1604	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1605	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1606	}
				1607
				1608	/*
				1609	* For all levels sharing cache; connect a sched_domain_shared
				1610	* instance.
				1611	*/
				1612	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1613	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1614	atomic_inc(&sd->shared->ref);
				1615	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1616	}
				1617
				1618	sd->private = sdd;
				1619
				1620	return sd;
				1621	}
				1622
				1623	/*
				1624	* Topology list, bottom-up.
				1625	*/
				1626	static struct sched_domain_topology_level default_topology[] = {
				1627	#ifdef CONFIG_SCHED_SMT
				1628	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1629	#endif
				1630	#ifdef CONFIG_SCHED_MC
				1631	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1632	#endif
				1633	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1634	{ NULL, },
				1635	};
				1636
				1637	static struct sched_domain_topology_level *sched_domain_topology =
				1638	default_topology;
				1639
				1640	#define for_each_sd_topology(tl) \
				1641	for (tl = sched_domain_topology; tl->mask; tl++)
				1642
				1643	void set_sched_topology(struct sched_domain_topology_level *tl)
				1644	{
				1645	if (WARN_ON_ONCE(sched_smp_initialized))
				1646	return;
				1647
				1648	sched_domain_topology = tl;
				1649	}
				1650
				1651	#ifdef CONFIG_NUMA
				1652
				1653	static const struct cpumask *sd_numa_mask(int cpu)
				1654	{
				1655	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1656	}
				1657
				1658	static void sched_numa_warn(const char *str)
				1659	{
				1660	static int done = false;
				1661	int i,j;
				1662
				1663	if (done)
				1664	return;
				1665
				1666	done = true;
				1667
				1668	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1669
				1670	for (i = 0; i < nr_node_ids; i++) {
				1671	printk(KERN_WARNING " ");
				1672	for (j = 0; j < nr_node_ids; j++)
				1673	printk(KERN_CONT "%02d ", node_distance(i,j));
				1674	printk(KERN_CONT "\n");
				1675	}
				1676	printk(KERN_WARNING "\n");
				1677	}
				1678
				1679	bool find_numa_distance(int distance)
				1680	{
				1681	int i;
				1682
				1683	if (distance == node_distance(0, 0))
				1684	return true;
				1685
				1686	for (i = 0; i < sched_domains_numa_levels; i++) {
				1687	if (sched_domains_numa_distance[i] == distance)
				1688	return true;
				1689	}
				1690
				1691	return false;
				1692	}
				1693
				1694	/*
				1695	* A system can have three types of NUMA topology:
				1696	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1697	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1698	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1699	*
				1700	* The difference between a glueless mesh topology and a backplane
				1701	* topology lies in whether communication between not directly
				1702	* connected nodes goes through intermediary nodes (where programs
				1703	* could run), or through backplane controllers. This affects
				1704	* placement of programs.
				1705	*
				1706	* The type of topology can be discerned with the following tests:
				1707	* - If the maximum distance between any nodes is 1 hop, the system
				1708	* is directly connected.
				1709	* - If for two nodes A and B, located N > 1 hops away from each other,
				1710	* there is an intermediary node C, which is < N hops away from both
				1711	* nodes A and B, the system is a glueless mesh.
				1712	*/
				1713	static void init_numa_topology_type(void)
				1714	{
				1715	int a, b, c, n;
				1716
				1717	n = sched_max_numa_distance;
				1718
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1719	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1720	sched_numa_topology_type = NUMA_DIRECT;
				1721	return;
				1722	}
				1723
				1724	for_each_online_node(a) {
				1725	for_each_online_node(b) {
				1726	/* Find two nodes furthest removed from each other. */
				1727	if (node_distance(a, b) < n)
				1728	continue;
				1729
				1730	/* Is there an intermediary node between a and b? */
				1731	for_each_online_node(c) {
				1732	if (node_distance(a, c) < n &&
				1733	node_distance(b, c) < n) {
				1734	sched_numa_topology_type =
				1735	NUMA_GLUELESS_MESH;
				1736	return;
				1737	}
				1738	}
				1739
				1740	sched_numa_topology_type = NUMA_BACKPLANE;
				1741	return;
				1742	}
				1743	}
				1744	}
				1745
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1746
				1747	#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
				1748
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1749	void sched_init_numa(void)
				1750	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1751	struct sched_domain_topology_level *tl;
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1752	unsigned long *distance_map;
				1753	int nr_levels = 0;
				1754	int i, j;
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1755
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1756	/*
				1757	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1758	* unique distances in the node_distance() table.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1759	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1760	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
				1761	if (!distance_map)
				1762	return;
				1763
				1764	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1765	for (i = 0; i < nr_node_ids; i++) {
				1766	for (j = 0; j < nr_node_ids; j++) {
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1767	int distance = node_distance(i, j);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1768
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1769	if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
				1770	sched_numa_warn("Invalid distance value range");
				1771	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1772	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1773
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1774	bitmap_set(distance_map, distance, 1);
				1775	}
				1776	}
				1777	/*
				1778	* We can now figure out how many unique distance values there are and
				1779	* allocate memory accordingly.
				1780	*/
				1781	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
				1782
				1783	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
				1784	if (!sched_domains_numa_distance) {
				1785	bitmap_free(distance_map);
				1786	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1787	}
				1788
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1789	for (i = 0, j = 0; i < nr_levels; i++, j++) {
				1790	j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
				1791	sched_domains_numa_distance[i] = j;
				1792	}
				1793
				1794	bitmap_free(distance_map);
				1795
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1796	/*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1797	* 'nr_levels' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1798	*
				1799	* The sched_domains_numa_distance[] array includes the actual distance
				1800	* numbers.
				1801	*/
				1802
				1803	/*
				1804	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1805	* If it fails to allocate memory for array sched_domains_numa_masks[][],
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1806	* the array will contain less then 'nr_levels' members. This could be
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1807	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1808	* in other functions.
				1809	*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1810	* We reset it to 'nr_levels' at the end of this function.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1811	*/
				1812	sched_domains_numa_levels = 0;
				1813
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1814	sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1815	if (!sched_domains_numa_masks)
				1816	return;
				1817
				1818	/*
				1819	* Now for each level, construct a mask per node which contains all
				1820	* CPUs of nodes that are that many hops away from us.
				1821	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1822	for (i = 0; i < nr_levels; i++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1823	sched_domains_numa_masks[i] =
				1824	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1825	if (!sched_domains_numa_masks[i])
				1826	return;
				1827
				1828	for (j = 0; j < nr_node_ids; j++) {
				1829	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1830	int k;
				1831
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1832	if (!mask)
				1833	return;
				1834
				1835	sched_domains_numa_masks[i][j] = mask;
				1836
				1837	for_each_node(k) {
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame^]	1838	/*
				1839	* Distance information can be unreliable for
				1840	* offline nodes, defer building the node
				1841	* masks to its bringup.
				1842	* This relies on all unique distance values
				1843	* still being visible at init time.
				1844	*/
				1845	if (!node_online(j))
				1846	continue;
				1847
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1848	if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
				1849	sched_numa_warn("Node-distance not symmetric");
				1850
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1851	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1852	continue;
				1853
				1854	cpumask_or(mask, mask, cpumask_of_node(k));
				1855	}
				1856	}
				1857	}
				1858
				1859	/* Compute default topology size */
				1860	for (i = 0; sched_domain_topology[i].mask; i++);
				1861
Dietmar Eggemann	71e5f66	2021-02-01 10:53:53 +0100	[diff] [blame]	1862	tl = kzalloc((i + nr_levels + 1) *
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1863	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1864	if (!tl)
				1865	return;
				1866
				1867	/*
				1868	* Copy the default topology bits..
				1869	*/
				1870	for (i = 0; sched_domain_topology[i].mask; i++)
				1871	tl[i] = sched_domain_topology[i];
				1872
				1873	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1874	* Add the NUMA identity distance, aka single NODE.
				1875	*/
				1876	tl[i++] = (struct sched_domain_topology_level){
				1877	.mask = sd_numa_mask,
				1878	.numa_level = 0,
				1879	SD_INIT_NAME(NODE)
				1880	};
				1881
				1882	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1883	* .. and append 'j' levels of NUMA goodness.
				1884	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1885	for (j = 1; j < nr_levels; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1886	tl[i] = (struct sched_domain_topology_level){
				1887	.mask = sd_numa_mask,
				1888	.sd_flags = cpu_numa_flags,
				1889	.flags = SDTL_OVERLAP,
				1890	.numa_level = j,
				1891	SD_INIT_NAME(NUMA)
				1892	};
				1893	}
				1894
				1895	sched_domain_topology = tl;
				1896
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1897	sched_domains_numa_levels = nr_levels;
				1898	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1899
				1900	init_numa_topology_type();
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame^]	1901
				1902	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
				1903	if (!sched_numa_onlined_nodes)
				1904	return;
				1905
				1906	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
				1907	for_each_online_node(i)
				1908	bitmap_set(sched_numa_onlined_nodes, i, 1);
				1909	}
				1910
				1911	static void __sched_domains_numa_masks_set(unsigned int node)
				1912	{
				1913	int i, j;
				1914
				1915	/*
				1916	* NUMA masks are not built for offline nodes in sched_init_numa().
				1917	* Thus, when a CPU of a never-onlined-before node gets plugged in,
				1918	* adding that new CPU to the right NUMA masks is not sufficient: the
				1919	* masks of that CPU's node must also be updated.
				1920	*/
				1921	if (test_bit(node, sched_numa_onlined_nodes))
				1922	return;
				1923
				1924	bitmap_set(sched_numa_onlined_nodes, node, 1);
				1925
				1926	for (i = 0; i < sched_domains_numa_levels; i++) {
				1927	for (j = 0; j < nr_node_ids; j++) {
				1928	if (!node_online(j) \|\| node == j)
				1929	continue;
				1930
				1931	if (node_distance(j, node) > sched_domains_numa_distance[i])
				1932	continue;
				1933
				1934	/* Add remote nodes in our masks */
				1935	cpumask_or(sched_domains_numa_masks[i][node],
				1936	sched_domains_numa_masks[i][node],
				1937	sched_domains_numa_masks[0][j]);
				1938	}
				1939	}
				1940
				1941	/*
				1942	* A new node has been brought up, potentially changing the topology
				1943	* classification.
				1944	*
				1945	* Note that this is racy vs any use of sched_numa_topology_type :/
				1946	*/
				1947	init_numa_topology_type();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1948	}
				1949
				1950	void sched_domains_numa_masks_set(unsigned int cpu)
				1951	{
				1952	int node = cpu_to_node(cpu);
				1953	int i, j;
				1954
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame^]	1955	__sched_domains_numa_masks_set(node);
				1956
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1957	for (i = 0; i < sched_domains_numa_levels; i++) {
				1958	for (j = 0; j < nr_node_ids; j++) {
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame^]	1959	if (!node_online(j))
				1960	continue;
				1961
				1962	/* Set ourselves in the remote node's masks */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1963	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1964	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1965	}
				1966	}
				1967	}
				1968
				1969	void sched_domains_numa_masks_clear(unsigned int cpu)
				1970	{
				1971	int i, j;
				1972
				1973	for (i = 0; i < sched_domains_numa_levels; i++) {
				1974	for (j = 0; j < nr_node_ids; j++)
				1975	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1976	}
				1977	}
				1978
Wanpeng Li	e0e8d49	2019-06-28 16:51:41 +0800	[diff] [blame]	1979	/*
				1980	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1981	* closest to @cpu from @cpumask.
				1982	* cpumask: cpumask to find a cpu from
				1983	* cpu: cpu to be close to
				1984	*
				1985	* returns: cpu, or nr_cpu_ids when nothing found.
				1986	*/
				1987	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				1988	{
				1989	int i, j = cpu_to_node(cpu);
				1990
				1991	for (i = 0; i < sched_domains_numa_levels; i++) {
				1992	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				1993	if (cpu < nr_cpu_ids)
				1994	return cpu;
				1995	}
				1996	return nr_cpu_ids;
				1997	}
				1998
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1999	#endif /* CONFIG_NUMA */
				2000
				2001	static int __sdt_alloc(const struct cpumask *cpu_map)
				2002	{
				2003	struct sched_domain_topology_level *tl;
				2004	int j;
				2005
				2006	for_each_sd_topology(tl) {
				2007	struct sd_data *sdd = &tl->data;
				2008
				2009	sdd->sd = alloc_percpu(struct sched_domain *);
				2010	if (!sdd->sd)
				2011	return -ENOMEM;
				2012
				2013	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				2014	if (!sdd->sds)
				2015	return -ENOMEM;
				2016
				2017	sdd->sg = alloc_percpu(struct sched_group *);
				2018	if (!sdd->sg)
				2019	return -ENOMEM;
				2020
				2021	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				2022	if (!sdd->sgc)
				2023	return -ENOMEM;
				2024
				2025	for_each_cpu(j, cpu_map) {
				2026	struct sched_domain *sd;
				2027	struct sched_domain_shared *sds;
				2028	struct sched_group *sg;
				2029	struct sched_group_capacity *sgc;
				2030
				2031	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				2032	GFP_KERNEL, cpu_to_node(j));
				2033	if (!sd)
				2034	return -ENOMEM;
				2035
				2036	*per_cpu_ptr(sdd->sd, j) = sd;
				2037
				2038	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				2039	GFP_KERNEL, cpu_to_node(j));
				2040	if (!sds)
				2041	return -ENOMEM;
				2042
				2043	*per_cpu_ptr(sdd->sds, j) = sds;
				2044
				2045	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				2046	GFP_KERNEL, cpu_to_node(j));
				2047	if (!sg)
				2048	return -ENOMEM;
				2049
				2050	sg->next = sg;
				2051
				2052	*per_cpu_ptr(sdd->sg, j) = sg;
				2053
				2054	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				2055	GFP_KERNEL, cpu_to_node(j));
				2056	if (!sgc)
				2057	return -ENOMEM;
				2058
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	2059	#ifdef CONFIG_SCHED_DEBUG
				2060	sgc->id = j;
				2061	#endif
				2062
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2063	*per_cpu_ptr(sdd->sgc, j) = sgc;
				2064	}
				2065	}
				2066
				2067	return 0;
				2068	}
				2069
				2070	static void __sdt_free(const struct cpumask *cpu_map)
				2071	{
				2072	struct sched_domain_topology_level *tl;
				2073	int j;
				2074
				2075	for_each_sd_topology(tl) {
				2076	struct sd_data *sdd = &tl->data;
				2077
				2078	for_each_cpu(j, cpu_map) {
				2079	struct sched_domain *sd;
				2080
				2081	if (sdd->sd) {
				2082	sd = *per_cpu_ptr(sdd->sd, j);
				2083	if (sd && (sd->flags & SD_OVERLAP))
				2084	free_sched_groups(sd->groups, 0);
				2085	kfree(*per_cpu_ptr(sdd->sd, j));
				2086	}
				2087
				2088	if (sdd->sds)
				2089	kfree(*per_cpu_ptr(sdd->sds, j));
				2090	if (sdd->sg)
				2091	kfree(*per_cpu_ptr(sdd->sg, j));
				2092	if (sdd->sgc)
				2093	kfree(*per_cpu_ptr(sdd->sgc, j));
				2094	}
				2095	free_percpu(sdd->sd);
				2096	sdd->sd = NULL;
				2097	free_percpu(sdd->sds);
				2098	sdd->sds = NULL;
				2099	free_percpu(sdd->sg);
				2100	sdd->sg = NULL;
				2101	free_percpu(sdd->sgc);
				2102	sdd->sgc = NULL;
				2103	}
				2104	}
				2105
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	2106	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2107	const struct cpumask cpu_map, struct sched_domain_attr attr,
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2108	struct sched_domain *child, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2109	{
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2110	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2111
				2112	if (child) {
				2113	sd->level = child->level + 1;
				2114	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				2115	child->parent = sd;
				2116
				2117	if (!cpumask_subset(sched_domain_span(child),
				2118	sched_domain_span(sd))) {
				2119	pr_err("BUG: arch topology borken\n");
				2120	#ifdef CONFIG_SCHED_DEBUG
				2121	pr_err(" the %s domain not a subset of the %s domain\n",
				2122	child->name, sd->name);
				2123	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2124	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2125	cpumask_or(sched_domain_span(sd),
				2126	sched_domain_span(sd),
				2127	sched_domain_span(child));
				2128	}
				2129
				2130	}
				2131	set_domain_attribute(sd, attr);
				2132
				2133	return sd;
				2134	}
				2135
				2136	/*
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2137	* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
				2138	* any two given CPUs at this (non-NUMA) topology level.
				2139	*/
				2140	static bool topology_span_sane(struct sched_domain_topology_level *tl,
				2141	const struct cpumask *cpu_map, int cpu)
				2142	{
				2143	int i;
				2144
				2145	/* NUMA levels are allowed to overlap */
				2146	if (tl->flags & SDTL_OVERLAP)
				2147	return true;
				2148
				2149	/*
				2150	* Non-NUMA levels cannot partially overlap - they must be either
				2151	* completely equal or completely disjoint. Otherwise we can end up
				2152	* breaking the sched_group lists - i.e. a later get_group() pass
				2153	* breaks the linking done for an earlier span.
				2154	*/
				2155	for_each_cpu(i, cpu_map) {
				2156	if (i == cpu)
				2157	continue;
				2158	/*
				2159	* We should 'and' all those masks with 'cpu_map' to exactly
				2160	* match the topology we're about to build, but that can only
				2161	* remove CPUs, which only lessens our ability to detect
				2162	* overlaps
				2163	*/
				2164	if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
				2165	cpumask_intersects(tl->mask(cpu), tl->mask(i)))
				2166	return false;
				2167	}
				2168
				2169	return true;
				2170	}
				2171
				2172	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2173	* Build sched domains for a given set of CPUs and attach the sched domains
				2174	* to the individual CPUs
				2175	*/
				2176	static int
				2177	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				2178	{
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2179	enum s_alloc alloc_state = sa_none;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2180	struct sched_domain *sd;
				2181	struct s_data d;
				2182	struct rq *rq = NULL;
				2183	int i, ret = -ENOMEM;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2184	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2185
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2186	if (WARN_ON(cpumask_empty(cpu_map)))
				2187	goto error;
				2188
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2189	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				2190	if (alloc_state != sa_rootdomain)
				2191	goto error;
				2192
				2193	/* Set up domains for CPUs specified by the cpu_map: */
				2194	for_each_cpu(i, cpu_map) {
				2195	struct sched_domain_topology_level *tl;
				2196
				2197	sd = NULL;
				2198	for_each_sd_topology(tl) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2199
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2200	if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
				2201	goto error;
				2202
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2203	sd = build_sched_domain(tl, cpu_map, attr, sd, i);
				2204
				2205	has_asym \|= sd->flags & SD_ASYM_CPUCAPACITY;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2206
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2207	if (tl == sched_domain_topology)
				2208	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	2209	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2210	sd->flags \|= SD_OVERLAP;
				2211	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				2212	break;
				2213	}
				2214	}
				2215
				2216	/* Build the groups for the domains */
				2217	for_each_cpu(i, cpu_map) {
				2218	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2219	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				2220	if (sd->flags & SD_OVERLAP) {
				2221	if (build_overlap_sched_groups(sd, i))
				2222	goto error;
				2223	} else {
				2224	if (build_sched_groups(sd, i))
				2225	goto error;
				2226	}
				2227	}
				2228	}
				2229
				2230	/* Calculate CPU capacity for physical packages and nodes */
				2231	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2232	if (!cpumask_test_cpu(i, cpu_map))
				2233	continue;
				2234
				2235	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2236	claim_allocations(i, sd);
				2237	init_sched_groups_capacity(i, sd);
				2238	}
				2239	}
				2240
				2241	/* Attach the domains */
				2242	rcu_read_lock();
				2243	for_each_cpu(i, cpu_map) {
				2244	rq = cpu_rq(i);
				2245	sd = *per_cpu_ptr(d.sd, i);
				2246
				2247	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2248	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2249	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2250
				2251	cpu_attach_domain(sd, d.rd, i);
				2252	}
				2253	rcu_read_unlock();
				2254
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2255	if (has_asym)
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2256	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2257
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	2258	if (rq && sched_debug_verbose) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2259	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2260	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2261	}
				2262
				2263	ret = 0;
				2264	error:
				2265	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2266
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2267	return ret;
				2268	}
				2269
				2270	/* Current sched domains: */
				2271	static cpumask_var_t *doms_cur;
				2272
				2273	/* Number of sched domains in 'doms_cur': */
				2274	static int ndoms_cur;
				2275
Ingo Molnar	3b03706	2021-03-18 13:38:50 +0100	[diff] [blame]	2276	/* Attributes of custom domains in 'doms_cur' */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2277	static struct sched_domain_attr *dattr_cur;
				2278
				2279	/*
				2280	* Special case: If a kmalloc() of a doms_cur partition (array of
				2281	* cpumask) fails, then fallback to a single sched domain,
				2282	* as determined by the single cpumask fallback_doms.
				2283	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2284	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2285
				2286	/*
				2287	* arch_update_cpu_topology lets virtualized architectures update the
				2288	* CPU core maps. It is supposed to return 1 if the topology changed
				2289	* or 0 if it stayed the same.
				2290	*/
				2291	int __weak arch_update_cpu_topology(void)
				2292	{
				2293	return 0;
				2294	}
				2295
				2296	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2297	{
				2298	int i;
				2299	cpumask_var_t *doms;
				2300
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2301	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2302	if (!doms)
				2303	return NULL;
				2304	for (i = 0; i < ndoms; i++) {
				2305	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2306	free_sched_domains(doms, i);
				2307	return NULL;
				2308	}
				2309	}
				2310	return doms;
				2311	}
				2312
				2313	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2314	{
				2315	unsigned int i;
				2316	for (i = 0; i < ndoms; i++)
				2317	free_cpumask_var(doms[i]);
				2318	kfree(doms);
				2319	}
				2320
				2321	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame]	2322	* Set up scheduler domains and groups. For now this just excludes isolated
				2323	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2324	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2325	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2326	{
				2327	int err;
				2328
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2329	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2330	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2331	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2332
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2333	arch_update_cpu_topology();
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2334	asym_cpu_capacity_scan();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2335	ndoms_cur = 1;
				2336	doms_cur = alloc_sched_domains(ndoms_cur);
				2337	if (!doms_cur)
				2338	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2339	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2340	err = build_sched_domains(doms_cur[0], NULL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2341
				2342	return err;
				2343	}
				2344
				2345	/*
				2346	* Detach sched domains from a group of CPUs specified in cpu_map
				2347	* These CPUs will now be attached to the NULL domain
				2348	*/
				2349	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2350	{
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2351	unsigned int cpu = cpumask_any(cpu_map);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2352	int i;
				2353
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2354	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2355	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2356
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2357	rcu_read_lock();
				2358	for_each_cpu(i, cpu_map)
				2359	cpu_attach_domain(NULL, &def_root_domain, i);
				2360	rcu_read_unlock();
				2361	}
				2362
				2363	/* handle null as "default" */
				2364	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2365	struct sched_domain_attr *new, int idx_new)
				2366	{
				2367	struct sched_domain_attr tmp;
				2368
				2369	/* Fast path: */
				2370	if (!new && !cur)
				2371	return 1;
				2372
				2373	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2374
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2375	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2376	new ? (new + idx_new) : &tmp,
				2377	sizeof(struct sched_domain_attr));
				2378	}
				2379
				2380	/*
				2381	* Partition sched domains as specified by the 'ndoms_new'
				2382	* cpumasks in the array doms_new[] of cpumasks. This compares
				2383	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2384	* It destroys each deleted domain and builds each new domain.
				2385	*
				2386	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2387	* The masks don't intersect (don't overlap.) We should setup one
				2388	* sched domain for each mask. CPUs not in any of the cpumasks will
				2389	* not be load balanced. If the same cpumask appears both in the
				2390	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2391	* it as it is.
				2392	*
				2393	* The passed in 'doms_new' should be allocated using
				2394	* alloc_sched_domains. This routine takes ownership of it and will
				2395	* free_sched_domains it when done with it. If the caller failed the
				2396	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2397	* and partition_sched_domains() will fallback to the single partition
				2398	* 'fallback_doms', it also forces the domains to be rebuilt.
				2399	*
				2400	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2401	* ndoms_new == 0 is a special case for destroying existing domains,
				2402	* and it will not create the default domain.
				2403	*
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2404	* Call with hotplug lock and sched_domains_mutex held
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2405	*/
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2406	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2407	struct sched_domain_attr *dattr_new)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2408	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2409	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2410	int i, j, n;
				2411	int new_topology;
				2412
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2413	lockdep_assert_held(&sched_domains_mutex);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2414
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2415	/* Let the architecture update CPU core mappings: */
				2416	new_topology = arch_update_cpu_topology();
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2417	/* Trigger rebuilding CPU capacity asymmetry data */
				2418	if (new_topology)
				2419	asym_cpu_capacity_scan();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2420
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2421	if (!doms_new) {
				2422	WARN_ON_ONCE(dattr_new);
				2423	n = 0;
				2424	doms_new = alloc_sched_domains(1);
				2425	if (doms_new) {
				2426	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2427	cpumask_and(doms_new[0], cpu_active_mask,
				2428	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2429	}
				2430	} else {
				2431	n = ndoms_new;
				2432	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2433
				2434	/* Destroy deleted domains: */
				2435	for (i = 0; i < ndoms_cur; i++) {
				2436	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2437	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2438	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2439	struct root_domain *rd;
				2440
				2441	/*
				2442	* This domain won't be destroyed and as such
				2443	* its dl_bw->total_bw needs to be cleared. It
				2444	* will be recomputed in function
				2445	* update_tasks_root_domain().
				2446	*/
				2447	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2448	dl_clear_root_domain(rd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2449	goto match1;
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2450	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2451	}
				2452	/* No match - a current sched domain not in new doms_new[] */
				2453	detach_destroy_domains(doms_cur[i]);
				2454	match1:
				2455	;
				2456	}
				2457
				2458	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2459	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2460	n = 0;
				2461	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2462	cpumask_and(doms_new[0], cpu_active_mask,
				2463	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2464	}
				2465
				2466	/* Build new domains: */
				2467	for (i = 0; i < ndoms_new; i++) {
				2468	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2469	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2470	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2471	goto match2;
				2472	}
				2473	/* No match - add a new doms_new */
				2474	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2475	match2:
				2476	;
				2477	}
				2478
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2479	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2480	/* Build perf. domains: */
				2481	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2482	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2483	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2484	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2485	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2486	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2487	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2488	}
				2489	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2490	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2491	match3:
				2492	;
				2493	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2494	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2495	#endif
				2496
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2497	/* Remember the new sched domains: */
				2498	if (doms_cur != &fallback_doms)
				2499	free_sched_domains(doms_cur, ndoms_cur);
				2500
				2501	kfree(dattr_cur);
				2502	doms_cur = doms_new;
				2503	dattr_cur = dattr_new;
				2504	ndoms_cur = ndoms_new;
				2505
Peter Zijlstra	3b87f13	2021-03-25 11:31:20 +0100	[diff] [blame]	2506	update_sched_domain_debugfs();
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2507	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2508
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2509	/*
				2510	* Call with hotplug lock held
				2511	*/
				2512	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2513	struct sched_domain_attr *dattr_new)
				2514	{
				2515	mutex_lock(&sched_domains_mutex);
				2516	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2517	mutex_unlock(&sched_domains_mutex);
				2518	}