Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: d201a7052a299fbc231f79ad5d98ebde3073bbe4 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	17	sched_debug_verbose = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	21	early_param("sched_verbose", sched_debug_setup);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	22
				23	static inline bool sched_debug(void)
				24	{
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	25	return sched_debug_verbose;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	26	}
				27
Valentin Schneider	848785d	2020-09-08 19:49:56 +0100	[diff] [blame]	28	#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
				29	const struct sd_flag_debug sd_flag_debug[] = {
				30	#include <linux/sched/sd_flags.h>
				31	};
				32	#undef SD_FLAG
				33
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	34	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				35	struct cpumask *groupmask)
				36	{
				37	struct sched_group *group = sd->groups;
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	38	unsigned long flags = sd->flags;
				39	unsigned int idx;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	40
				41	cpumask_clear(groupmask);
				42
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	43	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	54	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
				55	unsigned int flag = BIT(idx);
				56	unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
				57
				58	if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
				59	!(sd->child->flags & flag))
				60	printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
				61	sd_flag_debug[idx].name);
				62
				63	if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
				64	!(sd->parent->flags & flag))
				65	printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
				66	sd_flag_debug[idx].name);
				67	}
				68
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	69	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				70	do {
				71	if (!group) {
				72	printk("\n");
				73	printk(KERN_ERR "ERROR: group is NULL\n");
				74	break;
				75	}
				76
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	77	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	78	printk(KERN_CONT "\n");
				79	printk(KERN_ERR "ERROR: empty group\n");
				80	break;
				81	}
				82
				83	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	84	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	85	printk(KERN_CONT "\n");
				86	printk(KERN_ERR "ERROR: repeated CPUs\n");
				87	break;
				88	}
				89
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	90	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	91
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	92	printk(KERN_CONT " %d:{ span=%*pbl",
				93	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	94	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	95
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	96	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	97	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	98	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	99	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	100	}
				101
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	102	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				103	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	104
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	105	if (group == sd->groups && sd->child &&
				106	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	107	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	108	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				109	}
				110
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	111	printk(KERN_CONT " }");
				112
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	113	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	114
				115	if (group != sd->groups)
				116	printk(KERN_CONT ",");
				117
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	118	} while (group != sd->groups);
				119	printk(KERN_CONT "\n");
				120
				121	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				122	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				123
				124	if (sd->parent &&
				125	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	126	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	127	return 0;
				128	}
				129
				130	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				131	{
				132	int level = 0;
				133
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	134	if (!sched_debug_verbose)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	135	return;
				136
				137	if (!sd) {
				138	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				139	return;
				140	}
				141
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	142	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	143
				144	for (;;) {
				145	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				146	break;
				147	level++;
				148	sd = sd->parent;
				149	if (!sd)
				150	break;
				151	}
				152	}
				153	#else /* !CONFIG_SCHED_DEBUG */
				154
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	155	# define sched_debug_verbose 0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	156	# define sched_domain_debug(sd, cpu) do { } while (0)
				157	static inline bool sched_debug(void)
				158	{
				159	return false;
				160	}
				161	#endif /* CONFIG_SCHED_DEBUG */
				162
Valentin Schneider	4fc472f	2020-08-25 14:32:16 +0100	[diff] [blame]	163	/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
				164	#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
				165	static const unsigned int SD_DEGENERATE_GROUPS_MASK =
				166	#include <linux/sched/sd_flags.h>
				167	0;
				168	#undef SD_FLAG
				169
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	170	static int sd_degenerate(struct sched_domain *sd)
				171	{
				172	if (cpumask_weight(sched_domain_span(sd)) == 1)
				173	return 1;
				174
				175	/* Following flags need at least 2 groups */
Valentin Schneider	6f34981	2020-08-17 12:29:54 +0100	[diff] [blame]	176	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
				177	(sd->groups != sd->groups->next))
				178	return 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	179
				180	/* Following flags don't use groups */
				181	if (sd->flags & (SD_WAKE_AFFINE))
				182	return 0;
				183
				184	return 1;
				185	}
				186
				187	static int
				188	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				189	{
				190	unsigned long cflags = sd->flags, pflags = parent->flags;
				191
				192	if (sd_degenerate(parent))
				193	return 1;
				194
				195	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				196	return 0;
				197
				198	/* Flags needing groups don't count if only 1 group in parent */
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	199	if (parent->groups == parent->groups->next)
Valentin Schneider	3a6712c	2020-08-17 12:29:57 +0100	[diff] [blame]	200	pflags &= ~SD_DEGENERATE_GROUPS_MASK;
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	201
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	202	if (~cflags & pflags)
				203	return 0;
				204
				205	return 1;
				206	}
				207
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	208	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	209	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	211	DEFINE_MUTEX(sched_energy_mutex);
				212	bool sched_energy_update;
				213
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	214	void rebuild_sched_domains_energy(void)
				215	{
				216	mutex_lock(&sched_energy_mutex);
				217	sched_energy_update = true;
				218	rebuild_sched_domains();
				219	sched_energy_update = false;
				220	mutex_unlock(&sched_energy_mutex);
				221	}
				222
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	223	#ifdef CONFIG_PROC_SYSCTL
				224	int sched_energy_aware_handler(struct ctl_table *table, int write,
Christoph Hellwig	3292739	2020-04-24 08:43:38 +0200	[diff] [blame]	225	void buffer, size_t lenp, loff_t *ppos)
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	226	{
				227	int ret, state;
				228
				229	if (write && !capable(CAP_SYS_ADMIN))
				230	return -EPERM;
				231
				232	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				233	if (!ret && write) {
				234	state = static_branch_unlikely(&sched_energy_present);
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	235	if (state != sysctl_sched_energy_aware)
				236	rebuild_sched_domains_energy();
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	237	}
				238
				239	return ret;
				240	}
				241	#endif
				242
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	243	static void free_pd(struct perf_domain *pd)
				244	{
				245	struct perf_domain *tmp;
				246
				247	while (pd) {
				248	tmp = pd->next;
				249	kfree(pd);
				250	pd = tmp;
				251	}
				252	}
				253
				254	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				255	{
				256	while (pd) {
				257	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				258	return pd;
				259	pd = pd->next;
				260	}
				261
				262	return NULL;
				263	}
				264
				265	static struct perf_domain *pd_init(int cpu)
				266	{
				267	struct em_perf_domain *obj = em_cpu_get(cpu);
				268	struct perf_domain *pd;
				269
				270	if (!obj) {
				271	if (sched_debug())
				272	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				273	return NULL;
				274	}
				275
				276	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				277	if (!pd)
				278	return NULL;
				279	pd->em_pd = obj;
				280
				281	return pd;
				282	}
				283
				284	static void perf_domain_debug(const struct cpumask *cpu_map,
				285	struct perf_domain *pd)
				286	{
				287	if (!sched_debug() \|\| !pd)
				288	return;
				289
				290	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				291
				292	while (pd) {
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	293	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	294	cpumask_first(perf_domain_span(pd)),
				295	cpumask_pr_args(perf_domain_span(pd)),
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	296	em_pd_nr_perf_states(pd->em_pd));
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	297	pd = pd->next;
				298	}
				299
				300	printk(KERN_CONT "\n");
				301	}
				302
				303	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				304	{
				305	struct perf_domain *pd;
				306
				307	pd = container_of(rp, struct perf_domain, rcu);
				308	free_pd(pd);
				309	}
				310
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	311	static void sched_energy_set(bool has_eas)
				312	{
				313	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				314	if (sched_debug())
				315	pr_info("%s: stopping EAS\n", __func__);
				316	static_branch_disable_cpuslocked(&sched_energy_present);
				317	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				318	if (sched_debug())
				319	pr_info("%s: starting EAS\n", __func__);
				320	static_branch_enable_cpuslocked(&sched_energy_present);
				321	}
				322	}
				323
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	324	/*
				325	* EAS can be used on a root domain if it meets all the following conditions:
				326	* 1. an Energy Model (EM) is available;
				327	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	328	* 3. no SMT is detected.
				329	* 4. the EM complexity is low enough to keep scheduling overheads low;
				330	* 5. schedutil is driving the frequency of all CPUs of the rd;
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	331	* 6. frequency invariance support is present;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	332	*
				333	* The complexity of the Energy Model is defined as:
				334	*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	335	* C = nr_pd * (nr_cpus + nr_ps)
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	336	*
				337	* with parameters defined as:
				338	* - nr_pd: the number of performance domains
				339	* - nr_cpus: the number of CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	340	* - nr_ps: the sum of the number of performance states of all performance
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	341	* domains (for example, on a system with 2 performance domains,
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	342	* with 10 performance states each, nr_ps = 2 * 10 = 20).
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	343	*
				344	* It is generally not a good idea to use such a model in the wake-up path on
				345	* very complex platforms because of the associated scheduling overheads. The
				346	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	347	* with per-CPU DVFS and less than 8 performance states each, for example.
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	348	*/
				349	#define EM_MAX_COMPLEXITY 2048
				350
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	351	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	352	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	353	{
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	354	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	355	struct perf_domain pd = NULL, tmp;
				356	int cpu = cpumask_first(cpu_map);
				357	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	358	struct cpufreq_policy *policy;
				359	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	360
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	361	if (!sysctl_sched_energy_aware)
				362	goto free;
				363
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	364	/* EAS is enabled for asymmetric CPU capacity topologies. */
				365	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				366	if (sched_debug()) {
				367	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				368	cpumask_pr_args(cpu_map));
				369	}
				370	goto free;
				371	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	372
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	373	/* EAS definitely does not handle SMT */
				374	if (sched_smt_active()) {
				375	pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
				376	cpumask_pr_args(cpu_map));
				377	goto free;
				378	}
				379
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	380	if (!arch_scale_freq_invariant()) {
				381	if (sched_debug()) {
				382	pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
				383	cpumask_pr_args(cpu_map));
				384	}
				385	goto free;
				386	}
				387
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	388	for_each_cpu(i, cpu_map) {
				389	/* Skip already covered CPUs. */
				390	if (find_pd(pd, i))
				391	continue;
				392
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	393	/* Do not attempt EAS if schedutil is not being used. */
				394	policy = cpufreq_cpu_get(i);
				395	if (!policy)
				396	goto free;
				397	gov = policy->governor;
				398	cpufreq_cpu_put(policy);
				399	if (gov != &schedutil_gov) {
				400	if (rd->pd)
				401	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				402	cpumask_pr_args(cpu_map));
				403	goto free;
				404	}
				405
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	406	/* Create the new pd and add it to the local list. */
				407	tmp = pd_init(i);
				408	if (!tmp)
				409	goto free;
				410	tmp->next = pd;
				411	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	412
				413	/*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	414	* Count performance domains and performance states for the
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	415	* complexity check.
				416	*/
				417	nr_pd++;
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	418	nr_ps += em_pd_nr_perf_states(pd->em_pd);
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	419	}
				420
				421	/* Bail out if the Energy Model complexity is too high. */
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	422	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	423	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				424	cpumask_pr_args(cpu_map));
				425	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	426	}
				427
				428	perf_domain_debug(cpu_map, pd);
				429
				430	/* Attach the new list of performance domains to the root domain. */
				431	tmp = rd->pd;
				432	rcu_assign_pointer(rd->pd, pd);
				433	if (tmp)
				434	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				435
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	436	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	437
				438	free:
				439	free_pd(pd);
				440	tmp = rd->pd;
				441	rcu_assign_pointer(rd->pd, NULL);
				442	if (tmp)
				443	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	444
				445	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	446	}
				447	#else
				448	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	449	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	450
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	451	static void free_rootdomain(struct rcu_head *rcu)
				452	{
				453	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				454
				455	cpupri_cleanup(&rd->cpupri);
				456	cpudl_cleanup(&rd->cpudl);
				457	free_cpumask_var(rd->dlo_mask);
				458	free_cpumask_var(rd->rto_mask);
				459	free_cpumask_var(rd->online);
				460	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	461	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	462	kfree(rd);
				463	}
				464
				465	void rq_attach_root(struct rq rq, struct root_domain rd)
				466	{
				467	struct root_domain *old_rd = NULL;
				468	unsigned long flags;
				469
Peter Zijlstra	5cb9eaa	2020-11-17 18:19:31 -0500	[diff] [blame]	470	raw_spin_rq_lock_irqsave(rq, flags);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	471
				472	if (rq->rd) {
				473	old_rd = rq->rd;
				474
				475	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				476	set_rq_offline(rq);
				477
				478	cpumask_clear_cpu(rq->cpu, old_rd->span);
				479
				480	/*
				481	* If we dont want to free the old_rd yet then
				482	* set old_rd to NULL to skip the freeing later
				483	* in this function:
				484	*/
				485	if (!atomic_dec_and_test(&old_rd->refcount))
				486	old_rd = NULL;
				487	}
				488
				489	atomic_inc(&rd->refcount);
				490	rq->rd = rd;
				491
				492	cpumask_set_cpu(rq->cpu, rd->span);
				493	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				494	set_rq_online(rq);
				495
Peter Zijlstra	5cb9eaa	2020-11-17 18:19:31 -0500	[diff] [blame]	496	raw_spin_rq_unlock_irqrestore(rq, flags);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	497
				498	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	499	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	500	}
				501
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	502	void sched_get_rd(struct root_domain *rd)
				503	{
				504	atomic_inc(&rd->refcount);
				505	}
				506
				507	void sched_put_rd(struct root_domain *rd)
				508	{
				509	if (!atomic_dec_and_test(&rd->refcount))
				510	return;
				511
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	512	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	513	}
				514
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	515	static int init_rootdomain(struct root_domain *rd)
				516	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	517	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				518	goto out;
				519	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				520	goto free_span;
				521	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				522	goto free_online;
				523	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				524	goto free_dlo_mask;
				525
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	526	#ifdef HAVE_RT_PUSH_IPI
				527	rd->rto_cpu = -1;
				528	raw_spin_lock_init(&rd->rto_lock);
Sebastian Andrzej Siewior	da6ff09	2021-10-06 13:18:49 +0200	[diff] [blame]	529	rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	530	#endif
				531
Peng Liu	2676242	2020-10-08 23:48:46 +0800	[diff] [blame]	532	rd->visit_gen = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	533	init_dl_bw(&rd->dl_bw);
				534	if (cpudl_init(&rd->cpudl) != 0)
				535	goto free_rto_mask;
				536
				537	if (cpupri_init(&rd->cpupri) != 0)
				538	goto free_cpudl;
				539	return 0;
				540
				541	free_cpudl:
				542	cpudl_cleanup(&rd->cpudl);
				543	free_rto_mask:
				544	free_cpumask_var(rd->rto_mask);
				545	free_dlo_mask:
				546	free_cpumask_var(rd->dlo_mask);
				547	free_online:
				548	free_cpumask_var(rd->online);
				549	free_span:
				550	free_cpumask_var(rd->span);
				551	out:
				552	return -ENOMEM;
				553	}
				554
				555	/*
				556	* By default the system creates a single root-domain with all CPUs as
				557	* members (mimicking the global state we have today).
				558	*/
				559	struct root_domain def_root_domain;
				560
				561	void init_defrootdomain(void)
				562	{
				563	init_rootdomain(&def_root_domain);
				564
				565	atomic_set(&def_root_domain.refcount, 1);
				566	}
				567
				568	static struct root_domain *alloc_rootdomain(void)
				569	{
				570	struct root_domain *rd;
				571
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	572	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	573	if (!rd)
				574	return NULL;
				575
				576	if (init_rootdomain(rd) != 0) {
				577	kfree(rd);
				578	return NULL;
				579	}
				580
				581	return rd;
				582	}
				583
				584	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				585	{
				586	struct sched_group tmp, first;
				587
				588	if (!sg)
				589	return;
				590
				591	first = sg;
				592	do {
				593	tmp = sg->next;
				594
				595	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				596	kfree(sg->sgc);
				597
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	598	if (atomic_dec_and_test(&sg->ref))
				599	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	600	sg = tmp;
				601	} while (sg != first);
				602	}
				603
				604	static void destroy_sched_domain(struct sched_domain *sd)
				605	{
				606	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	607	* A normal sched domain may have multiple group references, an
				608	* overlapping domain, having private groups, only one. Iterate,
				609	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	610	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	611	free_sched_groups(sd->groups, 1);
				612
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	613	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				614	kfree(sd->shared);
				615	kfree(sd);
				616	}
				617
				618	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				619	{
				620	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				621
				622	while (sd) {
				623	struct sched_domain *parent = sd->parent;
				624	destroy_sched_domain(sd);
				625	sd = parent;
				626	}
				627	}
				628
				629	static void destroy_sched_domains(struct sched_domain *sd)
				630	{
				631	if (sd)
				632	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				633	}
				634
				635	/*
				636	* Keep a special pointer to the highest sched_domain that has
				637	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				638	* allows us to avoid some pointer chasing select_idle_sibling().
				639	*
				640	* Also keep a unique ID per domain (we use the first CPU number in
				641	* the cpumask of the domain), this allows us to quickly tell if
				642	* two CPUs are in the same cache domain, see cpus_share_cache().
				643	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	644	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	645	DEFINE_PER_CPU(int, sd_llc_size);
				646	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	647	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				648	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				649	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				650	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	651	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	652
				653	static void update_top_cache_domain(int cpu)
				654	{
				655	struct sched_domain_shared *sds = NULL;
				656	struct sched_domain *sd;
				657	int id = cpu;
				658	int size = 1;
				659
				660	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				661	if (sd) {
				662	id = cpumask_first(sched_domain_span(sd));
				663	size = cpumask_weight(sched_domain_span(sd));
				664	sds = sd->shared;
				665	}
				666
				667	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				668	per_cpu(sd_llc_size, cpu) = size;
				669	per_cpu(sd_llc_id, cpu) = id;
				670	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				671
				672	sd = lowest_flag_domain(cpu, SD_NUMA);
				673	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				674
				675	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	676	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				677
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	678	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	679	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	680	}
				681
				682	/*
				683	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				684	* hold the hotplug lock.
				685	*/
				686	static void
				687	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				688	{
				689	struct rq *rq = cpu_rq(cpu);
				690	struct sched_domain *tmp;
				691
				692	/* Remove the sched domains which do not contribute to scheduling. */
				693	for (tmp = sd; tmp; ) {
				694	struct sched_domain *parent = tmp->parent;
				695	if (!parent)
				696	break;
				697
				698	if (sd_parent_degenerate(tmp, parent)) {
				699	tmp->parent = parent->parent;
				700	if (parent->parent)
				701	parent->parent->child = tmp;
				702	/*
				703	* Transfer SD_PREFER_SIBLING down in case of a
				704	* degenerate parent; the spans match for this
				705	* so the property transfers.
				706	*/
				707	if (parent->flags & SD_PREFER_SIBLING)
				708	tmp->flags \|= SD_PREFER_SIBLING;
				709	destroy_sched_domain(parent);
				710	} else
				711	tmp = tmp->parent;
				712	}
				713
				714	if (sd && sd_degenerate(sd)) {
				715	tmp = sd;
				716	sd = sd->parent;
				717	destroy_sched_domain(tmp);
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	718	if (sd) {
				719	struct sched_group *sg = sd->groups;
				720
				721	/*
				722	* sched groups hold the flags of the child sched
				723	* domain for convenience. Clear such flags since
				724	* the child is being destroyed.
				725	*/
				726	do {
				727	sg->flags = 0;
				728	} while (sg != sd->groups);
				729
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	730	sd->child = NULL;
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	731	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	732	}
				733
				734	sched_domain_debug(sd, cpu);
				735
				736	rq_attach_root(rq, rd);
				737	tmp = rq->sd;
				738	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	739	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	740	destroy_sched_domains(tmp);
				741
				742	update_top_cache_domain(cpu);
				743	}
				744
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	745	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	746	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	747	struct root_domain *rd;
				748	};
				749
				750	enum s_alloc {
				751	sa_rootdomain,
				752	sa_sd,
				753	sa_sd_storage,
				754	sa_none,
				755	};
				756
				757	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	758	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	759	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	760	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	761	* The balance mask are all those CPUs that could actually end up at this
				762	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	763	*
				764	* Also see should_we_balance().
				765	*/
				766	int group_balance_cpu(struct sched_group *sg)
				767	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	768	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	769	}
				770
				771
				772	/*
				773	* NUMA topology (first read the regular topology blurb below)
				774	*
				775	* Given a node-distance table, for example:
				776	*
				777	* node 0 1 2 3
				778	* 0: 10 20 30 20
				779	* 1: 20 10 20 30
				780	* 2: 30 20 10 20
				781	* 3: 20 30 20 10
				782	*
				783	* which represents a 4 node ring topology like:
				784	*
				785	* 0 ----- 1
				786	* \| \|
				787	* \| \|
				788	* \| \|
				789	* 3 ----- 2
				790	*
				791	* We want to construct domains and groups to represent this. The way we go
				792	* about doing this is to build the domains on 'hops'. For each NUMA level we
				793	* construct the mask of all nodes reachable in @level hops.
				794	*
				795	* For the above NUMA topology that gives 3 levels:
				796	*
				797	* NUMA-2 0-3 0-3 0-3 0-3
				798	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				799	*
				800	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				801	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				802	*
				803	* NUMA-0 0 1 2 3
				804	*
				805	*
				806	* As can be seen; things don't nicely line up as with the regular topology.
				807	* When we iterate a domain in child domain chunks some nodes can be
				808	* represented multiple times -- hence the "overlap" naming for this part of
				809	* the topology.
				810	*
				811	* In order to minimize this overlap, we only build enough groups to cover the
				812	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				813	*
				814	* Because:
				815	*
				816	* - the first group of each domain is its child domain; this
				817	* gets us the first 0-1,3
				818	* - the only uncovered node is 2, who's child domain is 1-3.
				819	*
				820	* However, because of the overlap, computing a unique CPU for each group is
				821	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				822	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				823	* end up at those groups (they would end up in group: 0-1,3).
				824	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	825	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	826	* will contain those CPUs in the group that can reach this group given the
				827	* (child) domain tree.
				828	*
				829	* With this we can once again compute balance_cpu and sched_group_capacity
				830	* relations.
				831	*
				832	* XXX include words on how balance_cpu is unique and therefore can be
				833	* used for sched_group_capacity links.
				834	*
				835	*
				836	* Another 'interesting' topology is:
				837	*
				838	* node 0 1 2 3
				839	* 0: 10 20 20 30
				840	* 1: 20 10 20 20
				841	* 2: 20 20 10 20
				842	* 3: 30 20 20 10
				843	*
				844	* Which looks a little like:
				845	*
				846	* 0 ----- 1
				847	* \| / \|
				848	* \| / \|
				849	* \| / \|
				850	* 2 ----- 3
				851	*
				852	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				853	* are not.
				854	*
				855	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	856	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	857	*
				858	* NUMA-2 0-3 0-3
				859	* groups: {0-2},{1-3} {1-3},{0-2}
				860	*
				861	* NUMA-1 0-2 0-3 0-3 1-3
				862	*
				863	* NUMA-0 0 1 2 3
				864	*
				865	*/
				866
				867
				868	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	869	* Build the balance mask; it contains only those CPUs that can arrive at this
				870	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	871	*
				872	* We do this during the group creation pass, therefore the group information
				873	* isn't complete yet, however since each group represents a (child) domain we
				874	* can fully construct this using the sched_domain bits (which are already
				875	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	876	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	877	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	878	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	879	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	880	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	881	struct sd_data *sdd = sd->private;
				882	struct sched_domain *sibling;
				883	int i;
				884
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	885	cpumask_clear(mask);
				886
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	887	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	888	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	889
				890	/*
				891	* Can happen in the asymmetric case, where these siblings are
				892	* unused. The mask will not be empty because those CPUs that
				893	* do have the top domain _should_ span the domain.
				894	*/
				895	if (!sibling->child)
				896	continue;
				897
				898	/* If we would not end up here, we can't continue from here */
				899	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	900	continue;
				901
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	902	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	903	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	904
				905	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	906	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	907	}
				908
				909	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	910	* XXX: This creates per-node group entries; since the load-balancer will
				911	* immediately access remote memory to construct this group's load-balance
				912	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	913	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	914	static struct sched_group *
				915	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				916	{
				917	struct sched_group *sg;
				918	struct cpumask *sg_span;
				919
				920	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				921	GFP_KERNEL, cpu_to_node(cpu));
				922
				923	if (!sg)
				924	return NULL;
				925
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	926	sg_span = sched_group_span(sg);
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	927	if (sd->child) {
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	928	cpumask_copy(sg_span, sched_domain_span(sd->child));
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	929	sg->flags = sd->child->flags;
				930	} else {
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	931	cpumask_copy(sg_span, sched_domain_span(sd));
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	932	}
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	933
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	934	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	935	return sg;
				936	}
				937
				938	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	939	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	940	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	941	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	942	struct sd_data *sdd = sd->private;
				943	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	944	int cpu;
				945
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	946	build_balance_mask(sd, sg, mask);
Barry Song	0a2b65c	2021-03-25 15:31:40 +1300	[diff] [blame]	947	cpu = cpumask_first(mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	948
				949	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				950	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	951	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	952	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	953	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	954
				955	/*
				956	* Initialize sgc->capacity such that even if we mess up the
				957	* domains and no possible iteration will get us here, we won't
				958	* die on a /0 trap.
				959	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	960	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	961	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				962	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	963	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	964	}
				965
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	966	static struct sched_domain *
				967	find_descended_sibling(struct sched_domain sd, struct sched_domain sibling)
				968	{
				969	/*
				970	* The proper descendant would be the one whose child won't span out
				971	* of sd
				972	*/
				973	while (sibling->child &&
				974	!cpumask_subset(sched_domain_span(sibling->child),
				975	sched_domain_span(sd)))
				976	sibling = sibling->child;
				977
				978	/*
				979	* As we are referencing sgc across different topology level, we need
				980	* to go down to skip those sched_domains which don't contribute to
				981	* scheduling because they will be degenerated in cpu_attach_domain
				982	*/
				983	while (sibling->child &&
				984	cpumask_equal(sched_domain_span(sibling->child),
				985	sched_domain_span(sibling)))
				986	sibling = sibling->child;
				987
				988	return sibling;
				989	}
				990
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	991	static int
				992	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				993	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	994	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	995	const struct cpumask *span = sched_domain_span(sd);
				996	struct cpumask *covered = sched_domains_tmpmask;
				997	struct sd_data *sdd = sd->private;
				998	struct sched_domain *sibling;
				999	int i;
				1000
				1001	cpumask_clear(covered);
				1002
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	1003	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1004	struct cpumask *sg_span;
				1005
				1006	if (cpumask_test_cpu(i, covered))
				1007	continue;
				1008
				1009	sibling = *per_cpu_ptr(sdd->sd, i);
				1010
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	1011	/*
				1012	* Asymmetric node setups can result in situations where the
				1013	* domain tree is of unequal depth, make sure to skip domains
				1014	* that already cover the entire range.
				1015	*
				1016	* In that case build_sched_domains() will have terminated the
				1017	* iteration early and our sibling sd spans will be empty.
				1018	* Domains should always include the CPU they're built on, so
				1019	* check that.
				1020	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1021	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				1022	continue;
				1023
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	1024	/*
				1025	* Usually we build sched_group by sibling's child sched_domain
				1026	* But for machines whose NUMA diameter are 3 or above, we move
				1027	* to build sched_group by sibling's proper descendant's child
				1028	* domain because sibling's child sched_domain will span out of
				1029	* the sched_domain being built as below.
				1030	*
				1031	* Smallest diameter=3 topology is:
				1032	*
				1033	* node 0 1 2 3
				1034	* 0: 10 20 30 40
				1035	* 1: 20 10 20 30
				1036	* 2: 30 20 10 20
				1037	* 3: 40 30 20 10
				1038	*
				1039	* 0 --- 1 --- 2 --- 3
				1040	*
				1041	* NUMA-3 0-3 N/A N/A 0-3
				1042	* groups: {0-2},{1-3} {1-3},{0-2}
				1043	*
				1044	* NUMA-2 0-2 0-3 0-3 1-3
				1045	* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
				1046	*
				1047	* NUMA-1 0-1 0-2 1-3 2-3
				1048	* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
				1049	*
				1050	* NUMA-0 0 1 2 3
				1051	*
				1052	* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
				1053	* group span isn't a subset of the domain span.
				1054	*/
				1055	if (sibling->child &&
				1056	!cpumask_subset(sched_domain_span(sibling->child), span))
				1057	sibling = find_descended_sibling(sd, sibling);
				1058
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	1059	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1060	if (!sg)
				1061	goto fail;
				1062
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1063	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1064	cpumask_or(covered, covered, sg_span);
				1065
Barry Song	585b6d2	2021-02-24 16:09:44 +1300	[diff] [blame]	1066	init_overlap_sched_group(sibling, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1067
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1068	if (!first)
				1069	first = sg;
				1070	if (last)
				1071	last->next = sg;
				1072	last = sg;
				1073	last->next = first;
				1074	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	1075	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1076
				1077	return 0;
				1078
				1079	fail:
				1080	free_sched_groups(first, 0);
				1081
				1082	return -ENOMEM;
				1083	}
				1084
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1085
				1086	/*
				1087	* Package topology (also see the load-balance blurb in fair.c)
				1088	*
				1089	* The scheduler builds a tree structure to represent a number of important
				1090	* topology features. By default (default_topology[]) these include:
				1091	*
				1092	* - Simultaneous multithreading (SMT)
				1093	* - Multi-Core Cache (MC)
				1094	* - Package (DIE)
				1095	*
				1096	* Where the last one more or less denotes everything up to a NUMA node.
				1097	*
				1098	* The tree consists of 3 primary data structures:
				1099	*
				1100	* sched_domain -> sched_group -> sched_group_capacity
				1101	* ^ ^ ^ ^
				1102	* `-' `-'
				1103	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1104	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1105	* denote the ever growing mask of CPUs belonging to that level of topology.
				1106	*
				1107	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1108	* denoting the domains of the level below (or individual CPUs in case of the
				1109	* first domain level). The sched_group linked by a sched_domain includes the
				1110	* CPU of that sched_domain [*].
				1111	*
				1112	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1113	*
				1114	* CPU 0 1 2 3 4 5 6 7
				1115	*
				1116	* DIE [ ]
				1117	* MC [ ] [ ]
				1118	* SMT [ ] [ ] [ ] [ ]
				1119	*
				1120	* - or -
				1121	*
				1122	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1123	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1124	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1125	*
				1126	* CPU 0 1 2 3 4 5 6 7
				1127	*
				1128	* One way to think about it is: sched_domain moves you up and down among these
				1129	* topology levels, while sched_group moves you sideways through it, at child
				1130	* domain granularity.
				1131	*
				1132	* sched_group_capacity ensures each unique sched_group has shared storage.
				1133	*
				1134	* There are two related construction problems, both require a CPU that
				1135	* uniquely identify each group (for a given domain):
				1136	*
				1137	* - The first is the balance_cpu (see should_we_balance() and the
				1138	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1139	* continue balancing at a higher domain.
				1140	*
				1141	* - The second is the sched_group_capacity; we want all identical groups
				1142	* to share a single sched_group_capacity.
				1143	*
				1144	* Since these topologies are exclusive by construction. That is, its
				1145	* impossible for an SMT thread to belong to multiple cores, and cores to
				1146	* be part of multiple caches. There is a very clear and unique location
				1147	* for each CPU in the hierarchy.
				1148	*
				1149	* Therefore computing a unique CPU for each group is trivial (the iteration
				1150	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1151	* group), we can simply pick the first CPU in each group.
				1152	*
				1153	*
				1154	* [*] in other words, the first group of each domain is its child domain.
				1155	*/
				1156
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1157	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1158	{
				1159	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1160	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1161	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1162	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1163
				1164	if (child)
				1165	cpu = cpumask_first(sched_domain_span(child));
				1166
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1167	sg = *per_cpu_ptr(sdd->sg, cpu);
				1168	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1169
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1170	/* Increase refcounts for claim_allocations: */
				1171	already_visited = atomic_inc_return(&sg->ref) > 1;
				1172	/* sgc visits should follow a similar trend as sg */
				1173	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1174
				1175	/* If we have already visited that group, it's already initialized. */
				1176	if (already_visited)
				1177	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1178
				1179	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1180	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1181	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Ricardo Neri	16d364b	2021-09-10 18:18:15 -0700	[diff] [blame]	1182	sg->flags = child->flags;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1183	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1184	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1185	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1186	}
				1187
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1188	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1189	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1190	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1191
				1192	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1193	}
				1194
				1195	/*
				1196	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1197	* covered by the given span, will set each group's ->cpumask correctly,
				1198	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1199	*
				1200	* Assumes the sched_domain tree is fully constructed
				1201	*/
				1202	static int
				1203	build_sched_groups(struct sched_domain *sd, int cpu)
				1204	{
				1205	struct sched_group first = NULL, last = NULL;
				1206	struct sd_data *sdd = sd->private;
				1207	const struct cpumask *span = sched_domain_span(sd);
				1208	struct cpumask *covered;
				1209	int i;
				1210
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1211	lockdep_assert_held(&sched_domains_mutex);
				1212	covered = sched_domains_tmpmask;
				1213
				1214	cpumask_clear(covered);
				1215
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1216	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1217	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1218
				1219	if (cpumask_test_cpu(i, covered))
				1220	continue;
				1221
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1222	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1223
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1224	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1225
				1226	if (!first)
				1227	first = sg;
				1228	if (last)
				1229	last->next = sg;
				1230	last = sg;
				1231	}
				1232	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1233	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1234
				1235	return 0;
				1236	}
				1237
				1238	/*
				1239	* Initialize sched groups cpu_capacity.
				1240	*
				1241	* cpu_capacity indicates the capacity of sched group, which is used while
				1242	* distributing the load between different sched groups in a sched domain.
				1243	* Typically cpu_capacity for all the groups in a sched domain will be same
				1244	* unless there are asymmetries in the topology. If there are asymmetries,
				1245	* group having more cpu_capacity will pickup more load compared to the
				1246	* group having less cpu_capacity.
				1247	*/
				1248	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1249	{
				1250	struct sched_group *sg = sd->groups;
				1251
				1252	WARN_ON(!sg);
				1253
				1254	do {
				1255	int cpu, max_cpu = -1;
				1256
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1257	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1258
				1259	if (!(sd->flags & SD_ASYM_PACKING))
				1260	goto next;
				1261
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1262	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1263	if (max_cpu < 0)
				1264	max_cpu = cpu;
				1265	else if (sched_asym_prefer(cpu, max_cpu))
				1266	max_cpu = cpu;
				1267	}
				1268	sg->asym_prefer_cpu = max_cpu;
				1269
				1270	next:
				1271	sg = sg->next;
				1272	} while (sg != sd->groups);
				1273
				1274	if (cpu != group_balance_cpu(sg))
				1275	return;
				1276
				1277	update_group_capacity(sd, cpu);
				1278	}
				1279
				1280	/*
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1281	* Asymmetric CPU capacity bits
				1282	*/
				1283	struct asym_cap_data {
				1284	struct list_head link;
				1285	unsigned long capacity;
				1286	unsigned long cpus[];
				1287	};
				1288
				1289	/*
				1290	* Set of available CPUs grouped by their corresponding capacities
				1291	* Each list entry contains a CPU mask reflecting CPUs that share the same
				1292	* capacity.
				1293	* The lifespan of data is unlimited.
				1294	*/
				1295	static LIST_HEAD(asym_cap_list);
				1296
				1297	#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
				1298
				1299	/*
				1300	* Verify whether there is any CPU capacity asymmetry in a given sched domain.
				1301	* Provides sd_flags reflecting the asymmetry scope.
				1302	*/
				1303	static inline int
				1304	asym_cpu_capacity_classify(const struct cpumask *sd_span,
				1305	const struct cpumask *cpu_map)
				1306	{
				1307	struct asym_cap_data *entry;
				1308	int count = 0, miss = 0;
				1309
				1310	/*
				1311	* Count how many unique CPU capacities this domain spans across
				1312	* (compare sched_domain CPUs mask with ones representing available
				1313	* CPUs capacities). Take into account CPUs that might be offline:
				1314	* skip those.
				1315	*/
				1316	list_for_each_entry(entry, &asym_cap_list, link) {
				1317	if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
				1318	++count;
				1319	else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
				1320	++miss;
				1321	}
				1322
				1323	WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
				1324
				1325	/* No asymmetry detected */
				1326	if (count < 2)
				1327	return 0;
				1328	/* Some of the available CPU capacity values have not been detected */
				1329	if (miss)
				1330	return SD_ASYM_CPUCAPACITY;
				1331
				1332	/* Full asymmetry */
				1333	return SD_ASYM_CPUCAPACITY \| SD_ASYM_CPUCAPACITY_FULL;
				1334
				1335	}
				1336
				1337	static inline void asym_cpu_capacity_update_data(int cpu)
				1338	{
				1339	unsigned long capacity = arch_scale_cpu_capacity(cpu);
				1340	struct asym_cap_data *entry = NULL;
				1341
				1342	list_for_each_entry(entry, &asym_cap_list, link) {
				1343	if (capacity == entry->capacity)
				1344	goto done;
				1345	}
				1346
				1347	entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
				1348	if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
				1349	return;
				1350	entry->capacity = capacity;
				1351	list_add(&entry->link, &asym_cap_list);
				1352	done:
				1353	__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
				1354	}
				1355
				1356	/*
				1357	* Build-up/update list of CPUs grouped by their capacities
				1358	* An update requires explicit request to rebuild sched domains
				1359	* with state indicating CPU topology changes.
				1360	*/
				1361	static void asym_cpu_capacity_scan(void)
				1362	{
				1363	struct asym_cap_data entry, next;
				1364	int cpu;
				1365
				1366	list_for_each_entry(entry, &asym_cap_list, link)
				1367	cpumask_clear(cpu_capacity_span(entry));
				1368
				1369	for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
				1370	asym_cpu_capacity_update_data(cpu);
				1371
				1372	list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
				1373	if (cpumask_empty(cpu_capacity_span(entry))) {
				1374	list_del(&entry->link);
				1375	kfree(entry);
				1376	}
				1377	}
				1378
				1379	/*
				1380	* Only one capacity value has been detected i.e. this system is symmetric.
				1381	* No need to keep this data around.
				1382	*/
				1383	if (list_is_singular(&asym_cap_list)) {
				1384	entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
				1385	list_del(&entry->link);
				1386	kfree(entry);
				1387	}
				1388	}
				1389
				1390	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1391	* Initializers for schedule domains
				1392	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1393	*/
				1394
				1395	static int default_relax_domain_level = -1;
				1396	int sched_domain_level_max;
				1397
				1398	static int __init setup_relax_domain_level(char *str)
				1399	{
				1400	if (kstrtoint(str, 0, &default_relax_domain_level))
				1401	pr_warn("Unable to set relax_domain_level\n");
				1402
				1403	return 1;
				1404	}
				1405	__setup("relax_domain_level=", setup_relax_domain_level);
				1406
				1407	static void set_domain_attribute(struct sched_domain *sd,
				1408	struct sched_domain_attr *attr)
				1409	{
				1410	int request;
				1411
				1412	if (!attr \|\| attr->relax_domain_level < 0) {
				1413	if (default_relax_domain_level < 0)
				1414	return;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1415	request = default_relax_domain_level;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1416	} else
				1417	request = attr->relax_domain_level;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1418
				1419	if (sd->level > request) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1420	/* Turn off idle balance on this domain: */
				1421	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1422	}
				1423	}
				1424
				1425	static void __sdt_free(const struct cpumask *cpu_map);
				1426	static int __sdt_alloc(const struct cpumask *cpu_map);
				1427
				1428	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1429	const struct cpumask *cpu_map)
				1430	{
				1431	switch (what) {
				1432	case sa_rootdomain:
				1433	if (!atomic_read(&d->rd->refcount))
				1434	free_rootdomain(&d->rd->rcu);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1435	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1436	case sa_sd:
				1437	free_percpu(d->sd);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1438	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1439	case sa_sd_storage:
				1440	__sdt_free(cpu_map);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1441	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1442	case sa_none:
				1443	break;
				1444	}
				1445	}
				1446
				1447	static enum s_alloc
				1448	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1449	{
				1450	memset(d, 0, sizeof(*d));
				1451
				1452	if (__sdt_alloc(cpu_map))
				1453	return sa_sd_storage;
				1454	d->sd = alloc_percpu(struct sched_domain *);
				1455	if (!d->sd)
				1456	return sa_sd_storage;
				1457	d->rd = alloc_rootdomain();
				1458	if (!d->rd)
				1459	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1460
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1461	return sa_rootdomain;
				1462	}
				1463
				1464	/*
				1465	* NULL the sd_data elements we've used to build the sched_domain and
				1466	* sched_group structure so that the subsequent __free_domain_allocs()
				1467	* will not free the data we're using.
				1468	*/
				1469	static void claim_allocations(int cpu, struct sched_domain *sd)
				1470	{
				1471	struct sd_data *sdd = sd->private;
				1472
				1473	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1474	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1475
				1476	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1477	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1478
				1479	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1480	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1481
				1482	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1483	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1484	}
				1485
				1486	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1487	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1488
				1489	static int sched_domains_numa_levels;
				1490	static int sched_domains_curr_level;
				1491
				1492	int sched_max_numa_distance;
				1493	static int *sched_domains_numa_distance;
				1494	static struct cpumask ***sched_domains_numa_masks;
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame]	1495
				1496	static unsigned long __read_mostly *sched_numa_onlined_nodes;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1497	#endif
				1498
				1499	/*
				1500	* SD_flags allowed in topology descriptions.
				1501	*
				1502	* These flags are purely descriptive of the topology and do not prescribe
				1503	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1504	* function:
				1505	*
				1506	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1507	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1508	* SD_NUMA - describes NUMA topologies
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1509	*
				1510	* Odd one out, which beside describing the topology has a quirk also
				1511	* prescribes the desired behaviour that goes along with it:
				1512	*
				1513	* SD_ASYM_PACKING - describes SMT quirks
				1514	*/
				1515	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1516	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1517	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1518	SD_NUMA \| \
Valentin Schneider	cfe7ddc	2020-08-17 12:29:47 +0100	[diff] [blame]	1519	SD_ASYM_PACKING)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1520
				1521	static struct sched_domain *
				1522	sd_init(struct sched_domain_topology_level *tl,
				1523	const struct cpumask *cpu_map,
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1524	struct sched_domain *child, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1525	{
				1526	struct sd_data *sdd = &tl->data;
				1527	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1528	int sd_id, sd_weight, sd_flags = 0;
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1529	struct cpumask *sd_span;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1530
				1531	#ifdef CONFIG_NUMA
				1532	/*
				1533	* Ugly hack to pass state to sd_numa_mask()...
				1534	*/
				1535	sched_domains_curr_level = tl->numa_level;
				1536	#endif
				1537
				1538	sd_weight = cpumask_weight(tl->mask(cpu));
				1539
				1540	if (tl->sd_flags)
				1541	sd_flags = (*tl->sd_flags)();
				1542	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1543	"wrong sd_flags in topology description\n"))
Peng Liu	9b1b234	2020-06-09 23:09:36 +0800	[diff] [blame]	1544	sd_flags &= TOPOLOGY_SD_FLAGS;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1545
				1546	*sd = (struct sched_domain){
				1547	.min_interval = sd_weight,
				1548	.max_interval = 2*sd_weight,
Vincent Guittot	6e74991	2020-09-21 09:24:24 +0200	[diff] [blame]	1549	.busy_factor = 16,
Vincent Guittot	2208cda	2020-09-21 09:24:22 +0200	[diff] [blame]	1550	.imbalance_pct = 117,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1551
				1552	.cache_nice_tries = 0,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1553
Valentin Schneider	36c5bdc	2020-04-15 22:05:07 +0100	[diff] [blame]	1554	.flags = 1*SD_BALANCE_NEWIDLE
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1555	\| 1*SD_BALANCE_EXEC
				1556	\| 1*SD_BALANCE_FORK
				1557	\| 0*SD_BALANCE_WAKE
				1558	\| 1*SD_WAKE_AFFINE
				1559	\| 0*SD_SHARE_CPUCAPACITY
				1560	\| 0*SD_SHARE_PKG_RESOURCES
				1561	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1562	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1563	\| 0*SD_NUMA
				1564	\| sd_flags
				1565	,
				1566
				1567	.last_balance = jiffies,
				1568	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1569	.max_newidle_lb_cost = 0,
Vincent Guittot	e60b56e	2021-10-19 14:35:35 +0200	[diff] [blame]	1570	.last_decay_max_lb_cost = jiffies,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1571	.child = child,
				1572	#ifdef CONFIG_SCHED_DEBUG
				1573	.name = tl->name,
				1574	#endif
				1575	};
				1576
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	1577	sd_span = sched_domain_span(sd);
				1578	cpumask_and(sd_span, cpu_map, tl->mask(cpu));
				1579	sd_id = cpumask_first(sd_span);
				1580
				1581	sd->flags \|= asym_cpu_capacity_classify(sd_span, cpu_map);
				1582
				1583	WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY \| SD_ASYM_CPUCAPACITY)) ==
				1584	(SD_SHARE_CPUCAPACITY \| SD_ASYM_CPUCAPACITY),
				1585	"CPU capacity asymmetry not supported on SMT\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1586
				1587	/*
				1588	* Convert topological properties into behaviour.
				1589	*/
Morten Rasmussen	a526d46	2020-02-06 19:19:55 +0000	[diff] [blame]	1590	/* Don't attempt to spread across CPUs of different capacities. */
				1591	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
				1592	sd->child->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1593
				1594	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1595	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1596
				1597	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1598	sd->imbalance_pct = 117;
				1599	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1600
				1601	#ifdef CONFIG_NUMA
				1602	} else if (sd->flags & SD_NUMA) {
				1603	sd->cache_nice_tries = 2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1604
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1605	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1606	sd->flags \|= SD_SERIALIZE;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1607	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1608	sd->flags &= ~(SD_BALANCE_EXEC \|
				1609	SD_BALANCE_FORK \|
				1610	SD_WAKE_AFFINE);
				1611	}
				1612
				1613	#endif
				1614	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1615	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1616	}
				1617
				1618	/*
				1619	* For all levels sharing cache; connect a sched_domain_shared
				1620	* instance.
				1621	*/
				1622	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1623	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1624	atomic_inc(&sd->shared->ref);
				1625	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1626	}
				1627
				1628	sd->private = sdd;
				1629
				1630	return sd;
				1631	}
				1632
				1633	/*
				1634	* Topology list, bottom-up.
				1635	*/
				1636	static struct sched_domain_topology_level default_topology[] = {
				1637	#ifdef CONFIG_SCHED_SMT
				1638	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1639	#endif
Barry Song	778c558	2021-09-24 20:51:03 +1200	[diff] [blame]	1640
				1641	#ifdef CONFIG_SCHED_CLUSTER
				1642	{ cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
				1643	#endif
				1644
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1645	#ifdef CONFIG_SCHED_MC
				1646	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1647	#endif
				1648	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1649	{ NULL, },
				1650	};
				1651
				1652	static struct sched_domain_topology_level *sched_domain_topology =
				1653	default_topology;
				1654
				1655	#define for_each_sd_topology(tl) \
				1656	for (tl = sched_domain_topology; tl->mask; tl++)
				1657
				1658	void set_sched_topology(struct sched_domain_topology_level *tl)
				1659	{
				1660	if (WARN_ON_ONCE(sched_smp_initialized))
				1661	return;
				1662
				1663	sched_domain_topology = tl;
				1664	}
				1665
				1666	#ifdef CONFIG_NUMA
				1667
				1668	static const struct cpumask *sd_numa_mask(int cpu)
				1669	{
				1670	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1671	}
				1672
				1673	static void sched_numa_warn(const char *str)
				1674	{
				1675	static int done = false;
				1676	int i,j;
				1677
				1678	if (done)
				1679	return;
				1680
				1681	done = true;
				1682
				1683	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1684
				1685	for (i = 0; i < nr_node_ids; i++) {
				1686	printk(KERN_WARNING " ");
				1687	for (j = 0; j < nr_node_ids; j++)
				1688	printk(KERN_CONT "%02d ", node_distance(i,j));
				1689	printk(KERN_CONT "\n");
				1690	}
				1691	printk(KERN_WARNING "\n");
				1692	}
				1693
				1694	bool find_numa_distance(int distance)
				1695	{
				1696	int i;
				1697
				1698	if (distance == node_distance(0, 0))
				1699	return true;
				1700
				1701	for (i = 0; i < sched_domains_numa_levels; i++) {
				1702	if (sched_domains_numa_distance[i] == distance)
				1703	return true;
				1704	}
				1705
				1706	return false;
				1707	}
				1708
				1709	/*
				1710	* A system can have three types of NUMA topology:
				1711	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1712	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1713	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1714	*
				1715	* The difference between a glueless mesh topology and a backplane
				1716	* topology lies in whether communication between not directly
				1717	* connected nodes goes through intermediary nodes (where programs
				1718	* could run), or through backplane controllers. This affects
				1719	* placement of programs.
				1720	*
				1721	* The type of topology can be discerned with the following tests:
				1722	* - If the maximum distance between any nodes is 1 hop, the system
				1723	* is directly connected.
				1724	* - If for two nodes A and B, located N > 1 hops away from each other,
				1725	* there is an intermediary node C, which is < N hops away from both
				1726	* nodes A and B, the system is a glueless mesh.
				1727	*/
				1728	static void init_numa_topology_type(void)
				1729	{
				1730	int a, b, c, n;
				1731
				1732	n = sched_max_numa_distance;
				1733
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1734	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1735	sched_numa_topology_type = NUMA_DIRECT;
				1736	return;
				1737	}
				1738
				1739	for_each_online_node(a) {
				1740	for_each_online_node(b) {
				1741	/* Find two nodes furthest removed from each other. */
				1742	if (node_distance(a, b) < n)
				1743	continue;
				1744
				1745	/* Is there an intermediary node between a and b? */
				1746	for_each_online_node(c) {
				1747	if (node_distance(a, c) < n &&
				1748	node_distance(b, c) < n) {
				1749	sched_numa_topology_type =
				1750	NUMA_GLUELESS_MESH;
				1751	return;
				1752	}
				1753	}
				1754
				1755	sched_numa_topology_type = NUMA_BACKPLANE;
				1756	return;
				1757	}
				1758	}
				1759	}
				1760
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1761
				1762	#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
				1763
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1764	void sched_init_numa(void)
				1765	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1766	struct sched_domain_topology_level *tl;
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1767	unsigned long *distance_map;
				1768	int nr_levels = 0;
				1769	int i, j;
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1770
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1771	/*
				1772	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1773	* unique distances in the node_distance() table.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1774	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1775	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
				1776	if (!distance_map)
				1777	return;
				1778
				1779	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1780	for (i = 0; i < nr_node_ids; i++) {
				1781	for (j = 0; j < nr_node_ids; j++) {
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1782	int distance = node_distance(i, j);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1783
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1784	if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
				1785	sched_numa_warn("Invalid distance value range");
				1786	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1787	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1788
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1789	bitmap_set(distance_map, distance, 1);
				1790	}
				1791	}
				1792	/*
				1793	* We can now figure out how many unique distance values there are and
				1794	* allocate memory accordingly.
				1795	*/
				1796	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
				1797
				1798	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
				1799	if (!sched_domains_numa_distance) {
				1800	bitmap_free(distance_map);
				1801	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1802	}
				1803
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1804	for (i = 0, j = 0; i < nr_levels; i++, j++) {
				1805	j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
				1806	sched_domains_numa_distance[i] = j;
				1807	}
				1808
				1809	bitmap_free(distance_map);
				1810
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1811	/*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1812	* 'nr_levels' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1813	*
				1814	* The sched_domains_numa_distance[] array includes the actual distance
				1815	* numbers.
				1816	*/
				1817
				1818	/*
				1819	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1820	* If it fails to allocate memory for array sched_domains_numa_masks[][],
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1821	* the array will contain less then 'nr_levels' members. This could be
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1822	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1823	* in other functions.
				1824	*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1825	* We reset it to 'nr_levels' at the end of this function.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1826	*/
				1827	sched_domains_numa_levels = 0;
				1828
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1829	sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1830	if (!sched_domains_numa_masks)
				1831	return;
				1832
				1833	/*
				1834	* Now for each level, construct a mask per node which contains all
				1835	* CPUs of nodes that are that many hops away from us.
				1836	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1837	for (i = 0; i < nr_levels; i++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1838	sched_domains_numa_masks[i] =
				1839	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1840	if (!sched_domains_numa_masks[i])
				1841	return;
				1842
				1843	for (j = 0; j < nr_node_ids; j++) {
				1844	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1845	int k;
				1846
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1847	if (!mask)
				1848	return;
				1849
				1850	sched_domains_numa_masks[i][j] = mask;
				1851
				1852	for_each_node(k) {
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame]	1853	/*
				1854	* Distance information can be unreliable for
				1855	* offline nodes, defer building the node
				1856	* masks to its bringup.
				1857	* This relies on all unique distance values
				1858	* still being visible at init time.
				1859	*/
				1860	if (!node_online(j))
				1861	continue;
				1862
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1863	if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
				1864	sched_numa_warn("Node-distance not symmetric");
				1865
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1866	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1867	continue;
				1868
				1869	cpumask_or(mask, mask, cpumask_of_node(k));
				1870	}
				1871	}
				1872	}
				1873
				1874	/* Compute default topology size */
				1875	for (i = 0; sched_domain_topology[i].mask; i++);
				1876
Dietmar Eggemann	71e5f66	2021-02-01 10:53:53 +0100	[diff] [blame]	1877	tl = kzalloc((i + nr_levels + 1) *
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1878	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1879	if (!tl)
				1880	return;
				1881
				1882	/*
				1883	* Copy the default topology bits..
				1884	*/
				1885	for (i = 0; sched_domain_topology[i].mask; i++)
				1886	tl[i] = sched_domain_topology[i];
				1887
				1888	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1889	* Add the NUMA identity distance, aka single NODE.
				1890	*/
				1891	tl[i++] = (struct sched_domain_topology_level){
				1892	.mask = sd_numa_mask,
				1893	.numa_level = 0,
				1894	SD_INIT_NAME(NODE)
				1895	};
				1896
				1897	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1898	* .. and append 'j' levels of NUMA goodness.
				1899	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1900	for (j = 1; j < nr_levels; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1901	tl[i] = (struct sched_domain_topology_level){
				1902	.mask = sd_numa_mask,
				1903	.sd_flags = cpu_numa_flags,
				1904	.flags = SDTL_OVERLAP,
				1905	.numa_level = j,
				1906	SD_INIT_NAME(NUMA)
				1907	};
				1908	}
				1909
				1910	sched_domain_topology = tl;
				1911
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1912	sched_domains_numa_levels = nr_levels;
				1913	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1914
				1915	init_numa_topology_type();
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame]	1916
				1917	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
				1918	if (!sched_numa_onlined_nodes)
				1919	return;
				1920
				1921	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
				1922	for_each_online_node(i)
				1923	bitmap_set(sched_numa_onlined_nodes, i, 1);
				1924	}
				1925
				1926	static void __sched_domains_numa_masks_set(unsigned int node)
				1927	{
				1928	int i, j;
				1929
				1930	/*
				1931	* NUMA masks are not built for offline nodes in sched_init_numa().
				1932	* Thus, when a CPU of a never-onlined-before node gets plugged in,
				1933	* adding that new CPU to the right NUMA masks is not sufficient: the
				1934	* masks of that CPU's node must also be updated.
				1935	*/
				1936	if (test_bit(node, sched_numa_onlined_nodes))
				1937	return;
				1938
				1939	bitmap_set(sched_numa_onlined_nodes, node, 1);
				1940
				1941	for (i = 0; i < sched_domains_numa_levels; i++) {
				1942	for (j = 0; j < nr_node_ids; j++) {
				1943	if (!node_online(j) \|\| node == j)
				1944	continue;
				1945
				1946	if (node_distance(j, node) > sched_domains_numa_distance[i])
				1947	continue;
				1948
				1949	/* Add remote nodes in our masks */
				1950	cpumask_or(sched_domains_numa_masks[i][node],
				1951	sched_domains_numa_masks[i][node],
				1952	sched_domains_numa_masks[0][j]);
				1953	}
				1954	}
				1955
				1956	/*
				1957	* A new node has been brought up, potentially changing the topology
				1958	* classification.
				1959	*
				1960	* Note that this is racy vs any use of sched_numa_topology_type :/
				1961	*/
				1962	init_numa_topology_type();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1963	}
				1964
				1965	void sched_domains_numa_masks_set(unsigned int cpu)
				1966	{
				1967	int node = cpu_to_node(cpu);
				1968	int i, j;
				1969
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame]	1970	__sched_domains_numa_masks_set(node);
				1971
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1972	for (i = 0; i < sched_domains_numa_levels; i++) {
				1973	for (j = 0; j < nr_node_ids; j++) {
Valentin Schneider	0083242	2021-08-18 13:13:33 +0530	[diff] [blame]	1974	if (!node_online(j))
				1975	continue;
				1976
				1977	/* Set ourselves in the remote node's masks */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1978	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1979	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1980	}
				1981	}
				1982	}
				1983
				1984	void sched_domains_numa_masks_clear(unsigned int cpu)
				1985	{
				1986	int i, j;
				1987
				1988	for (i = 0; i < sched_domains_numa_levels; i++) {
				1989	for (j = 0; j < nr_node_ids; j++)
				1990	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1991	}
				1992	}
				1993
Wanpeng Li	e0e8d49	2019-06-28 16:51:41 +0800	[diff] [blame]	1994	/*
				1995	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1996	* closest to @cpu from @cpumask.
				1997	* cpumask: cpumask to find a cpu from
				1998	* cpu: cpu to be close to
				1999	*
				2000	* returns: cpu, or nr_cpu_ids when nothing found.
				2001	*/
				2002	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				2003	{
				2004	int i, j = cpu_to_node(cpu);
				2005
				2006	for (i = 0; i < sched_domains_numa_levels; i++) {
				2007	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				2008	if (cpu < nr_cpu_ids)
				2009	return cpu;
				2010	}
				2011	return nr_cpu_ids;
				2012	}
				2013
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2014	#endif /* CONFIG_NUMA */
				2015
				2016	static int __sdt_alloc(const struct cpumask *cpu_map)
				2017	{
				2018	struct sched_domain_topology_level *tl;
				2019	int j;
				2020
				2021	for_each_sd_topology(tl) {
				2022	struct sd_data *sdd = &tl->data;
				2023
				2024	sdd->sd = alloc_percpu(struct sched_domain *);
				2025	if (!sdd->sd)
				2026	return -ENOMEM;
				2027
				2028	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				2029	if (!sdd->sds)
				2030	return -ENOMEM;
				2031
				2032	sdd->sg = alloc_percpu(struct sched_group *);
				2033	if (!sdd->sg)
				2034	return -ENOMEM;
				2035
				2036	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				2037	if (!sdd->sgc)
				2038	return -ENOMEM;
				2039
				2040	for_each_cpu(j, cpu_map) {
				2041	struct sched_domain *sd;
				2042	struct sched_domain_shared *sds;
				2043	struct sched_group *sg;
				2044	struct sched_group_capacity *sgc;
				2045
				2046	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				2047	GFP_KERNEL, cpu_to_node(j));
				2048	if (!sd)
				2049	return -ENOMEM;
				2050
				2051	*per_cpu_ptr(sdd->sd, j) = sd;
				2052
				2053	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				2054	GFP_KERNEL, cpu_to_node(j));
				2055	if (!sds)
				2056	return -ENOMEM;
				2057
				2058	*per_cpu_ptr(sdd->sds, j) = sds;
				2059
				2060	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				2061	GFP_KERNEL, cpu_to_node(j));
				2062	if (!sg)
				2063	return -ENOMEM;
				2064
				2065	sg->next = sg;
				2066
				2067	*per_cpu_ptr(sdd->sg, j) = sg;
				2068
				2069	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				2070	GFP_KERNEL, cpu_to_node(j));
				2071	if (!sgc)
				2072	return -ENOMEM;
				2073
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	2074	#ifdef CONFIG_SCHED_DEBUG
				2075	sgc->id = j;
				2076	#endif
				2077
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2078	*per_cpu_ptr(sdd->sgc, j) = sgc;
				2079	}
				2080	}
				2081
				2082	return 0;
				2083	}
				2084
				2085	static void __sdt_free(const struct cpumask *cpu_map)
				2086	{
				2087	struct sched_domain_topology_level *tl;
				2088	int j;
				2089
				2090	for_each_sd_topology(tl) {
				2091	struct sd_data *sdd = &tl->data;
				2092
				2093	for_each_cpu(j, cpu_map) {
				2094	struct sched_domain *sd;
				2095
				2096	if (sdd->sd) {
				2097	sd = *per_cpu_ptr(sdd->sd, j);
				2098	if (sd && (sd->flags & SD_OVERLAP))
				2099	free_sched_groups(sd->groups, 0);
				2100	kfree(*per_cpu_ptr(sdd->sd, j));
				2101	}
				2102
				2103	if (sdd->sds)
				2104	kfree(*per_cpu_ptr(sdd->sds, j));
				2105	if (sdd->sg)
				2106	kfree(*per_cpu_ptr(sdd->sg, j));
				2107	if (sdd->sgc)
				2108	kfree(*per_cpu_ptr(sdd->sgc, j));
				2109	}
				2110	free_percpu(sdd->sd);
				2111	sdd->sd = NULL;
				2112	free_percpu(sdd->sds);
				2113	sdd->sds = NULL;
				2114	free_percpu(sdd->sg);
				2115	sdd->sg = NULL;
				2116	free_percpu(sdd->sgc);
				2117	sdd->sgc = NULL;
				2118	}
				2119	}
				2120
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	2121	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2122	const struct cpumask cpu_map, struct sched_domain_attr attr,
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2123	struct sched_domain *child, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2124	{
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2125	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2126
				2127	if (child) {
				2128	sd->level = child->level + 1;
				2129	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				2130	child->parent = sd;
				2131
				2132	if (!cpumask_subset(sched_domain_span(child),
				2133	sched_domain_span(sd))) {
				2134	pr_err("BUG: arch topology borken\n");
				2135	#ifdef CONFIG_SCHED_DEBUG
				2136	pr_err(" the %s domain not a subset of the %s domain\n",
				2137	child->name, sd->name);
				2138	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2139	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2140	cpumask_or(sched_domain_span(sd),
				2141	sched_domain_span(sd),
				2142	sched_domain_span(child));
				2143	}
				2144
				2145	}
				2146	set_domain_attribute(sd, attr);
				2147
				2148	return sd;
				2149	}
				2150
				2151	/*
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2152	* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
				2153	* any two given CPUs at this (non-NUMA) topology level.
				2154	*/
				2155	static bool topology_span_sane(struct sched_domain_topology_level *tl,
				2156	const struct cpumask *cpu_map, int cpu)
				2157	{
				2158	int i;
				2159
				2160	/* NUMA levels are allowed to overlap */
				2161	if (tl->flags & SDTL_OVERLAP)
				2162	return true;
				2163
				2164	/*
				2165	* Non-NUMA levels cannot partially overlap - they must be either
				2166	* completely equal or completely disjoint. Otherwise we can end up
				2167	* breaking the sched_group lists - i.e. a later get_group() pass
				2168	* breaks the linking done for an earlier span.
				2169	*/
				2170	for_each_cpu(i, cpu_map) {
				2171	if (i == cpu)
				2172	continue;
				2173	/*
				2174	* We should 'and' all those masks with 'cpu_map' to exactly
				2175	* match the topology we're about to build, but that can only
				2176	* remove CPUs, which only lessens our ability to detect
				2177	* overlaps
				2178	*/
				2179	if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
				2180	cpumask_intersects(tl->mask(cpu), tl->mask(i)))
				2181	return false;
				2182	}
				2183
				2184	return true;
				2185	}
				2186
				2187	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2188	* Build sched domains for a given set of CPUs and attach the sched domains
				2189	* to the individual CPUs
				2190	*/
				2191	static int
				2192	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				2193	{
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2194	enum s_alloc alloc_state = sa_none;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2195	struct sched_domain *sd;
				2196	struct s_data d;
				2197	struct rq *rq = NULL;
				2198	int i, ret = -ENOMEM;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2199	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2200
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2201	if (WARN_ON(cpumask_empty(cpu_map)))
				2202	goto error;
				2203
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2204	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				2205	if (alloc_state != sa_rootdomain)
				2206	goto error;
				2207
				2208	/* Set up domains for CPUs specified by the cpu_map: */
				2209	for_each_cpu(i, cpu_map) {
				2210	struct sched_domain_topology_level *tl;
				2211
				2212	sd = NULL;
				2213	for_each_sd_topology(tl) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2214
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2215	if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
				2216	goto error;
				2217
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2218	sd = build_sched_domain(tl, cpu_map, attr, sd, i);
				2219
				2220	has_asym \|= sd->flags & SD_ASYM_CPUCAPACITY;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2221
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2222	if (tl == sched_domain_topology)
				2223	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	2224	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2225	sd->flags \|= SD_OVERLAP;
				2226	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				2227	break;
				2228	}
				2229	}
				2230
				2231	/* Build the groups for the domains */
				2232	for_each_cpu(i, cpu_map) {
				2233	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2234	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				2235	if (sd->flags & SD_OVERLAP) {
				2236	if (build_overlap_sched_groups(sd, i))
				2237	goto error;
				2238	} else {
				2239	if (build_sched_groups(sd, i))
				2240	goto error;
				2241	}
				2242	}
				2243	}
				2244
				2245	/* Calculate CPU capacity for physical packages and nodes */
				2246	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2247	if (!cpumask_test_cpu(i, cpu_map))
				2248	continue;
				2249
				2250	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2251	claim_allocations(i, sd);
				2252	init_sched_groups_capacity(i, sd);
				2253	}
				2254	}
				2255
				2256	/* Attach the domains */
				2257	rcu_read_lock();
				2258	for_each_cpu(i, cpu_map) {
				2259	rq = cpu_rq(i);
				2260	sd = *per_cpu_ptr(d.sd, i);
				2261
				2262	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2263	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2264	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2265
				2266	cpu_attach_domain(sd, d.rd, i);
				2267	}
				2268	rcu_read_unlock();
				2269
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2270	if (has_asym)
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2271	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2272
Peter Zijlstra	9406415	2021-04-15 18:23:17 +0200	[diff] [blame]	2273	if (rq && sched_debug_verbose) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2274	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2275	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2276	}
				2277
				2278	ret = 0;
				2279	error:
				2280	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2281
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2282	return ret;
				2283	}
				2284
				2285	/* Current sched domains: */
				2286	static cpumask_var_t *doms_cur;
				2287
				2288	/* Number of sched domains in 'doms_cur': */
				2289	static int ndoms_cur;
				2290
Ingo Molnar	3b03706	2021-03-18 13:38:50 +0100	[diff] [blame]	2291	/* Attributes of custom domains in 'doms_cur' */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2292	static struct sched_domain_attr *dattr_cur;
				2293
				2294	/*
				2295	* Special case: If a kmalloc() of a doms_cur partition (array of
				2296	* cpumask) fails, then fallback to a single sched domain,
				2297	* as determined by the single cpumask fallback_doms.
				2298	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2299	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2300
				2301	/*
				2302	* arch_update_cpu_topology lets virtualized architectures update the
				2303	* CPU core maps. It is supposed to return 1 if the topology changed
				2304	* or 0 if it stayed the same.
				2305	*/
				2306	int __weak arch_update_cpu_topology(void)
				2307	{
				2308	return 0;
				2309	}
				2310
				2311	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2312	{
				2313	int i;
				2314	cpumask_var_t *doms;
				2315
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2316	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2317	if (!doms)
				2318	return NULL;
				2319	for (i = 0; i < ndoms; i++) {
				2320	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2321	free_sched_domains(doms, i);
				2322	return NULL;
				2323	}
				2324	}
				2325	return doms;
				2326	}
				2327
				2328	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2329	{
				2330	unsigned int i;
				2331	for (i = 0; i < ndoms; i++)
				2332	free_cpumask_var(doms[i]);
				2333	kfree(doms);
				2334	}
				2335
				2336	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame]	2337	* Set up scheduler domains and groups. For now this just excludes isolated
				2338	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2339	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2340	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2341	{
				2342	int err;
				2343
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2344	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2345	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2346	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2347
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2348	arch_update_cpu_topology();
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2349	asym_cpu_capacity_scan();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2350	ndoms_cur = 1;
				2351	doms_cur = alloc_sched_domains(ndoms_cur);
				2352	if (!doms_cur)
				2353	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2354	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2355	err = build_sched_domains(doms_cur[0], NULL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2356
				2357	return err;
				2358	}
				2359
				2360	/*
				2361	* Detach sched domains from a group of CPUs specified in cpu_map
				2362	* These CPUs will now be attached to the NULL domain
				2363	*/
				2364	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2365	{
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2366	unsigned int cpu = cpumask_any(cpu_map);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2367	int i;
				2368
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2369	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2370	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2371
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2372	rcu_read_lock();
				2373	for_each_cpu(i, cpu_map)
				2374	cpu_attach_domain(NULL, &def_root_domain, i);
				2375	rcu_read_unlock();
				2376	}
				2377
				2378	/* handle null as "default" */
				2379	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2380	struct sched_domain_attr *new, int idx_new)
				2381	{
				2382	struct sched_domain_attr tmp;
				2383
				2384	/* Fast path: */
				2385	if (!new && !cur)
				2386	return 1;
				2387
				2388	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2389
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2390	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2391	new ? (new + idx_new) : &tmp,
				2392	sizeof(struct sched_domain_attr));
				2393	}
				2394
				2395	/*
				2396	* Partition sched domains as specified by the 'ndoms_new'
				2397	* cpumasks in the array doms_new[] of cpumasks. This compares
				2398	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2399	* It destroys each deleted domain and builds each new domain.
				2400	*
				2401	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2402	* The masks don't intersect (don't overlap.) We should setup one
				2403	* sched domain for each mask. CPUs not in any of the cpumasks will
				2404	* not be load balanced. If the same cpumask appears both in the
				2405	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2406	* it as it is.
				2407	*
				2408	* The passed in 'doms_new' should be allocated using
				2409	* alloc_sched_domains. This routine takes ownership of it and will
				2410	* free_sched_domains it when done with it. If the caller failed the
				2411	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2412	* and partition_sched_domains() will fallback to the single partition
				2413	* 'fallback_doms', it also forces the domains to be rebuilt.
				2414	*
				2415	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2416	* ndoms_new == 0 is a special case for destroying existing domains,
				2417	* and it will not create the default domain.
				2418	*
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2419	* Call with hotplug lock and sched_domains_mutex held
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2420	*/
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2421	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2422	struct sched_domain_attr *dattr_new)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2423	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2424	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2425	int i, j, n;
				2426	int new_topology;
				2427
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2428	lockdep_assert_held(&sched_domains_mutex);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2429
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2430	/* Let the architecture update CPU core mappings: */
				2431	new_topology = arch_update_cpu_topology();
Beata Michalska	c744dc4	2021-06-03 15:06:26 +0100	[diff] [blame]	2432	/* Trigger rebuilding CPU capacity asymmetry data */
				2433	if (new_topology)
				2434	asym_cpu_capacity_scan();
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2435
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2436	if (!doms_new) {
				2437	WARN_ON_ONCE(dattr_new);
				2438	n = 0;
				2439	doms_new = alloc_sched_domains(1);
				2440	if (doms_new) {
				2441	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2442	cpumask_and(doms_new[0], cpu_active_mask,
				2443	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2444	}
				2445	} else {
				2446	n = ndoms_new;
				2447	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2448
				2449	/* Destroy deleted domains: */
				2450	for (i = 0; i < ndoms_cur; i++) {
				2451	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2452	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2453	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2454	struct root_domain *rd;
				2455
				2456	/*
				2457	* This domain won't be destroyed and as such
				2458	* its dl_bw->total_bw needs to be cleared. It
				2459	* will be recomputed in function
				2460	* update_tasks_root_domain().
				2461	*/
				2462	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2463	dl_clear_root_domain(rd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2464	goto match1;
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2465	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2466	}
				2467	/* No match - a current sched domain not in new doms_new[] */
				2468	detach_destroy_domains(doms_cur[i]);
				2469	match1:
				2470	;
				2471	}
				2472
				2473	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2474	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2475	n = 0;
				2476	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2477	cpumask_and(doms_new[0], cpu_active_mask,
				2478	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2479	}
				2480
				2481	/* Build new domains: */
				2482	for (i = 0; i < ndoms_new; i++) {
				2483	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2484	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2485	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2486	goto match2;
				2487	}
				2488	/* No match - add a new doms_new */
				2489	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2490	match2:
				2491	;
				2492	}
				2493
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2494	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2495	/* Build perf. domains: */
				2496	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2497	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2498	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2499	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2500	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2501	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2502	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2503	}
				2504	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2505	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2506	match3:
				2507	;
				2508	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2509	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2510	#endif
				2511
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2512	/* Remember the new sched domains: */
				2513	if (doms_cur != &fallback_doms)
				2514	free_sched_domains(doms_cur, ndoms_cur);
				2515
				2516	kfree(dattr_cur);
				2517	doms_cur = doms_new;
				2518	dattr_cur = dattr_new;
				2519	ndoms_cur = ndoms_new;
				2520
Peter Zijlstra	3b87f13	2021-03-25 11:31:20 +0100	[diff] [blame]	2521	update_sched_domain_debugfs();
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2522	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2523
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2524	/*
				2525	* Call with hotplug lock held
				2526	*/
				2527	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2528	struct sched_domain_attr *dattr_new)
				2529	{
				2530	mutex_lock(&sched_domains_mutex);
				2531	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2532	mutex_unlock(&sched_domains_mutex);
				2533	}