Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: 09d35044bd889291e3cacb3f8c4f3aa2b9619e5f [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9469eb0	2017-09-07 17:03:53 +0200	[diff] [blame]	17	sched_debug_enabled = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
				21	early_param("sched_debug", sched_debug_setup);
				22
				23	static inline bool sched_debug(void)
				24	{
				25	return sched_debug_enabled;
				26	}
				27
Valentin Schneider	848785d	2020-09-08 19:49:56 +0100	[diff] [blame]	28	#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
				29	const struct sd_flag_debug sd_flag_debug[] = {
				30	#include <linux/sched/sd_flags.h>
				31	};
				32	#undef SD_FLAG
				33
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	34	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				35	struct cpumask *groupmask)
				36	{
				37	struct sched_group *group = sd->groups;
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	38	unsigned long flags = sd->flags;
				39	unsigned int idx;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	40
				41	cpumask_clear(groupmask);
				42
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	43	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	54	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
				55	unsigned int flag = BIT(idx);
				56	unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
				57
				58	if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
				59	!(sd->child->flags & flag))
				60	printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
				61	sd_flag_debug[idx].name);
				62
				63	if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
				64	!(sd->parent->flags & flag))
				65	printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
				66	sd_flag_debug[idx].name);
				67	}
				68
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	69	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				70	do {
				71	if (!group) {
				72	printk("\n");
				73	printk(KERN_ERR "ERROR: group is NULL\n");
				74	break;
				75	}
				76
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	77	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	78	printk(KERN_CONT "\n");
				79	printk(KERN_ERR "ERROR: empty group\n");
				80	break;
				81	}
				82
				83	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	84	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	85	printk(KERN_CONT "\n");
				86	printk(KERN_ERR "ERROR: repeated CPUs\n");
				87	break;
				88	}
				89
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	90	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	91
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	92	printk(KERN_CONT " %d:{ span=%*pbl",
				93	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	94	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	95
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	96	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	97	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	98	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	99	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	100	}
				101
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	102	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				103	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	104
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	105	if (group == sd->groups && sd->child &&
				106	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	107	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	108	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				109	}
				110
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	111	printk(KERN_CONT " }");
				112
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	113	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	114
				115	if (group != sd->groups)
				116	printk(KERN_CONT ",");
				117
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	118	} while (group != sd->groups);
				119	printk(KERN_CONT "\n");
				120
				121	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				122	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				123
				124	if (sd->parent &&
				125	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	126	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	127	return 0;
				128	}
				129
				130	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				131	{
				132	int level = 0;
				133
				134	if (!sched_debug_enabled)
				135	return;
				136
				137	if (!sd) {
				138	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				139	return;
				140	}
				141
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	142	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	143
				144	for (;;) {
				145	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				146	break;
				147	level++;
				148	sd = sd->parent;
				149	if (!sd)
				150	break;
				151	}
				152	}
				153	#else /* !CONFIG_SCHED_DEBUG */
				154
				155	# define sched_debug_enabled 0
				156	# define sched_domain_debug(sd, cpu) do { } while (0)
				157	static inline bool sched_debug(void)
				158	{
				159	return false;
				160	}
				161	#endif /* CONFIG_SCHED_DEBUG */
				162
Valentin Schneider	4fc472f	2020-08-25 14:32:16 +0100	[diff] [blame]	163	/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
				164	#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
				165	static const unsigned int SD_DEGENERATE_GROUPS_MASK =
				166	#include <linux/sched/sd_flags.h>
				167	0;
				168	#undef SD_FLAG
				169
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	170	static int sd_degenerate(struct sched_domain *sd)
				171	{
				172	if (cpumask_weight(sched_domain_span(sd)) == 1)
				173	return 1;
				174
				175	/* Following flags need at least 2 groups */
Valentin Schneider	6f34981	2020-08-17 12:29:54 +0100	[diff] [blame]	176	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
				177	(sd->groups != sd->groups->next))
				178	return 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	179
				180	/* Following flags don't use groups */
				181	if (sd->flags & (SD_WAKE_AFFINE))
				182	return 0;
				183
				184	return 1;
				185	}
				186
				187	static int
				188	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				189	{
				190	unsigned long cflags = sd->flags, pflags = parent->flags;
				191
				192	if (sd_degenerate(parent))
				193	return 1;
				194
				195	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				196	return 0;
				197
				198	/* Flags needing groups don't count if only 1 group in parent */
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	199	if (parent->groups == parent->groups->next)
Valentin Schneider	3a6712c	2020-08-17 12:29:57 +0100	[diff] [blame]	200	pflags &= ~SD_DEGENERATE_GROUPS_MASK;
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	201
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	202	if (~cflags & pflags)
				203	return 0;
				204
				205	return 1;
				206	}
				207
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	208	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	209	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	211	DEFINE_MUTEX(sched_energy_mutex);
				212	bool sched_energy_update;
				213
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	214	void rebuild_sched_domains_energy(void)
				215	{
				216	mutex_lock(&sched_energy_mutex);
				217	sched_energy_update = true;
				218	rebuild_sched_domains();
				219	sched_energy_update = false;
				220	mutex_unlock(&sched_energy_mutex);
				221	}
				222
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	223	#ifdef CONFIG_PROC_SYSCTL
				224	int sched_energy_aware_handler(struct ctl_table *table, int write,
Christoph Hellwig	3292739	2020-04-24 08:43:38 +0200	[diff] [blame]	225	void buffer, size_t lenp, loff_t *ppos)
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	226	{
				227	int ret, state;
				228
				229	if (write && !capable(CAP_SYS_ADMIN))
				230	return -EPERM;
				231
				232	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				233	if (!ret && write) {
				234	state = static_branch_unlikely(&sched_energy_present);
Ionela Voinescu	31f6a8c	2020-10-27 18:07:11 +0000	[diff] [blame]	235	if (state != sysctl_sched_energy_aware)
				236	rebuild_sched_domains_energy();
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	237	}
				238
				239	return ret;
				240	}
				241	#endif
				242
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	243	static void free_pd(struct perf_domain *pd)
				244	{
				245	struct perf_domain *tmp;
				246
				247	while (pd) {
				248	tmp = pd->next;
				249	kfree(pd);
				250	pd = tmp;
				251	}
				252	}
				253
				254	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				255	{
				256	while (pd) {
				257	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				258	return pd;
				259	pd = pd->next;
				260	}
				261
				262	return NULL;
				263	}
				264
				265	static struct perf_domain *pd_init(int cpu)
				266	{
				267	struct em_perf_domain *obj = em_cpu_get(cpu);
				268	struct perf_domain *pd;
				269
				270	if (!obj) {
				271	if (sched_debug())
				272	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				273	return NULL;
				274	}
				275
				276	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				277	if (!pd)
				278	return NULL;
				279	pd->em_pd = obj;
				280
				281	return pd;
				282	}
				283
				284	static void perf_domain_debug(const struct cpumask *cpu_map,
				285	struct perf_domain *pd)
				286	{
				287	if (!sched_debug() \|\| !pd)
				288	return;
				289
				290	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				291
				292	while (pd) {
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	293	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	294	cpumask_first(perf_domain_span(pd)),
				295	cpumask_pr_args(perf_domain_span(pd)),
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	296	em_pd_nr_perf_states(pd->em_pd));
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	297	pd = pd->next;
				298	}
				299
				300	printk(KERN_CONT "\n");
				301	}
				302
				303	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				304	{
				305	struct perf_domain *pd;
				306
				307	pd = container_of(rp, struct perf_domain, rcu);
				308	free_pd(pd);
				309	}
				310
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	311	static void sched_energy_set(bool has_eas)
				312	{
				313	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				314	if (sched_debug())
				315	pr_info("%s: stopping EAS\n", __func__);
				316	static_branch_disable_cpuslocked(&sched_energy_present);
				317	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				318	if (sched_debug())
				319	pr_info("%s: starting EAS\n", __func__);
				320	static_branch_enable_cpuslocked(&sched_energy_present);
				321	}
				322	}
				323
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	324	/*
				325	* EAS can be used on a root domain if it meets all the following conditions:
				326	* 1. an Energy Model (EM) is available;
				327	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	328	* 3. no SMT is detected.
				329	* 4. the EM complexity is low enough to keep scheduling overheads low;
				330	* 5. schedutil is driving the frequency of all CPUs of the rd;
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	331	* 6. frequency invariance support is present;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	332	*
				333	* The complexity of the Energy Model is defined as:
				334	*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	335	* C = nr_pd * (nr_cpus + nr_ps)
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	336	*
				337	* with parameters defined as:
				338	* - nr_pd: the number of performance domains
				339	* - nr_cpus: the number of CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	340	* - nr_ps: the sum of the number of performance states of all performance
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	341	* domains (for example, on a system with 2 performance domains,
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	342	* with 10 performance states each, nr_ps = 2 * 10 = 20).
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	343	*
				344	* It is generally not a good idea to use such a model in the wake-up path on
				345	* very complex platforms because of the associated scheduling overheads. The
				346	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	347	* with per-CPU DVFS and less than 8 performance states each, for example.
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	348	*/
				349	#define EM_MAX_COMPLEXITY 2048
				350
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	351	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	352	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	353	{
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	354	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	355	struct perf_domain pd = NULL, tmp;
				356	int cpu = cpumask_first(cpu_map);
				357	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	358	struct cpufreq_policy *policy;
				359	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	360
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	361	if (!sysctl_sched_energy_aware)
				362	goto free;
				363
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	364	/* EAS is enabled for asymmetric CPU capacity topologies. */
				365	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				366	if (sched_debug()) {
				367	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				368	cpumask_pr_args(cpu_map));
				369	}
				370	goto free;
				371	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	372
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	373	/* EAS definitely does not handle SMT */
				374	if (sched_smt_active()) {
				375	pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
				376	cpumask_pr_args(cpu_map));
				377	goto free;
				378	}
				379
Ionela Voinescu	fa50e2b	2020-10-27 18:07:13 +0000	[diff] [blame]	380	if (!arch_scale_freq_invariant()) {
				381	if (sched_debug()) {
				382	pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
				383	cpumask_pr_args(cpu_map));
				384	}
				385	goto free;
				386	}
				387
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	388	for_each_cpu(i, cpu_map) {
				389	/* Skip already covered CPUs. */
				390	if (find_pd(pd, i))
				391	continue;
				392
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	393	/* Do not attempt EAS if schedutil is not being used. */
				394	policy = cpufreq_cpu_get(i);
				395	if (!policy)
				396	goto free;
				397	gov = policy->governor;
				398	cpufreq_cpu_put(policy);
				399	if (gov != &schedutil_gov) {
				400	if (rd->pd)
				401	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				402	cpumask_pr_args(cpu_map));
				403	goto free;
				404	}
				405
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	406	/* Create the new pd and add it to the local list. */
				407	tmp = pd_init(i);
				408	if (!tmp)
				409	goto free;
				410	tmp->next = pd;
				411	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	412
				413	/*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	414	* Count performance domains and performance states for the
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	415	* complexity check.
				416	*/
				417	nr_pd++;
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	418	nr_ps += em_pd_nr_perf_states(pd->em_pd);
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	419	}
				420
				421	/* Bail out if the Energy Model complexity is too high. */
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	422	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	423	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				424	cpumask_pr_args(cpu_map));
				425	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	426	}
				427
				428	perf_domain_debug(cpu_map, pd);
				429
				430	/* Attach the new list of performance domains to the root domain. */
				431	tmp = rd->pd;
				432	rcu_assign_pointer(rd->pd, pd);
				433	if (tmp)
				434	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				435
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	436	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	437
				438	free:
				439	free_pd(pd);
				440	tmp = rd->pd;
				441	rcu_assign_pointer(rd->pd, NULL);
				442	if (tmp)
				443	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	444
				445	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	446	}
				447	#else
				448	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	449	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	450
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	451	static void free_rootdomain(struct rcu_head *rcu)
				452	{
				453	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				454
				455	cpupri_cleanup(&rd->cpupri);
				456	cpudl_cleanup(&rd->cpudl);
				457	free_cpumask_var(rd->dlo_mask);
				458	free_cpumask_var(rd->rto_mask);
				459	free_cpumask_var(rd->online);
				460	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	461	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	462	kfree(rd);
				463	}
				464
				465	void rq_attach_root(struct rq rq, struct root_domain rd)
				466	{
				467	struct root_domain *old_rd = NULL;
				468	unsigned long flags;
				469
				470	raw_spin_lock_irqsave(&rq->lock, flags);
				471
				472	if (rq->rd) {
				473	old_rd = rq->rd;
				474
				475	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				476	set_rq_offline(rq);
				477
				478	cpumask_clear_cpu(rq->cpu, old_rd->span);
				479
				480	/*
				481	* If we dont want to free the old_rd yet then
				482	* set old_rd to NULL to skip the freeing later
				483	* in this function:
				484	*/
				485	if (!atomic_dec_and_test(&old_rd->refcount))
				486	old_rd = NULL;
				487	}
				488
				489	atomic_inc(&rd->refcount);
				490	rq->rd = rd;
				491
				492	cpumask_set_cpu(rq->cpu, rd->span);
				493	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				494	set_rq_online(rq);
				495
				496	raw_spin_unlock_irqrestore(&rq->lock, flags);
				497
				498	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	499	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	500	}
				501
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	502	void sched_get_rd(struct root_domain *rd)
				503	{
				504	atomic_inc(&rd->refcount);
				505	}
				506
				507	void sched_put_rd(struct root_domain *rd)
				508	{
				509	if (!atomic_dec_and_test(&rd->refcount))
				510	return;
				511
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	512	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	513	}
				514
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	515	static int init_rootdomain(struct root_domain *rd)
				516	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	517	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				518	goto out;
				519	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				520	goto free_span;
				521	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				522	goto free_online;
				523	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				524	goto free_dlo_mask;
				525
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	526	#ifdef HAVE_RT_PUSH_IPI
				527	rd->rto_cpu = -1;
				528	raw_spin_lock_init(&rd->rto_lock);
				529	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				530	#endif
				531
Peng Liu	2676242	2020-10-08 23:48:46 +0800	[diff] [blame]	532	rd->visit_gen = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	533	init_dl_bw(&rd->dl_bw);
				534	if (cpudl_init(&rd->cpudl) != 0)
				535	goto free_rto_mask;
				536
				537	if (cpupri_init(&rd->cpupri) != 0)
				538	goto free_cpudl;
				539	return 0;
				540
				541	free_cpudl:
				542	cpudl_cleanup(&rd->cpudl);
				543	free_rto_mask:
				544	free_cpumask_var(rd->rto_mask);
				545	free_dlo_mask:
				546	free_cpumask_var(rd->dlo_mask);
				547	free_online:
				548	free_cpumask_var(rd->online);
				549	free_span:
				550	free_cpumask_var(rd->span);
				551	out:
				552	return -ENOMEM;
				553	}
				554
				555	/*
				556	* By default the system creates a single root-domain with all CPUs as
				557	* members (mimicking the global state we have today).
				558	*/
				559	struct root_domain def_root_domain;
				560
				561	void init_defrootdomain(void)
				562	{
				563	init_rootdomain(&def_root_domain);
				564
				565	atomic_set(&def_root_domain.refcount, 1);
				566	}
				567
				568	static struct root_domain *alloc_rootdomain(void)
				569	{
				570	struct root_domain *rd;
				571
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	572	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	573	if (!rd)
				574	return NULL;
				575
				576	if (init_rootdomain(rd) != 0) {
				577	kfree(rd);
				578	return NULL;
				579	}
				580
				581	return rd;
				582	}
				583
				584	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				585	{
				586	struct sched_group tmp, first;
				587
				588	if (!sg)
				589	return;
				590
				591	first = sg;
				592	do {
				593	tmp = sg->next;
				594
				595	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				596	kfree(sg->sgc);
				597
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	598	if (atomic_dec_and_test(&sg->ref))
				599	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	600	sg = tmp;
				601	} while (sg != first);
				602	}
				603
				604	static void destroy_sched_domain(struct sched_domain *sd)
				605	{
				606	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	607	* A normal sched domain may have multiple group references, an
				608	* overlapping domain, having private groups, only one. Iterate,
				609	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	610	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	611	free_sched_groups(sd->groups, 1);
				612
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	613	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				614	kfree(sd->shared);
				615	kfree(sd);
				616	}
				617
				618	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				619	{
				620	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				621
				622	while (sd) {
				623	struct sched_domain *parent = sd->parent;
				624	destroy_sched_domain(sd);
				625	sd = parent;
				626	}
				627	}
				628
				629	static void destroy_sched_domains(struct sched_domain *sd)
				630	{
				631	if (sd)
				632	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				633	}
				634
				635	/*
				636	* Keep a special pointer to the highest sched_domain that has
				637	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				638	* allows us to avoid some pointer chasing select_idle_sibling().
				639	*
				640	* Also keep a unique ID per domain (we use the first CPU number in
				641	* the cpumask of the domain), this allows us to quickly tell if
				642	* two CPUs are in the same cache domain, see cpus_share_cache().
				643	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	644	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	645	DEFINE_PER_CPU(int, sd_llc_size);
				646	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	647	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				648	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				649	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				650	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	651	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	652
				653	static void update_top_cache_domain(int cpu)
				654	{
				655	struct sched_domain_shared *sds = NULL;
				656	struct sched_domain *sd;
				657	int id = cpu;
				658	int size = 1;
				659
				660	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				661	if (sd) {
				662	id = cpumask_first(sched_domain_span(sd));
				663	size = cpumask_weight(sched_domain_span(sd));
				664	sds = sd->shared;
				665	}
				666
				667	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				668	per_cpu(sd_llc_size, cpu) = size;
				669	per_cpu(sd_llc_id, cpu) = id;
				670	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				671
				672	sd = lowest_flag_domain(cpu, SD_NUMA);
				673	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				674
				675	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	676	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				677
				678	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
				679	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	680	}
				681
				682	/*
				683	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				684	* hold the hotplug lock.
				685	*/
				686	static void
				687	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				688	{
				689	struct rq *rq = cpu_rq(cpu);
				690	struct sched_domain *tmp;
Valentin Schneider	b5b2173	2020-11-10 18:43:00 +0000	[diff] [blame]	691	int numa_distance = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	692
				693	/* Remove the sched domains which do not contribute to scheduling. */
				694	for (tmp = sd; tmp; ) {
				695	struct sched_domain *parent = tmp->parent;
				696	if (!parent)
				697	break;
				698
				699	if (sd_parent_degenerate(tmp, parent)) {
				700	tmp->parent = parent->parent;
				701	if (parent->parent)
				702	parent->parent->child = tmp;
				703	/*
				704	* Transfer SD_PREFER_SIBLING down in case of a
				705	* degenerate parent; the spans match for this
				706	* so the property transfers.
				707	*/
				708	if (parent->flags & SD_PREFER_SIBLING)
				709	tmp->flags \|= SD_PREFER_SIBLING;
				710	destroy_sched_domain(parent);
				711	} else
				712	tmp = tmp->parent;
				713	}
				714
				715	if (sd && sd_degenerate(sd)) {
				716	tmp = sd;
				717	sd = sd->parent;
				718	destroy_sched_domain(tmp);
				719	if (sd)
				720	sd->child = NULL;
				721	}
				722
Valentin Schneider	b5b2173	2020-11-10 18:43:00 +0000	[diff] [blame]	723	for (tmp = sd; tmp; tmp = tmp->parent)
				724	numa_distance += !!(tmp->flags & SD_NUMA);
				725
				726	/*
				727	* FIXME: Diameter >=3 is misrepresented.
				728	*
				729	* Smallest diameter=3 topology is:
				730	*
				731	* node 0 1 2 3
				732	* 0: 10 20 30 40
				733	* 1: 20 10 20 30
				734	* 2: 30 20 10 20
				735	* 3: 40 30 20 10
				736	*
				737	* 0 --- 1 --- 2 --- 3
				738	*
				739	* NUMA-3 0-3 N/A N/A 0-3
				740	* groups: {0-2},{1-3} {1-3},{0-2}
				741	*
				742	* NUMA-2 0-2 0-3 0-3 1-3
				743	* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
				744	*
				745	* NUMA-1 0-1 0-2 1-3 2-3
				746	* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
				747	*
				748	* NUMA-0 0 1 2 3
				749	*
				750	* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
				751	* group span isn't a subset of the domain span.
				752	*/
				753	WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
				754
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	755	sched_domain_debug(sd, cpu);
				756
				757	rq_attach_root(rq, rd);
				758	tmp = rq->sd;
				759	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	760	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	761	destroy_sched_domains(tmp);
				762
				763	update_top_cache_domain(cpu);
				764	}
				765
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	766	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	767	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	768	struct root_domain *rd;
				769	};
				770
				771	enum s_alloc {
				772	sa_rootdomain,
				773	sa_sd,
				774	sa_sd_storage,
				775	sa_none,
				776	};
				777
				778	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	779	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	780	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	781	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	782	* The balance mask are all those CPUs that could actually end up at this
				783	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	784	*
				785	* Also see should_we_balance().
				786	*/
				787	int group_balance_cpu(struct sched_group *sg)
				788	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	789	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	790	}
				791
				792
				793	/*
				794	* NUMA topology (first read the regular topology blurb below)
				795	*
				796	* Given a node-distance table, for example:
				797	*
				798	* node 0 1 2 3
				799	* 0: 10 20 30 20
				800	* 1: 20 10 20 30
				801	* 2: 30 20 10 20
				802	* 3: 20 30 20 10
				803	*
				804	* which represents a 4 node ring topology like:
				805	*
				806	* 0 ----- 1
				807	* \| \|
				808	* \| \|
				809	* \| \|
				810	* 3 ----- 2
				811	*
				812	* We want to construct domains and groups to represent this. The way we go
				813	* about doing this is to build the domains on 'hops'. For each NUMA level we
				814	* construct the mask of all nodes reachable in @level hops.
				815	*
				816	* For the above NUMA topology that gives 3 levels:
				817	*
				818	* NUMA-2 0-3 0-3 0-3 0-3
				819	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				820	*
				821	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				822	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				823	*
				824	* NUMA-0 0 1 2 3
				825	*
				826	*
				827	* As can be seen; things don't nicely line up as with the regular topology.
				828	* When we iterate a domain in child domain chunks some nodes can be
				829	* represented multiple times -- hence the "overlap" naming for this part of
				830	* the topology.
				831	*
				832	* In order to minimize this overlap, we only build enough groups to cover the
				833	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				834	*
				835	* Because:
				836	*
				837	* - the first group of each domain is its child domain; this
				838	* gets us the first 0-1,3
				839	* - the only uncovered node is 2, who's child domain is 1-3.
				840	*
				841	* However, because of the overlap, computing a unique CPU for each group is
				842	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				843	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				844	* end up at those groups (they would end up in group: 0-1,3).
				845	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	846	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	847	* will contain those CPUs in the group that can reach this group given the
				848	* (child) domain tree.
				849	*
				850	* With this we can once again compute balance_cpu and sched_group_capacity
				851	* relations.
				852	*
				853	* XXX include words on how balance_cpu is unique and therefore can be
				854	* used for sched_group_capacity links.
				855	*
				856	*
				857	* Another 'interesting' topology is:
				858	*
				859	* node 0 1 2 3
				860	* 0: 10 20 20 30
				861	* 1: 20 10 20 20
				862	* 2: 20 20 10 20
				863	* 3: 30 20 20 10
				864	*
				865	* Which looks a little like:
				866	*
				867	* 0 ----- 1
				868	* \| / \|
				869	* \| / \|
				870	* \| / \|
				871	* 2 ----- 3
				872	*
				873	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				874	* are not.
				875	*
				876	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	877	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	878	*
				879	* NUMA-2 0-3 0-3
				880	* groups: {0-2},{1-3} {1-3},{0-2}
				881	*
				882	* NUMA-1 0-2 0-3 0-3 1-3
				883	*
				884	* NUMA-0 0 1 2 3
				885	*
				886	*/
				887
				888
				889	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	890	* Build the balance mask; it contains only those CPUs that can arrive at this
				891	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	892	*
				893	* We do this during the group creation pass, therefore the group information
				894	* isn't complete yet, however since each group represents a (child) domain we
				895	* can fully construct this using the sched_domain bits (which are already
				896	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	897	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	898	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	899	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	900	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	901	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	902	struct sd_data *sdd = sd->private;
				903	struct sched_domain *sibling;
				904	int i;
				905
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	906	cpumask_clear(mask);
				907
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	908	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	909	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	910
				911	/*
				912	* Can happen in the asymmetric case, where these siblings are
				913	* unused. The mask will not be empty because those CPUs that
				914	* do have the top domain _should_ span the domain.
				915	*/
				916	if (!sibling->child)
				917	continue;
				918
				919	/* If we would not end up here, we can't continue from here */
				920	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	921	continue;
				922
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	923	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	924	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	925
				926	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	927	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	928	}
				929
				930	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	931	* XXX: This creates per-node group entries; since the load-balancer will
				932	* immediately access remote memory to construct this group's load-balance
				933	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	934	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	935	static struct sched_group *
				936	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				937	{
				938	struct sched_group *sg;
				939	struct cpumask *sg_span;
				940
				941	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				942	GFP_KERNEL, cpu_to_node(cpu));
				943
				944	if (!sg)
				945	return NULL;
				946
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	947	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	948	if (sd->child)
				949	cpumask_copy(sg_span, sched_domain_span(sd->child));
				950	else
				951	cpumask_copy(sg_span, sched_domain_span(sd));
				952
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	953	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	954	return sg;
				955	}
				956
				957	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	958	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	959	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	960	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	961	struct sd_data *sdd = sd->private;
				962	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	963	int cpu;
				964
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	965	build_balance_mask(sd, sg, mask);
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	966	cpu = cpumask_first_and(sched_group_span(sg), mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	967
				968	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				969	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	970	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	971	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	972	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	973
				974	/*
				975	* Initialize sgc->capacity such that even if we mess up the
				976	* domains and no possible iteration will get us here, we won't
				977	* die on a /0 trap.
				978	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	979	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	980	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				981	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	982	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	983	}
				984
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	985	static int
				986	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				987	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	988	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	989	const struct cpumask *span = sched_domain_span(sd);
				990	struct cpumask *covered = sched_domains_tmpmask;
				991	struct sd_data *sdd = sd->private;
				992	struct sched_domain *sibling;
				993	int i;
				994
				995	cpumask_clear(covered);
				996
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	997	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	998	struct cpumask *sg_span;
				999
				1000	if (cpumask_test_cpu(i, covered))
				1001	continue;
				1002
				1003	sibling = *per_cpu_ptr(sdd->sd, i);
				1004
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	1005	/*
				1006	* Asymmetric node setups can result in situations where the
				1007	* domain tree is of unequal depth, make sure to skip domains
				1008	* that already cover the entire range.
				1009	*
				1010	* In that case build_sched_domains() will have terminated the
				1011	* iteration early and our sibling sd spans will be empty.
				1012	* Domains should always include the CPU they're built on, so
				1013	* check that.
				1014	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1015	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				1016	continue;
				1017
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	1018	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1019	if (!sg)
				1020	goto fail;
				1021
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1022	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1023	cpumask_or(covered, covered, sg_span);
				1024
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	1025	init_overlap_sched_group(sd, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1026
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1027	if (!first)
				1028	first = sg;
				1029	if (last)
				1030	last->next = sg;
				1031	last = sg;
				1032	last->next = first;
				1033	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	1034	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1035
				1036	return 0;
				1037
				1038	fail:
				1039	free_sched_groups(first, 0);
				1040
				1041	return -ENOMEM;
				1042	}
				1043
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1044
				1045	/*
				1046	* Package topology (also see the load-balance blurb in fair.c)
				1047	*
				1048	* The scheduler builds a tree structure to represent a number of important
				1049	* topology features. By default (default_topology[]) these include:
				1050	*
				1051	* - Simultaneous multithreading (SMT)
				1052	* - Multi-Core Cache (MC)
				1053	* - Package (DIE)
				1054	*
				1055	* Where the last one more or less denotes everything up to a NUMA node.
				1056	*
				1057	* The tree consists of 3 primary data structures:
				1058	*
				1059	* sched_domain -> sched_group -> sched_group_capacity
				1060	* ^ ^ ^ ^
				1061	* `-' `-'
				1062	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1063	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1064	* denote the ever growing mask of CPUs belonging to that level of topology.
				1065	*
				1066	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1067	* denoting the domains of the level below (or individual CPUs in case of the
				1068	* first domain level). The sched_group linked by a sched_domain includes the
				1069	* CPU of that sched_domain [*].
				1070	*
				1071	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1072	*
				1073	* CPU 0 1 2 3 4 5 6 7
				1074	*
				1075	* DIE [ ]
				1076	* MC [ ] [ ]
				1077	* SMT [ ] [ ] [ ] [ ]
				1078	*
				1079	* - or -
				1080	*
				1081	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1082	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1083	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1084	*
				1085	* CPU 0 1 2 3 4 5 6 7
				1086	*
				1087	* One way to think about it is: sched_domain moves you up and down among these
				1088	* topology levels, while sched_group moves you sideways through it, at child
				1089	* domain granularity.
				1090	*
				1091	* sched_group_capacity ensures each unique sched_group has shared storage.
				1092	*
				1093	* There are two related construction problems, both require a CPU that
				1094	* uniquely identify each group (for a given domain):
				1095	*
				1096	* - The first is the balance_cpu (see should_we_balance() and the
				1097	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1098	* continue balancing at a higher domain.
				1099	*
				1100	* - The second is the sched_group_capacity; we want all identical groups
				1101	* to share a single sched_group_capacity.
				1102	*
				1103	* Since these topologies are exclusive by construction. That is, its
				1104	* impossible for an SMT thread to belong to multiple cores, and cores to
				1105	* be part of multiple caches. There is a very clear and unique location
				1106	* for each CPU in the hierarchy.
				1107	*
				1108	* Therefore computing a unique CPU for each group is trivial (the iteration
				1109	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1110	* group), we can simply pick the first CPU in each group.
				1111	*
				1112	*
				1113	* [*] in other words, the first group of each domain is its child domain.
				1114	*/
				1115
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1116	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1117	{
				1118	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1119	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1120	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1121	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1122
				1123	if (child)
				1124	cpu = cpumask_first(sched_domain_span(child));
				1125
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1126	sg = *per_cpu_ptr(sdd->sg, cpu);
				1127	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1128
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1129	/* Increase refcounts for claim_allocations: */
				1130	already_visited = atomic_inc_return(&sg->ref) > 1;
				1131	/* sgc visits should follow a similar trend as sg */
				1132	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1133
				1134	/* If we have already visited that group, it's already initialized. */
				1135	if (already_visited)
				1136	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1137
				1138	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1139	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1140	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1141	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1142	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1143	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1144	}
				1145
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1146	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1147	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1148	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1149
				1150	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1151	}
				1152
				1153	/*
				1154	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1155	* covered by the given span, will set each group's ->cpumask correctly,
				1156	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1157	*
				1158	* Assumes the sched_domain tree is fully constructed
				1159	*/
				1160	static int
				1161	build_sched_groups(struct sched_domain *sd, int cpu)
				1162	{
				1163	struct sched_group first = NULL, last = NULL;
				1164	struct sd_data *sdd = sd->private;
				1165	const struct cpumask *span = sched_domain_span(sd);
				1166	struct cpumask *covered;
				1167	int i;
				1168
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1169	lockdep_assert_held(&sched_domains_mutex);
				1170	covered = sched_domains_tmpmask;
				1171
				1172	cpumask_clear(covered);
				1173
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1174	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1175	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1176
				1177	if (cpumask_test_cpu(i, covered))
				1178	continue;
				1179
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1180	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1181
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1182	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1183
				1184	if (!first)
				1185	first = sg;
				1186	if (last)
				1187	last->next = sg;
				1188	last = sg;
				1189	}
				1190	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1191	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1192
				1193	return 0;
				1194	}
				1195
				1196	/*
				1197	* Initialize sched groups cpu_capacity.
				1198	*
				1199	* cpu_capacity indicates the capacity of sched group, which is used while
				1200	* distributing the load between different sched groups in a sched domain.
				1201	* Typically cpu_capacity for all the groups in a sched domain will be same
				1202	* unless there are asymmetries in the topology. If there are asymmetries,
				1203	* group having more cpu_capacity will pickup more load compared to the
				1204	* group having less cpu_capacity.
				1205	*/
				1206	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1207	{
				1208	struct sched_group *sg = sd->groups;
				1209
				1210	WARN_ON(!sg);
				1211
				1212	do {
				1213	int cpu, max_cpu = -1;
				1214
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1215	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1216
				1217	if (!(sd->flags & SD_ASYM_PACKING))
				1218	goto next;
				1219
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1220	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1221	if (max_cpu < 0)
				1222	max_cpu = cpu;
				1223	else if (sched_asym_prefer(cpu, max_cpu))
				1224	max_cpu = cpu;
				1225	}
				1226	sg->asym_prefer_cpu = max_cpu;
				1227
				1228	next:
				1229	sg = sg->next;
				1230	} while (sg != sd->groups);
				1231
				1232	if (cpu != group_balance_cpu(sg))
				1233	return;
				1234
				1235	update_group_capacity(sd, cpu);
				1236	}
				1237
				1238	/*
				1239	* Initializers for schedule domains
				1240	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1241	*/
				1242
				1243	static int default_relax_domain_level = -1;
				1244	int sched_domain_level_max;
				1245
				1246	static int __init setup_relax_domain_level(char *str)
				1247	{
				1248	if (kstrtoint(str, 0, &default_relax_domain_level))
				1249	pr_warn("Unable to set relax_domain_level\n");
				1250
				1251	return 1;
				1252	}
				1253	__setup("relax_domain_level=", setup_relax_domain_level);
				1254
				1255	static void set_domain_attribute(struct sched_domain *sd,
				1256	struct sched_domain_attr *attr)
				1257	{
				1258	int request;
				1259
				1260	if (!attr \|\| attr->relax_domain_level < 0) {
				1261	if (default_relax_domain_level < 0)
				1262	return;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1263	request = default_relax_domain_level;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1264	} else
				1265	request = attr->relax_domain_level;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1266
				1267	if (sd->level > request) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1268	/* Turn off idle balance on this domain: */
				1269	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1270	}
				1271	}
				1272
				1273	static void __sdt_free(const struct cpumask *cpu_map);
				1274	static int __sdt_alloc(const struct cpumask *cpu_map);
				1275
				1276	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1277	const struct cpumask *cpu_map)
				1278	{
				1279	switch (what) {
				1280	case sa_rootdomain:
				1281	if (!atomic_read(&d->rd->refcount))
				1282	free_rootdomain(&d->rd->rcu);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1283	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1284	case sa_sd:
				1285	free_percpu(d->sd);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1286	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1287	case sa_sd_storage:
				1288	__sdt_free(cpu_map);
Gustavo A. R. Silva	df561f66	2020-08-23 17:36:59 -0500	[diff] [blame]	1289	fallthrough;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1290	case sa_none:
				1291	break;
				1292	}
				1293	}
				1294
				1295	static enum s_alloc
				1296	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1297	{
				1298	memset(d, 0, sizeof(*d));
				1299
				1300	if (__sdt_alloc(cpu_map))
				1301	return sa_sd_storage;
				1302	d->sd = alloc_percpu(struct sched_domain *);
				1303	if (!d->sd)
				1304	return sa_sd_storage;
				1305	d->rd = alloc_rootdomain();
				1306	if (!d->rd)
				1307	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1308
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1309	return sa_rootdomain;
				1310	}
				1311
				1312	/*
				1313	* NULL the sd_data elements we've used to build the sched_domain and
				1314	* sched_group structure so that the subsequent __free_domain_allocs()
				1315	* will not free the data we're using.
				1316	*/
				1317	static void claim_allocations(int cpu, struct sched_domain *sd)
				1318	{
				1319	struct sd_data *sdd = sd->private;
				1320
				1321	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1322	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1323
				1324	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1325	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1326
				1327	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1328	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1329
				1330	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1331	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1332	}
				1333
				1334	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1335	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1336
				1337	static int sched_domains_numa_levels;
				1338	static int sched_domains_curr_level;
				1339
				1340	int sched_max_numa_distance;
				1341	static int *sched_domains_numa_distance;
				1342	static struct cpumask ***sched_domains_numa_masks;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1343	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1344	#endif
				1345
				1346	/*
				1347	* SD_flags allowed in topology descriptions.
				1348	*
				1349	* These flags are purely descriptive of the topology and do not prescribe
				1350	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1351	* function:
				1352	*
				1353	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1354	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1355	* SD_NUMA - describes NUMA topologies
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1356	*
				1357	* Odd one out, which beside describing the topology has a quirk also
				1358	* prescribes the desired behaviour that goes along with it:
				1359	*
				1360	* SD_ASYM_PACKING - describes SMT quirks
				1361	*/
				1362	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1363	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1364	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1365	SD_NUMA \| \
Valentin Schneider	cfe7ddc	2020-08-17 12:29:47 +0100	[diff] [blame]	1366	SD_ASYM_PACKING)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1367
				1368	static struct sched_domain *
				1369	sd_init(struct sched_domain_topology_level *tl,
				1370	const struct cpumask *cpu_map,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1371	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1372	{
				1373	struct sd_data *sdd = &tl->data;
				1374	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1375	int sd_id, sd_weight, sd_flags = 0;
				1376
				1377	#ifdef CONFIG_NUMA
				1378	/*
				1379	* Ugly hack to pass state to sd_numa_mask()...
				1380	*/
				1381	sched_domains_curr_level = tl->numa_level;
				1382	#endif
				1383
				1384	sd_weight = cpumask_weight(tl->mask(cpu));
				1385
				1386	if (tl->sd_flags)
				1387	sd_flags = (*tl->sd_flags)();
				1388	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1389	"wrong sd_flags in topology description\n"))
Peng Liu	9b1b234	2020-06-09 23:09:36 +0800	[diff] [blame]	1390	sd_flags &= TOPOLOGY_SD_FLAGS;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1391
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1392	/* Apply detected topology flags */
				1393	sd_flags \|= dflags;
				1394
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1395	*sd = (struct sched_domain){
				1396	.min_interval = sd_weight,
				1397	.max_interval = 2*sd_weight,
Vincent Guittot	6e74991	2020-09-21 09:24:24 +0200	[diff] [blame]	1398	.busy_factor = 16,
Vincent Guittot	2208cda	2020-09-21 09:24:22 +0200	[diff] [blame]	1399	.imbalance_pct = 117,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1400
				1401	.cache_nice_tries = 0,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1402
Valentin Schneider	36c5bdc	2020-04-15 22:05:07 +0100	[diff] [blame]	1403	.flags = 1*SD_BALANCE_NEWIDLE
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1404	\| 1*SD_BALANCE_EXEC
				1405	\| 1*SD_BALANCE_FORK
				1406	\| 0*SD_BALANCE_WAKE
				1407	\| 1*SD_WAKE_AFFINE
				1408	\| 0*SD_SHARE_CPUCAPACITY
				1409	\| 0*SD_SHARE_PKG_RESOURCES
				1410	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1411	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1412	\| 0*SD_NUMA
				1413	\| sd_flags
				1414	,
				1415
				1416	.last_balance = jiffies,
				1417	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1418	.max_newidle_lb_cost = 0,
				1419	.next_decay_max_lb_cost = jiffies,
				1420	.child = child,
				1421	#ifdef CONFIG_SCHED_DEBUG
				1422	.name = tl->name,
				1423	#endif
				1424	};
				1425
				1426	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
				1427	sd_id = cpumask_first(sched_domain_span(sd));
				1428
				1429	/*
				1430	* Convert topological properties into behaviour.
				1431	*/
				1432
Morten Rasmussen	a526d46	2020-02-06 19:19:55 +0000	[diff] [blame]	1433	/* Don't attempt to spread across CPUs of different capacities. */
				1434	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
				1435	sd->child->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1436
				1437	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1438	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1439
				1440	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1441	sd->imbalance_pct = 117;
				1442	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1443
				1444	#ifdef CONFIG_NUMA
				1445	} else if (sd->flags & SD_NUMA) {
				1446	sd->cache_nice_tries = 2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1447
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1448	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1449	sd->flags \|= SD_SERIALIZE;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1450	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1451	sd->flags &= ~(SD_BALANCE_EXEC \|
				1452	SD_BALANCE_FORK \|
				1453	SD_WAKE_AFFINE);
				1454	}
				1455
				1456	#endif
				1457	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1458	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1459	}
				1460
				1461	/*
				1462	* For all levels sharing cache; connect a sched_domain_shared
				1463	* instance.
				1464	*/
				1465	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1466	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1467	atomic_inc(&sd->shared->ref);
				1468	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1469	}
				1470
				1471	sd->private = sdd;
				1472
				1473	return sd;
				1474	}
				1475
				1476	/*
				1477	* Topology list, bottom-up.
				1478	*/
				1479	static struct sched_domain_topology_level default_topology[] = {
				1480	#ifdef CONFIG_SCHED_SMT
				1481	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1482	#endif
				1483	#ifdef CONFIG_SCHED_MC
				1484	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1485	#endif
				1486	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1487	{ NULL, },
				1488	};
				1489
				1490	static struct sched_domain_topology_level *sched_domain_topology =
				1491	default_topology;
				1492
				1493	#define for_each_sd_topology(tl) \
				1494	for (tl = sched_domain_topology; tl->mask; tl++)
				1495
				1496	void set_sched_topology(struct sched_domain_topology_level *tl)
				1497	{
				1498	if (WARN_ON_ONCE(sched_smp_initialized))
				1499	return;
				1500
				1501	sched_domain_topology = tl;
				1502	}
				1503
				1504	#ifdef CONFIG_NUMA
				1505
				1506	static const struct cpumask *sd_numa_mask(int cpu)
				1507	{
				1508	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1509	}
				1510
				1511	static void sched_numa_warn(const char *str)
				1512	{
				1513	static int done = false;
				1514	int i,j;
				1515
				1516	if (done)
				1517	return;
				1518
				1519	done = true;
				1520
				1521	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1522
				1523	for (i = 0; i < nr_node_ids; i++) {
				1524	printk(KERN_WARNING " ");
				1525	for (j = 0; j < nr_node_ids; j++)
				1526	printk(KERN_CONT "%02d ", node_distance(i,j));
				1527	printk(KERN_CONT "\n");
				1528	}
				1529	printk(KERN_WARNING "\n");
				1530	}
				1531
				1532	bool find_numa_distance(int distance)
				1533	{
				1534	int i;
				1535
				1536	if (distance == node_distance(0, 0))
				1537	return true;
				1538
				1539	for (i = 0; i < sched_domains_numa_levels; i++) {
				1540	if (sched_domains_numa_distance[i] == distance)
				1541	return true;
				1542	}
				1543
				1544	return false;
				1545	}
				1546
				1547	/*
				1548	* A system can have three types of NUMA topology:
				1549	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1550	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1551	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1552	*
				1553	* The difference between a glueless mesh topology and a backplane
				1554	* topology lies in whether communication between not directly
				1555	* connected nodes goes through intermediary nodes (where programs
				1556	* could run), or through backplane controllers. This affects
				1557	* placement of programs.
				1558	*
				1559	* The type of topology can be discerned with the following tests:
				1560	* - If the maximum distance between any nodes is 1 hop, the system
				1561	* is directly connected.
				1562	* - If for two nodes A and B, located N > 1 hops away from each other,
				1563	* there is an intermediary node C, which is < N hops away from both
				1564	* nodes A and B, the system is a glueless mesh.
				1565	*/
				1566	static void init_numa_topology_type(void)
				1567	{
				1568	int a, b, c, n;
				1569
				1570	n = sched_max_numa_distance;
				1571
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1572	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1573	sched_numa_topology_type = NUMA_DIRECT;
				1574	return;
				1575	}
				1576
				1577	for_each_online_node(a) {
				1578	for_each_online_node(b) {
				1579	/* Find two nodes furthest removed from each other. */
				1580	if (node_distance(a, b) < n)
				1581	continue;
				1582
				1583	/* Is there an intermediary node between a and b? */
				1584	for_each_online_node(c) {
				1585	if (node_distance(a, c) < n &&
				1586	node_distance(b, c) < n) {
				1587	sched_numa_topology_type =
				1588	NUMA_GLUELESS_MESH;
				1589	return;
				1590	}
				1591	}
				1592
				1593	sched_numa_topology_type = NUMA_BACKPLANE;
				1594	return;
				1595	}
				1596	}
				1597	}
				1598
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1599
				1600	#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
				1601
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1602	void sched_init_numa(void)
				1603	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1604	struct sched_domain_topology_level *tl;
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1605	unsigned long *distance_map;
				1606	int nr_levels = 0;
				1607	int i, j;
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1608
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1609	/*
				1610	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1611	* unique distances in the node_distance() table.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1612	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1613	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
				1614	if (!distance_map)
				1615	return;
				1616
				1617	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1618	for (i = 0; i < nr_node_ids; i++) {
				1619	for (j = 0; j < nr_node_ids; j++) {
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1620	int distance = node_distance(i, j);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1621
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1622	if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
				1623	sched_numa_warn("Invalid distance value range");
				1624	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1625	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1626
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1627	bitmap_set(distance_map, distance, 1);
				1628	}
				1629	}
				1630	/*
				1631	* We can now figure out how many unique distance values there are and
				1632	* allocate memory accordingly.
				1633	*/
				1634	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
				1635
				1636	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
				1637	if (!sched_domains_numa_distance) {
				1638	bitmap_free(distance_map);
				1639	return;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1640	}
				1641
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1642	for (i = 0, j = 0; i < nr_levels; i++, j++) {
				1643	j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
				1644	sched_domains_numa_distance[i] = j;
				1645	}
				1646
				1647	bitmap_free(distance_map);
				1648
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1649	/*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1650	* 'nr_levels' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1651	*
				1652	* The sched_domains_numa_distance[] array includes the actual distance
				1653	* numbers.
				1654	*/
				1655
				1656	/*
				1657	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1658	* If it fails to allocate memory for array sched_domains_numa_masks[][],
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1659	* the array will contain less then 'nr_levels' members. This could be
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1660	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1661	* in other functions.
				1662	*
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1663	* We reset it to 'nr_levels' at the end of this function.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1664	*/
				1665	sched_domains_numa_levels = 0;
				1666
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1667	sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1668	if (!sched_domains_numa_masks)
				1669	return;
				1670
				1671	/*
				1672	* Now for each level, construct a mask per node which contains all
				1673	* CPUs of nodes that are that many hops away from us.
				1674	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1675	for (i = 0; i < nr_levels; i++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1676	sched_domains_numa_masks[i] =
				1677	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1678	if (!sched_domains_numa_masks[i])
				1679	return;
				1680
				1681	for (j = 0; j < nr_node_ids; j++) {
				1682	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1683	int k;
				1684
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1685	if (!mask)
				1686	return;
				1687
				1688	sched_domains_numa_masks[i][j] = mask;
				1689
				1690	for_each_node(k) {
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1691	if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
				1692	sched_numa_warn("Node-distance not symmetric");
				1693
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1694	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1695	continue;
				1696
				1697	cpumask_or(mask, mask, cpumask_of_node(k));
				1698	}
				1699	}
				1700	}
				1701
				1702	/* Compute default topology size */
				1703	for (i = 0; sched_domain_topology[i].mask; i++);
				1704
Dietmar Eggemann	71e5f66	2021-02-01 10:53:53 +0100	[diff] [blame^]	1705	tl = kzalloc((i + nr_levels + 1) *
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1706	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1707	if (!tl)
				1708	return;
				1709
				1710	/*
				1711	* Copy the default topology bits..
				1712	*/
				1713	for (i = 0; sched_domain_topology[i].mask; i++)
				1714	tl[i] = sched_domain_topology[i];
				1715
				1716	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1717	* Add the NUMA identity distance, aka single NODE.
				1718	*/
				1719	tl[i++] = (struct sched_domain_topology_level){
				1720	.mask = sd_numa_mask,
				1721	.numa_level = 0,
				1722	SD_INIT_NAME(NODE)
				1723	};
				1724
				1725	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1726	* .. and append 'j' levels of NUMA goodness.
				1727	*/
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1728	for (j = 1; j < nr_levels; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1729	tl[i] = (struct sched_domain_topology_level){
				1730	.mask = sd_numa_mask,
				1731	.sd_flags = cpu_numa_flags,
				1732	.flags = SDTL_OVERLAP,
				1733	.numa_level = j,
				1734	SD_INIT_NAME(NUMA)
				1735	};
				1736	}
				1737
				1738	sched_domain_topology = tl;
				1739
Valentin Schneider	620a6dc	2021-01-22 12:39:43 +0000	[diff] [blame]	1740	sched_domains_numa_levels = nr_levels;
				1741	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1742
				1743	init_numa_topology_type();
				1744	}
				1745
				1746	void sched_domains_numa_masks_set(unsigned int cpu)
				1747	{
				1748	int node = cpu_to_node(cpu);
				1749	int i, j;
				1750
				1751	for (i = 0; i < sched_domains_numa_levels; i++) {
				1752	for (j = 0; j < nr_node_ids; j++) {
				1753	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1754	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1755	}
				1756	}
				1757	}
				1758
				1759	void sched_domains_numa_masks_clear(unsigned int cpu)
				1760	{
				1761	int i, j;
				1762
				1763	for (i = 0; i < sched_domains_numa_levels; i++) {
				1764	for (j = 0; j < nr_node_ids; j++)
				1765	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1766	}
				1767	}
				1768
Wanpeng Li	e0e8d49	2019-06-28 16:51:41 +0800	[diff] [blame]	1769	/*
				1770	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1771	* closest to @cpu from @cpumask.
				1772	* cpumask: cpumask to find a cpu from
				1773	* cpu: cpu to be close to
				1774	*
				1775	* returns: cpu, or nr_cpu_ids when nothing found.
				1776	*/
				1777	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				1778	{
				1779	int i, j = cpu_to_node(cpu);
				1780
				1781	for (i = 0; i < sched_domains_numa_levels; i++) {
				1782	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				1783	if (cpu < nr_cpu_ids)
				1784	return cpu;
				1785	}
				1786	return nr_cpu_ids;
				1787	}
				1788
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1789	#endif /* CONFIG_NUMA */
				1790
				1791	static int __sdt_alloc(const struct cpumask *cpu_map)
				1792	{
				1793	struct sched_domain_topology_level *tl;
				1794	int j;
				1795
				1796	for_each_sd_topology(tl) {
				1797	struct sd_data *sdd = &tl->data;
				1798
				1799	sdd->sd = alloc_percpu(struct sched_domain *);
				1800	if (!sdd->sd)
				1801	return -ENOMEM;
				1802
				1803	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				1804	if (!sdd->sds)
				1805	return -ENOMEM;
				1806
				1807	sdd->sg = alloc_percpu(struct sched_group *);
				1808	if (!sdd->sg)
				1809	return -ENOMEM;
				1810
				1811	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				1812	if (!sdd->sgc)
				1813	return -ENOMEM;
				1814
				1815	for_each_cpu(j, cpu_map) {
				1816	struct sched_domain *sd;
				1817	struct sched_domain_shared *sds;
				1818	struct sched_group *sg;
				1819	struct sched_group_capacity *sgc;
				1820
				1821	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				1822	GFP_KERNEL, cpu_to_node(j));
				1823	if (!sd)
				1824	return -ENOMEM;
				1825
				1826	*per_cpu_ptr(sdd->sd, j) = sd;
				1827
				1828	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				1829	GFP_KERNEL, cpu_to_node(j));
				1830	if (!sds)
				1831	return -ENOMEM;
				1832
				1833	*per_cpu_ptr(sdd->sds, j) = sds;
				1834
				1835	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				1836	GFP_KERNEL, cpu_to_node(j));
				1837	if (!sg)
				1838	return -ENOMEM;
				1839
				1840	sg->next = sg;
				1841
				1842	*per_cpu_ptr(sdd->sg, j) = sg;
				1843
				1844	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				1845	GFP_KERNEL, cpu_to_node(j));
				1846	if (!sgc)
				1847	return -ENOMEM;
				1848
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	1849	#ifdef CONFIG_SCHED_DEBUG
				1850	sgc->id = j;
				1851	#endif
				1852
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1853	*per_cpu_ptr(sdd->sgc, j) = sgc;
				1854	}
				1855	}
				1856
				1857	return 0;
				1858	}
				1859
				1860	static void __sdt_free(const struct cpumask *cpu_map)
				1861	{
				1862	struct sched_domain_topology_level *tl;
				1863	int j;
				1864
				1865	for_each_sd_topology(tl) {
				1866	struct sd_data *sdd = &tl->data;
				1867
				1868	for_each_cpu(j, cpu_map) {
				1869	struct sched_domain *sd;
				1870
				1871	if (sdd->sd) {
				1872	sd = *per_cpu_ptr(sdd->sd, j);
				1873	if (sd && (sd->flags & SD_OVERLAP))
				1874	free_sched_groups(sd->groups, 0);
				1875	kfree(*per_cpu_ptr(sdd->sd, j));
				1876	}
				1877
				1878	if (sdd->sds)
				1879	kfree(*per_cpu_ptr(sdd->sds, j));
				1880	if (sdd->sg)
				1881	kfree(*per_cpu_ptr(sdd->sg, j));
				1882	if (sdd->sgc)
				1883	kfree(*per_cpu_ptr(sdd->sgc, j));
				1884	}
				1885	free_percpu(sdd->sd);
				1886	sdd->sd = NULL;
				1887	free_percpu(sdd->sds);
				1888	sdd->sds = NULL;
				1889	free_percpu(sdd->sg);
				1890	sdd->sg = NULL;
				1891	free_percpu(sdd->sgc);
				1892	sdd->sgc = NULL;
				1893	}
				1894	}
				1895
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	1896	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1897	const struct cpumask cpu_map, struct sched_domain_attr attr,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1898	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1899	{
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1900	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1901
				1902	if (child) {
				1903	sd->level = child->level + 1;
				1904	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				1905	child->parent = sd;
				1906
				1907	if (!cpumask_subset(sched_domain_span(child),
				1908	sched_domain_span(sd))) {
				1909	pr_err("BUG: arch topology borken\n");
				1910	#ifdef CONFIG_SCHED_DEBUG
				1911	pr_err(" the %s domain not a subset of the %s domain\n",
				1912	child->name, sd->name);
				1913	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1914	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1915	cpumask_or(sched_domain_span(sd),
				1916	sched_domain_span(sd),
				1917	sched_domain_span(child));
				1918	}
				1919
				1920	}
				1921	set_domain_attribute(sd, attr);
				1922
				1923	return sd;
				1924	}
				1925
				1926	/*
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	1927	* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
				1928	* any two given CPUs at this (non-NUMA) topology level.
				1929	*/
				1930	static bool topology_span_sane(struct sched_domain_topology_level *tl,
				1931	const struct cpumask *cpu_map, int cpu)
				1932	{
				1933	int i;
				1934
				1935	/* NUMA levels are allowed to overlap */
				1936	if (tl->flags & SDTL_OVERLAP)
				1937	return true;
				1938
				1939	/*
				1940	* Non-NUMA levels cannot partially overlap - they must be either
				1941	* completely equal or completely disjoint. Otherwise we can end up
				1942	* breaking the sched_group lists - i.e. a later get_group() pass
				1943	* breaks the linking done for an earlier span.
				1944	*/
				1945	for_each_cpu(i, cpu_map) {
				1946	if (i == cpu)
				1947	continue;
				1948	/*
				1949	* We should 'and' all those masks with 'cpu_map' to exactly
				1950	* match the topology we're about to build, but that can only
				1951	* remove CPUs, which only lessens our ability to detect
				1952	* overlaps
				1953	*/
				1954	if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
				1955	cpumask_intersects(tl->mask(cpu), tl->mask(i)))
				1956	return false;
				1957	}
				1958
				1959	return true;
				1960	}
				1961
				1962	/*
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1963	* Find the sched_domain_topology_level where all CPU capacities are visible
				1964	* for all CPUs.
				1965	*/
				1966	static struct sched_domain_topology_level
				1967	asym_cpu_capacity_level(const struct cpumask cpu_map)
				1968	{
				1969	int i, j, asym_level = 0;
				1970	bool asym = false;
				1971	struct sched_domain_topology_level tl, asym_tl = NULL;
				1972	unsigned long cap;
				1973
				1974	/* Is there any asymmetry? */
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1975	cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1976
				1977	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1978	if (arch_scale_cpu_capacity(i) != cap) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1979	asym = true;
				1980	break;
				1981	}
				1982	}
				1983
				1984	if (!asym)
				1985	return NULL;
				1986
				1987	/*
				1988	* Examine topology from all CPU's point of views to detect the lowest
				1989	* sched_domain_topology_level where a highest capacity CPU is visible
				1990	* to everyone.
				1991	*/
				1992	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1993	unsigned long max_capacity = arch_scale_cpu_capacity(i);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1994	int tl_id = 0;
				1995
				1996	for_each_sd_topology(tl) {
				1997	if (tl_id < asym_level)
				1998	goto next_level;
				1999
				2000	for_each_cpu_and(j, tl->mask(i), cpu_map) {
				2001	unsigned long capacity;
				2002
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	2003	capacity = arch_scale_cpu_capacity(j);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2004
				2005	if (capacity <= max_capacity)
				2006	continue;
				2007
				2008	max_capacity = capacity;
				2009	asym_level = tl_id;
				2010	asym_tl = tl;
				2011	}
				2012	next_level:
				2013	tl_id++;
				2014	}
				2015	}
				2016
				2017	return asym_tl;
				2018	}
				2019
				2020
				2021	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2022	* Build sched domains for a given set of CPUs and attach the sched domains
				2023	* to the individual CPUs
				2024	*/
				2025	static int
				2026	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				2027	{
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2028	enum s_alloc alloc_state = sa_none;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2029	struct sched_domain *sd;
				2030	struct s_data d;
				2031	struct rq *rq = NULL;
				2032	int i, ret = -ENOMEM;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2033	struct sched_domain_topology_level *tl_asym;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2034	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2035
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	2036	if (WARN_ON(cpumask_empty(cpu_map)))
				2037	goto error;
				2038
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2039	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				2040	if (alloc_state != sa_rootdomain)
				2041	goto error;
				2042
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2043	tl_asym = asym_cpu_capacity_level(cpu_map);
				2044
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2045	/* Set up domains for CPUs specified by the cpu_map: */
				2046	for_each_cpu(i, cpu_map) {
				2047	struct sched_domain_topology_level *tl;
Valentin Schneider	c200191	2020-08-17 12:29:56 +0100	[diff] [blame]	2048	int dflags = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2049
				2050	sd = NULL;
				2051	for_each_sd_topology(tl) {
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2052	if (tl == tl_asym) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2053	dflags \|= SD_ASYM_CPUCAPACITY;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2054	has_asym = true;
				2055	}
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2056
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2057	if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
				2058	goto error;
				2059
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2060	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
				2061
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2062	if (tl == sched_domain_topology)
				2063	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	2064	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2065	sd->flags \|= SD_OVERLAP;
				2066	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				2067	break;
				2068	}
				2069	}
				2070
				2071	/* Build the groups for the domains */
				2072	for_each_cpu(i, cpu_map) {
				2073	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2074	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				2075	if (sd->flags & SD_OVERLAP) {
				2076	if (build_overlap_sched_groups(sd, i))
				2077	goto error;
				2078	} else {
				2079	if (build_sched_groups(sd, i))
				2080	goto error;
				2081	}
				2082	}
				2083	}
				2084
				2085	/* Calculate CPU capacity for physical packages and nodes */
				2086	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2087	if (!cpumask_test_cpu(i, cpu_map))
				2088	continue;
				2089
				2090	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2091	claim_allocations(i, sd);
				2092	init_sched_groups_capacity(i, sd);
				2093	}
				2094	}
				2095
				2096	/* Attach the domains */
				2097	rcu_read_lock();
				2098	for_each_cpu(i, cpu_map) {
				2099	rq = cpu_rq(i);
				2100	sd = *per_cpu_ptr(d.sd, i);
				2101
				2102	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2103	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2104	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2105
				2106	cpu_attach_domain(sd, d.rd, i);
				2107	}
				2108	rcu_read_unlock();
				2109
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2110	if (has_asym)
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2111	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2112
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2113	if (rq && sched_debug_enabled) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2114	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2115	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2116	}
				2117
				2118	ret = 0;
				2119	error:
				2120	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2121
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2122	return ret;
				2123	}
				2124
				2125	/* Current sched domains: */
				2126	static cpumask_var_t *doms_cur;
				2127
				2128	/* Number of sched domains in 'doms_cur': */
				2129	static int ndoms_cur;
				2130
				2131	/* Attribues of custom domains in 'doms_cur' */
				2132	static struct sched_domain_attr *dattr_cur;
				2133
				2134	/*
				2135	* Special case: If a kmalloc() of a doms_cur partition (array of
				2136	* cpumask) fails, then fallback to a single sched domain,
				2137	* as determined by the single cpumask fallback_doms.
				2138	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2139	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2140
				2141	/*
				2142	* arch_update_cpu_topology lets virtualized architectures update the
				2143	* CPU core maps. It is supposed to return 1 if the topology changed
				2144	* or 0 if it stayed the same.
				2145	*/
				2146	int __weak arch_update_cpu_topology(void)
				2147	{
				2148	return 0;
				2149	}
				2150
				2151	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2152	{
				2153	int i;
				2154	cpumask_var_t *doms;
				2155
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2156	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2157	if (!doms)
				2158	return NULL;
				2159	for (i = 0; i < ndoms; i++) {
				2160	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2161	free_sched_domains(doms, i);
				2162	return NULL;
				2163	}
				2164	}
				2165	return doms;
				2166	}
				2167
				2168	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2169	{
				2170	unsigned int i;
				2171	for (i = 0; i < ndoms; i++)
				2172	free_cpumask_var(doms[i]);
				2173	kfree(doms);
				2174	}
				2175
				2176	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame]	2177	* Set up scheduler domains and groups. For now this just excludes isolated
				2178	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2179	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2180	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2181	{
				2182	int err;
				2183
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2184	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2185	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2186	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2187
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2188	arch_update_cpu_topology();
				2189	ndoms_cur = 1;
				2190	doms_cur = alloc_sched_domains(ndoms_cur);
				2191	if (!doms_cur)
				2192	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2193	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2194	err = build_sched_domains(doms_cur[0], NULL);
				2195	register_sched_domain_sysctl();
				2196
				2197	return err;
				2198	}
				2199
				2200	/*
				2201	* Detach sched domains from a group of CPUs specified in cpu_map
				2202	* These CPUs will now be attached to the NULL domain
				2203	*/
				2204	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2205	{
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2206	unsigned int cpu = cpumask_any(cpu_map);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2207	int i;
				2208
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2209	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2210	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2211
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2212	rcu_read_lock();
				2213	for_each_cpu(i, cpu_map)
				2214	cpu_attach_domain(NULL, &def_root_domain, i);
				2215	rcu_read_unlock();
				2216	}
				2217
				2218	/* handle null as "default" */
				2219	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2220	struct sched_domain_attr *new, int idx_new)
				2221	{
				2222	struct sched_domain_attr tmp;
				2223
				2224	/* Fast path: */
				2225	if (!new && !cur)
				2226	return 1;
				2227
				2228	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2229
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2230	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2231	new ? (new + idx_new) : &tmp,
				2232	sizeof(struct sched_domain_attr));
				2233	}
				2234
				2235	/*
				2236	* Partition sched domains as specified by the 'ndoms_new'
				2237	* cpumasks in the array doms_new[] of cpumasks. This compares
				2238	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2239	* It destroys each deleted domain and builds each new domain.
				2240	*
				2241	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2242	* The masks don't intersect (don't overlap.) We should setup one
				2243	* sched domain for each mask. CPUs not in any of the cpumasks will
				2244	* not be load balanced. If the same cpumask appears both in the
				2245	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2246	* it as it is.
				2247	*
				2248	* The passed in 'doms_new' should be allocated using
				2249	* alloc_sched_domains. This routine takes ownership of it and will
				2250	* free_sched_domains it when done with it. If the caller failed the
				2251	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2252	* and partition_sched_domains() will fallback to the single partition
				2253	* 'fallback_doms', it also forces the domains to be rebuilt.
				2254	*
				2255	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2256	* ndoms_new == 0 is a special case for destroying existing domains,
				2257	* and it will not create the default domain.
				2258	*
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2259	* Call with hotplug lock and sched_domains_mutex held
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2260	*/
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2261	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2262	struct sched_domain_attr *dattr_new)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2263	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2264	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2265	int i, j, n;
				2266	int new_topology;
				2267
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2268	lockdep_assert_held(&sched_domains_mutex);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2269
				2270	/* Always unregister in case we don't destroy any domains: */
				2271	unregister_sched_domain_sysctl();
				2272
				2273	/* Let the architecture update CPU core mappings: */
				2274	new_topology = arch_update_cpu_topology();
				2275
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2276	if (!doms_new) {
				2277	WARN_ON_ONCE(dattr_new);
				2278	n = 0;
				2279	doms_new = alloc_sched_domains(1);
				2280	if (doms_new) {
				2281	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2282	cpumask_and(doms_new[0], cpu_active_mask,
				2283	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2284	}
				2285	} else {
				2286	n = ndoms_new;
				2287	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2288
				2289	/* Destroy deleted domains: */
				2290	for (i = 0; i < ndoms_cur; i++) {
				2291	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2292	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2293	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2294	struct root_domain *rd;
				2295
				2296	/*
				2297	* This domain won't be destroyed and as such
				2298	* its dl_bw->total_bw needs to be cleared. It
				2299	* will be recomputed in function
				2300	* update_tasks_root_domain().
				2301	*/
				2302	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2303	dl_clear_root_domain(rd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2304	goto match1;
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2305	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2306	}
				2307	/* No match - a current sched domain not in new doms_new[] */
				2308	detach_destroy_domains(doms_cur[i]);
				2309	match1:
				2310	;
				2311	}
				2312
				2313	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2314	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2315	n = 0;
				2316	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2317	cpumask_and(doms_new[0], cpu_active_mask,
				2318	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2319	}
				2320
				2321	/* Build new domains: */
				2322	for (i = 0; i < ndoms_new; i++) {
				2323	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2324	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2325	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2326	goto match2;
				2327	}
				2328	/* No match - add a new doms_new */
				2329	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2330	match2:
				2331	;
				2332	}
				2333
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2334	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2335	/* Build perf. domains: */
				2336	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2337	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2338	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2339	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2340	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2341	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2342	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2343	}
				2344	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2345	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2346	match3:
				2347	;
				2348	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2349	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2350	#endif
				2351
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2352	/* Remember the new sched domains: */
				2353	if (doms_cur != &fallback_doms)
				2354	free_sched_domains(doms_cur, ndoms_cur);
				2355
				2356	kfree(dattr_cur);
				2357	doms_cur = doms_new;
				2358	dattr_cur = dattr_new;
				2359	ndoms_cur = ndoms_new;
				2360
				2361	register_sched_domain_sysctl();
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2362	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2363
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2364	/*
				2365	* Call with hotplug lock held
				2366	*/
				2367	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2368	struct sched_domain_attr *dattr_new)
				2369	{
				2370	mutex_lock(&sched_domains_mutex);
				2371	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2372	mutex_unlock(&sched_domains_mutex);
				2373	}