Blame - kernel/sched/topology.c - SHIFTPHONES/mainline/linux

blob: a3a2417fec5473386af0a2b1ea415740f0689e9e [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2	/*
				3	* Scheduler topology setup/handling methods
				4	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	5	#include "sched.h"
				6
				7	DEFINE_MUTEX(sched_domains_mutex);
				8
				9	/* Protected by sched_domains_mutex: */
zhong jiang	ace8031	2018-08-03 20:37:32 +0800	[diff] [blame]	10	static cpumask_var_t sched_domains_tmpmask;
				11	static cpumask_var_t sched_domains_tmpmask2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	12
				13	#ifdef CONFIG_SCHED_DEBUG
				14
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	15	static int __init sched_debug_setup(char *str)
				16	{
Peter Zijlstra	9469eb0	2017-09-07 17:03:53 +0200	[diff] [blame]	17	sched_debug_enabled = true;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	18
				19	return 0;
				20	}
				21	early_param("sched_debug", sched_debug_setup);
				22
				23	static inline bool sched_debug(void)
				24	{
				25	return sched_debug_enabled;
				26	}
				27
Valentin Schneider	848785d	2020-09-08 19:49:56 +0100	[diff] [blame]	28	#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
				29	const struct sd_flag_debug sd_flag_debug[] = {
				30	#include <linux/sched/sd_flags.h>
				31	};
				32	#undef SD_FLAG
				33
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	34	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
				35	struct cpumask *groupmask)
				36	{
				37	struct sched_group *group = sd->groups;
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	38	unsigned long flags = sd->flags;
				39	unsigned int idx;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	40
				41	cpumask_clear(groupmask);
				42
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	43	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	44	printk(KERN_CONT "span=%*pbl level=%s\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	45	cpumask_pr_args(sched_domain_span(sd)), sd->name);
				46
				47	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	48	printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	49	}
Yi Wang	6cd0c58	2018-07-23 12:19:07 +0800	[diff] [blame]	50	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	51	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	52	}
				53
Valentin Schneider	65c5e25	2020-08-17 12:29:51 +0100	[diff] [blame]	54	for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
				55	unsigned int flag = BIT(idx);
				56	unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
				57
				58	if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
				59	!(sd->child->flags & flag))
				60	printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
				61	sd_flag_debug[idx].name);
				62
				63	if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
				64	!(sd->parent->flags & flag))
				65	printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
				66	sd_flag_debug[idx].name);
				67	}
				68
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	69	printk(KERN_DEBUG "%*s groups:", level + 1, "");
				70	do {
				71	if (!group) {
				72	printk("\n");
				73	printk(KERN_ERR "ERROR: group is NULL\n");
				74	break;
				75	}
				76
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	77	if (!cpumask_weight(sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	78	printk(KERN_CONT "\n");
				79	printk(KERN_ERR "ERROR: empty group\n");
				80	break;
				81	}
				82
				83	if (!(sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	84	cpumask_intersects(groupmask, sched_group_span(group))) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	85	printk(KERN_CONT "\n");
				86	printk(KERN_ERR "ERROR: repeated CPUs\n");
				87	break;
				88	}
				89
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	90	cpumask_or(groupmask, groupmask, sched_group_span(group));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	91
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	92	printk(KERN_CONT " %d:{ span=%*pbl",
				93	group->sgc->id,
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	94	cpumask_pr_args(sched_group_span(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	95
Peter Zijlstra	af21812	2017-05-01 08:51:05 +0200	[diff] [blame]	96	if ((sd->flags & SD_OVERLAP) &&
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	97	!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	98	printk(KERN_CONT " mask=%*pbl",
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	99	cpumask_pr_args(group_balance_mask(group)));
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	100	}
				101
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	102	if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
				103	printk(KERN_CONT " cap=%lu", group->sgc->capacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	104
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	105	if (group == sd->groups && sd->child &&
				106	!cpumask_equal(sched_domain_span(sd->child),
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	107	sched_group_span(group))) {
Peter Zijlstra	a420b06	2017-04-14 18:20:48 +0200	[diff] [blame]	108	printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
				109	}
				110
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	111	printk(KERN_CONT " }");
				112
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	113	group = group->next;
Peter Zijlstra	b0151c2	2017-04-14 17:29:16 +0200	[diff] [blame]	114
				115	if (group != sd->groups)
				116	printk(KERN_CONT ",");
				117
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	118	} while (group != sd->groups);
				119	printk(KERN_CONT "\n");
				120
				121	if (!cpumask_equal(sched_domain_span(sd), groupmask))
				122	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
				123
				124	if (sd->parent &&
				125	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	126	printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	127	return 0;
				128	}
				129
				130	static void sched_domain_debug(struct sched_domain *sd, int cpu)
				131	{
				132	int level = 0;
				133
				134	if (!sched_debug_enabled)
				135	return;
				136
				137	if (!sd) {
				138	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
				139	return;
				140	}
				141
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	142	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	143
				144	for (;;) {
				145	if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
				146	break;
				147	level++;
				148	sd = sd->parent;
				149	if (!sd)
				150	break;
				151	}
				152	}
				153	#else /* !CONFIG_SCHED_DEBUG */
				154
				155	# define sched_debug_enabled 0
				156	# define sched_domain_debug(sd, cpu) do { } while (0)
				157	static inline bool sched_debug(void)
				158	{
				159	return false;
				160	}
				161	#endif /* CONFIG_SCHED_DEBUG */
				162
Valentin Schneider	4fc472f	2020-08-25 14:32:16 +0100	[diff] [blame]	163	/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
				164	#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
				165	static const unsigned int SD_DEGENERATE_GROUPS_MASK =
				166	#include <linux/sched/sd_flags.h>
				167	0;
				168	#undef SD_FLAG
				169
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	170	static int sd_degenerate(struct sched_domain *sd)
				171	{
				172	if (cpumask_weight(sched_domain_span(sd)) == 1)
				173	return 1;
				174
				175	/* Following flags need at least 2 groups */
Valentin Schneider	6f34981	2020-08-17 12:29:54 +0100	[diff] [blame]	176	if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
				177	(sd->groups != sd->groups->next))
				178	return 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	179
				180	/* Following flags don't use groups */
				181	if (sd->flags & (SD_WAKE_AFFINE))
				182	return 0;
				183
				184	return 1;
				185	}
				186
				187	static int
				188	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
				189	{
				190	unsigned long cflags = sd->flags, pflags = parent->flags;
				191
				192	if (sd_degenerate(parent))
				193	return 1;
				194
				195	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
				196	return 0;
				197
				198	/* Flags needing groups don't count if only 1 group in parent */
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	199	if (parent->groups == parent->groups->next)
Valentin Schneider	3a6712c	2020-08-17 12:29:57 +0100	[diff] [blame]	200	pflags &= ~SD_DEGENERATE_GROUPS_MASK;
Valentin Schneider	ab65afb	2020-08-17 12:29:55 +0100	[diff] [blame]	201
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	202	if (~cflags & pflags)
				203	return 0;
				204
				205	return 1;
				206	}
				207
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	208	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Peter Zijlstra	f8a696f	2018-12-05 11:23:56 +0100	[diff] [blame]	209	DEFINE_STATIC_KEY_FALSE(sched_energy_present);
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	210	unsigned int sysctl_sched_energy_aware = 1;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	211	DEFINE_MUTEX(sched_energy_mutex);
				212	bool sched_energy_update;
				213
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	214	#ifdef CONFIG_PROC_SYSCTL
				215	int sched_energy_aware_handler(struct ctl_table *table, int write,
Christoph Hellwig	3292739	2020-04-24 08:43:38 +0200	[diff] [blame]	216	void buffer, size_t lenp, loff_t *ppos)
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	217	{
				218	int ret, state;
				219
				220	if (write && !capable(CAP_SYS_ADMIN))
				221	return -EPERM;
				222
				223	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
				224	if (!ret && write) {
				225	state = static_branch_unlikely(&sched_energy_present);
				226	if (state != sysctl_sched_energy_aware) {
				227	mutex_lock(&sched_energy_mutex);
				228	sched_energy_update = 1;
				229	rebuild_sched_domains();
				230	sched_energy_update = 0;
				231	mutex_unlock(&sched_energy_mutex);
				232	}
				233	}
				234
				235	return ret;
				236	}
				237	#endif
				238
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	239	static void free_pd(struct perf_domain *pd)
				240	{
				241	struct perf_domain *tmp;
				242
				243	while (pd) {
				244	tmp = pd->next;
				245	kfree(pd);
				246	pd = tmp;
				247	}
				248	}
				249
				250	static struct perf_domain find_pd(struct perf_domain pd, int cpu)
				251	{
				252	while (pd) {
				253	if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
				254	return pd;
				255	pd = pd->next;
				256	}
				257
				258	return NULL;
				259	}
				260
				261	static struct perf_domain *pd_init(int cpu)
				262	{
				263	struct em_perf_domain *obj = em_cpu_get(cpu);
				264	struct perf_domain *pd;
				265
				266	if (!obj) {
				267	if (sched_debug())
				268	pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
				269	return NULL;
				270	}
				271
				272	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
				273	if (!pd)
				274	return NULL;
				275	pd->em_pd = obj;
				276
				277	return pd;
				278	}
				279
				280	static void perf_domain_debug(const struct cpumask *cpu_map,
				281	struct perf_domain *pd)
				282	{
				283	if (!sched_debug() \|\| !pd)
				284	return;
				285
				286	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
				287
				288	while (pd) {
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	289	printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	290	cpumask_first(perf_domain_span(pd)),
				291	cpumask_pr_args(perf_domain_span(pd)),
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	292	em_pd_nr_perf_states(pd->em_pd));
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	293	pd = pd->next;
				294	}
				295
				296	printk(KERN_CONT "\n");
				297	}
				298
				299	static void destroy_perf_domain_rcu(struct rcu_head *rp)
				300	{
				301	struct perf_domain *pd;
				302
				303	pd = container_of(rp, struct perf_domain, rcu);
				304	free_pd(pd);
				305	}
				306
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	307	static void sched_energy_set(bool has_eas)
				308	{
				309	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
				310	if (sched_debug())
				311	pr_info("%s: stopping EAS\n", __func__);
				312	static_branch_disable_cpuslocked(&sched_energy_present);
				313	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
				314	if (sched_debug())
				315	pr_info("%s: starting EAS\n", __func__);
				316	static_branch_enable_cpuslocked(&sched_energy_present);
				317	}
				318	}
				319
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	320	/*
				321	* EAS can be used on a root domain if it meets all the following conditions:
				322	* 1. an Energy Model (EM) is available;
				323	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	324	* 3. no SMT is detected.
				325	* 4. the EM complexity is low enough to keep scheduling overheads low;
				326	* 5. schedutil is driving the frequency of all CPUs of the rd;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	327	*
				328	* The complexity of the Energy Model is defined as:
				329	*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	330	* C = nr_pd * (nr_cpus + nr_ps)
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	331	*
				332	* with parameters defined as:
				333	* - nr_pd: the number of performance domains
				334	* - nr_cpus: the number of CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	335	* - nr_ps: the sum of the number of performance states of all performance
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	336	* domains (for example, on a system with 2 performance domains,
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	337	* with 10 performance states each, nr_ps = 2 * 10 = 20).
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	338	*
				339	* It is generally not a good idea to use such a model in the wake-up path on
				340	* very complex platforms because of the associated scheduling overheads. The
				341	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	342	* with per-CPU DVFS and less than 8 performance states each, for example.
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	343	*/
				344	#define EM_MAX_COMPLEXITY 2048
				345
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	346	extern struct cpufreq_governor schedutil_gov;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	347	static bool build_perf_domains(const struct cpumask *cpu_map)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	348	{
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	349	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	350	struct perf_domain pd = NULL, tmp;
				351	int cpu = cpumask_first(cpu_map);
				352	struct root_domain *rd = cpu_rq(cpu)->rd;
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	353	struct cpufreq_policy *policy;
				354	struct cpufreq_governor *gov;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	355
Quentin Perret	8d5d0cf	2018-12-03 09:56:23 +0000	[diff] [blame]	356	if (!sysctl_sched_energy_aware)
				357	goto free;
				358
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	359	/* EAS is enabled for asymmetric CPU capacity topologies. */
				360	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
				361	if (sched_debug()) {
				362	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
				363	cpumask_pr_args(cpu_map));
				364	}
				365	goto free;
				366	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	367
Valentin Schneider	38502ab	2020-02-27 19:14:32 +0000	[diff] [blame]	368	/* EAS definitely does not handle SMT */
				369	if (sched_smt_active()) {
				370	pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
				371	cpumask_pr_args(cpu_map));
				372	goto free;
				373	}
				374
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	375	for_each_cpu(i, cpu_map) {
				376	/* Skip already covered CPUs. */
				377	if (find_pd(pd, i))
				378	continue;
				379
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	380	/* Do not attempt EAS if schedutil is not being used. */
				381	policy = cpufreq_cpu_get(i);
				382	if (!policy)
				383	goto free;
				384	gov = policy->governor;
				385	cpufreq_cpu_put(policy);
				386	if (gov != &schedutil_gov) {
				387	if (rd->pd)
				388	pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
				389	cpumask_pr_args(cpu_map));
				390	goto free;
				391	}
				392
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	393	/* Create the new pd and add it to the local list. */
				394	tmp = pd_init(i);
				395	if (!tmp)
				396	goto free;
				397	tmp->next = pd;
				398	pd = tmp;
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	399
				400	/*
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	401	* Count performance domains and performance states for the
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	402	* complexity check.
				403	*/
				404	nr_pd++;
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	405	nr_ps += em_pd_nr_perf_states(pd->em_pd);
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	406	}
				407
				408	/* Bail out if the Energy Model complexity is too high. */
Lukasz Luba	521b512	2020-05-27 10:58:47 +0100	[diff] [blame]	409	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
Quentin Perret	b68a4c0	2018-12-03 09:56:20 +0000	[diff] [blame]	410	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
				411	cpumask_pr_args(cpu_map));
				412	goto free;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	413	}
				414
				415	perf_domain_debug(cpu_map, pd);
				416
				417	/* Attach the new list of performance domains to the root domain. */
				418	tmp = rd->pd;
				419	rcu_assign_pointer(rd->pd, pd);
				420	if (tmp)
				421	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
				422
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	423	return !!pd;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	424
				425	free:
				426	free_pd(pd);
				427	tmp = rd->pd;
				428	rcu_assign_pointer(rd->pd, NULL);
				429	if (tmp)
				430	call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	431
				432	return false;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	433	}
				434	#else
				435	static void free_pd(struct perf_domain *pd) { }
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	436	#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	437
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	438	static void free_rootdomain(struct rcu_head *rcu)
				439	{
				440	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
				441
				442	cpupri_cleanup(&rd->cpupri);
				443	cpudl_cleanup(&rd->cpudl);
				444	free_cpumask_var(rd->dlo_mask);
				445	free_cpumask_var(rd->rto_mask);
				446	free_cpumask_var(rd->online);
				447	free_cpumask_var(rd->span);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	448	free_pd(rd->pd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	449	kfree(rd);
				450	}
				451
				452	void rq_attach_root(struct rq rq, struct root_domain rd)
				453	{
				454	struct root_domain *old_rd = NULL;
				455	unsigned long flags;
				456
				457	raw_spin_lock_irqsave(&rq->lock, flags);
				458
				459	if (rq->rd) {
				460	old_rd = rq->rd;
				461
				462	if (cpumask_test_cpu(rq->cpu, old_rd->online))
				463	set_rq_offline(rq);
				464
				465	cpumask_clear_cpu(rq->cpu, old_rd->span);
				466
				467	/*
				468	* If we dont want to free the old_rd yet then
				469	* set old_rd to NULL to skip the freeing later
				470	* in this function:
				471	*/
				472	if (!atomic_dec_and_test(&old_rd->refcount))
				473	old_rd = NULL;
				474	}
				475
				476	atomic_inc(&rd->refcount);
				477	rq->rd = rd;
				478
				479	cpumask_set_cpu(rq->cpu, rd->span);
				480	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
				481	set_rq_online(rq);
				482
				483	raw_spin_unlock_irqrestore(&rq->lock, flags);
				484
				485	if (old_rd)
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	486	call_rcu(&old_rd->rcu, free_rootdomain);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	487	}
				488
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	489	void sched_get_rd(struct root_domain *rd)
				490	{
				491	atomic_inc(&rd->refcount);
				492	}
				493
				494	void sched_put_rd(struct root_domain *rd)
				495	{
				496	if (!atomic_dec_and_test(&rd->refcount))
				497	return;
				498
Paul E. McKenney	337e9b0	2018-11-06 19:10:53 -0800	[diff] [blame]	499	call_rcu(&rd->rcu, free_rootdomain);
Steven Rostedt (VMware)	364f566	2018-01-23 20:45:38 -0500	[diff] [blame]	500	}
				501
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	502	static int init_rootdomain(struct root_domain *rd)
				503	{
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	504	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
				505	goto out;
				506	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
				507	goto free_span;
				508	if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
				509	goto free_online;
				510	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
				511	goto free_dlo_mask;
				512
Steven Rostedt (Red Hat)	4bdced5	2017-10-06 14:05:04 -0400	[diff] [blame]	513	#ifdef HAVE_RT_PUSH_IPI
				514	rd->rto_cpu = -1;
				515	raw_spin_lock_init(&rd->rto_lock);
				516	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
				517	#endif
				518
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	519	init_dl_bw(&rd->dl_bw);
				520	if (cpudl_init(&rd->cpudl) != 0)
				521	goto free_rto_mask;
				522
				523	if (cpupri_init(&rd->cpupri) != 0)
				524	goto free_cpudl;
				525	return 0;
				526
				527	free_cpudl:
				528	cpudl_cleanup(&rd->cpudl);
				529	free_rto_mask:
				530	free_cpumask_var(rd->rto_mask);
				531	free_dlo_mask:
				532	free_cpumask_var(rd->dlo_mask);
				533	free_online:
				534	free_cpumask_var(rd->online);
				535	free_span:
				536	free_cpumask_var(rd->span);
				537	out:
				538	return -ENOMEM;
				539	}
				540
				541	/*
				542	* By default the system creates a single root-domain with all CPUs as
				543	* members (mimicking the global state we have today).
				544	*/
				545	struct root_domain def_root_domain;
				546
				547	void init_defrootdomain(void)
				548	{
				549	init_rootdomain(&def_root_domain);
				550
				551	atomic_set(&def_root_domain.refcount, 1);
				552	}
				553
				554	static struct root_domain *alloc_rootdomain(void)
				555	{
				556	struct root_domain *rd;
				557
Viresh Kumar	4d13a06	2017-04-13 14:45:48 +0530	[diff] [blame]	558	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	559	if (!rd)
				560	return NULL;
				561
				562	if (init_rootdomain(rd) != 0) {
				563	kfree(rd);
				564	return NULL;
				565	}
				566
				567	return rd;
				568	}
				569
				570	static void free_sched_groups(struct sched_group *sg, int free_sgc)
				571	{
				572	struct sched_group tmp, first;
				573
				574	if (!sg)
				575	return;
				576
				577	first = sg;
				578	do {
				579	tmp = sg->next;
				580
				581	if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
				582	kfree(sg->sgc);
				583
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	584	if (atomic_dec_and_test(&sg->ref))
				585	kfree(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	586	sg = tmp;
				587	} while (sg != first);
				588	}
				589
				590	static void destroy_sched_domain(struct sched_domain *sd)
				591	{
				592	/*
Peter Zijlstra	a090c4f	2017-08-21 15:42:52 +0200	[diff] [blame]	593	* A normal sched domain may have multiple group references, an
				594	* overlapping domain, having private groups, only one. Iterate,
				595	* dropping group/capacity references, freeing where none remain.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	596	*/
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	597	free_sched_groups(sd->groups, 1);
				598
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	599	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
				600	kfree(sd->shared);
				601	kfree(sd);
				602	}
				603
				604	static void destroy_sched_domains_rcu(struct rcu_head *rcu)
				605	{
				606	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
				607
				608	while (sd) {
				609	struct sched_domain *parent = sd->parent;
				610	destroy_sched_domain(sd);
				611	sd = parent;
				612	}
				613	}
				614
				615	static void destroy_sched_domains(struct sched_domain *sd)
				616	{
				617	if (sd)
				618	call_rcu(&sd->rcu, destroy_sched_domains_rcu);
				619	}
				620
				621	/*
				622	* Keep a special pointer to the highest sched_domain that has
				623	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
				624	* allows us to avoid some pointer chasing select_idle_sibling().
				625	*
				626	* Also keep a unique ID per domain (we use the first CPU number in
				627	* the cpumask of the domain), this allows us to quickly tell if
				628	* two CPUs are in the same cache domain, see cpus_share_cache().
				629	*/
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	630	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	631	DEFINE_PER_CPU(int, sd_llc_size);
				632	DEFINE_PER_CPU(int, sd_llc_id);
Joel Fernandes (Google)	994aeb7	2019-03-20 20:34:24 -0400	[diff] [blame]	633	DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
				634	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
				635	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
				636	DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	637	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	638
				639	static void update_top_cache_domain(int cpu)
				640	{
				641	struct sched_domain_shared *sds = NULL;
				642	struct sched_domain *sd;
				643	int id = cpu;
				644	int size = 1;
				645
				646	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
				647	if (sd) {
				648	id = cpumask_first(sched_domain_span(sd));
				649	size = cpumask_weight(sched_domain_span(sd));
				650	sds = sd->shared;
				651	}
				652
				653	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
				654	per_cpu(sd_llc_size, cpu) = size;
				655	per_cpu(sd_llc_id, cpu) = id;
				656	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
				657
				658	sd = lowest_flag_domain(cpu, SD_NUMA);
				659	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
				660
				661	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
Quentin Perret	011b27b	2018-12-03 09:56:19 +0000	[diff] [blame]	662	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
				663
				664	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
				665	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	666	}
				667
				668	/*
				669	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
				670	* hold the hotplug lock.
				671	*/
				672	static void
				673	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
				674	{
				675	struct rq *rq = cpu_rq(cpu);
				676	struct sched_domain *tmp;
				677
				678	/* Remove the sched domains which do not contribute to scheduling. */
				679	for (tmp = sd; tmp; ) {
				680	struct sched_domain *parent = tmp->parent;
				681	if (!parent)
				682	break;
				683
				684	if (sd_parent_degenerate(tmp, parent)) {
				685	tmp->parent = parent->parent;
				686	if (parent->parent)
				687	parent->parent->child = tmp;
				688	/*
				689	* Transfer SD_PREFER_SIBLING down in case of a
				690	* degenerate parent; the spans match for this
				691	* so the property transfers.
				692	*/
				693	if (parent->flags & SD_PREFER_SIBLING)
				694	tmp->flags \|= SD_PREFER_SIBLING;
				695	destroy_sched_domain(parent);
				696	} else
				697	tmp = tmp->parent;
				698	}
				699
				700	if (sd && sd_degenerate(sd)) {
				701	tmp = sd;
				702	sd = sd->parent;
				703	destroy_sched_domain(tmp);
				704	if (sd)
				705	sd->child = NULL;
				706	}
				707
				708	sched_domain_debug(sd, cpu);
				709
				710	rq_attach_root(rq, rd);
				711	tmp = rq->sd;
				712	rcu_assign_pointer(rq->sd, sd);
Peter Zijlstra	bbdacdf	2017-08-10 17:10:26 +0200	[diff] [blame]	713	dirty_sched_domain_sysctl(cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	714	destroy_sched_domains(tmp);
				715
				716	update_top_cache_domain(cpu);
				717	}
				718
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	719	struct s_data {
Luc Van Oostenryck	99687cd	2019-01-18 15:49:36 +0100	[diff] [blame]	720	struct sched_domain * __percpu *sd;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	721	struct root_domain *rd;
				722	};
				723
				724	enum s_alloc {
				725	sa_rootdomain,
				726	sa_sd,
				727	sa_sd_storage,
				728	sa_none,
				729	};
				730
				731	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	732	* Return the canonical balance CPU for this group, this is the first CPU
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	733	* of this group that's also in the balance mask.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	734	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	735	* The balance mask are all those CPUs that could actually end up at this
				736	* group. See build_balance_mask().
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	737	*
				738	* Also see should_we_balance().
				739	*/
				740	int group_balance_cpu(struct sched_group *sg)
				741	{
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	742	return cpumask_first(group_balance_mask(sg));
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	743	}
				744
				745
				746	/*
				747	* NUMA topology (first read the regular topology blurb below)
				748	*
				749	* Given a node-distance table, for example:
				750	*
				751	* node 0 1 2 3
				752	* 0: 10 20 30 20
				753	* 1: 20 10 20 30
				754	* 2: 30 20 10 20
				755	* 3: 20 30 20 10
				756	*
				757	* which represents a 4 node ring topology like:
				758	*
				759	* 0 ----- 1
				760	* \| \|
				761	* \| \|
				762	* \| \|
				763	* 3 ----- 2
				764	*
				765	* We want to construct domains and groups to represent this. The way we go
				766	* about doing this is to build the domains on 'hops'. For each NUMA level we
				767	* construct the mask of all nodes reachable in @level hops.
				768	*
				769	* For the above NUMA topology that gives 3 levels:
				770	*
				771	* NUMA-2 0-3 0-3 0-3 0-3
				772	* groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
				773	*
				774	* NUMA-1 0-1,3 0-2 1-3 0,2-3
				775	* groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
				776	*
				777	* NUMA-0 0 1 2 3
				778	*
				779	*
				780	* As can be seen; things don't nicely line up as with the regular topology.
				781	* When we iterate a domain in child domain chunks some nodes can be
				782	* represented multiple times -- hence the "overlap" naming for this part of
				783	* the topology.
				784	*
				785	* In order to minimize this overlap, we only build enough groups to cover the
				786	* domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
				787	*
				788	* Because:
				789	*
				790	* - the first group of each domain is its child domain; this
				791	* gets us the first 0-1,3
				792	* - the only uncovered node is 2, who's child domain is 1-3.
				793	*
				794	* However, because of the overlap, computing a unique CPU for each group is
				795	* more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
				796	* groups include the CPUs of Node-0, while those CPUs would not in fact ever
				797	* end up at those groups (they would end up in group: 0-1,3).
				798	*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	799	* To correct this we have to introduce the group balance mask. This mask
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	800	* will contain those CPUs in the group that can reach this group given the
				801	* (child) domain tree.
				802	*
				803	* With this we can once again compute balance_cpu and sched_group_capacity
				804	* relations.
				805	*
				806	* XXX include words on how balance_cpu is unique and therefore can be
				807	* used for sched_group_capacity links.
				808	*
				809	*
				810	* Another 'interesting' topology is:
				811	*
				812	* node 0 1 2 3
				813	* 0: 10 20 20 30
				814	* 1: 20 10 20 20
				815	* 2: 20 20 10 20
				816	* 3: 30 20 20 10
				817	*
				818	* Which looks a little like:
				819	*
				820	* 0 ----- 1
				821	* \| / \|
				822	* \| / \|
				823	* \| / \|
				824	* 2 ----- 3
				825	*
				826	* This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
				827	* are not.
				828	*
				829	* This leads to a few particularly weird cases where the sched_domain's are
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	830	* not of the same number for each CPU. Consider:
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	831	*
				832	* NUMA-2 0-3 0-3
				833	* groups: {0-2},{1-3} {1-3},{0-2}
				834	*
				835	* NUMA-1 0-2 0-3 0-3 1-3
				836	*
				837	* NUMA-0 0 1 2 3
				838	*
				839	*/
				840
				841
				842	/*
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	843	* Build the balance mask; it contains only those CPUs that can arrive at this
				844	* group and should be considered to continue balancing.
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	845	*
				846	* We do this during the group creation pass, therefore the group information
				847	* isn't complete yet, however since each group represents a (child) domain we
				848	* can fully construct this using the sched_domain bits (which are already
				849	* complete).
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	850	*/
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	851	static void
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	852	build_balance_mask(struct sched_domain sd, struct sched_group sg, struct cpumask *mask)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	853	{
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	854	const struct cpumask *sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	855	struct sd_data *sdd = sd->private;
				856	struct sched_domain *sibling;
				857	int i;
				858
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	859	cpumask_clear(mask);
				860
Lauro Ramos Venancio	f32d782	2017-04-20 16:51:40 -0300	[diff] [blame]	861	for_each_cpu(i, sg_span) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	862	sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	863
				864	/*
				865	* Can happen in the asymmetric case, where these siblings are
				866	* unused. The mask will not be empty because those CPUs that
				867	* do have the top domain _should_ span the domain.
				868	*/
				869	if (!sibling->child)
				870	continue;
				871
				872	/* If we would not end up here, we can't continue from here */
				873	if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	874	continue;
				875
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	876	cpumask_set_cpu(i, mask);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	877	}
Peter Zijlstra	73bb059	2017-04-25 14:00:49 +0200	[diff] [blame]	878
				879	/* We must not have empty masks here */
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	880	WARN_ON_ONCE(cpumask_empty(mask));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	881	}
				882
				883	/*
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	884	* XXX: This creates per-node group entries; since the load-balancer will
				885	* immediately access remote memory to construct this group's load-balance
				886	* statistics having the groups node local is of dubious benefit.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	887	*/
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	888	static struct sched_group *
				889	build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
				890	{
				891	struct sched_group *sg;
				892	struct cpumask *sg_span;
				893
				894	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				895	GFP_KERNEL, cpu_to_node(cpu));
				896
				897	if (!sg)
				898	return NULL;
				899
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	900	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	901	if (sd->child)
				902	cpumask_copy(sg_span, sched_domain_span(sd->child));
				903	else
				904	cpumask_copy(sg_span, sched_domain_span(sd));
				905
Shu Wang	213c5a4	2017-08-10 15:52:16 +0800	[diff] [blame]	906	atomic_inc(&sg->ref);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	907	return sg;
				908	}
				909
				910	static void init_overlap_sched_group(struct sched_domain *sd,
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	911	struct sched_group *sg)
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	912	{
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	913	struct cpumask *mask = sched_domains_tmpmask2;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	914	struct sd_data *sdd = sd->private;
				915	struct cpumask *sg_span;
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	916	int cpu;
				917
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	918	build_balance_mask(sd, sg, mask);
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	919	cpu = cpumask_first_and(sched_group_span(sg), mask);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	920
				921	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
				922	if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	923	cpumask_copy(group_balance_mask(sg), mask);
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	924	else
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	925	WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	926
				927	/*
				928	* Initialize sgc->capacity such that even if we mess up the
				929	* domains and no possible iteration will get us here, we won't
				930	* die on a /0 trap.
				931	*/
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	932	sg_span = sched_group_span(sg);
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	933	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
				934	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	935	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	936	}
				937
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	938	static int
				939	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
				940	{
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	941	struct sched_group first = NULL, last = NULL, *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	942	const struct cpumask *span = sched_domain_span(sd);
				943	struct cpumask *covered = sched_domains_tmpmask;
				944	struct sd_data *sdd = sd->private;
				945	struct sched_domain *sibling;
				946	int i;
				947
				948	cpumask_clear(covered);
				949
Peter Zijlstra	0372dd2	2017-04-14 17:24:02 +0200	[diff] [blame]	950	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	951	struct cpumask *sg_span;
				952
				953	if (cpumask_test_cpu(i, covered))
				954	continue;
				955
				956	sibling = *per_cpu_ptr(sdd->sd, i);
				957
Lauro Ramos Venancio	c20e1ea	2017-04-20 16:51:42 -0300	[diff] [blame]	958	/*
				959	* Asymmetric node setups can result in situations where the
				960	* domain tree is of unequal depth, make sure to skip domains
				961	* that already cover the entire range.
				962	*
				963	* In that case build_sched_domains() will have terminated the
				964	* iteration early and our sibling sd spans will be empty.
				965	* Domains should always include the CPU they're built on, so
				966	* check that.
				967	*/
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	968	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
				969	continue;
				970
Lauro Ramos Venancio	8c03346	2017-04-13 10:56:07 -0300	[diff] [blame]	971	sg = build_group_from_child_sched_domain(sibling, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	972	if (!sg)
				973	goto fail;
				974
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	975	sg_span = sched_group_span(sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	976	cpumask_or(covered, covered, sg_span);
				977
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	978	init_overlap_sched_group(sd, sg);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	979
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	980	if (!first)
				981	first = sg;
				982	if (last)
				983	last->next = sg;
				984	last = sg;
				985	last->next = first;
				986	}
Peter Zijlstra	91eaed0	2017-04-14 17:32:07 +0200	[diff] [blame]	987	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	988
				989	return 0;
				990
				991	fail:
				992	free_sched_groups(first, 0);
				993
				994	return -ENOMEM;
				995	}
				996
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	997
				998	/*
				999	* Package topology (also see the load-balance blurb in fair.c)
				1000	*
				1001	* The scheduler builds a tree structure to represent a number of important
				1002	* topology features. By default (default_topology[]) these include:
				1003	*
				1004	* - Simultaneous multithreading (SMT)
				1005	* - Multi-Core Cache (MC)
				1006	* - Package (DIE)
				1007	*
				1008	* Where the last one more or less denotes everything up to a NUMA node.
				1009	*
				1010	* The tree consists of 3 primary data structures:
				1011	*
				1012	* sched_domain -> sched_group -> sched_group_capacity
				1013	* ^ ^ ^ ^
				1014	* `-' `-'
				1015	*
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1016	* The sched_domains are per-CPU and have a two way link (parent & child) and
Peter Zijlstra	35a566e	2017-04-28 10:54:26 +0200	[diff] [blame]	1017	* denote the ever growing mask of CPUs belonging to that level of topology.
				1018	*
				1019	* Each sched_domain has a circular (double) linked list of sched_group's, each
				1020	* denoting the domains of the level below (or individual CPUs in case of the
				1021	* first domain level). The sched_group linked by a sched_domain includes the
				1022	* CPU of that sched_domain [*].
				1023	*
				1024	* Take for instance a 2 threaded, 2 core, 2 cache cluster part:
				1025	*
				1026	* CPU 0 1 2 3 4 5 6 7
				1027	*
				1028	* DIE [ ]
				1029	* MC [ ] [ ]
				1030	* SMT [ ] [ ] [ ] [ ]
				1031	*
				1032	* - or -
				1033	*
				1034	* DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
				1035	* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
				1036	* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
				1037	*
				1038	* CPU 0 1 2 3 4 5 6 7
				1039	*
				1040	* One way to think about it is: sched_domain moves you up and down among these
				1041	* topology levels, while sched_group moves you sideways through it, at child
				1042	* domain granularity.
				1043	*
				1044	* sched_group_capacity ensures each unique sched_group has shared storage.
				1045	*
				1046	* There are two related construction problems, both require a CPU that
				1047	* uniquely identify each group (for a given domain):
				1048	*
				1049	* - The first is the balance_cpu (see should_we_balance() and the
				1050	* load-balance blub in fair.c); for each group we only want 1 CPU to
				1051	* continue balancing at a higher domain.
				1052	*
				1053	* - The second is the sched_group_capacity; we want all identical groups
				1054	* to share a single sched_group_capacity.
				1055	*
				1056	* Since these topologies are exclusive by construction. That is, its
				1057	* impossible for an SMT thread to belong to multiple cores, and cores to
				1058	* be part of multiple caches. There is a very clear and unique location
				1059	* for each CPU in the hierarchy.
				1060	*
				1061	* Therefore computing a unique CPU for each group is trivial (the iteration
				1062	* mask is redundant and set all 1s; all CPUs in a group will end up at _that_
				1063	* group), we can simply pick the first CPU in each group.
				1064	*
				1065	*
				1066	* [*] in other words, the first group of each domain is its child domain.
				1067	*/
				1068
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1069	static struct sched_group get_group(int cpu, struct sd_data sdd)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1070	{
				1071	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1072	struct sched_domain *child = sd->child;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1073	struct sched_group *sg;
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1074	bool already_visited;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1075
				1076	if (child)
				1077	cpu = cpumask_first(sched_domain_span(child));
				1078
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1079	sg = *per_cpu_ptr(sdd->sg, cpu);
				1080	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1081
Valentin Schneider	67d4f6f	2019-04-09 18:35:45 +0100	[diff] [blame]	1082	/* Increase refcounts for claim_allocations: */
				1083	already_visited = atomic_inc_return(&sg->ref) > 1;
				1084	/* sgc visits should follow a similar trend as sg */
				1085	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
				1086
				1087	/* If we have already visited that group, it's already initialized. */
				1088	if (already_visited)
				1089	return sg;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1090
				1091	if (child) {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1092	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
				1093	cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1094	} else {
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1095	cpumask_set_cpu(cpu, sched_group_span(sg));
Peter Zijlstra	e5c14b1	2017-05-01 10:47:02 +0200	[diff] [blame]	1096	cpumask_set_cpu(cpu, group_balance_mask(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1097	}
				1098
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1099	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1100	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen	e3d6d0c	2018-07-04 11:17:41 +0100	[diff] [blame]	1101	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1102
				1103	return sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1104	}
				1105
				1106	/*
				1107	* build_sched_groups will build a circular linked list of the groups
Valentin Schneider	d874323	2019-04-09 18:35:46 +0100	[diff] [blame]	1108	* covered by the given span, will set each group's ->cpumask correctly,
				1109	* and will initialize their ->sgc.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1110	*
				1111	* Assumes the sched_domain tree is fully constructed
				1112	*/
				1113	static int
				1114	build_sched_groups(struct sched_domain *sd, int cpu)
				1115	{
				1116	struct sched_group first = NULL, last = NULL;
				1117	struct sd_data *sdd = sd->private;
				1118	const struct cpumask *span = sched_domain_span(sd);
				1119	struct cpumask *covered;
				1120	int i;
				1121
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1122	lockdep_assert_held(&sched_domains_mutex);
				1123	covered = sched_domains_tmpmask;
				1124
				1125	cpumask_clear(covered);
				1126
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1127	for_each_cpu_wrap(i, span, cpu) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1128	struct sched_group *sg;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1129
				1130	if (cpumask_test_cpu(i, covered))
				1131	continue;
				1132
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1133	sg = get_group(i, sdd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1134
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1135	cpumask_or(covered, covered, sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1136
				1137	if (!first)
				1138	first = sg;
				1139	if (last)
				1140	last->next = sg;
				1141	last = sg;
				1142	}
				1143	last->next = first;
Peter Zijlstra	0c0e776	2017-05-03 14:18:06 +0200	[diff] [blame]	1144	sd->groups = first;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1145
				1146	return 0;
				1147	}
				1148
				1149	/*
				1150	* Initialize sched groups cpu_capacity.
				1151	*
				1152	* cpu_capacity indicates the capacity of sched group, which is used while
				1153	* distributing the load between different sched groups in a sched domain.
				1154	* Typically cpu_capacity for all the groups in a sched domain will be same
				1155	* unless there are asymmetries in the topology. If there are asymmetries,
				1156	* group having more cpu_capacity will pickup more load compared to the
				1157	* group having less cpu_capacity.
				1158	*/
				1159	static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
				1160	{
				1161	struct sched_group *sg = sd->groups;
				1162
				1163	WARN_ON(!sg);
				1164
				1165	do {
				1166	int cpu, max_cpu = -1;
				1167
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1168	sg->group_weight = cpumask_weight(sched_group_span(sg));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1169
				1170	if (!(sd->flags & SD_ASYM_PACKING))
				1171	goto next;
				1172
Peter Zijlstra	ae4df9d	2017-05-01 11:03:12 +0200	[diff] [blame]	1173	for_each_cpu(cpu, sched_group_span(sg)) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1174	if (max_cpu < 0)
				1175	max_cpu = cpu;
				1176	else if (sched_asym_prefer(cpu, max_cpu))
				1177	max_cpu = cpu;
				1178	}
				1179	sg->asym_prefer_cpu = max_cpu;
				1180
				1181	next:
				1182	sg = sg->next;
				1183	} while (sg != sd->groups);
				1184
				1185	if (cpu != group_balance_cpu(sg))
				1186	return;
				1187
				1188	update_group_capacity(sd, cpu);
				1189	}
				1190
				1191	/*
				1192	* Initializers for schedule domains
				1193	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
				1194	*/
				1195
				1196	static int default_relax_domain_level = -1;
				1197	int sched_domain_level_max;
				1198
				1199	static int __init setup_relax_domain_level(char *str)
				1200	{
				1201	if (kstrtoint(str, 0, &default_relax_domain_level))
				1202	pr_warn("Unable to set relax_domain_level\n");
				1203
				1204	return 1;
				1205	}
				1206	__setup("relax_domain_level=", setup_relax_domain_level);
				1207
				1208	static void set_domain_attribute(struct sched_domain *sd,
				1209	struct sched_domain_attr *attr)
				1210	{
				1211	int request;
				1212
				1213	if (!attr \|\| attr->relax_domain_level < 0) {
				1214	if (default_relax_domain_level < 0)
				1215	return;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1216	request = default_relax_domain_level;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1217	} else
				1218	request = attr->relax_domain_level;
Valentin Schneider	9ae7ab2	2019-10-14 17:44:08 +0100	[diff] [blame]	1219
				1220	if (sd->level > request) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1221	/* Turn off idle balance on this domain: */
				1222	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1223	}
				1224	}
				1225
				1226	static void __sdt_free(const struct cpumask *cpu_map);
				1227	static int __sdt_alloc(const struct cpumask *cpu_map);
				1228
				1229	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
				1230	const struct cpumask *cpu_map)
				1231	{
				1232	switch (what) {
				1233	case sa_rootdomain:
				1234	if (!atomic_read(&d->rd->refcount))
				1235	free_rootdomain(&d->rd->rcu);
				1236	/* Fall through */
				1237	case sa_sd:
				1238	free_percpu(d->sd);
				1239	/* Fall through */
				1240	case sa_sd_storage:
				1241	__sdt_free(cpu_map);
				1242	/* Fall through */
				1243	case sa_none:
				1244	break;
				1245	}
				1246	}
				1247
				1248	static enum s_alloc
				1249	__visit_domain_allocation_hell(struct s_data d, const struct cpumask cpu_map)
				1250	{
				1251	memset(d, 0, sizeof(*d));
				1252
				1253	if (__sdt_alloc(cpu_map))
				1254	return sa_sd_storage;
				1255	d->sd = alloc_percpu(struct sched_domain *);
				1256	if (!d->sd)
				1257	return sa_sd_storage;
				1258	d->rd = alloc_rootdomain();
				1259	if (!d->rd)
				1260	return sa_sd;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1261
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1262	return sa_rootdomain;
				1263	}
				1264
				1265	/*
				1266	* NULL the sd_data elements we've used to build the sched_domain and
				1267	* sched_group structure so that the subsequent __free_domain_allocs()
				1268	* will not free the data we're using.
				1269	*/
				1270	static void claim_allocations(int cpu, struct sched_domain *sd)
				1271	{
				1272	struct sd_data *sdd = sd->private;
				1273
				1274	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
				1275	*per_cpu_ptr(sdd->sd, cpu) = NULL;
				1276
				1277	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
				1278	*per_cpu_ptr(sdd->sds, cpu) = NULL;
				1279
				1280	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
				1281	*per_cpu_ptr(sdd->sg, cpu) = NULL;
				1282
				1283	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
				1284	*per_cpu_ptr(sdd->sgc, cpu) = NULL;
				1285	}
				1286
				1287	#ifdef CONFIG_NUMA
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1288	enum numa_topology_type sched_numa_topology_type;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1289
				1290	static int sched_domains_numa_levels;
				1291	static int sched_domains_curr_level;
				1292
				1293	int sched_max_numa_distance;
				1294	static int *sched_domains_numa_distance;
				1295	static struct cpumask ***sched_domains_numa_masks;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1296	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1297	#endif
				1298
				1299	/*
				1300	* SD_flags allowed in topology descriptions.
				1301	*
				1302	* These flags are purely descriptive of the topology and do not prescribe
				1303	* behaviour. Behaviour is artificial and mapped in the below sd_init()
				1304	* function:
				1305	*
				1306	* SD_SHARE_CPUCAPACITY - describes SMT topologies
				1307	* SD_SHARE_PKG_RESOURCES - describes shared caches
				1308	* SD_NUMA - describes NUMA topologies
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1309	*
				1310	* Odd one out, which beside describing the topology has a quirk also
				1311	* prescribes the desired behaviour that goes along with it:
				1312	*
				1313	* SD_ASYM_PACKING - describes SMT quirks
				1314	*/
				1315	#define TOPOLOGY_SD_FLAGS \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1316	(SD_SHARE_CPUCAPACITY \| \
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1317	SD_SHARE_PKG_RESOURCES \| \
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1318	SD_NUMA \| \
Valentin Schneider	cfe7ddc	2020-08-17 12:29:47 +0100	[diff] [blame]	1319	SD_ASYM_PACKING)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1320
				1321	static struct sched_domain *
				1322	sd_init(struct sched_domain_topology_level *tl,
				1323	const struct cpumask *cpu_map,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1324	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1325	{
				1326	struct sd_data *sdd = &tl->data;
				1327	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
				1328	int sd_id, sd_weight, sd_flags = 0;
				1329
				1330	#ifdef CONFIG_NUMA
				1331	/*
				1332	* Ugly hack to pass state to sd_numa_mask()...
				1333	*/
				1334	sched_domains_curr_level = tl->numa_level;
				1335	#endif
				1336
				1337	sd_weight = cpumask_weight(tl->mask(cpu));
				1338
				1339	if (tl->sd_flags)
				1340	sd_flags = (*tl->sd_flags)();
				1341	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
				1342	"wrong sd_flags in topology description\n"))
Peng Liu	9b1b234	2020-06-09 23:09:36 +0800	[diff] [blame]	1343	sd_flags &= TOPOLOGY_SD_FLAGS;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1344
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1345	/* Apply detected topology flags */
				1346	sd_flags \|= dflags;
				1347
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1348	*sd = (struct sched_domain){
				1349	.min_interval = sd_weight,
				1350	.max_interval = 2*sd_weight,
Vincent Guittot	6e74991	2020-09-21 09:24:24 +0200	[diff] [blame^]	1351	.busy_factor = 16,
Vincent Guittot	2208cda	2020-09-21 09:24:22 +0200	[diff] [blame]	1352	.imbalance_pct = 117,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1353
				1354	.cache_nice_tries = 0,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1355
Valentin Schneider	36c5bdc	2020-04-15 22:05:07 +0100	[diff] [blame]	1356	.flags = 1*SD_BALANCE_NEWIDLE
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1357	\| 1*SD_BALANCE_EXEC
				1358	\| 1*SD_BALANCE_FORK
				1359	\| 0*SD_BALANCE_WAKE
				1360	\| 1*SD_WAKE_AFFINE
				1361	\| 0*SD_SHARE_CPUCAPACITY
				1362	\| 0*SD_SHARE_PKG_RESOURCES
				1363	\| 0*SD_SERIALIZE
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1364	\| 1*SD_PREFER_SIBLING
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1365	\| 0*SD_NUMA
				1366	\| sd_flags
				1367	,
				1368
				1369	.last_balance = jiffies,
				1370	.balance_interval = sd_weight,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1371	.max_newidle_lb_cost = 0,
				1372	.next_decay_max_lb_cost = jiffies,
				1373	.child = child,
				1374	#ifdef CONFIG_SCHED_DEBUG
				1375	.name = tl->name,
				1376	#endif
				1377	};
				1378
				1379	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
				1380	sd_id = cpumask_first(sched_domain_span(sd));
				1381
				1382	/*
				1383	* Convert topological properties into behaviour.
				1384	*/
				1385
Morten Rasmussen	a526d46	2020-02-06 19:19:55 +0000	[diff] [blame]	1386	/* Don't attempt to spread across CPUs of different capacities. */
				1387	if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
				1388	sd->child->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1389
				1390	if (sd->flags & SD_SHARE_CPUCAPACITY) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1391	sd->imbalance_pct = 110;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1392
				1393	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1394	sd->imbalance_pct = 117;
				1395	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1396
				1397	#ifdef CONFIG_NUMA
				1398	} else if (sd->flags & SD_NUMA) {
				1399	sd->cache_nice_tries = 2;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1400
Morten Rasmussen	9c63e84	2018-07-04 11:17:50 +0100	[diff] [blame]	1401	sd->flags &= ~SD_PREFER_SIBLING;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1402	sd->flags \|= SD_SERIALIZE;
Matt Fleming	a55c745	2019-08-08 20:53:01 +0100	[diff] [blame]	1403	if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1404	sd->flags &= ~(SD_BALANCE_EXEC \|
				1405	SD_BALANCE_FORK \|
				1406	SD_WAKE_AFFINE);
				1407	}
				1408
				1409	#endif
				1410	} else {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1411	sd->cache_nice_tries = 1;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1412	}
				1413
				1414	/*
				1415	* For all levels sharing cache; connect a sched_domain_shared
				1416	* instance.
				1417	*/
				1418	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
				1419	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
				1420	atomic_inc(&sd->shared->ref);
				1421	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
				1422	}
				1423
				1424	sd->private = sdd;
				1425
				1426	return sd;
				1427	}
				1428
				1429	/*
				1430	* Topology list, bottom-up.
				1431	*/
				1432	static struct sched_domain_topology_level default_topology[] = {
				1433	#ifdef CONFIG_SCHED_SMT
				1434	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
				1435	#endif
				1436	#ifdef CONFIG_SCHED_MC
				1437	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
				1438	#endif
				1439	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
				1440	{ NULL, },
				1441	};
				1442
				1443	static struct sched_domain_topology_level *sched_domain_topology =
				1444	default_topology;
				1445
				1446	#define for_each_sd_topology(tl) \
				1447	for (tl = sched_domain_topology; tl->mask; tl++)
				1448
				1449	void set_sched_topology(struct sched_domain_topology_level *tl)
				1450	{
				1451	if (WARN_ON_ONCE(sched_smp_initialized))
				1452	return;
				1453
				1454	sched_domain_topology = tl;
				1455	}
				1456
				1457	#ifdef CONFIG_NUMA
				1458
				1459	static const struct cpumask *sd_numa_mask(int cpu)
				1460	{
				1461	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
				1462	}
				1463
				1464	static void sched_numa_warn(const char *str)
				1465	{
				1466	static int done = false;
				1467	int i,j;
				1468
				1469	if (done)
				1470	return;
				1471
				1472	done = true;
				1473
				1474	printk(KERN_WARNING "ERROR: %s\n\n", str);
				1475
				1476	for (i = 0; i < nr_node_ids; i++) {
				1477	printk(KERN_WARNING " ");
				1478	for (j = 0; j < nr_node_ids; j++)
				1479	printk(KERN_CONT "%02d ", node_distance(i,j));
				1480	printk(KERN_CONT "\n");
				1481	}
				1482	printk(KERN_WARNING "\n");
				1483	}
				1484
				1485	bool find_numa_distance(int distance)
				1486	{
				1487	int i;
				1488
				1489	if (distance == node_distance(0, 0))
				1490	return true;
				1491
				1492	for (i = 0; i < sched_domains_numa_levels; i++) {
				1493	if (sched_domains_numa_distance[i] == distance)
				1494	return true;
				1495	}
				1496
				1497	return false;
				1498	}
				1499
				1500	/*
				1501	* A system can have three types of NUMA topology:
				1502	* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
				1503	* NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
				1504	* NUMA_BACKPLANE: nodes can reach other nodes through a backplane
				1505	*
				1506	* The difference between a glueless mesh topology and a backplane
				1507	* topology lies in whether communication between not directly
				1508	* connected nodes goes through intermediary nodes (where programs
				1509	* could run), or through backplane controllers. This affects
				1510	* placement of programs.
				1511	*
				1512	* The type of topology can be discerned with the following tests:
				1513	* - If the maximum distance between any nodes is 1 hop, the system
				1514	* is directly connected.
				1515	* - If for two nodes A and B, located N > 1 hops away from each other,
				1516	* there is an intermediary node C, which is < N hops away from both
				1517	* nodes A and B, the system is a glueless mesh.
				1518	*/
				1519	static void init_numa_topology_type(void)
				1520	{
				1521	int a, b, c, n;
				1522
				1523	n = sched_max_numa_distance;
				1524
Srikar Dronamraju	e5e96fa	2018-08-10 22:30:18 +0530	[diff] [blame]	1525	if (sched_domains_numa_levels <= 2) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1526	sched_numa_topology_type = NUMA_DIRECT;
				1527	return;
				1528	}
				1529
				1530	for_each_online_node(a) {
				1531	for_each_online_node(b) {
				1532	/* Find two nodes furthest removed from each other. */
				1533	if (node_distance(a, b) < n)
				1534	continue;
				1535
				1536	/* Is there an intermediary node between a and b? */
				1537	for_each_online_node(c) {
				1538	if (node_distance(a, c) < n &&
				1539	node_distance(b, c) < n) {
				1540	sched_numa_topology_type =
				1541	NUMA_GLUELESS_MESH;
				1542	return;
				1543	}
				1544	}
				1545
				1546	sched_numa_topology_type = NUMA_BACKPLANE;
				1547	return;
				1548	}
				1549	}
				1550	}
				1551
				1552	void sched_init_numa(void)
				1553	{
				1554	int next_distance, curr_distance = node_distance(0, 0);
				1555	struct sched_domain_topology_level *tl;
				1556	int level = 0;
				1557	int i, j, k;
				1558
Peter Zijlstra	993f0b0	2018-11-02 14:22:25 +0100	[diff] [blame]	1559	sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1560	if (!sched_domains_numa_distance)
				1561	return;
				1562
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1563	/* Includes NUMA identity node at level 0. */
				1564	sched_domains_numa_distance[level++] = curr_distance;
				1565	sched_domains_numa_levels = level;
				1566
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1567	/*
				1568	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
				1569	* unique distances in the node_distance() table.
				1570	*
				1571	* Assumes node_distance(0,j) includes all distances in
				1572	* node_distance(i,j) in order to avoid cubic time.
				1573	*/
				1574	next_distance = curr_distance;
				1575	for (i = 0; i < nr_node_ids; i++) {
				1576	for (j = 0; j < nr_node_ids; j++) {
				1577	for (k = 0; k < nr_node_ids; k++) {
				1578	int distance = node_distance(i, k);
				1579
				1580	if (distance > curr_distance &&
				1581	(distance < next_distance \|\|
				1582	next_distance == curr_distance))
				1583	next_distance = distance;
				1584
				1585	/*
				1586	* While not a strong assumption it would be nice to know
				1587	* about cases where if node A is connected to B, B is not
				1588	* equally connected to A.
				1589	*/
				1590	if (sched_debug() && node_distance(k, i) != distance)
				1591	sched_numa_warn("Node-distance not symmetric");
				1592
				1593	if (sched_debug() && i && !find_numa_distance(distance))
				1594	sched_numa_warn("Node-0 not representative");
				1595	}
				1596	if (next_distance != curr_distance) {
				1597	sched_domains_numa_distance[level++] = next_distance;
				1598	sched_domains_numa_levels = level;
				1599	curr_distance = next_distance;
				1600	} else break;
				1601	}
				1602
				1603	/*
				1604	* In case of sched_debug() we verify the above assumption.
				1605	*/
				1606	if (!sched_debug())
				1607	break;
				1608	}
				1609
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1610	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1611	* 'level' contains the number of unique distances
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1612	*
				1613	* The sched_domains_numa_distance[] array includes the actual distance
				1614	* numbers.
				1615	*/
				1616
				1617	/*
				1618	* Here, we should temporarily reset sched_domains_numa_levels to 0.
				1619	* If it fails to allocate memory for array sched_domains_numa_masks[][],
				1620	* the array will contain less then 'level' members. This could be
				1621	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
				1622	* in other functions.
				1623	*
				1624	* We reset it to 'level' at the end of this function.
				1625	*/
				1626	sched_domains_numa_levels = 0;
				1627
				1628	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
				1629	if (!sched_domains_numa_masks)
				1630	return;
				1631
				1632	/*
				1633	* Now for each level, construct a mask per node which contains all
				1634	* CPUs of nodes that are that many hops away from us.
				1635	*/
				1636	for (i = 0; i < level; i++) {
				1637	sched_domains_numa_masks[i] =
				1638	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
				1639	if (!sched_domains_numa_masks[i])
				1640	return;
				1641
				1642	for (j = 0; j < nr_node_ids; j++) {
				1643	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
				1644	if (!mask)
				1645	return;
				1646
				1647	sched_domains_numa_masks[i][j] = mask;
				1648
				1649	for_each_node(k) {
				1650	if (node_distance(j, k) > sched_domains_numa_distance[i])
				1651	continue;
				1652
				1653	cpumask_or(mask, mask, cpumask_of_node(k));
				1654	}
				1655	}
				1656	}
				1657
				1658	/* Compute default topology size */
				1659	for (i = 0; sched_domain_topology[i].mask; i++);
				1660
				1661	tl = kzalloc((i + level + 1) *
				1662	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
				1663	if (!tl)
				1664	return;
				1665
				1666	/*
				1667	* Copy the default topology bits..
				1668	*/
				1669	for (i = 0; sched_domain_topology[i].mask; i++)
				1670	tl[i] = sched_domain_topology[i];
				1671
				1672	/*
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1673	* Add the NUMA identity distance, aka single NODE.
				1674	*/
				1675	tl[i++] = (struct sched_domain_topology_level){
				1676	.mask = sd_numa_mask,
				1677	.numa_level = 0,
				1678	SD_INIT_NAME(NODE)
				1679	};
				1680
				1681	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1682	* .. and append 'j' levels of NUMA goodness.
				1683	*/
Suravee Suthikulpanit	051f3ca	2017-09-07 02:20:05 -0500	[diff] [blame]	1684	for (j = 1; j < level; i++, j++) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1685	tl[i] = (struct sched_domain_topology_level){
				1686	.mask = sd_numa_mask,
				1687	.sd_flags = cpu_numa_flags,
				1688	.flags = SDTL_OVERLAP,
				1689	.numa_level = j,
				1690	SD_INIT_NAME(NUMA)
				1691	};
				1692	}
				1693
				1694	sched_domain_topology = tl;
				1695
				1696	sched_domains_numa_levels = level;
				1697	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
				1698
				1699	init_numa_topology_type();
				1700	}
				1701
				1702	void sched_domains_numa_masks_set(unsigned int cpu)
				1703	{
				1704	int node = cpu_to_node(cpu);
				1705	int i, j;
				1706
				1707	for (i = 0; i < sched_domains_numa_levels; i++) {
				1708	for (j = 0; j < nr_node_ids; j++) {
				1709	if (node_distance(j, node) <= sched_domains_numa_distance[i])
				1710	cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
				1711	}
				1712	}
				1713	}
				1714
				1715	void sched_domains_numa_masks_clear(unsigned int cpu)
				1716	{
				1717	int i, j;
				1718
				1719	for (i = 0; i < sched_domains_numa_levels; i++) {
				1720	for (j = 0; j < nr_node_ids; j++)
				1721	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
				1722	}
				1723	}
				1724
Wanpeng Li	e0e8d49	2019-06-28 16:51:41 +0800	[diff] [blame]	1725	/*
				1726	* sched_numa_find_closest() - given the NUMA topology, find the cpu
				1727	* closest to @cpu from @cpumask.
				1728	* cpumask: cpumask to find a cpu from
				1729	* cpu: cpu to be close to
				1730	*
				1731	* returns: cpu, or nr_cpu_ids when nothing found.
				1732	*/
				1733	int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
				1734	{
				1735	int i, j = cpu_to_node(cpu);
				1736
				1737	for (i = 0; i < sched_domains_numa_levels; i++) {
				1738	cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
				1739	if (cpu < nr_cpu_ids)
				1740	return cpu;
				1741	}
				1742	return nr_cpu_ids;
				1743	}
				1744
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1745	#endif /* CONFIG_NUMA */
				1746
				1747	static int __sdt_alloc(const struct cpumask *cpu_map)
				1748	{
				1749	struct sched_domain_topology_level *tl;
				1750	int j;
				1751
				1752	for_each_sd_topology(tl) {
				1753	struct sd_data *sdd = &tl->data;
				1754
				1755	sdd->sd = alloc_percpu(struct sched_domain *);
				1756	if (!sdd->sd)
				1757	return -ENOMEM;
				1758
				1759	sdd->sds = alloc_percpu(struct sched_domain_shared *);
				1760	if (!sdd->sds)
				1761	return -ENOMEM;
				1762
				1763	sdd->sg = alloc_percpu(struct sched_group *);
				1764	if (!sdd->sg)
				1765	return -ENOMEM;
				1766
				1767	sdd->sgc = alloc_percpu(struct sched_group_capacity *);
				1768	if (!sdd->sgc)
				1769	return -ENOMEM;
				1770
				1771	for_each_cpu(j, cpu_map) {
				1772	struct sched_domain *sd;
				1773	struct sched_domain_shared *sds;
				1774	struct sched_group *sg;
				1775	struct sched_group_capacity *sgc;
				1776
				1777	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
				1778	GFP_KERNEL, cpu_to_node(j));
				1779	if (!sd)
				1780	return -ENOMEM;
				1781
				1782	*per_cpu_ptr(sdd->sd, j) = sd;
				1783
				1784	sds = kzalloc_node(sizeof(struct sched_domain_shared),
				1785	GFP_KERNEL, cpu_to_node(j));
				1786	if (!sds)
				1787	return -ENOMEM;
				1788
				1789	*per_cpu_ptr(sdd->sds, j) = sds;
				1790
				1791	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
				1792	GFP_KERNEL, cpu_to_node(j));
				1793	if (!sg)
				1794	return -ENOMEM;
				1795
				1796	sg->next = sg;
				1797
				1798	*per_cpu_ptr(sdd->sg, j) = sg;
				1799
				1800	sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
				1801	GFP_KERNEL, cpu_to_node(j));
				1802	if (!sgc)
				1803	return -ENOMEM;
				1804
Peter Zijlstra	005f874	2017-04-26 17:35:35 +0200	[diff] [blame]	1805	#ifdef CONFIG_SCHED_DEBUG
				1806	sgc->id = j;
				1807	#endif
				1808
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1809	*per_cpu_ptr(sdd->sgc, j) = sgc;
				1810	}
				1811	}
				1812
				1813	return 0;
				1814	}
				1815
				1816	static void __sdt_free(const struct cpumask *cpu_map)
				1817	{
				1818	struct sched_domain_topology_level *tl;
				1819	int j;
				1820
				1821	for_each_sd_topology(tl) {
				1822	struct sd_data *sdd = &tl->data;
				1823
				1824	for_each_cpu(j, cpu_map) {
				1825	struct sched_domain *sd;
				1826
				1827	if (sdd->sd) {
				1828	sd = *per_cpu_ptr(sdd->sd, j);
				1829	if (sd && (sd->flags & SD_OVERLAP))
				1830	free_sched_groups(sd->groups, 0);
				1831	kfree(*per_cpu_ptr(sdd->sd, j));
				1832	}
				1833
				1834	if (sdd->sds)
				1835	kfree(*per_cpu_ptr(sdd->sds, j));
				1836	if (sdd->sg)
				1837	kfree(*per_cpu_ptr(sdd->sg, j));
				1838	if (sdd->sgc)
				1839	kfree(*per_cpu_ptr(sdd->sgc, j));
				1840	}
				1841	free_percpu(sdd->sd);
				1842	sdd->sd = NULL;
				1843	free_percpu(sdd->sds);
				1844	sdd->sds = NULL;
				1845	free_percpu(sdd->sg);
				1846	sdd->sg = NULL;
				1847	free_percpu(sdd->sgc);
				1848	sdd->sgc = NULL;
				1849	}
				1850	}
				1851
Viresh Kumar	181a80d1	2017-04-27 13:58:59 +0530	[diff] [blame]	1852	static struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1853	const struct cpumask cpu_map, struct sched_domain_attr attr,
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1854	struct sched_domain *child, int dflags, int cpu)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1855	{
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1856	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1857
				1858	if (child) {
				1859	sd->level = child->level + 1;
				1860	sched_domain_level_max = max(sched_domain_level_max, sd->level);
				1861	child->parent = sd;
				1862
				1863	if (!cpumask_subset(sched_domain_span(child),
				1864	sched_domain_span(sd))) {
				1865	pr_err("BUG: arch topology borken\n");
				1866	#ifdef CONFIG_SCHED_DEBUG
				1867	pr_err(" the %s domain not a subset of the %s domain\n",
				1868	child->name, sd->name);
				1869	#endif
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	1870	/* Fixup, ensure @sd has at least @child CPUs. */
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1871	cpumask_or(sched_domain_span(sd),
				1872	sched_domain_span(sd),
				1873	sched_domain_span(child));
				1874	}
				1875
				1876	}
				1877	set_domain_attribute(sd, attr);
				1878
				1879	return sd;
				1880	}
				1881
				1882	/*
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	1883	* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
				1884	* any two given CPUs at this (non-NUMA) topology level.
				1885	*/
				1886	static bool topology_span_sane(struct sched_domain_topology_level *tl,
				1887	const struct cpumask *cpu_map, int cpu)
				1888	{
				1889	int i;
				1890
				1891	/* NUMA levels are allowed to overlap */
				1892	if (tl->flags & SDTL_OVERLAP)
				1893	return true;
				1894
				1895	/*
				1896	* Non-NUMA levels cannot partially overlap - they must be either
				1897	* completely equal or completely disjoint. Otherwise we can end up
				1898	* breaking the sched_group lists - i.e. a later get_group() pass
				1899	* breaks the linking done for an earlier span.
				1900	*/
				1901	for_each_cpu(i, cpu_map) {
				1902	if (i == cpu)
				1903	continue;
				1904	/*
				1905	* We should 'and' all those masks with 'cpu_map' to exactly
				1906	* match the topology we're about to build, but that can only
				1907	* remove CPUs, which only lessens our ability to detect
				1908	* overlaps
				1909	*/
				1910	if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
				1911	cpumask_intersects(tl->mask(cpu), tl->mask(i)))
				1912	return false;
				1913	}
				1914
				1915	return true;
				1916	}
				1917
				1918	/*
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1919	* Find the sched_domain_topology_level where all CPU capacities are visible
				1920	* for all CPUs.
				1921	*/
				1922	static struct sched_domain_topology_level
				1923	asym_cpu_capacity_level(const struct cpumask cpu_map)
				1924	{
				1925	int i, j, asym_level = 0;
				1926	bool asym = false;
				1927	struct sched_domain_topology_level tl, asym_tl = NULL;
				1928	unsigned long cap;
				1929
				1930	/* Is there any asymmetry? */
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1931	cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1932
				1933	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1934	if (arch_scale_cpu_capacity(i) != cap) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1935	asym = true;
				1936	break;
				1937	}
				1938	}
				1939
				1940	if (!asym)
				1941	return NULL;
				1942
				1943	/*
				1944	* Examine topology from all CPU's point of views to detect the lowest
				1945	* sched_domain_topology_level where a highest capacity CPU is visible
				1946	* to everyone.
				1947	*/
				1948	for_each_cpu(i, cpu_map) {
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1949	unsigned long max_capacity = arch_scale_cpu_capacity(i);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1950	int tl_id = 0;
				1951
				1952	for_each_sd_topology(tl) {
				1953	if (tl_id < asym_level)
				1954	goto next_level;
				1955
				1956	for_each_cpu_and(j, tl->mask(i), cpu_map) {
				1957	unsigned long capacity;
				1958
Vincent Guittot	8ec59c0	2019-06-17 17:00:17 +0200	[diff] [blame]	1959	capacity = arch_scale_cpu_capacity(j);
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1960
				1961	if (capacity <= max_capacity)
				1962	continue;
				1963
				1964	max_capacity = capacity;
				1965	asym_level = tl_id;
				1966	asym_tl = tl;
				1967	}
				1968	next_level:
				1969	tl_id++;
				1970	}
				1971	}
				1972
				1973	return asym_tl;
				1974	}
				1975
				1976
				1977	/*
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1978	* Build sched domains for a given set of CPUs and attach the sched domains
				1979	* to the individual CPUs
				1980	*/
				1981	static int
				1982	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
				1983	{
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	1984	enum s_alloc alloc_state = sa_none;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1985	struct sched_domain *sd;
				1986	struct s_data d;
				1987	struct rq *rq = NULL;
				1988	int i, ret = -ENOMEM;
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1989	struct sched_domain_topology_level *tl_asym;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	1990	bool has_asym = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1991
Valentin Schneider	cd1cb33	2019-10-23 16:37:44 +0100	[diff] [blame]	1992	if (WARN_ON(cpumask_empty(cpu_map)))
				1993	goto error;
				1994
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	1995	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
				1996	if (alloc_state != sa_rootdomain)
				1997	goto error;
				1998
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	1999	tl_asym = asym_cpu_capacity_level(cpu_map);
				2000
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2001	/* Set up domains for CPUs specified by the cpu_map: */
				2002	for_each_cpu(i, cpu_map) {
				2003	struct sched_domain_topology_level *tl;
Valentin Schneider	c200191	2020-08-17 12:29:56 +0100	[diff] [blame]	2004	int dflags = 0;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2005
				2006	sd = NULL;
				2007	for_each_sd_topology(tl) {
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2008	if (tl == tl_asym) {
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2009	dflags \|= SD_ASYM_CPUCAPACITY;
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2010	has_asym = true;
				2011	}
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2012
Valentin Schneider	ccf7412	2020-01-15 16:09:15 +0000	[diff] [blame]	2013	if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
				2014	goto error;
				2015
Morten Rasmussen	05484e0	2018-07-20 14:32:31 +0100	[diff] [blame]	2016	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
				2017
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2018	if (tl == sched_domain_topology)
				2019	*per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstra	af85596	2017-04-26 17:36:41 +0200	[diff] [blame]	2020	if (tl->flags & SDTL_OVERLAP)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2021	sd->flags \|= SD_OVERLAP;
				2022	if (cpumask_equal(cpu_map, sched_domain_span(sd)))
				2023	break;
				2024	}
				2025	}
				2026
				2027	/* Build the groups for the domains */
				2028	for_each_cpu(i, cpu_map) {
				2029	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2030	sd->span_weight = cpumask_weight(sched_domain_span(sd));
				2031	if (sd->flags & SD_OVERLAP) {
				2032	if (build_overlap_sched_groups(sd, i))
				2033	goto error;
				2034	} else {
				2035	if (build_sched_groups(sd, i))
				2036	goto error;
				2037	}
				2038	}
				2039	}
				2040
				2041	/* Calculate CPU capacity for physical packages and nodes */
				2042	for (i = nr_cpumask_bits-1; i >= 0; i--) {
				2043	if (!cpumask_test_cpu(i, cpu_map))
				2044	continue;
				2045
				2046	for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
				2047	claim_allocations(i, sd);
				2048	init_sched_groups_capacity(i, sd);
				2049	}
				2050	}
				2051
				2052	/* Attach the domains */
				2053	rcu_read_lock();
				2054	for_each_cpu(i, cpu_map) {
				2055	rq = cpu_rq(i);
				2056	sd = *per_cpu_ptr(d.sd, i);
				2057
				2058	/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
				2059	if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
				2060	WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
				2061
				2062	cpu_attach_domain(sd, d.rd, i);
				2063	}
				2064	rcu_read_unlock();
				2065
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2066	if (has_asym)
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2067	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
Morten Rasmussen	df054e8	2018-07-04 11:17:39 +0100	[diff] [blame]	2068
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2069	if (rq && sched_debug_enabled) {
Juri Lelli	bf5015a	2018-05-24 17:29:36 +0200	[diff] [blame]	2070	pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2071	cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
				2072	}
				2073
				2074	ret = 0;
				2075	error:
				2076	__free_domain_allocs(&d, alloc_state, cpu_map);
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2077
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2078	return ret;
				2079	}
				2080
				2081	/* Current sched domains: */
				2082	static cpumask_var_t *doms_cur;
				2083
				2084	/* Number of sched domains in 'doms_cur': */
				2085	static int ndoms_cur;
				2086
				2087	/* Attribues of custom domains in 'doms_cur' */
				2088	static struct sched_domain_attr *dattr_cur;
				2089
				2090	/*
				2091	* Special case: If a kmalloc() of a doms_cur partition (array of
				2092	* cpumask) fails, then fallback to a single sched domain,
				2093	* as determined by the single cpumask fallback_doms.
				2094	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2095	static cpumask_var_t fallback_doms;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2096
				2097	/*
				2098	* arch_update_cpu_topology lets virtualized architectures update the
				2099	* CPU core maps. It is supposed to return 1 if the topology changed
				2100	* or 0 if it stayed the same.
				2101	*/
				2102	int __weak arch_update_cpu_topology(void)
				2103	{
				2104	return 0;
				2105	}
				2106
				2107	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
				2108	{
				2109	int i;
				2110	cpumask_var_t *doms;
				2111
Kees Cook	6da2ec5	2018-06-12 13:55:00 -0700	[diff] [blame]	2112	doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2113	if (!doms)
				2114	return NULL;
				2115	for (i = 0; i < ndoms; i++) {
				2116	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
				2117	free_sched_domains(doms, i);
				2118	return NULL;
				2119	}
				2120	}
				2121	return doms;
				2122	}
				2123
				2124	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
				2125	{
				2126	unsigned int i;
				2127	for (i = 0; i < ndoms; i++)
				2128	free_cpumask_var(doms[i]);
				2129	kfree(doms);
				2130	}
				2131
				2132	/*
Juri Lelli	cb0c041	2018-12-19 14:34:45 +0100	[diff] [blame]	2133	* Set up scheduler domains and groups. For now this just excludes isolated
				2134	* CPUs, but could be used to exclude other special cases in the future.
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2135	*/
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2136	int sched_init_domains(const struct cpumask *cpu_map)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2137	{
				2138	int err;
				2139
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2140	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
Peter Zijlstra	1676330	2017-04-25 14:31:11 +0200	[diff] [blame]	2141	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
Peter Zijlstra	8d5dc51	2017-04-25 15:29:40 +0200	[diff] [blame]	2142	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
				2143
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2144	arch_update_cpu_topology();
				2145	ndoms_cur = 1;
				2146	doms_cur = alloc_sched_domains(ndoms_cur);
				2147	if (!doms_cur)
				2148	doms_cur = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2149	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2150	err = build_sched_domains(doms_cur[0], NULL);
				2151	register_sched_domain_sysctl();
				2152
				2153	return err;
				2154	}
				2155
				2156	/*
				2157	* Detach sched domains from a group of CPUs specified in cpu_map
				2158	* These CPUs will now be attached to the NULL domain
				2159	*/
				2160	static void detach_destroy_domains(const struct cpumask *cpu_map)
				2161	{
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2162	unsigned int cpu = cpumask_any(cpu_map);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2163	int i;
				2164
Valentin Schneider	e284df7	2019-10-23 16:37:45 +0100	[diff] [blame]	2165	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
				2166	static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
				2167
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2168	rcu_read_lock();
				2169	for_each_cpu(i, cpu_map)
				2170	cpu_attach_domain(NULL, &def_root_domain, i);
				2171	rcu_read_unlock();
				2172	}
				2173
				2174	/* handle null as "default" */
				2175	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
				2176	struct sched_domain_attr *new, int idx_new)
				2177	{
				2178	struct sched_domain_attr tmp;
				2179
				2180	/* Fast path: */
				2181	if (!new && !cur)
				2182	return 1;
				2183
				2184	tmp = SD_ATTR_INIT;
Ingo Molnar	97fb7a0	2018-03-03 14:01:12 +0100	[diff] [blame]	2185
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2186	return !memcmp(cur ? (cur + idx_cur) : &tmp,
				2187	new ? (new + idx_new) : &tmp,
				2188	sizeof(struct sched_domain_attr));
				2189	}
				2190
				2191	/*
				2192	* Partition sched domains as specified by the 'ndoms_new'
				2193	* cpumasks in the array doms_new[] of cpumasks. This compares
				2194	* doms_new[] to the current sched domain partitioning, doms_cur[].
				2195	* It destroys each deleted domain and builds each new domain.
				2196	*
				2197	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
				2198	* The masks don't intersect (don't overlap.) We should setup one
				2199	* sched domain for each mask. CPUs not in any of the cpumasks will
				2200	* not be load balanced. If the same cpumask appears both in the
				2201	* current 'doms_cur' domains and in the new 'doms_new', we can leave
				2202	* it as it is.
				2203	*
				2204	* The passed in 'doms_new' should be allocated using
				2205	* alloc_sched_domains. This routine takes ownership of it and will
				2206	* free_sched_domains it when done with it. If the caller failed the
				2207	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
				2208	* and partition_sched_domains() will fallback to the single partition
				2209	* 'fallback_doms', it also forces the domains to be rebuilt.
				2210	*
				2211	* If doms_new == NULL it will be replaced with cpu_online_mask.
				2212	* ndoms_new == 0 is a special case for destroying existing domains,
				2213	* and it will not create the default domain.
				2214	*
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2215	* Call with hotplug lock and sched_domains_mutex held
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2216	*/
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2217	void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
				2218	struct sched_domain_attr *dattr_new)
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2219	{
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2220	bool __maybe_unused has_eas = false;
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2221	int i, j, n;
				2222	int new_topology;
				2223
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2224	lockdep_assert_held(&sched_domains_mutex);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2225
				2226	/* Always unregister in case we don't destroy any domains: */
				2227	unregister_sched_domain_sysctl();
				2228
				2229	/* Let the architecture update CPU core mappings: */
				2230	new_topology = arch_update_cpu_topology();
				2231
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2232	if (!doms_new) {
				2233	WARN_ON_ONCE(dattr_new);
				2234	n = 0;
				2235	doms_new = alloc_sched_domains(1);
				2236	if (doms_new) {
				2237	n = 1;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2238	cpumask_and(doms_new[0], cpu_active_mask,
				2239	housekeeping_cpumask(HK_FLAG_DOMAIN));
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2240	}
				2241	} else {
				2242	n = ndoms_new;
				2243	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2244
				2245	/* Destroy deleted domains: */
				2246	for (i = 0; i < ndoms_cur; i++) {
				2247	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2248	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2249	dattrs_equal(dattr_cur, i, dattr_new, j)) {
				2250	struct root_domain *rd;
				2251
				2252	/*
				2253	* This domain won't be destroyed and as such
				2254	* its dl_bw->total_bw needs to be cleared. It
				2255	* will be recomputed in function
				2256	* update_tasks_root_domain().
				2257	*/
				2258	rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
				2259	dl_clear_root_domain(rd);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2260	goto match1;
Mathieu Poirier	f9a25f7	2019-07-19 15:59:55 +0200	[diff] [blame]	2261	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2262	}
				2263	/* No match - a current sched domain not in new doms_new[] */
				2264	detach_destroy_domains(doms_cur[i]);
				2265	match1:
				2266	;
				2267	}
				2268
				2269	n = ndoms_cur;
Peter Zijlstra	09e0dd8	2017-08-08 12:16:24 +0200	[diff] [blame]	2270	if (!doms_new) {
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2271	n = 0;
				2272	doms_new = &fallback_doms;
Frederic Weisbecker	edb9382	2017-10-27 04:42:37 +0200	[diff] [blame]	2273	cpumask_and(doms_new[0], cpu_active_mask,
				2274	housekeeping_cpumask(HK_FLAG_DOMAIN));
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2275	}
				2276
				2277	/* Build new domains: */
				2278	for (i = 0; i < ndoms_new; i++) {
				2279	for (j = 0; j < n && !new_topology; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2280	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
				2281	dattrs_equal(dattr_new, i, dattr_cur, j))
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2282	goto match2;
				2283	}
				2284	/* No match - add a new doms_new */
				2285	build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
				2286	match2:
				2287	;
				2288	}
				2289
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2290	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2291	/* Build perf. domains: */
				2292	for (i = 0; i < ndoms_new; i++) {
Quentin Perret	531b5c9	2018-12-03 09:56:21 +0000	[diff] [blame]	2293	for (j = 0; j < n && !sched_energy_update; j++) {
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2294	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2295	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
				2296	has_eas = true;
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2297	goto match3;
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2298	}
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2299	}
				2300	/* No match - add perf. domains for a new rd */
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2301	has_eas \|= build_perf_domains(doms_new[i]);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2302	match3:
				2303	;
				2304	}
Quentin Perret	1f74de8	2018-12-03 09:56:22 +0000	[diff] [blame]	2305	sched_energy_set(has_eas);
Quentin Perret	6aa140f	2018-12-03 09:56:18 +0000	[diff] [blame]	2306	#endif
				2307
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2308	/* Remember the new sched domains: */
				2309	if (doms_cur != &fallback_doms)
				2310	free_sched_domains(doms_cur, ndoms_cur);
				2311
				2312	kfree(dattr_cur);
				2313	doms_cur = doms_new;
				2314	dattr_cur = dattr_new;
				2315	ndoms_cur = ndoms_new;
				2316
				2317	register_sched_domain_sysctl();
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2318	}
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2319
Mathieu Poirier	c22645f	2019-07-19 15:59:53 +0200	[diff] [blame]	2320	/*
				2321	* Call with hotplug lock held
				2322	*/
				2323	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				2324	struct sched_domain_attr *dattr_new)
				2325	{
				2326	mutex_lock(&sched_domains_mutex);
				2327	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
Ingo Molnar	f2cb136	2017-02-01 13:10:18 +0100	[diff] [blame]	2328	mutex_unlock(&sched_domains_mutex);
				2329	}