sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd
Currently update_sd_pick_busiest only identifies the busiest sd
that is either overloaded, or has a group imbalance. When no
sd is imbalanced or overloaded, the load balancer fails to find
the busiest domain.
This breaks load balancing between domains that are not overloaded,
in the !SD_ASYM_PACKING case. This patch makes update_sd_pick_busiest
return true when the busiest sd yet is encountered.
Groups are ranked in the order overloaded > imbalanced > other,
with higher ranked groups getting priority even when their load
is lower. This is necessary due to the possibility of unequal
capacities and cpumasks between domains within a sched group.
Behaviour for SD_ASYM_PACKING does not seem to match the comment,
but I have no hardware to test that so I have left the behaviour
of that code unchanged.
Enum for group classification suggested by Peter Zijlstra.
Signed-off-by: Rik van Riel <riel@redhat.com>
[peterz: replaced sg_lb_stats::group_imb with the new enum group_type
in an attempt to avoid endless recalculation]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Michael Neuling <mikey@neuling.org>
Cc: ktkhai@parallels.com
Cc: tim.c.chen@linux.intel.com
Cc: nicolas.pitre@linaro.org
Cc: jhladky@redhat.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140729152743.GI3935@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9477e6..9437725 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5559,6 +5559,13 @@
#endif
/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -5572,7 +5579,7 @@
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
+ enum group_type group_type;
int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -5610,6 +5617,8 @@
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
},
};
}
@@ -5891,6 +5900,18 @@
return capacity_factor;
}
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -5942,9 +5963,8 @@
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
-
- sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
+ sgs->group_type = group_classify(group, sgs);
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
@@ -5968,13 +5988,19 @@
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
- return false;
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_type > busiest->group_type)
return true;
- if (sgs->group_imb)
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/*
@@ -5982,8 +6008,7 @@
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -6228,7 +6253,7 @@
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (busiest->group_imb) {
+ if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,7 +6273,7 @@
return fix_small_imbalance(env, sds);
}
- if (busiest->sum_nr_running > busiest->group_capacity_factor) {
+ if (busiest->group_type == group_overloaded) {
/*
* Don't want to pull so many tasks that a group would go idle.
* Except of course for the group_imb case, since then we might
@@ -6337,7 +6362,7 @@
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (busiest->group_imb)
+ if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */