blob: 6c28298b391af56b0ba8c1250e5b23461c04713d [file] [log] [blame]
/* Copyright (c) 2012-2017, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
* only version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Implementation credits: Srivatsa Vaddagiri, Steve Muckle
* Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
* and Vikram Mulukutla
*/
#include <linux/cpufreq.h>
#include <linux/list_sort.h>
#include <linux/syscore_ops.h>
#include "sched.h"
#include "walt.h"
#include <trace/events/sched.h>
#define CSTATE_LATENCY_GRANULARITY_SHIFT (6)
inline void clear_ed_task(struct task_struct *p, struct rq *rq)
{
if (p == rq->ed_task)
rq->ed_task = NULL;
}
inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
{
p->last_switch_out_ts = wallclock;
}
/*
* Note C-state for (idle) cpus.
*
* @cstate = cstate index, 0 -> active state
* @wakeup_energy = energy spent in waking up cpu
* @wakeup_latency = latency to wakeup from cstate
*
*/
void
sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
{
struct rq *rq = cpu_rq(cpu);
rq->cstate = cstate; /* C1, C2 etc */
rq->wakeup_energy = wakeup_energy;
/* disregard small latency delta (64 us). */
rq->wakeup_latency = ((wakeup_latency >>
CSTATE_LATENCY_GRANULARITY_SHIFT) <<
CSTATE_LATENCY_GRANULARITY_SHIFT);
}
/*
* Note D-state for (idle) cluster.
*
* @dstate = dstate index, 0 -> active state
* @wakeup_energy = energy spent in waking up cluster
* @wakeup_latency = latency to wakeup from cluster
*
*/
void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
int wakeup_energy, int wakeup_latency)
{
struct sched_cluster *cluster =
cpu_rq(cpumask_first(cluster_cpus))->cluster;
cluster->dstate = dstate;
cluster->dstate_wakeup_energy = wakeup_energy;
cluster->dstate_wakeup_latency = wakeup_latency;
}
u32 __weak get_freq_max_load(int cpu, u32 freq)
{
/* 100% by default */
return 100;
}
struct freq_max_load_entry {
/* The maximum load which has accounted governor's headroom. */
u64 hdemand;
};
struct freq_max_load {
struct rcu_head rcu;
int length;
struct freq_max_load_entry freqs[0];
};
static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
static DEFINE_SPINLOCK(freq_max_load_lock);
struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
{
return NULL;
}
int sched_update_freq_max_load(const cpumask_t *cpumask)
{
int i, cpu, ret;
unsigned int freq;
struct cpu_pstate_pwr *costs;
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
struct freq_max_load *max_load, *old_max_load;
struct freq_max_load_entry *entry;
u64 max_demand_capacity, max_demand;
unsigned long flags;
u32 hfreq;
int hpct;
if (!per_cpu_info)
return 0;
spin_lock_irqsave(&freq_max_load_lock, flags);
max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
for_each_cpu(cpu, cpumask) {
if (!per_cpu_info[cpu].ptable) {
ret = -EINVAL;
goto fail;
}
old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
/*
* allocate len + 1 and leave the last power cost as 0 for
* power_cost() can stop iterating index when
* per_cpu_info[cpu].len > len of max_load due to race between
* cpu power stats update and get_cpu_pwr_stats().
*/
max_load = kzalloc(sizeof(struct freq_max_load) +
sizeof(struct freq_max_load_entry) *
(per_cpu_info[cpu].len + 1), GFP_ATOMIC);
if (unlikely(!max_load)) {
ret = -ENOMEM;
goto fail;
}
max_load->length = per_cpu_info[cpu].len;
max_demand = max_demand_capacity *
cpu_max_possible_capacity(cpu);
i = 0;
costs = per_cpu_info[cpu].ptable;
while (costs[i].freq) {
entry = &max_load->freqs[i];
freq = costs[i].freq;
hpct = get_freq_max_load(cpu, freq);
if (hpct <= 0 || hpct > 100)
hpct = 100;
hfreq = div64_u64((u64)freq * hpct, 100);
entry->hdemand =
div64_u64(max_demand * hfreq,
cpu_max_possible_freq(cpu));
i++;
}
rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
if (old_max_load)
kfree_rcu(old_max_load, rcu);
}
spin_unlock_irqrestore(&freq_max_load_lock, flags);
return 0;
fail:
for_each_cpu(cpu, cpumask) {
max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
if (max_load) {
rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
kfree_rcu(max_load, rcu);
}
}
spin_unlock_irqrestore(&freq_max_load_lock, flags);
return ret;
}
unsigned long __weak arch_get_cpu_efficiency(int cpu)
{
return SCHED_CAPACITY_SCALE;
}
int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
{
struct rq *rq = cpu_rq(cpu);
rq->static_cpu_pwr_cost = cost;
return 0;
}
unsigned int sched_get_static_cpu_pwr_cost(int cpu)
{
return cpu_rq(cpu)->static_cpu_pwr_cost;
}
int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
{
struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
cluster->static_cluster_pwr_cost = cost;
return 0;
}
unsigned int sched_get_static_cluster_pwr_cost(int cpu)
{
return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
}
int sched_set_cluster_wake_idle(int cpu, unsigned int wake_idle)
{
struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
cluster->wake_up_idle = !!wake_idle;
return 0;
}
unsigned int sched_get_cluster_wake_idle(int cpu)
{
return cpu_rq(cpu)->cluster->wake_up_idle;
}
/*
* Tasks that are runnable continuously for a period greather than
* EARLY_DETECTION_DURATION can be flagged early as potential
* high load tasks.
*/
#define EARLY_DETECTION_DURATION 9500000
/*
* For increase, send notification if
* freq_required - cur_freq > sysctl_sched_freq_inc_notify
*/
__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
/*
* For decrease, send notification if
* cur_freq - freq_required > sysctl_sched_freq_dec_notify
*/
__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
/* Maximum allowed threshold before freq aggregation must be enabled */
#define MAX_FREQ_AGGR_THRESH 1000
#define for_each_related_thread_group(grp) \
list_for_each_entry(grp, &active_related_thread_groups, list)
/* Size of bitmaps maintained to track top tasks */
static const unsigned int top_tasks_bitmap_size =
BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);
__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
/* A cpu can no longer accommodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
* rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
*/
unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
/*
* Place sync wakee tasks those have less than configured demand to the waker's
* cluster.
*/
unsigned int __read_mostly sched_small_wakee_task_load;
unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
unsigned int __read_mostly sched_big_waker_task_load;
unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
* load higher than this level, tasks becomes eligible for inter
* cluster migration.
*/
unsigned int __read_mostly sched_spill_load;
unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
/*
* Prefer the waker CPU for sync wakee task, if the CPU has only 1 runnable
* task. This eliminates the LPM exit latency associated with the idle
* CPUs in the waker cluster.
*/
unsigned int __read_mostly sysctl_sched_prefer_sync_wakee_to_waker;
/*
* Tasks whose bandwidth consumption on a cpu is more than
* sched_upmigrate are considered "big" tasks. Big tasks will be
* considered for "up" migration, i.e migrating to a cpu with better
* capacity.
*/
unsigned int __read_mostly sched_upmigrate;
unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
/*
* Big tasks, once migrated, will need to drop their bandwidth
* consumption to less than sched_downmigrate before they are "down"
* migrated.
*/
unsigned int __read_mostly sched_downmigrate;
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
* The load scale factor of a CPU gets boosted when its max frequency
* is restricted due to which the tasks are migrating to higher capacity
* CPUs early. The sched_upmigrate threshold is auto-upgraded by
* rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
*/
unsigned int up_down_migrate_scale_factor = 1024;
/*
* Scheduler selects and places task to its previous CPU if sleep time is
* less than sysctl_sched_select_prev_cpu_us.
*/
unsigned int __read_mostly
sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
unsigned int __read_mostly
sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
/*
* Scheduler tries to avoid waking up idle CPUs for tasks running
* in short bursts. If the task average burst is less than
* sysctl_sched_short_burst nanoseconds and it sleeps on an average
* for more than sysctl_sched_short_sleep nanoseconds, then the
* task is eligible for packing.
*/
unsigned int __read_mostly sysctl_sched_short_burst;
unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
static void _update_up_down_migrate(unsigned int *up_migrate,
unsigned int *down_migrate, bool is_group)
{
unsigned int delta;
if (up_down_migrate_scale_factor == 1024)
return;
delta = *up_migrate - *down_migrate;
*up_migrate /= NSEC_PER_USEC;
*up_migrate *= up_down_migrate_scale_factor;
*up_migrate >>= 10;
*up_migrate *= NSEC_PER_USEC;
if (!is_group)
*up_migrate = min(*up_migrate, sched_ravg_window);
*down_migrate /= NSEC_PER_USEC;
*down_migrate *= up_down_migrate_scale_factor;
*down_migrate >>= 10;
*down_migrate *= NSEC_PER_USEC;
*down_migrate = min(*down_migrate, *up_migrate - delta);
}
static void update_up_down_migrate(void)
{
unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
_update_up_down_migrate(&up_migrate, &down_migrate, false);
sched_upmigrate = up_migrate;
sched_downmigrate = down_migrate;
up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
_update_up_down_migrate(&up_migrate, &down_migrate, true);
sched_group_upmigrate = up_migrate;
sched_group_downmigrate = down_migrate;
}
void set_hmp_defaults(void)
{
sched_spill_load =
pct_to_real(sysctl_sched_spill_load_pct);
update_up_down_migrate();
sched_init_task_load_windows =
div64_u64((u64)sysctl_sched_init_task_load_pct *
(u64)sched_ravg_window, 100);
sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
NSEC_PER_USEC;
sched_small_wakee_task_load =
div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
(u64)sched_ravg_window, 100);
sched_big_waker_task_load =
div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
(u64)sched_ravg_window, 100);
sched_freq_aggregate_threshold =
pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
}
#ifdef CONFIG_CGROUP_SCHED
int upmigrate_discouraged(struct task_struct *p)
{
return task_group(p)->upmigrate_discouraged;
}
#else
static inline int upmigrate_discouraged(struct task_struct *p)
{
return 0;
}
#endif
/* Is a task "big" on its current cpu */
static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
{
int nice = task_nice(p);
if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
return 0;
return scaled_load > sched_upmigrate;
}
int is_big_task(struct task_struct *p)
{
return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
}
u64 cpu_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
}
u64 cpu_load_sync(int cpu, int sync)
{
return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
}
/*
* Task will fit on a cpu if it's bandwidth consumption on that cpu
* will be less than sched_upmigrate. A big task that was previously
* "up" migrated will be considered fitting on "little" cpu if its
* bandwidth consumption on "little" cpu will be less than
* sched_downmigrate. This will help avoid frequenty migrations for
* tasks with load close to the upmigrate threshold
*/
int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
enum sched_boost_policy boost_policy)
{
int upmigrate = sched_upmigrate;
if (cpu_capacity(cpu) == max_capacity)
return 1;
if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
upmigrate = sched_downmigrate;
if (boost_policy != SCHED_BOOST_ON_BIG) {
if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
upmigrate_discouraged(p))
return 1;
if (task_load < upmigrate)
return 1;
} else {
if (task_sched_boost(p) || task_load >= upmigrate)
return 0;
return 1;
}
return 0;
}
int task_will_fit(struct task_struct *p, int cpu)
{
u64 tload = scale_load_to_cpu(task_load(p), cpu);
return task_load_will_fit(p, tload, cpu, sched_boost_policy());
}
/*
* Return the cost of running task p on CPU cpu. This function
* currently assumes that task p is the only task which will run on
* the CPU.
*/
unsigned int power_cost(int cpu, u64 demand)
{
int first, mid, last;
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
struct cpu_pstate_pwr *costs;
struct freq_max_load *max_load;
int total_static_pwr_cost = 0;
struct rq *rq = cpu_rq(cpu);
unsigned int pc;
if (!per_cpu_info || !per_cpu_info[cpu].ptable)
/*
* When power aware scheduling is not in use, or CPU
* power data is not available, just use the CPU
* capacity as a rough stand-in for real CPU power
* numbers, assuming bigger CPUs are more power
* hungry.
*/
return cpu_max_possible_capacity(cpu);
rcu_read_lock();
max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
if (!max_load) {
pc = cpu_max_possible_capacity(cpu);
goto unlock;
}
costs = per_cpu_info[cpu].ptable;
if (demand <= max_load->freqs[0].hdemand) {
pc = costs[0].power;
goto unlock;
} else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
pc = costs[max_load->length - 1].power;
goto unlock;
}
first = 0;
last = max_load->length - 1;
mid = (last - first) >> 1;
while (1) {
if (demand <= max_load->freqs[mid].hdemand)
last = mid;
else
first = mid;
if (last - first == 1)
break;
mid = first + ((last - first) >> 1);
}
pc = costs[last].power;
unlock:
rcu_read_unlock();
if (idle_cpu(cpu) && rq->cstate) {
total_static_pwr_cost += rq->static_cpu_pwr_cost;
if (rq->cluster->dstate)
total_static_pwr_cost +=
rq->cluster->static_cluster_pwr_cost;
}
return pc + total_static_pwr_cost;
}
struct sched_cluster *rq_cluster(struct rq *rq)
{
return rq->cluster;
}
/*
* reset_cpu_hmp_stats - reset HMP stats for a cpu
* nr_big_tasks
* cumulative_runnable_avg (iff reset_cra is true)
*/
void reset_cpu_hmp_stats(int cpu, int reset_cra)
{
reset_cfs_rq_hmp_stats(cpu, reset_cra);
reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
}
void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
struct task_struct *p, s64 delta)
{
u64 new_task_load;
u64 old_task_load;
if (sched_disable_window_stats)
return;
old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
stats->nr_big_tasks--;
else if (!__is_big_task(p, old_task_load) &&
__is_big_task(p, new_task_load))
stats->nr_big_tasks++;
BUG_ON(stats->nr_big_tasks < 0);
}
/*
* Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
*/
static void update_nr_big_tasks(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *p;
/* Do not reset cumulative_runnable_avg */
reset_cpu_hmp_stats(cpu, 0);
list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
inc_hmp_sched_stats_fair(rq, p, 0);
}
/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
void pre_big_task_count_change(const struct cpumask *cpus)
{
int i;
local_irq_disable();
for_each_cpu(i, cpus)
raw_spin_lock(&cpu_rq(i)->lock);
}
/*
* Reinitialize 'nr_big_tasks' counters on all affected cpus
*/
void post_big_task_count_change(const struct cpumask *cpus)
{
int i;
/* Assumes local_irq_disable() keeps online cpumap stable */
for_each_cpu(i, cpus)
update_nr_big_tasks(i);
for_each_cpu(i, cpus)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_enable();
}
static inline int invalid_value_freq_input(unsigned int *data)
{
if (data == &sysctl_sched_freq_aggregate)
return !(*data == 0 || *data == 1);
return 0;
}
static inline int invalid_value(unsigned int *data)
{
unsigned int val = *data;
if (data == &sysctl_sched_ravg_hist_size)
return (val < 2 || val > RAVG_HIST_SIZE_MAX);
if (data == &sysctl_sched_window_stats_policy)
return val >= WINDOW_STATS_INVALID_POLICY;
return invalid_value_freq_input(data);
}
/*
* Handle "atomic" update of sysctl_sched_window_stats_policy,
* sysctl_sched_ravg_hist_size variables.
*/
int sched_window_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
unsigned int old_val;
mutex_lock(&policy_mutex);
old_val = *data;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (write && (old_val == *data)))
goto done;
if (invalid_value(data)) {
*data = old_val;
ret = -EINVAL;
goto done;
}
reset_all_window_stats(0, 0);
done:
mutex_unlock(&policy_mutex);
return ret;
}
/*
* Convert percentage value into absolute form. This will avoid div() operation
* in fast path, to convert task load in percentage scale.
*/
int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int old_val;
unsigned int *data = (unsigned int *)table->data;
int update_task_count = 0;
/*
* The policy mutex is acquired with cpu_hotplug.lock
* held from cpu_up()->cpufreq_governor_interactive()->
* sched_set_window(). So enforce the same order here.
*/
if (write && (data == &sysctl_sched_upmigrate_pct)) {
update_task_count = 1;
get_online_cpus();
}
mutex_lock(&policy_mutex);
old_val = *data;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (write && (old_val == *data))
goto done;
if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
sysctl_sched_group_downmigrate_pct >
sysctl_sched_group_upmigrate_pct) {
*data = old_val;
ret = -EINVAL;
goto done;
}
/*
* Big task tunable change will need to re-classify tasks on
* runqueue as big and set their counters appropriately.
* sysctl interface affects secondary variables (*_pct), which is then
* "atomically" carried over to the primary variables. Atomic change
* includes taking runqueue lock of all online cpus and re-initiatizing
* their big counter values based on changed criteria.
*/
if (update_task_count)
pre_big_task_count_change(cpu_online_mask);
set_hmp_defaults();
if (update_task_count)
post_big_task_count_change(cpu_online_mask);
done:
mutex_unlock(&policy_mutex);
if (update_task_count)
put_online_cpus();
return ret;
}
inline int nr_big_tasks(struct rq *rq)
{
return rq->hmp_stats.nr_big_tasks;
}
unsigned int cpu_temp(int cpu)
{
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
if (per_cpu_info)
return per_cpu_info[cpu].temp;
else
return 0;
}
void free_task_load_ptrs(struct task_struct *p)
{
kfree(p->ravg.curr_window_cpu);
kfree(p->ravg.prev_window_cpu);
/*
* update_task_ravg() can be called for exiting tasks. While the
* function itself ensures correct behavior, the corresponding
* trace event requires that these pointers be NULL.
*/
p->ravg.curr_window_cpu = NULL;
p->ravg.prev_window_cpu = NULL;
}
/* Return task demand in percentage scale */
unsigned int pct_task_load(struct task_struct *p)
{
unsigned int load;
load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
return load;
}
static int __init set_sched_ravg_window(char *str)
{
unsigned int window_size;
get_option(&str, &window_size);
if (window_size < MIN_SCHED_RAVG_WINDOW ||
window_size > MAX_SCHED_RAVG_WINDOW) {
WARN_ON(1);
return -EINVAL;
}
sched_ravg_window = window_size;
return 0;
}
early_param("sched_ravg_window", set_sched_ravg_window);
#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
static inline u64 scale_exec_time(u64 delta, struct rq *rq)
{
u32 freq;
freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
delta *= rq->cluster->exec_scale_factor;
delta >>= 10;
return delta;
}
/* Does freq_required sufficiently exceed or fall behind cur_freq? */
static inline int
nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
{
int delta = freq_required - cur_freq;
if (freq_required > cur_freq)
return delta < sysctl_sched_freq_inc_notify;
delta = -delta;
return delta < sysctl_sched_freq_dec_notify;
}
/* Convert busy time to frequency equivalent */
static inline unsigned int load_to_freq(struct rq *rq, u64 load)
{
unsigned int freq;
load = scale_load_to_cpu(load, cpu_of(rq));
load *= 128;
load = div64_u64(load, max_task_load());
freq = load * cpu_max_possible_freq(cpu_of(rq));
freq /= 128;
return freq;
}
/*
* Return load from all related groups in given frequency domain.
*/
static void group_load_in_freq_domain(struct cpumask *cpus,
u64 *grp_load, u64 *new_grp_load)
{
int j;
for_each_cpu(j, cpus) {
struct rq *rq = cpu_rq(j);
*grp_load += rq->grp_time.prev_runnable_sum;
*new_grp_load += rq->grp_time.nt_prev_runnable_sum;
}
}
/*
* Should scheduler alert governor for changing frequency?
*
* @check_pred - evaluate frequency based on the predictive demand
* @check_groups - add load from all related groups on given cpu
*
* check_groups is set to 1 if a "related" task movement/wakeup is triggering
* the notification check. To avoid "re-aggregation" of demand in such cases,
* we check whether the migrated/woken tasks demand (along with demand from
* existing tasks on the cpu) can be met on target cpu
*
*/
static int send_notification(struct rq *rq, int check_pred, int check_groups)
{
unsigned int cur_freq, freq_required;
unsigned long flags;
int rc = 0;
u64 group_load = 0, new_load = 0;
if (check_pred) {
u64 prev = rq->old_busy_time;
u64 predicted = rq->hmp_stats.pred_demands_sum;
if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
return 0;
prev = max(prev, rq->old_estimated_time);
if (prev > predicted)
return 0;
cur_freq = load_to_freq(rq, prev);
freq_required = load_to_freq(rq, predicted);
if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
return 0;
} else {
/*
* Protect from concurrent update of rq->prev_runnable_sum and
* group cpu load
*/
raw_spin_lock_irqsave(&rq->lock, flags);
if (check_groups)
group_load = rq->grp_time.prev_runnable_sum;
new_load = rq->prev_runnable_sum + group_load;
new_load = freq_policy_load(rq, new_load);
raw_spin_unlock_irqrestore(&rq->lock, flags);
cur_freq = load_to_freq(rq, rq->old_busy_time);
freq_required = load_to_freq(rq, new_load);
if (nearly_same_freq(cur_freq, freq_required))
return 0;
}
raw_spin_lock_irqsave(&rq->lock, flags);
if (!rq->cluster->notifier_sent) {
rq->cluster->notifier_sent = 1;
rc = 1;
trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
new_load);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
return rc;
}
/* Alert governor if there is a need to change frequency */
void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
{
int cpu = cpu_of(rq);
if (!send_notification(rq, check_pred, check_groups))
return;
atomic_notifier_call_chain(
&load_alert_notifier_head, 0,
(void *)(long)cpu);
}
void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
struct task_struct *p)
{
bool check_groups;
rcu_read_lock();
check_groups = task_in_related_thread_group(p);
rcu_read_unlock();
if (!same_freq_domain(src_cpu, dest_cpu)) {
if (!src_cpu_dead)
check_for_freq_change(cpu_rq(src_cpu), false,
check_groups);
check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
} else {
check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
}
}
#define INC_STEP 8
#define DEC_STEP 2
#define CONSISTENT_THRES 16
#define INC_STEP_BIG 16
/*
* bucket_increase - update the count of all buckets
*
* @buckets: array of buckets tracking busy time of a task
* @idx: the index of bucket to be incremented
*
* Each time a complete window finishes, count of bucket that runtime
* falls in (@idx) is incremented. Counts of all other buckets are
* decayed. The rate of increase and decay could be different based
* on current count in the bucket.
*/
static inline void bucket_increase(u8 *buckets, int idx)
{
int i, step;
for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
if (idx != i) {
if (buckets[i] > DEC_STEP)
buckets[i] -= DEC_STEP;
else
buckets[i] = 0;
} else {
step = buckets[i] >= CONSISTENT_THRES ?
INC_STEP_BIG : INC_STEP;
if (buckets[i] > U8_MAX - step)
buckets[i] = U8_MAX;
else
buckets[i] += step;
}
}
}
static inline int busy_to_bucket(u32 normalized_rt)
{
int bidx;
bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
/*
* Combine lowest two buckets. The lowest frequency falls into
* 2nd bucket and thus keep predicting lowest bucket is not
* useful.
*/
if (!bidx)
bidx++;
return bidx;
}
/*
* get_pred_busy - calculate predicted demand for a task on runqueue
*
* @rq: runqueue of task p
* @p: task whose prediction is being updated
* @start: starting bucket. returned prediction should not be lower than
* this bucket.
* @runtime: runtime of the task. returned prediction should not be lower
* than this runtime.
* Note: @start can be derived from @runtime. It's passed in only to
* avoid duplicated calculation in some cases.
*
* A new predicted busy time is returned for task @p based on @runtime
* passed in. The function searches through buckets that represent busy
* time equal to or bigger than @runtime and attempts to find the bucket to
* to use for prediction. Once found, it searches through historical busy
* time and returns the latest that falls into the bucket. If no such busy
* time exists, it returns the medium of that bucket.
*/
static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
int start, u32 runtime)
{
int i;
u8 *buckets = p->ravg.busy_buckets;
u32 *hist = p->ravg.sum_history;
u32 dmin, dmax;
u64 cur_freq_runtime = 0;
int first = NUM_BUSY_BUCKETS, final;
u32 ret = runtime;
/* skip prediction for new tasks due to lack of history */
if (unlikely(is_new_task(p)))
goto out;
/* find minimal bucket index to pick */
for (i = start; i < NUM_BUSY_BUCKETS; i++) {
if (buckets[i]) {
first = i;
break;
}
}
/* if no higher buckets are filled, predict runtime */
if (first >= NUM_BUSY_BUCKETS)
goto out;
/* compute the bucket for prediction */
final = first;
/* determine demand range for the predicted bucket */
if (final < 2) {
/* lowest two buckets are combined */
dmin = 0;
final = 1;
} else {
dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
}
dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
/*
* search through runtime history and return first runtime that falls
* into the range of predicted bucket.
*/
for (i = 0; i < sched_ravg_hist_size; i++) {
if (hist[i] >= dmin && hist[i] < dmax) {
ret = hist[i];
break;
}
}
/* no historical runtime within bucket found, use average of the bin */
if (ret < dmin)
ret = (dmin + dmax) / 2;
/*
* when updating in middle of a window, runtime could be higher
* than all recorded history. Always predict at least runtime.
*/
ret = max(runtime, ret);
out:
trace_sched_update_pred_demand(rq, p, runtime,
mult_frac((unsigned int)cur_freq_runtime, 100,
sched_ravg_window), ret);
return ret;
}
static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
{
if (p->ravg.pred_demand >= p->ravg.curr_window)
return p->ravg.pred_demand;
return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
p->ravg.curr_window);
}
static void reset_all_task_stats(void)
{
struct task_struct *g, *p;
do_each_thread(g, p) {
reset_task_stats(p);
} while_each_thread(g, p);
}
enum reset_reason_code {
WINDOW_CHANGE,
POLICY_CHANGE,
HIST_SIZE_CHANGE,
FREQ_AGGREGATE_CHANGE,
};
const char *sched_window_reset_reasons[] = {
"WINDOW_CHANGE",
"POLICY_CHANGE",
"HIST_SIZE_CHANGE",
};
/* Called with IRQs enabled */
void reset_all_window_stats(u64 window_start, unsigned int window_size)
{
int cpu, i;
unsigned long flags;
u64 start_ts = sched_ktime_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
local_irq_save(flags);
read_lock(&tasklist_lock);
read_lock(&related_thread_group_lock);
/* Taking all runqueue locks prevents race with sched_exit(). */
for_each_possible_cpu(cpu)
raw_spin_lock(&cpu_rq(cpu)->lock);
sched_disable_window_stats = 1;
reset_all_task_stats();
read_unlock(&tasklist_lock);
if (window_size) {
sched_ravg_window = window_size * TICK_NSEC;
set_hmp_defaults();
sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES;
}
sched_disable_window_stats = 0;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
if (window_start)
rq->window_start = window_start;
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
memset(&rq->load_subs[i], 0,
sizeof(struct load_subtractions));
clear_top_tasks_table(rq->top_tasks[i]);
clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]);
}
rq->curr_table = 0;
rq->curr_top = 0;
rq->prev_top = 0;
reset_cpu_hmp_stats(cpu, 1);
}
if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
reason = POLICY_CHANGE;
old = sched_window_stats_policy;
new = sysctl_sched_window_stats_policy;
sched_window_stats_policy = sysctl_sched_window_stats_policy;
} else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
reason = HIST_SIZE_CHANGE;
old = sched_ravg_hist_size;
new = sysctl_sched_ravg_hist_size;
sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
} else if (sched_freq_aggregate !=
sysctl_sched_freq_aggregate) {
reason = FREQ_AGGREGATE_CHANGE;
old = sched_freq_aggregate;
new = sysctl_sched_freq_aggregate;
sched_freq_aggregate = sysctl_sched_freq_aggregate;
}
for_each_possible_cpu(cpu)
raw_spin_unlock(&cpu_rq(cpu)->lock);
read_unlock(&related_thread_group_lock);
local_irq_restore(flags);
trace_sched_reset_all_window_stats(window_start, window_size,
sched_ktime_clock() - start_ts, reason, old, new);
}
void sched_get_cpus_busy(struct sched_load *busy,
const struct cpumask *query_cpus)
{
unsigned long flags;
struct rq *rq;
const int cpus = cpumask_weight(query_cpus);
u64 load[cpus], group_load[cpus];
u64 nload[cpus], ngload[cpus];
u64 pload[cpus];
unsigned int max_freq[cpus];
int notifier_sent = 0;
int early_detection[cpus];
int cpu, i = 0;
unsigned int window_size;
u64 max_prev_sum = 0;
int max_busy_cpu = cpumask_first(query_cpus);
u64 total_group_load = 0, total_ngload = 0;
bool aggregate_load = false;
struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));
if (unlikely(cpus == 0))
return;
local_irq_save(flags);
/*
* This function could be called in timer context, and the
* current task may have been executing for a long time. Ensure
* that the window stats are current by doing an update.
*/
for_each_cpu(cpu, query_cpus)
raw_spin_lock(&cpu_rq(cpu)->lock);
window_size = sched_ravg_window;
/*
* We don't really need the cluster lock for this entire for loop
* block. However, there is no advantage in optimizing this as rq
* locks are held regardless and would prevent migration anyways
*/
raw_spin_lock(&cluster->load_lock);
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
0);
/*
* Ensure that we don't report load for 'cpu' again via the
* cpufreq_update_util path in the window that started at
* rq->window_start
*/
rq->load_reported_window = rq->window_start;
account_load_subtractions(rq);
load[i] = rq->prev_runnable_sum;
nload[i] = rq->nt_prev_runnable_sum;
pload[i] = rq->hmp_stats.pred_demands_sum;
rq->old_estimated_time = pload[i];
if (load[i] > max_prev_sum) {
max_prev_sum = load[i];
max_busy_cpu = cpu;
}
/*
* sched_get_cpus_busy() is called for all CPUs in a
* frequency domain. So the notifier_sent flag per
* cluster works even when a frequency domain spans
* more than 1 cluster.
*/
if (rq->cluster->notifier_sent) {
notifier_sent = 1;
rq->cluster->notifier_sent = 0;
}
early_detection[i] = (rq->ed_task != NULL);
max_freq[i] = cpu_max_freq(cpu);
i++;
}
raw_spin_unlock(&cluster->load_lock);
group_load_in_freq_domain(
&cpu_rq(max_busy_cpu)->freq_domain_cpumask,
&total_group_load, &total_ngload);
aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
i = 0;
for_each_cpu(cpu, query_cpus) {
group_load[i] = 0;
ngload[i] = 0;
if (early_detection[i])
goto skip_early;
rq = cpu_rq(cpu);
if (aggregate_load) {
if (cpu == max_busy_cpu) {
group_load[i] = total_group_load;
ngload[i] = total_ngload;
}
} else {
group_load[i] = rq->grp_time.prev_runnable_sum;
ngload[i] = rq->grp_time.nt_prev_runnable_sum;
}
load[i] += group_load[i];
nload[i] += ngload[i];
load[i] = freq_policy_load(rq, load[i]);
rq->old_busy_time = load[i];
/*
* Scale load in reference to cluster max_possible_freq.
*
* Note that scale_load_to_cpu() scales load in reference to
* the cluster max_freq.
*/
load[i] = scale_load_to_cpu(load[i], cpu);
nload[i] = scale_load_to_cpu(nload[i], cpu);
pload[i] = scale_load_to_cpu(pload[i], cpu);
skip_early:
i++;
}
for_each_cpu(cpu, query_cpus)
raw_spin_unlock(&(cpu_rq(cpu))->lock);
local_irq_restore(flags);
i = 0;
for_each_cpu(cpu, query_cpus) {
rq = cpu_rq(cpu);
if (early_detection[i]) {
busy[i].prev_load = div64_u64(sched_ravg_window,
NSEC_PER_USEC);
busy[i].new_task_load = 0;
busy[i].predicted_load = 0;
goto exit_early;
}
load[i] = scale_load_to_freq(load[i], max_freq[i],
cpu_max_possible_freq(cpu));
nload[i] = scale_load_to_freq(nload[i], max_freq[i],
cpu_max_possible_freq(cpu));
pload[i] = scale_load_to_freq(pload[i], max_freq[i],
rq->cluster->max_possible_freq);
busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
exit_early:
trace_sched_get_busy(cpu, busy[i].prev_load,
busy[i].new_task_load,
busy[i].predicted_load,
early_detection[i]);
i++;
}
}
int sched_set_window(u64 window_start, unsigned int window_size)
{
u64 now, cur_jiffies, jiffy_ktime_ns;
s64 ws;
unsigned long flags;
if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)
return -EINVAL;
mutex_lock(&policy_mutex);
/*
* Get a consistent view of ktime, jiffies, and the time
* since the last jiffy (based on last_jiffies_update).
*/
local_irq_save(flags);
cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
local_irq_restore(flags);
/* translate window_start from jiffies to nanoseconds */
ws = (window_start - cur_jiffies); /* jiffy difference */
ws *= TICK_NSEC;
ws += jiffy_ktime_ns;
/*
* Roll back calculated window start so that it is in
* the past (window stats must have a current window).
*/
while (ws > now)
ws -= (window_size * TICK_NSEC);
BUG_ON(sched_ktime_clock() < ws);
reset_all_window_stats(ws, window_size);
sched_update_freq_max_load(cpu_possible_mask);
mutex_unlock(&policy_mutex);
return 0;
}
static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
{
rq->load_subs[index].window_start = ws;
rq->load_subs[index].subs = 0;
rq->load_subs[index].new_subs = 0;
}
#define sched_up_down_migrate_auto_update 1
static void check_for_up_down_migrate_update(const struct cpumask *cpus)
{
int i = cpumask_first(cpus);
if (!sched_up_down_migrate_auto_update)
return;
if (cpu_max_possible_capacity(i) == max_possible_capacity)
return;
if (cpu_max_possible_freq(i) == cpu_max_freq(i))
up_down_migrate_scale_factor = 1024;
else
up_down_migrate_scale_factor = (1024 *
cpu_max_possible_freq(i)) / cpu_max_freq(i);
update_up_down_migrate();
}
void update_cpu_cluster_capacity(const cpumask_t *cpus)
{
int i;
struct sched_cluster *cluster;
struct cpumask cpumask;
cpumask_copy(&cpumask, cpus);
pre_big_task_count_change(cpu_possible_mask);
for_each_cpu(i, &cpumask) {
cluster = cpu_rq(i)->cluster;
cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
cluster->capacity = compute_capacity(cluster);
cluster->load_scale_factor = compute_load_scale_factor(cluster);
/* 'cpus' can contain cpumask more than one cluster */
check_for_up_down_migrate_update(&cluster->cpus);
}
__update_min_max_capacity();
post_big_task_count_change(cpu_possible_mask);
}
static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
{
struct cpumask cpumask;
struct sched_cluster *cluster;
int i, update_capacity = 0;
unsigned long flags;
spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
cpumask_copy(&cpumask, cpus);
for_each_cpu(i, &cpumask) {
cluster = cpu_rq(i)->cluster;
cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
update_capacity += (cluster->max_mitigated_freq != fmax);
cluster->max_mitigated_freq = fmax;
}
spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
if (update_capacity)
update_cpu_cluster_capacity(cpus);
}
static int cpufreq_notifier_trans(struct notifier_block *nb,
unsigned long val, void *data)
{
struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
unsigned int cpu = freq->cpu, new_freq = freq->new;
unsigned long flags;
struct sched_cluster *cluster;
struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
int i, j;
if (val != CPUFREQ_POSTCHANGE)
return 0;
BUG_ON(!new_freq);
if (cpu_cur_freq(cpu) == new_freq)
return 0;
for_each_cpu(i, &policy_cpus) {
cluster = cpu_rq(i)->cluster;
for_each_cpu(j, &cluster->cpus) {
struct rq *rq = cpu_rq(j);
raw_spin_lock_irqsave(&rq->lock, flags);
update_task_ravg(rq->curr, rq, TASK_UPDATE,
sched_ktime_clock(), 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
cluster->cur_freq = new_freq;
cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
}
return 0;
}
static int pwr_stats_ready_notifier(struct notifier_block *nb,
unsigned long cpu, void *data)
{
cpumask_t mask = CPU_MASK_NONE;
cpumask_set_cpu(cpu, &mask);
sched_update_freq_max_load(&mask);
mutex_lock(&cluster_lock);
sort_clusters();
mutex_unlock(&cluster_lock);
return 0;
}
static struct notifier_block notifier_trans_block = {
.notifier_call = cpufreq_notifier_trans
};
static struct notifier_block notifier_pwr_stats_ready = {
.notifier_call = pwr_stats_ready_notifier
};
int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
{
return -EINVAL;
}
static int register_sched_callback(void)
{
cpufreq_register_notifier(&notifier_trans_block,
CPUFREQ_TRANSITION_NOTIFIER);
register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
return 0;
}
/*
* cpufreq callbacks can be registered at core_initcall or later time.
* Any registration done prior to that is "forgotten" by cpufreq. See
* initialization of variable init_cpufreq_transition_notifier_list_called
* for further information.
*/
core_initcall(register_sched_callback);
void update_avg_burst(struct task_struct *p)
{
update_avg(&p->ravg.avg_burst, p->ravg.curr_burst);
p->ravg.curr_burst = 0;
}
void note_task_waking(struct task_struct *p, u64 wallclock)
{
u64 sleep_time = wallclock - p->last_switch_out_ts;
p->last_wake_ts = wallclock;
update_avg(&p->ravg.avg_sleep_time, sleep_time);
}
#ifdef CONFIG_CGROUP_SCHED
u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
return tg->upmigrate_discouraged;
}
int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 upmigrate_discourage)
{
struct task_group *tg = css_tg(css);
int discourage = upmigrate_discourage > 0;
if (tg->upmigrate_discouraged == discourage)
return 0;
/*
* Revisit big-task classification for tasks of this cgroup. It would
* have been efficient to walk tasks of just this cgroup in running
* state, but we don't have easy means to do that. Walk all tasks in
* running state on all cpus instead and re-visit their big task
* classification.
*/
get_online_cpus();
pre_big_task_count_change(cpu_online_mask);
tg->upmigrate_discouraged = discourage;
post_big_task_count_change(cpu_online_mask);
put_online_cpus();
return 0;
}
#endif /* CONFIG_CGROUP_SCHED */