| /* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License version 2 and |
| * only version 2 as published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * Implementation credits: Srivatsa Vaddagiri, Steve Muckle |
| * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti |
| * and Vikram Mulukutla |
| */ |
| |
| #include <linux/cpufreq.h> |
| #include <linux/list_sort.h> |
| #include <linux/syscore_ops.h> |
| |
| #include "sched.h" |
| |
| #include <trace/events/sched.h> |
| |
| const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK", |
| "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"}; |
| |
| const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP", |
| "RQ_TO_RQ", "GROUP_TO_GROUP"}; |
| |
| |
| static ktime_t ktime_last; |
| static bool sched_ktime_suspended; |
| |
| static bool use_cycle_counter; |
| static struct cpu_cycle_counter_cb cpu_cycle_counter_cb; |
| |
| u64 sched_ktime_clock(void) |
| { |
| if (unlikely(sched_ktime_suspended)) |
| return ktime_to_ns(ktime_last); |
| return ktime_get_ns(); |
| } |
| |
| static void sched_resume(void) |
| { |
| sched_ktime_suspended = false; |
| } |
| |
| static int sched_suspend(void) |
| { |
| ktime_last = ktime_get(); |
| sched_ktime_suspended = true; |
| return 0; |
| } |
| |
| static struct syscore_ops sched_syscore_ops = { |
| .resume = sched_resume, |
| .suspend = sched_suspend |
| }; |
| |
| static int __init sched_init_ops(void) |
| { |
| register_syscore_ops(&sched_syscore_ops); |
| return 0; |
| } |
| late_initcall(sched_init_ops); |
| |
| inline void clear_ed_task(struct task_struct *p, struct rq *rq) |
| { |
| if (p == rq->ed_task) |
| rq->ed_task = NULL; |
| } |
| |
| inline void set_task_last_wake(struct task_struct *p, u64 wallclock) |
| { |
| p->last_wake_ts = wallclock; |
| } |
| |
| inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) |
| { |
| p->last_switch_out_ts = wallclock; |
| } |
| |
| /* |
| * Note C-state for (idle) cpus. |
| * |
| * @cstate = cstate index, 0 -> active state |
| * @wakeup_energy = energy spent in waking up cpu |
| * @wakeup_latency = latency to wakeup from cstate |
| * |
| */ |
| void |
| sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| rq->cstate = cstate; /* C1, C2 etc */ |
| rq->wakeup_energy = wakeup_energy; |
| rq->wakeup_latency = wakeup_latency; |
| } |
| |
| /* |
| * Note D-state for (idle) cluster. |
| * |
| * @dstate = dstate index, 0 -> active state |
| * @wakeup_energy = energy spent in waking up cluster |
| * @wakeup_latency = latency to wakeup from cluster |
| * |
| */ |
| void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate, |
| int wakeup_energy, int wakeup_latency) |
| { |
| struct sched_cluster *cluster = |
| cpu_rq(cpumask_first(cluster_cpus))->cluster; |
| cluster->dstate = dstate; |
| cluster->dstate_wakeup_energy = wakeup_energy; |
| cluster->dstate_wakeup_latency = wakeup_latency; |
| } |
| |
| u32 __weak get_freq_max_load(int cpu, u32 freq) |
| { |
| /* 100% by default */ |
| return 100; |
| } |
| |
| struct freq_max_load_entry { |
| /* The maximum load which has accounted governor's headroom. */ |
| u64 hdemand; |
| }; |
| |
| struct freq_max_load { |
| struct rcu_head rcu; |
| int length; |
| struct freq_max_load_entry freqs[0]; |
| }; |
| |
| static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load); |
| static DEFINE_SPINLOCK(freq_max_load_lock); |
| |
| struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) |
| { |
| return NULL; |
| } |
| |
| int sched_update_freq_max_load(const cpumask_t *cpumask) |
| { |
| int i, cpu, ret; |
| unsigned int freq; |
| struct cpu_pstate_pwr *costs; |
| struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); |
| struct freq_max_load *max_load, *old_max_load; |
| struct freq_max_load_entry *entry; |
| u64 max_demand_capacity, max_demand; |
| unsigned long flags; |
| u32 hfreq; |
| int hpct; |
| |
| if (!per_cpu_info) |
| return 0; |
| |
| spin_lock_irqsave(&freq_max_load_lock, flags); |
| max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity); |
| for_each_cpu(cpu, cpumask) { |
| if (!per_cpu_info[cpu].ptable) { |
| ret = -EINVAL; |
| goto fail; |
| } |
| |
| old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); |
| |
| /* |
| * allocate len + 1 and leave the last power cost as 0 for |
| * power_cost() can stop iterating index when |
| * per_cpu_info[cpu].len > len of max_load due to race between |
| * cpu power stats update and get_cpu_pwr_stats(). |
| */ |
| max_load = kzalloc(sizeof(struct freq_max_load) + |
| sizeof(struct freq_max_load_entry) * |
| (per_cpu_info[cpu].len + 1), GFP_ATOMIC); |
| if (unlikely(!max_load)) { |
| ret = -ENOMEM; |
| goto fail; |
| } |
| |
| max_load->length = per_cpu_info[cpu].len; |
| |
| max_demand = max_demand_capacity * |
| cpu_max_possible_capacity(cpu); |
| |
| i = 0; |
| costs = per_cpu_info[cpu].ptable; |
| while (costs[i].freq) { |
| entry = &max_load->freqs[i]; |
| freq = costs[i].freq; |
| hpct = get_freq_max_load(cpu, freq); |
| if (hpct <= 0 && hpct > 100) |
| hpct = 100; |
| hfreq = div64_u64((u64)freq * hpct, 100); |
| entry->hdemand = |
| div64_u64(max_demand * hfreq, |
| cpu_max_possible_freq(cpu)); |
| i++; |
| } |
| |
| rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load); |
| if (old_max_load) |
| kfree_rcu(old_max_load, rcu); |
| } |
| |
| spin_unlock_irqrestore(&freq_max_load_lock, flags); |
| return 0; |
| |
| fail: |
| for_each_cpu(cpu, cpumask) { |
| max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); |
| if (max_load) { |
| rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL); |
| kfree_rcu(max_load, rcu); |
| } |
| } |
| |
| spin_unlock_irqrestore(&freq_max_load_lock, flags); |
| return ret; |
| } |
| |
| unsigned int max_possible_efficiency = 1; |
| unsigned int min_possible_efficiency = UINT_MAX; |
| |
| unsigned long __weak arch_get_cpu_efficiency(int cpu) |
| { |
| return SCHED_CAPACITY_SCALE; |
| } |
| |
| /* Keep track of max/min capacity possible across CPUs "currently" */ |
| static void __update_min_max_capacity(void) |
| { |
| int i; |
| int max_cap = 0, min_cap = INT_MAX; |
| |
| for_each_online_cpu(i) { |
| max_cap = max(max_cap, cpu_capacity(i)); |
| min_cap = min(min_cap, cpu_capacity(i)); |
| } |
| |
| max_capacity = max_cap; |
| min_capacity = min_cap; |
| } |
| |
| static void update_min_max_capacity(void) |
| { |
| unsigned long flags; |
| int i; |
| |
| local_irq_save(flags); |
| for_each_possible_cpu(i) |
| raw_spin_lock(&cpu_rq(i)->lock); |
| |
| __update_min_max_capacity(); |
| |
| for_each_possible_cpu(i) |
| raw_spin_unlock(&cpu_rq(i)->lock); |
| local_irq_restore(flags); |
| } |
| |
| /* |
| * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that |
| * least efficient cpu gets capacity of 1024 |
| */ |
| static unsigned long |
| capacity_scale_cpu_efficiency(struct sched_cluster *cluster) |
| { |
| return (1024 * cluster->efficiency) / min_possible_efficiency; |
| } |
| |
| /* |
| * Return 'capacity' of a cpu in reference to cpu with lowest max_freq |
| * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. |
| */ |
| static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster) |
| { |
| return (1024 * cluster_max_freq(cluster)) / min_max_freq; |
| } |
| |
| /* |
| * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so |
| * that "most" efficient cpu gets a load_scale_factor of 1 |
| */ |
| static inline unsigned long |
| load_scale_cpu_efficiency(struct sched_cluster *cluster) |
| { |
| return DIV_ROUND_UP(1024 * max_possible_efficiency, |
| cluster->efficiency); |
| } |
| |
| /* |
| * Return load_scale_factor of a cpu in reference to cpu with best max_freq |
| * (max_possible_freq), so that one with best max_freq gets a load_scale_factor |
| * of 1. |
| */ |
| static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster) |
| { |
| return DIV_ROUND_UP(1024 * max_possible_freq, |
| cluster_max_freq(cluster)); |
| } |
| |
| static int compute_capacity(struct sched_cluster *cluster) |
| { |
| int capacity = 1024; |
| |
| capacity *= capacity_scale_cpu_efficiency(cluster); |
| capacity >>= 10; |
| |
| capacity *= capacity_scale_cpu_freq(cluster); |
| capacity >>= 10; |
| |
| return capacity; |
| } |
| |
| static int compute_max_possible_capacity(struct sched_cluster *cluster) |
| { |
| int capacity = 1024; |
| |
| capacity *= capacity_scale_cpu_efficiency(cluster); |
| capacity >>= 10; |
| |
| capacity *= (1024 * cluster->max_possible_freq) / min_max_freq; |
| capacity >>= 10; |
| |
| return capacity; |
| } |
| |
| static int compute_load_scale_factor(struct sched_cluster *cluster) |
| { |
| int load_scale = 1024; |
| |
| /* |
| * load_scale_factor accounts for the fact that task load |
| * is in reference to "best" performing cpu. Task's load will need to be |
| * scaled (up) by a factor to determine suitability to be placed on a |
| * (little) cpu. |
| */ |
| load_scale *= load_scale_cpu_efficiency(cluster); |
| load_scale >>= 10; |
| |
| load_scale *= load_scale_cpu_freq(cluster); |
| load_scale >>= 10; |
| |
| return load_scale; |
| } |
| |
| struct list_head cluster_head; |
| static DEFINE_MUTEX(cluster_lock); |
| static cpumask_t all_cluster_cpus = CPU_MASK_NONE; |
| DECLARE_BITMAP(all_cluster_ids, NR_CPUS); |
| struct sched_cluster *sched_cluster[NR_CPUS]; |
| int num_clusters; |
| |
| struct sched_cluster init_cluster = { |
| .list = LIST_HEAD_INIT(init_cluster.list), |
| .id = 0, |
| .max_power_cost = 1, |
| .min_power_cost = 1, |
| .capacity = 1024, |
| .max_possible_capacity = 1024, |
| .efficiency = 1, |
| .load_scale_factor = 1024, |
| .cur_freq = 1, |
| .max_freq = 1, |
| .max_mitigated_freq = UINT_MAX, |
| .min_freq = 1, |
| .max_possible_freq = 1, |
| .dstate = 0, |
| .dstate_wakeup_energy = 0, |
| .dstate_wakeup_latency = 0, |
| .exec_scale_factor = 1024, |
| .notifier_sent = 0, |
| }; |
| |
| static void update_all_clusters_stats(void) |
| { |
| struct sched_cluster *cluster; |
| u64 highest_mpc = 0, lowest_mpc = U64_MAX; |
| |
| pre_big_task_count_change(cpu_possible_mask); |
| |
| for_each_sched_cluster(cluster) { |
| u64 mpc; |
| |
| cluster->capacity = compute_capacity(cluster); |
| mpc = cluster->max_possible_capacity = |
| compute_max_possible_capacity(cluster); |
| cluster->load_scale_factor = compute_load_scale_factor(cluster); |
| |
| cluster->exec_scale_factor = |
| DIV_ROUND_UP(cluster->efficiency * 1024, |
| max_possible_efficiency); |
| |
| if (mpc > highest_mpc) |
| highest_mpc = mpc; |
| |
| if (mpc < lowest_mpc) |
| lowest_mpc = mpc; |
| } |
| |
| max_possible_capacity = highest_mpc; |
| min_max_possible_capacity = lowest_mpc; |
| |
| __update_min_max_capacity(); |
| sched_update_freq_max_load(cpu_possible_mask); |
| post_big_task_count_change(cpu_possible_mask); |
| } |
| |
| static void assign_cluster_ids(struct list_head *head) |
| { |
| struct sched_cluster *cluster; |
| int pos = 0; |
| |
| list_for_each_entry(cluster, head, list) { |
| cluster->id = pos; |
| sched_cluster[pos++] = cluster; |
| } |
| } |
| |
| static void |
| move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) |
| { |
| struct list_head *first, *last; |
| |
| first = src->next; |
| last = src->prev; |
| |
| if (sync_rcu) { |
| INIT_LIST_HEAD_RCU(src); |
| synchronize_rcu(); |
| } |
| |
| first->prev = dst; |
| dst->prev = last; |
| last->next = dst; |
| |
| /* Ensure list sanity before making the head visible to all CPUs. */ |
| smp_mb(); |
| dst->next = first; |
| } |
| |
| static int |
| compare_clusters(void *priv, struct list_head *a, struct list_head *b) |
| { |
| struct sched_cluster *cluster1, *cluster2; |
| int ret; |
| |
| cluster1 = container_of(a, struct sched_cluster, list); |
| cluster2 = container_of(b, struct sched_cluster, list); |
| |
| ret = cluster1->max_power_cost > cluster2->max_power_cost || |
| (cluster1->max_power_cost == cluster2->max_power_cost && |
| cluster1->max_possible_capacity < |
| cluster2->max_possible_capacity); |
| |
| return ret; |
| } |
| |
| static void sort_clusters(void) |
| { |
| struct sched_cluster *cluster; |
| struct list_head new_head; |
| |
| INIT_LIST_HEAD(&new_head); |
| |
| for_each_sched_cluster(cluster) { |
| cluster->max_power_cost = power_cost(cluster_first_cpu(cluster), |
| max_task_load()); |
| cluster->min_power_cost = power_cost(cluster_first_cpu(cluster), |
| 0); |
| } |
| |
| move_list(&new_head, &cluster_head, true); |
| |
| list_sort(NULL, &new_head, compare_clusters); |
| assign_cluster_ids(&new_head); |
| |
| /* |
| * Ensure cluster ids are visible to all CPUs before making |
| * cluster_head visible. |
| */ |
| move_list(&cluster_head, &new_head, false); |
| } |
| |
| static void |
| insert_cluster(struct sched_cluster *cluster, struct list_head *head) |
| { |
| struct sched_cluster *tmp; |
| struct list_head *iter = head; |
| |
| list_for_each_entry(tmp, head, list) { |
| if (cluster->max_power_cost < tmp->max_power_cost) |
| break; |
| iter = &tmp->list; |
| } |
| |
| list_add(&cluster->list, iter); |
| } |
| |
| static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) |
| { |
| struct sched_cluster *cluster = NULL; |
| |
| cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC); |
| if (!cluster) { |
| __WARN_printf("Cluster allocation failed. \ |
| Possible bad scheduling\n"); |
| return NULL; |
| } |
| |
| INIT_LIST_HEAD(&cluster->list); |
| cluster->max_power_cost = 1; |
| cluster->min_power_cost = 1; |
| cluster->capacity = 1024; |
| cluster->max_possible_capacity = 1024; |
| cluster->efficiency = 1; |
| cluster->load_scale_factor = 1024; |
| cluster->cur_freq = 1; |
| cluster->max_freq = 1; |
| cluster->max_mitigated_freq = UINT_MAX; |
| cluster->min_freq = 1; |
| cluster->max_possible_freq = 1; |
| cluster->dstate = 0; |
| cluster->dstate_wakeup_energy = 0; |
| cluster->dstate_wakeup_latency = 0; |
| cluster->freq_init_done = false; |
| |
| cluster->cpus = *cpus; |
| cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); |
| |
| if (cluster->efficiency > max_possible_efficiency) |
| max_possible_efficiency = cluster->efficiency; |
| if (cluster->efficiency < min_possible_efficiency) |
| min_possible_efficiency = cluster->efficiency; |
| |
| cluster->notifier_sent = 0; |
| return cluster; |
| } |
| |
| static void add_cluster(const struct cpumask *cpus, struct list_head *head) |
| { |
| struct sched_cluster *cluster = alloc_new_cluster(cpus); |
| int i; |
| |
| if (!cluster) |
| return; |
| |
| for_each_cpu(i, cpus) |
| cpu_rq(i)->cluster = cluster; |
| |
| insert_cluster(cluster, head); |
| set_bit(num_clusters, all_cluster_ids); |
| num_clusters++; |
| } |
| |
| void update_cluster_topology(void) |
| { |
| struct cpumask cpus = *cpu_possible_mask; |
| const struct cpumask *cluster_cpus; |
| struct list_head new_head; |
| int i; |
| |
| INIT_LIST_HEAD(&new_head); |
| |
| for_each_cpu(i, &cpus) { |
| cluster_cpus = cpu_coregroup_mask(i); |
| cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus); |
| cpumask_andnot(&cpus, &cpus, cluster_cpus); |
| add_cluster(cluster_cpus, &new_head); |
| } |
| |
| assign_cluster_ids(&new_head); |
| |
| /* |
| * Ensure cluster ids are visible to all CPUs before making |
| * cluster_head visible. |
| */ |
| move_list(&cluster_head, &new_head, false); |
| } |
| |
| void init_clusters(void) |
| { |
| bitmap_clear(all_cluster_ids, 0, NR_CPUS); |
| init_cluster.cpus = *cpu_possible_mask; |
| INIT_LIST_HEAD(&cluster_head); |
| } |
| |
| int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) |
| { |
| mutex_lock(&cluster_lock); |
| if (!cb->get_cpu_cycle_counter) { |
| mutex_unlock(&cluster_lock); |
| return -EINVAL; |
| } |
| |
| cpu_cycle_counter_cb = *cb; |
| use_cycle_counter = true; |
| mutex_unlock(&cluster_lock); |
| |
| return 0; |
| } |
| |
| int got_boost_kick(void) |
| { |
| int cpu = smp_processor_id(); |
| struct rq *rq = cpu_rq(cpu); |
| |
| return test_bit(BOOST_KICK, &rq->hmp_flags); |
| } |
| |
| inline void clear_boost_kick(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| clear_bit(BOOST_KICK, &rq->hmp_flags); |
| } |
| |
| inline void boost_kick(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) |
| smp_send_reschedule(cpu); |
| } |
| |
| /* Clear any HMP scheduler related requests pending from or on cpu */ |
| void clear_hmp_request(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| unsigned long flags; |
| |
| clear_boost_kick(cpu); |
| clear_reserved(cpu); |
| if (rq->push_task) { |
| raw_spin_lock_irqsave(&rq->lock, flags); |
| if (rq->push_task) { |
| clear_reserved(rq->push_cpu); |
| put_task_struct(rq->push_task); |
| rq->push_task = NULL; |
| } |
| rq->active_balance = 0; |
| raw_spin_unlock_irqrestore(&rq->lock, flags); |
| } |
| } |
| |
| int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| rq->static_cpu_pwr_cost = cost; |
| return 0; |
| } |
| |
| unsigned int sched_get_static_cpu_pwr_cost(int cpu) |
| { |
| return cpu_rq(cpu)->static_cpu_pwr_cost; |
| } |
| |
| int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost) |
| { |
| struct sched_cluster *cluster = cpu_rq(cpu)->cluster; |
| |
| cluster->static_cluster_pwr_cost = cost; |
| return 0; |
| } |
| |
| unsigned int sched_get_static_cluster_pwr_cost(int cpu) |
| { |
| return cpu_rq(cpu)->cluster->static_cluster_pwr_cost; |
| } |
| |
| /* |
| * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy |
| * associated with them. This is required for atomic update of those variables |
| * when being modifed via sysctl interface. |
| * |
| * IMPORTANT: Initialize both copies to same value!! |
| */ |
| |
| /* |
| * Tasks that are runnable continuously for a period greather than |
| * EARLY_DETECTION_DURATION can be flagged early as potential |
| * high load tasks. |
| */ |
| #define EARLY_DETECTION_DURATION 9500000 |
| |
| static __read_mostly unsigned int sched_ravg_hist_size = 5; |
| __read_mostly unsigned int sysctl_sched_ravg_hist_size = 5; |
| |
| static __read_mostly unsigned int sched_window_stats_policy = |
| WINDOW_STATS_MAX_RECENT_AVG; |
| __read_mostly unsigned int sysctl_sched_window_stats_policy = |
| WINDOW_STATS_MAX_RECENT_AVG; |
| |
| #define SCHED_ACCOUNT_WAIT_TIME 1 |
| |
| __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); |
| |
| unsigned int __read_mostly sysctl_sched_enable_colocation = 1; |
| |
| /* |
| * Enable colocation and frequency aggregation for all threads in a process. |
| * The children inherits the group id from the parent. |
| */ |
| unsigned int __read_mostly sysctl_sched_enable_thread_grouping; |
| |
| |
| __read_mostly unsigned int sysctl_sched_new_task_windows = 5; |
| |
| #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 |
| |
| /* |
| * For increase, send notification if |
| * freq_required - cur_freq > sysctl_sched_freq_inc_notify |
| */ |
| __read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */ |
| |
| /* |
| * For decrease, send notification if |
| * cur_freq - freq_required > sysctl_sched_freq_dec_notify |
| */ |
| __read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */ |
| |
| static __read_mostly unsigned int sched_io_is_busy; |
| |
| __read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024; |
| |
| /* |
| * Maximum possible frequency across all cpus. Task demand and cpu |
| * capacity (cpu_power) metrics are scaled in reference to it. |
| */ |
| unsigned int max_possible_freq = 1; |
| |
| /* |
| * Minimum possible max_freq across all cpus. This will be same as |
| * max_possible_freq on homogeneous systems and could be different from |
| * max_possible_freq on heterogenous systems. min_max_freq is used to derive |
| * capacity (cpu_power) of cpus. |
| */ |
| unsigned int min_max_freq = 1; |
| |
| unsigned int max_capacity = 1024; /* max(rq->capacity) */ |
| unsigned int min_capacity = 1024; /* min(rq->capacity) */ |
| unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ |
| unsigned int |
| min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ |
| |
| /* Window size (in ns) */ |
| __read_mostly unsigned int sched_ravg_window = 10000000; |
| |
| /* Min window size (in ns) = 10ms */ |
| #define MIN_SCHED_RAVG_WINDOW 10000000 |
| |
| /* Max window size (in ns) = 1s */ |
| #define MAX_SCHED_RAVG_WINDOW 1000000000 |
| |
| /* Temporarily disable window-stats activity on all cpus */ |
| unsigned int __read_mostly sched_disable_window_stats; |
| |
| /* |
| * Major task runtime. If a task runs for more than sched_major_task_runtime |
| * in a window, it's considered to be generating majority of workload |
| * for this window. Prediction could be adjusted for such tasks. |
| */ |
| __read_mostly unsigned int sched_major_task_runtime = 10000000; |
| |
| static unsigned int sync_cpu; |
| |
| static LIST_HEAD(related_thread_groups); |
| static DEFINE_RWLOCK(related_thread_group_lock); |
| |
| #define for_each_related_thread_group(grp) \ |
| list_for_each_entry(grp, &related_thread_groups, list) |
| |
| /* |
| * Demand aggregation for frequency purpose: |
| * |
| * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads |
| * for frequency determination purpose. This aggregation is done per-cluster. |
| * |
| * CPU demand of tasks from various related groups is aggregated per-cluster and |
| * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined |
| * by just rq->prev_runnable_sum. |
| * |
| * Some examples follow, which assume: |
| * Cluster0 = CPU0-3, Cluster1 = CPU4-7 |
| * One related thread group A that has tasks A0, A1, A2 |
| * |
| * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of |
| * tasks belonging to group A are accumulated when they run on cpu X. |
| * |
| * CX->curr/prev_sum = counters in which cpu execution stats of all tasks |
| * not belonging to group A are accumulated when they run on cpu X |
| * |
| * Lets say the stats for window M was as below: |
| * |
| * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms |
| * Task A0 ran 5ms on CPU0 |
| * Task B0 ran 1ms on CPU0 |
| * |
| * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms |
| * Task A1 ran 4ms on CPU1 |
| * Task A2 ran 2ms on CPU1 |
| * Task B1 ran 5ms on CPU1 |
| * |
| * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 |
| * CPU2 idle |
| * |
| * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 |
| * CPU3 idle |
| * |
| * In this case, CPU1 was most busy going by just its prev_sum counter. Demand |
| * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy |
| * time reported to governor will be: |
| * |
| * |
| * C0 busy time = 1ms |
| * C1 busy time = 5 + 5 + 6 = 16ms |
| * |
| */ |
| static __read_mostly unsigned int sched_freq_aggregate; |
| __read_mostly unsigned int sysctl_sched_freq_aggregate; |
| |
| unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; |
| static unsigned int __read_mostly sched_freq_aggregate_threshold; |
| |
| /* Initial task load. Newly created tasks are assigned this load. */ |
| unsigned int __read_mostly sched_init_task_load_windows; |
| unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; |
| |
| unsigned int max_task_load(void) |
| { |
| return sched_ravg_window; |
| } |
| |
| /* |
| * Scheduler boost is a mechanism to temporarily place tasks on CPUs |
| * with higher capacity than those where a task would have normally |
| * ended up with their load characteristics. Any entity enabling |
| * boost is responsible for disabling it as well. |
| */ |
| unsigned int sysctl_sched_boost; |
| |
| /* A cpu can no longer accommodate more tasks if: |
| * |
| * rq->nr_running > sysctl_sched_spill_nr_run || |
| * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load |
| */ |
| unsigned int __read_mostly sysctl_sched_spill_nr_run = 10; |
| |
| /* |
| * Place sync wakee tasks those have less than configured demand to the waker's |
| * cluster. |
| */ |
| unsigned int __read_mostly sched_small_wakee_task_load; |
| unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10; |
| |
| unsigned int __read_mostly sched_big_waker_task_load; |
| unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25; |
| |
| /* |
| * CPUs with load greater than the sched_spill_load_threshold are not |
| * eligible for task placement. When all CPUs in a cluster achieve a |
| * load higher than this level, tasks becomes eligible for inter |
| * cluster migration. |
| */ |
| unsigned int __read_mostly sched_spill_load; |
| unsigned int __read_mostly sysctl_sched_spill_load_pct = 100; |
| |
| /* |
| * Tasks whose bandwidth consumption on a cpu is more than |
| * sched_upmigrate are considered "big" tasks. Big tasks will be |
| * considered for "up" migration, i.e migrating to a cpu with better |
| * capacity. |
| */ |
| unsigned int __read_mostly sched_upmigrate; |
| unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80; |
| |
| /* |
| * Big tasks, once migrated, will need to drop their bandwidth |
| * consumption to less than sched_downmigrate before they are "down" |
| * migrated. |
| */ |
| unsigned int __read_mostly sched_downmigrate; |
| unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; |
| |
| /* |
| * The load scale factor of a CPU gets boosted when its max frequency |
| * is restricted due to which the tasks are migrating to higher capacity |
| * CPUs early. The sched_upmigrate threshold is auto-upgraded by |
| * rq->max_possible_freq/rq->max_freq of a lower capacity CPU. |
| */ |
| unsigned int up_down_migrate_scale_factor = 1024; |
| |
| /* |
| * Scheduler selects and places task to its previous CPU if sleep time is |
| * less than sysctl_sched_select_prev_cpu_us. |
| */ |
| unsigned int __read_mostly |
| sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC; |
| |
| unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000; |
| |
| unsigned int __read_mostly |
| sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; |
| |
| unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; |
| |
| void update_up_down_migrate(void) |
| { |
| unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); |
| unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); |
| unsigned int delta; |
| |
| if (up_down_migrate_scale_factor == 1024) |
| goto done; |
| |
| delta = up_migrate - down_migrate; |
| |
| up_migrate /= NSEC_PER_USEC; |
| up_migrate *= up_down_migrate_scale_factor; |
| up_migrate >>= 10; |
| up_migrate *= NSEC_PER_USEC; |
| |
| up_migrate = min(up_migrate, sched_ravg_window); |
| |
| down_migrate /= NSEC_PER_USEC; |
| down_migrate *= up_down_migrate_scale_factor; |
| down_migrate >>= 10; |
| down_migrate *= NSEC_PER_USEC; |
| |
| down_migrate = min(down_migrate, up_migrate - delta); |
| done: |
| sched_upmigrate = up_migrate; |
| sched_downmigrate = down_migrate; |
| } |
| |
| void set_hmp_defaults(void) |
| { |
| sched_spill_load = |
| pct_to_real(sysctl_sched_spill_load_pct); |
| |
| update_up_down_migrate(); |
| |
| sched_major_task_runtime = |
| mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100); |
| |
| sched_init_task_load_windows = |
| div64_u64((u64)sysctl_sched_init_task_load_pct * |
| (u64)sched_ravg_window, 100); |
| |
| sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us * |
| NSEC_PER_USEC; |
| |
| sched_small_wakee_task_load = |
| div64_u64((u64)sysctl_sched_small_wakee_task_load_pct * |
| (u64)sched_ravg_window, 100); |
| |
| sched_big_waker_task_load = |
| div64_u64((u64)sysctl_sched_big_waker_task_load_pct * |
| (u64)sched_ravg_window, 100); |
| |
| sched_freq_aggregate_threshold = |
| pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); |
| } |
| |
| u32 sched_get_init_task_load(struct task_struct *p) |
| { |
| return p->init_load_pct; |
| } |
| |
| int sched_set_init_task_load(struct task_struct *p, int init_load_pct) |
| { |
| if (init_load_pct < 0 || init_load_pct > 100) |
| return -EINVAL; |
| |
| p->init_load_pct = init_load_pct; |
| |
| return 0; |
| } |
| |
| #ifdef CONFIG_CGROUP_SCHED |
| |
| int upmigrate_discouraged(struct task_struct *p) |
| { |
| return task_group(p)->upmigrate_discouraged; |
| } |
| |
| #else |
| |
| static inline int upmigrate_discouraged(struct task_struct *p) |
| { |
| return 0; |
| } |
| |
| #endif |
| |
| /* Is a task "big" on its current cpu */ |
| static inline int __is_big_task(struct task_struct *p, u64 scaled_load) |
| { |
| int nice = task_nice(p); |
| |
| if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) |
| return 0; |
| |
| return scaled_load > sched_upmigrate; |
| } |
| |
| int is_big_task(struct task_struct *p) |
| { |
| return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p))); |
| } |
| |
| u64 cpu_load(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu); |
| } |
| |
| u64 cpu_load_sync(int cpu, int sync) |
| { |
| return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); |
| } |
| |
| static int boost_refcount; |
| static DEFINE_SPINLOCK(boost_lock); |
| static DEFINE_MUTEX(boost_mutex); |
| |
| static void boost_kick_cpus(void) |
| { |
| int i; |
| |
| for_each_online_cpu(i) { |
| if (cpu_capacity(i) != max_capacity) |
| boost_kick(i); |
| } |
| } |
| |
| int sched_boost(void) |
| { |
| return boost_refcount > 0; |
| } |
| |
| int sched_set_boost(int enable) |
| { |
| unsigned long flags; |
| int ret = 0; |
| int old_refcount; |
| |
| spin_lock_irqsave(&boost_lock, flags); |
| |
| old_refcount = boost_refcount; |
| |
| if (enable == 1) { |
| boost_refcount++; |
| } else if (!enable) { |
| if (boost_refcount >= 1) |
| boost_refcount--; |
| else |
| ret = -EINVAL; |
| } else { |
| ret = -EINVAL; |
| } |
| |
| if (!old_refcount && boost_refcount) |
| boost_kick_cpus(); |
| |
| trace_sched_set_boost(boost_refcount); |
| spin_unlock_irqrestore(&boost_lock, flags); |
| |
| return ret; |
| } |
| |
| int sched_boost_handler(struct ctl_table *table, int write, |
| void __user *buffer, size_t *lenp, |
| loff_t *ppos) |
| { |
| int ret; |
| |
| mutex_lock(&boost_mutex); |
| if (!write) |
| sysctl_sched_boost = sched_boost(); |
| |
| ret = proc_dointvec(table, write, buffer, lenp, ppos); |
| if (ret || !write) |
| goto done; |
| |
| ret = (sysctl_sched_boost <= 1) ? |
| sched_set_boost(sysctl_sched_boost) : -EINVAL; |
| |
| done: |
| mutex_unlock(&boost_mutex); |
| return ret; |
| } |
| |
| /* |
| * Task will fit on a cpu if it's bandwidth consumption on that cpu |
| * will be less than sched_upmigrate. A big task that was previously |
| * "up" migrated will be considered fitting on "little" cpu if its |
| * bandwidth consumption on "little" cpu will be less than |
| * sched_downmigrate. This will help avoid frequenty migrations for |
| * tasks with load close to the upmigrate threshold |
| */ |
| int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, |
| enum sched_boost_type boost_type) |
| { |
| int upmigrate; |
| |
| if (cpu_capacity(cpu) == max_capacity) |
| return 1; |
| |
| if (boost_type != SCHED_BOOST_ON_BIG) { |
| if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || |
| upmigrate_discouraged(p)) |
| return 1; |
| |
| upmigrate = sched_upmigrate; |
| if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) |
| upmigrate = sched_downmigrate; |
| |
| if (task_load < upmigrate) |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| enum sched_boost_type sched_boost_type(void) |
| { |
| if (sched_boost()) { |
| if (min_possible_efficiency != max_possible_efficiency) |
| return SCHED_BOOST_ON_BIG; |
| else |
| return SCHED_BOOST_ON_ALL; |
| } |
| return SCHED_BOOST_NONE; |
| } |
| |
| int task_will_fit(struct task_struct *p, int cpu) |
| { |
| u64 tload = scale_load_to_cpu(task_load(p), cpu); |
| |
| return task_load_will_fit(p, tload, cpu, sched_boost_type()); |
| } |
| |
| int group_will_fit(struct sched_cluster *cluster, |
| struct related_thread_group *grp, u64 demand) |
| { |
| int cpu = cluster_first_cpu(cluster); |
| int prev_capacity = 0; |
| unsigned int threshold = sched_upmigrate; |
| u64 load; |
| |
| if (cluster->capacity == max_capacity) |
| return 1; |
| |
| if (grp->preferred_cluster) |
| prev_capacity = grp->preferred_cluster->capacity; |
| |
| if (cluster->capacity < prev_capacity) |
| threshold = sched_downmigrate; |
| |
| load = scale_load_to_cpu(demand, cpu); |
| if (load < threshold) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* |
| * Return the cost of running task p on CPU cpu. This function |
| * currently assumes that task p is the only task which will run on |
| * the CPU. |
| */ |
| unsigned int power_cost(int cpu, u64 demand) |
| { |
| int first, mid, last; |
| struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); |
| struct cpu_pstate_pwr *costs; |
| struct freq_max_load *max_load; |
| int total_static_pwr_cost = 0; |
| struct rq *rq = cpu_rq(cpu); |
| unsigned int pc; |
| |
| if (!per_cpu_info || !per_cpu_info[cpu].ptable) |
| /* |
| * When power aware scheduling is not in use, or CPU |
| * power data is not available, just use the CPU |
| * capacity as a rough stand-in for real CPU power |
| * numbers, assuming bigger CPUs are more power |
| * hungry. |
| */ |
| return cpu_max_possible_capacity(cpu); |
| |
| rcu_read_lock(); |
| max_load = rcu_dereference(per_cpu(freq_max_load, cpu)); |
| if (!max_load) { |
| pc = cpu_max_possible_capacity(cpu); |
| goto unlock; |
| } |
| |
| costs = per_cpu_info[cpu].ptable; |
| |
| if (demand <= max_load->freqs[0].hdemand) { |
| pc = costs[0].power; |
| goto unlock; |
| } else if (demand > max_load->freqs[max_load->length - 1].hdemand) { |
| pc = costs[max_load->length - 1].power; |
| goto unlock; |
| } |
| |
| first = 0; |
| last = max_load->length - 1; |
| mid = (last - first) >> 1; |
| while (1) { |
| if (demand <= max_load->freqs[mid].hdemand) |
| last = mid; |
| else |
| first = mid; |
| |
| if (last - first == 1) |
| break; |
| mid = first + ((last - first) >> 1); |
| } |
| |
| pc = costs[last].power; |
| |
| unlock: |
| rcu_read_unlock(); |
| |
| if (idle_cpu(cpu) && rq->cstate) { |
| total_static_pwr_cost += rq->static_cpu_pwr_cost; |
| if (rq->cluster->dstate) |
| total_static_pwr_cost += |
| rq->cluster->static_cluster_pwr_cost; |
| } |
| |
| return pc + total_static_pwr_cost; |
| |
| } |
| |
| void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) |
| { |
| if (sched_disable_window_stats) |
| return; |
| |
| if (is_big_task(p)) |
| stats->nr_big_tasks++; |
| } |
| |
| void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p) |
| { |
| if (sched_disable_window_stats) |
| return; |
| |
| if (is_big_task(p)) |
| stats->nr_big_tasks--; |
| |
| BUG_ON(stats->nr_big_tasks < 0); |
| } |
| |
| void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) |
| { |
| inc_nr_big_task(&rq->hmp_stats, p); |
| if (change_cra) |
| inc_cumulative_runnable_avg(&rq->hmp_stats, p); |
| } |
| |
| void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra) |
| { |
| dec_nr_big_task(&rq->hmp_stats, p); |
| if (change_cra) |
| dec_cumulative_runnable_avg(&rq->hmp_stats, p); |
| } |
| |
| static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra) |
| { |
| stats->nr_big_tasks = 0; |
| if (reset_cra) { |
| stats->cumulative_runnable_avg = 0; |
| stats->pred_demands_sum = 0; |
| } |
| } |
| |
| /* |
| * Invoked from three places: |
| * 1) try_to_wake_up() -> ... -> select_best_cpu() |
| * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu() |
| * 3) can_migrate_task() |
| * |
| * Its safe to de-reference p->grp in first case (since p->pi_lock is held) |
| * but not in other cases. p->grp is hence freed after a RCU grace period and |
| * accessed under rcu_read_lock() |
| */ |
| int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) |
| { |
| struct related_thread_group *grp; |
| int rc = 0; |
| |
| rcu_read_lock(); |
| |
| grp = task_related_thread_group(p); |
| if (!grp || !sysctl_sched_enable_colocation) |
| rc = 1; |
| else |
| rc = (grp->preferred_cluster == cluster); |
| |
| rcu_read_unlock(); |
| return rc; |
| } |
| |
| struct sched_cluster *rq_cluster(struct rq *rq) |
| { |
| return rq->cluster; |
| } |
| |
| /* |
| * reset_cpu_hmp_stats - reset HMP stats for a cpu |
| * nr_big_tasks |
| * cumulative_runnable_avg (iff reset_cra is true) |
| */ |
| void reset_cpu_hmp_stats(int cpu, int reset_cra) |
| { |
| reset_cfs_rq_hmp_stats(cpu, reset_cra); |
| reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra); |
| } |
| |
| void fixup_nr_big_tasks(struct hmp_sched_stats *stats, |
| struct task_struct *p, s64 delta) |
| { |
| u64 new_task_load; |
| u64 old_task_load; |
| |
| if (sched_disable_window_stats) |
| return; |
| |
| old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p)); |
| new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p)); |
| |
| if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load)) |
| stats->nr_big_tasks--; |
| else if (!__is_big_task(p, old_task_load) && |
| __is_big_task(p, new_task_load)) |
| stats->nr_big_tasks++; |
| |
| BUG_ON(stats->nr_big_tasks < 0); |
| } |
| |
| /* |
| * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters. |
| */ |
| static void update_nr_big_tasks(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| struct task_struct *p; |
| |
| /* Do not reset cumulative_runnable_avg */ |
| reset_cpu_hmp_stats(cpu, 0); |
| |
| list_for_each_entry(p, &rq->cfs_tasks, se.group_node) |
| inc_hmp_sched_stats_fair(rq, p, 0); |
| } |
| |
| /* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */ |
| void pre_big_task_count_change(const struct cpumask *cpus) |
| { |
| int i; |
| |
| local_irq_disable(); |
| |
| for_each_cpu(i, cpus) |
| raw_spin_lock(&cpu_rq(i)->lock); |
| } |
| |
| /* |
| * Reinitialize 'nr_big_tasks' counters on all affected cpus |
| */ |
| void post_big_task_count_change(const struct cpumask *cpus) |
| { |
| int i; |
| |
| /* Assumes local_irq_disable() keeps online cpumap stable */ |
| for_each_cpu(i, cpus) |
| update_nr_big_tasks(i); |
| |
| for_each_cpu(i, cpus) |
| raw_spin_unlock(&cpu_rq(i)->lock); |
| |
| local_irq_enable(); |
| } |
| |
| DEFINE_MUTEX(policy_mutex); |
| |
| static inline int invalid_value_freq_input(unsigned int *data) |
| { |
| if (data == &sysctl_sched_freq_aggregate) |
| return !(*data == 0 || *data == 1); |
| |
| return 0; |
| } |
| |
| static inline int invalid_value(unsigned int *data) |
| { |
| unsigned int val = *data; |
| |
| if (data == &sysctl_sched_ravg_hist_size) |
| return (val < 2 || val > RAVG_HIST_SIZE_MAX); |
| |
| if (data == &sysctl_sched_window_stats_policy) |
| return val >= WINDOW_STATS_INVALID_POLICY; |
| |
| return invalid_value_freq_input(data); |
| } |
| |
| /* |
| * Handle "atomic" update of sysctl_sched_window_stats_policy, |
| * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables. |
| */ |
| int sched_window_update_handler(struct ctl_table *table, int write, |
| void __user *buffer, size_t *lenp, |
| loff_t *ppos) |
| { |
| int ret; |
| unsigned int *data = (unsigned int *)table->data; |
| unsigned int old_val; |
| |
| mutex_lock(&policy_mutex); |
| |
| old_val = *data; |
| |
| ret = proc_dointvec(table, write, buffer, lenp, ppos); |
| if (ret || !write || (write && (old_val == *data))) |
| goto done; |
| |
| if (invalid_value(data)) { |
| *data = old_val; |
| ret = -EINVAL; |
| goto done; |
| } |
| |
| reset_all_window_stats(0, 0); |
| |
| done: |
| mutex_unlock(&policy_mutex); |
| |
| return ret; |
| } |
| |
| /* |
| * Convert percentage value into absolute form. This will avoid div() operation |
| * in fast path, to convert task load in percentage scale. |
| */ |
| int sched_hmp_proc_update_handler(struct ctl_table *table, int write, |
| void __user *buffer, size_t *lenp, |
| loff_t *ppos) |
| { |
| int ret; |
| unsigned int old_val; |
| unsigned int *data = (unsigned int *)table->data; |
| int update_min_nice = 0; |
| |
| mutex_lock(&policy_mutex); |
| |
| old_val = *data; |
| |
| ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| |
| if (ret || !write) |
| goto done; |
| |
| if (write && (old_val == *data)) |
| goto done; |
| |
| if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { |
| *data = old_val; |
| ret = -EINVAL; |
| goto done; |
| } |
| |
| /* |
| * Big task tunable change will need to re-classify tasks on |
| * runqueue as big and set their counters appropriately. |
| * sysctl interface affects secondary variables (*_pct), which is then |
| * "atomically" carried over to the primary variables. Atomic change |
| * includes taking runqueue lock of all online cpus and re-initiatizing |
| * their big counter values based on changed criteria. |
| */ |
| if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { |
| get_online_cpus(); |
| pre_big_task_count_change(cpu_online_mask); |
| } |
| |
| set_hmp_defaults(); |
| |
| if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) { |
| post_big_task_count_change(cpu_online_mask); |
| put_online_cpus(); |
| } |
| |
| done: |
| mutex_unlock(&policy_mutex); |
| return ret; |
| } |
| |
| inline int nr_big_tasks(struct rq *rq) |
| { |
| return rq->hmp_stats.nr_big_tasks; |
| } |
| |
| unsigned int cpu_temp(int cpu) |
| { |
| struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats(); |
| |
| if (per_cpu_info) |
| return per_cpu_info[cpu].temp; |
| else |
| return 0; |
| } |
| |
| void init_new_task_load(struct task_struct *p) |
| { |
| int i; |
| u32 init_load_windows = sched_init_task_load_windows; |
| u32 init_load_pct = current->init_load_pct; |
| |
| p->init_load_pct = 0; |
| rcu_assign_pointer(p->grp, NULL); |
| INIT_LIST_HEAD(&p->grp_list); |
| memset(&p->ravg, 0, sizeof(struct ravg)); |
| p->cpu_cycles = 0; |
| |
| if (init_load_pct) |
| init_load_windows = div64_u64((u64)init_load_pct * |
| (u64)sched_ravg_window, 100); |
| |
| p->ravg.demand = init_load_windows; |
| p->ravg.pred_demand = 0; |
| for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) |
| p->ravg.sum_history[i] = init_load_windows; |
| } |
| |
| /* Return task demand in percentage scale */ |
| unsigned int pct_task_load(struct task_struct *p) |
| { |
| unsigned int load; |
| |
| load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); |
| |
| return load; |
| } |
| |
| /* |
| * Return total number of tasks "eligible" to run on highest capacity cpu |
| * |
| * This is simply nr_big_tasks for cpus which are not of max_capacity and |
| * nr_running for cpus of max_capacity |
| */ |
| unsigned int nr_eligible_big_tasks(int cpu) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| int nr_big = rq->hmp_stats.nr_big_tasks; |
| int nr = rq->nr_running; |
| |
| if (cpu_max_possible_capacity(cpu) != max_possible_capacity) |
| return nr_big; |
| |
| return nr; |
| } |
| |
| static inline int exiting_task(struct task_struct *p) |
| { |
| return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); |
| } |
| |
| static int __init set_sched_ravg_window(char *str) |
| { |
| unsigned int window_size; |
| |
| get_option(&str, &window_size); |
| |
| if (window_size < MIN_SCHED_RAVG_WINDOW || |
| window_size > MAX_SCHED_RAVG_WINDOW) { |
| WARN_ON(1); |
| return -EINVAL; |
| } |
| |
| sched_ravg_window = window_size; |
| return 0; |
| } |
| |
| early_param("sched_ravg_window", set_sched_ravg_window); |
| |
| static inline void |
| update_window_start(struct rq *rq, u64 wallclock) |
| { |
| s64 delta; |
| int nr_windows; |
| |
| delta = wallclock - rq->window_start; |
| BUG_ON(delta < 0); |
| if (delta < sched_ravg_window) |
| return; |
| |
| nr_windows = div64_u64(delta, sched_ravg_window); |
| rq->window_start += (u64)nr_windows * (u64)sched_ravg_window; |
| } |
| |
| #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) |
| |
| static inline u64 scale_exec_time(u64 delta, struct rq *rq) |
| { |
| u32 freq; |
| |
| freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); |
| delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq); |
| delta *= rq->cluster->exec_scale_factor; |
| delta >>= 10; |
| |
| return delta; |
| } |
| |
| static inline int cpu_is_waiting_on_io(struct rq *rq) |
| { |
| if (!sched_io_is_busy) |
| return 0; |
| |
| return atomic_read(&rq->nr_iowait); |
| } |
| |
| /* Does freq_required sufficiently exceed or fall behind cur_freq? */ |
| static inline int |
| nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) |
| { |
| int delta = freq_required - cur_freq; |
| |
| if (freq_required > cur_freq) |
| return delta < sysctl_sched_freq_inc_notify; |
| |
| delta = -delta; |
| |
| return delta < sysctl_sched_freq_dec_notify; |
| } |
| |
| /* Convert busy time to frequency equivalent */ |
| static inline unsigned int load_to_freq(struct rq *rq, u64 load) |
| { |
| unsigned int freq; |
| |
| load = scale_load_to_cpu(load, cpu_of(rq)); |
| load *= 128; |
| load = div64_u64(load, max_task_load()); |
| |
| freq = load * cpu_max_possible_freq(cpu_of(rq)); |
| freq /= 128; |
| |
| return freq; |
| } |
| |
| static inline struct group_cpu_time * |
| _group_cpu_time(struct related_thread_group *grp, int cpu); |
| |
| /* |
| * Return load from all related group in given cpu. |
| * Caller must ensure that related_thread_group_lock is held. |
| */ |
| static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load) |
| { |
| struct related_thread_group *grp; |
| |
| for_each_related_thread_group(grp) { |
| struct group_cpu_time *cpu_time; |
| |
| cpu_time = _group_cpu_time(grp, cpu); |
| *grp_load += cpu_time->prev_runnable_sum; |
| if (new_grp_load) |
| *new_grp_load += cpu_time->nt_prev_runnable_sum; |
| } |
| } |
| |
| /* |
| * Return load from all related groups in given frequency domain. |
| * Caller must ensure that related_thread_group_lock is held. |
| */ |
| static void group_load_in_freq_domain(struct cpumask *cpus, |
| u64 *grp_load, u64 *new_grp_load) |
| { |
| struct related_thread_group *grp; |
| int j; |
| |
| for_each_related_thread_group(grp) { |
| for_each_cpu(j, cpus) { |
| struct group_cpu_time *cpu_time; |
| |
| cpu_time = _group_cpu_time(grp, j); |
| *grp_load += cpu_time->prev_runnable_sum; |
| *new_grp_load += cpu_time->nt_prev_runnable_sum; |
| } |
| } |
| } |
| |
| /* |
| * Should scheduler alert governor for changing frequency? |
| * |
| * @check_pred - evaluate frequency based on the predictive demand |
| * @check_groups - add load from all related groups on given cpu |
| * |
| * check_groups is set to 1 if a "related" task movement/wakeup is triggering |
| * the notification check. To avoid "re-aggregation" of demand in such cases, |
| * we check whether the migrated/woken tasks demand (along with demand from |
| * existing tasks on the cpu) can be met on target cpu |
| * |
| */ |
| |
| static int send_notification(struct rq *rq, int check_pred, int check_groups) |
| { |
| unsigned int cur_freq, freq_required; |
| unsigned long flags; |
| int rc = 0; |
| u64 group_load = 0, new_load = 0; |
| |
| if (check_pred) { |
| u64 prev = rq->old_busy_time; |
| u64 predicted = rq->hmp_stats.pred_demands_sum; |
| |
| if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq))) |
| return 0; |
| |
| prev = max(prev, rq->old_estimated_time); |
| if (prev > predicted) |
| return 0; |
| |
| cur_freq = load_to_freq(rq, prev); |
| freq_required = load_to_freq(rq, predicted); |
| |
| if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) |
| return 0; |
| } else { |
| read_lock(&related_thread_group_lock); |
| /* |
| * Protect from concurrent update of rq->prev_runnable_sum and |
| * group cpu load |
| */ |
| raw_spin_lock_irqsave(&rq->lock, flags); |
| if (check_groups) |
| _group_load_in_cpu(cpu_of(rq), &group_load, NULL); |
| |
| new_load = rq->prev_runnable_sum + group_load; |
| |
| raw_spin_unlock_irqrestore(&rq->lock, flags); |
| read_unlock(&related_thread_group_lock); |
| |
| cur_freq = load_to_freq(rq, rq->old_busy_time); |
| freq_required = load_to_freq(rq, new_load); |
| |
| if (nearly_same_freq(cur_freq, freq_required)) |
| return 0; |
| } |
| |
| raw_spin_lock_irqsave(&rq->lock, flags); |
| if (!rq->cluster->notifier_sent) { |
| rq->cluster->notifier_sent = 1; |
| rc = 1; |
| trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq, |
| new_load); |
| } |
| raw_spin_unlock_irqrestore(&rq->lock, flags); |
| |
| return rc; |
| } |
| |
| /* Alert governor if there is a need to change frequency */ |
| void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups) |
| { |
| int cpu = cpu_of(rq); |
| |
| if (!send_notification(rq, check_pred, check_groups)) |
| return; |
| |
| atomic_notifier_call_chain( |
| &load_alert_notifier_head, 0, |
| (void *)(long)cpu); |
| } |
| |
| void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead, |
| struct task_struct *p) |
| { |
| bool check_groups; |
| |
| rcu_read_lock(); |
| check_groups = task_in_related_thread_group(p); |
| rcu_read_unlock(); |
| |
| if (!same_freq_domain(src_cpu, dest_cpu)) { |
| if (!src_cpu_dead) |
| check_for_freq_change(cpu_rq(src_cpu), false, |
| check_groups); |
| check_for_freq_change(cpu_rq(dest_cpu), false, check_groups); |
| } else { |
| check_for_freq_change(cpu_rq(dest_cpu), true, check_groups); |
| } |
| } |
| |
| static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, |
| u64 irqtime, int event) |
| { |
| if (is_idle_task(p)) { |
| /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ |
| if (event == PICK_NEXT_TASK) |
| return 0; |
| |
| /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ |
| return irqtime || cpu_is_waiting_on_io(rq); |
| } |
| |
| if (event == TASK_WAKE) |
| return 0; |
| |
| if (event == PUT_PREV_TASK || event == IRQ_UPDATE) |
| return 1; |
| |
| /* |
| * TASK_UPDATE can be called on sleeping task, when its moved between |
| * related groups |
| */ |
| if (event == TASK_UPDATE) { |
| if (rq->curr == p) |
| return 1; |
| |
| return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; |
| } |
| |
| /* TASK_MIGRATE, PICK_NEXT_TASK left */ |
| return SCHED_FREQ_ACCOUNT_WAIT_TIME; |
| } |
| |
| static inline bool is_new_task(struct task_struct *p) |
| { |
| return p->ravg.active_windows < sysctl_sched_new_task_windows; |
| } |
| |
| #define INC_STEP 8 |
| #define DEC_STEP 2 |
| #define CONSISTENT_THRES 16 |
| #define INC_STEP_BIG 16 |
| /* |
| * bucket_increase - update the count of all buckets |
| * |
| * @buckets: array of buckets tracking busy time of a task |
| * @idx: the index of bucket to be incremented |
| * |
| * Each time a complete window finishes, count of bucket that runtime |
| * falls in (@idx) is incremented. Counts of all other buckets are |
| * decayed. The rate of increase and decay could be different based |
| * on current count in the bucket. |
| */ |
| static inline void bucket_increase(u8 *buckets, int idx) |
| { |
| int i, step; |
| |
| for (i = 0; i < NUM_BUSY_BUCKETS; i++) { |
| if (idx != i) { |
| if (buckets[i] > DEC_STEP) |
| buckets[i] -= DEC_STEP; |
| else |
| buckets[i] = 0; |
| } else { |
| step = buckets[i] >= CONSISTENT_THRES ? |
| INC_STEP_BIG : INC_STEP; |
| if (buckets[i] > U8_MAX - step) |
| buckets[i] = U8_MAX; |
| else |
| buckets[i] += step; |
| } |
| } |
| } |
| |
| static inline int busy_to_bucket(u32 normalized_rt) |
| { |
| int bidx; |
| |
| bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); |
| bidx = min(bidx, NUM_BUSY_BUCKETS - 1); |
| |
| /* |
| * Combine lowest two buckets. The lowest frequency falls into |
| * 2nd bucket and thus keep predicting lowest bucket is not |
| * useful. |
| */ |
| if (!bidx) |
| bidx++; |
| |
| return bidx; |
| } |
| |
| static inline u64 |
| scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) |
| { |
| return div64_u64(load * (u64)src_freq, (u64)dst_freq); |
| } |
| |
| #define HEAVY_TASK_SKIP 2 |
| #define HEAVY_TASK_SKIP_LIMIT 4 |
| /* |
| * get_pred_busy - calculate predicted demand for a task on runqueue |
| * |
| * @rq: runqueue of task p |
| * @p: task whose prediction is being updated |
| * @start: starting bucket. returned prediction should not be lower than |
| * this bucket. |
| * @runtime: runtime of the task. returned prediction should not be lower |
| * than this runtime. |
| * Note: @start can be derived from @runtime. It's passed in only to |
| * avoid duplicated calculation in some cases. |
| * |
| * A new predicted busy time is returned for task @p based on @runtime |
| * passed in. The function searches through buckets that represent busy |
| * time equal to or bigger than @runtime and attempts to find the bucket to |
| * to use for prediction. Once found, it searches through historical busy |
| * time and returns the latest that falls into the bucket. If no such busy |
| * time exists, it returns the medium of that bucket. |
| */ |
| static u32 get_pred_busy(struct rq *rq, struct task_struct *p, |
| int start, u32 runtime) |
| { |
| int i; |
| u8 *buckets = p->ravg.busy_buckets; |
| u32 *hist = p->ravg.sum_history; |
| u32 dmin, dmax; |
| u64 cur_freq_runtime = 0; |
| int first = NUM_BUSY_BUCKETS, final, skip_to; |
| u32 ret = runtime; |
| |
| /* skip prediction for new tasks due to lack of history */ |
| if (unlikely(is_new_task(p))) |
| goto out; |
| |
| /* find minimal bucket index to pick */ |
| for (i = start; i < NUM_BUSY_BUCKETS; i++) { |
| if (buckets[i]) { |
| first = i; |
| break; |
| } |
| } |
| /* if no higher buckets are filled, predict runtime */ |
| if (first >= NUM_BUSY_BUCKETS) |
| goto out; |
| |
| /* compute the bucket for prediction */ |
| final = first; |
| if (first < HEAVY_TASK_SKIP_LIMIT) { |
| /* compute runtime at current CPU frequency */ |
| cur_freq_runtime = mult_frac(runtime, max_possible_efficiency, |
| rq->cluster->efficiency); |
| cur_freq_runtime = scale_load_to_freq(cur_freq_runtime, |
| max_possible_freq, rq->cluster->cur_freq); |
| /* |
| * if the task runs for majority of the window, try to |
| * pick higher buckets. |
| */ |
| if (cur_freq_runtime >= sched_major_task_runtime) { |
| int next = NUM_BUSY_BUCKETS; |
| /* |
| * if there is a higher bucket that's consistently |
| * hit, don't jump beyond that. |
| */ |
| for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT && |
| i < NUM_BUSY_BUCKETS; i++) { |
| if (buckets[i] > CONSISTENT_THRES) { |
| next = i; |
| break; |
| } |
| } |
| skip_to = min(next, start + HEAVY_TASK_SKIP); |
| /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */ |
| skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to); |
| /* don't go below first non-empty bucket, if any */ |
| final = max(first, skip_to); |
| } |
| } |
| |
| /* determine demand range for the predicted bucket */ |
| if (final < 2) { |
| /* lowest two buckets are combined */ |
| dmin = 0; |
| final = 1; |
| } else { |
| dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); |
| } |
| dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); |
| |
| /* |
| * search through runtime history and return first runtime that falls |
| * into the range of predicted bucket. |
| */ |
| for (i = 0; i < sched_ravg_hist_size; i++) { |
| if (hist[i] >= dmin && hist[i] < dmax) { |
| ret = hist[i]; |
| break; |
| } |
| } |
| /* no historical runtime within bucket found, use average of the bin */ |
| if (ret < dmin) |
| ret = (dmin + dmax) / 2; |
| /* |
| * when updating in middle of a window, runtime could be higher |
| * than all recorded history. Always predict at least runtime. |
| */ |
| ret = max(runtime, ret); |
| out: |
| trace_sched_update_pred_demand(rq, p, runtime, |
| mult_frac((unsigned int)cur_freq_runtime, 100, |
| sched_ravg_window), ret); |
| return ret; |
| } |
| |
| static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p) |
| { |
| if (p->ravg.pred_demand >= p->ravg.curr_window) |
| return p->ravg.pred_demand; |
| |
| return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window), |
| p->ravg.curr_window); |
| } |
| |
| /* |
| * predictive demand of a task is calculated at the window roll-over. |
| * if the task current window busy time exceeds the predicted |
| * demand, update it here to reflect the task needs. |
| */ |
| void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) |
| { |
| u32 new, old; |
| |
| if (is_idle_task(p) || exiting_task(p)) |
| return; |
| |
| if (event != PUT_PREV_TASK && event != TASK_UPDATE && |
| (!SCHED_FREQ_ACCOUNT_WAIT_TIME || |
| (event != TASK_MIGRATE && |
| event != PICK_NEXT_TASK))) |
| return; |
| |
| /* |
| * TASK_UPDATE can be called on sleeping task, when its moved between |
| * related groups |
| */ |
| if (event == TASK_UPDATE) { |
| if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) |
| return; |
| } |
| |
| new = calc_pred_demand(rq, p); |
| old = p->ravg.pred_demand; |
| |
| if (old >= new) |
| return; |
| |
| if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || |
| !p->dl.dl_throttled)) |
| p->sched_class->fixup_hmp_sched_stats(rq, p, |
| p->ravg.demand, |
| new); |
| |
| p->ravg.pred_demand = new; |
| } |
| |
| /* |
| * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) |
| */ |
| static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, |
| int event, u64 wallclock, u64 irqtime) |
| { |
| int new_window, full_window = 0; |
| int p_is_curr_task = (p == rq->curr); |
| u64 mark_start = p->ravg.mark_start; |
| u64 window_start = rq->window_start; |
| u32 window_size = sched_ravg_window; |
| u64 delta; |
| u64 *curr_runnable_sum = &rq->curr_runnable_sum; |
| u64 *prev_runnable_sum = &rq->prev_runnable_sum; |
| u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; |
| u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; |
| int flip_counters = 0; |
| int prev_sum_reset = 0; |
| bool new_task; |
| struct related_thread_group *grp; |
| |
| new_window = mark_start < window_start; |
| if (new_window) { |
| full_window = (window_start - mark_start) >= window_size; |
| if (p->ravg.active_windows < USHRT_MAX) |
| p->ravg.active_windows++; |
| } |
| |
| new_task = is_new_task(p); |
| |
| grp = p->grp; |
| if (grp && sched_freq_aggregate) { |
| /* cpu_time protected by rq_lock */ |
| struct group_cpu_time *cpu_time = |
| _group_cpu_time(grp, cpu_of(rq)); |
| |
| curr_runnable_sum = &cpu_time->curr_runnable_sum; |
| prev_runnable_sum = &cpu_time->prev_runnable_sum; |
| |
| nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; |
| nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; |
| |
| if (cpu_time->window_start != rq->window_start) { |
| int nr_windows; |
| |
| delta = rq->window_start - cpu_time->window_start; |
| nr_windows = div64_u64(delta, window_size); |
| if (nr_windows > 1) |
| prev_sum_reset = 1; |
| |
| cpu_time->window_start = rq->window_start; |
| flip_counters = 1; |
| } |
| |
| if (p_is_curr_task && new_window) { |
| u64 curr_sum = rq->curr_runnable_sum; |
| u64 nt_curr_sum = rq->nt_curr_runnable_sum; |
| |
| if (full_window) |
| curr_sum = nt_curr_sum = 0; |
| |
| rq->prev_runnable_sum = curr_sum; |
| rq->nt_prev_runnable_sum = nt_curr_sum; |
| |
| rq->curr_runnable_sum = 0; |
| rq->nt_curr_runnable_sum = 0; |
| } |
| } else { |
| if (p_is_curr_task && new_window) { |
| flip_counters = 1; |
| if (full_window) |
| prev_sum_reset = 1; |
| } |
| } |
| |
| /* |
| * Handle per-task window rollover. We don't care about the idle |
| * task or exiting tasks. |
| */ |
| if (new_window && !is_idle_task(p) && !exiting_task(p)) { |
| u32 curr_window = 0; |
| |
| if (!full_window) |
| curr_window = p->ravg.curr_window; |
| |
| p->ravg.prev_window = curr_window; |
| p->ravg.curr_window = 0; |
| } |
| |
| if (flip_counters) { |
| u64 curr_sum = *curr_runnable_sum; |
| u64 nt_curr_sum = *nt_curr_runnable_sum; |
| |
| if (prev_sum_reset) |
| curr_sum = nt_curr_sum = 0; |
| |
| *prev_runnable_sum = curr_sum; |
| *nt_prev_runnable_sum = nt_curr_sum; |
| |
| *curr_runnable_sum = 0; |
| *nt_curr_runnable_sum = 0; |
| } |
| |
| if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { |
| /* |
| * account_busy_for_cpu_time() = 0, so no update to the |
| * task's current window needs to be made. This could be |
| * for example |
| * |
| * - a wakeup event on a task within the current |
| * window (!new_window below, no action required), |
| * - switching to a new task from idle (PICK_NEXT_TASK) |
| * in a new window where irqtime is 0 and we aren't |
| * waiting on IO |
| */ |
| |
| if (!new_window) |
| return; |
| |
| /* |
| * A new window has started. The RQ demand must be rolled |
| * over if p is the current task. |
| */ |
| if (p_is_curr_task) { |
| /* p is idle task */ |
| BUG_ON(p != rq->idle); |
| } |
| |
| return; |
| } |
| |
| if (!new_window) { |
| /* |
| * account_busy_for_cpu_time() = 1 so busy time needs |
| * to be accounted to the current window. No rollover |
| * since we didn't start a new window. An example of this is |
| * when a task starts execution and then sleeps within the |
| * same window. |
| */ |
| |
| if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) |
| delta = wallclock - mark_start; |
| else |
| delta = irqtime; |
| delta = scale_exec_time(delta, rq); |
| *curr_runnable_sum += delta; |
| if (new_task) |
| *nt_curr_runnable_sum += delta; |
| |
| if (!is_idle_task(p) && !exiting_task(p)) |
| p->ravg.curr_window += delta; |
| |
| return; |
| } |
| |
| if (!p_is_curr_task) { |
| /* |
| * account_busy_for_cpu_time() = 1 so busy time needs |
| * to be accounted to the current window. A new window |
| * has also started, but p is not the current task, so the |
| * window is not rolled over - just split up and account |
| * as necessary into curr and prev. The window is only |
| * rolled over when a new window is processed for the current |
| * task. |
| * |
| * Irqtime can't be accounted by a task that isn't the |
| * currently running task. |
| */ |
| |
| if (!full_window) { |
| /* |
| * A full window hasn't elapsed, account partial |
| * contribution to previous completed window. |
| */ |
| delta = scale_exec_time(window_start - mark_start, rq); |
| if (!exiting_task(p)) |
| p->ravg.prev_window += delta; |
| } else { |
| /* |
| * Since at least one full window has elapsed, |
| * the contribution to the previous window is the |
| * full window (window_size). |
| */ |
| delta = scale_exec_time(window_size, rq); |
| if (!exiting_task(p)) |
| p->ravg.prev_window = delta; |
| } |
| |
| *prev_runnable_sum += delta; |
| if (new_task) |
| *nt_prev_runnable_sum += delta; |
| |
| /* Account piece of busy time in the current window. */ |
| delta = scale_exec_time(wallclock - window_start, rq); |
| *curr_runnable_sum += delta; |
| if (new_task) |
| *nt_curr_runnable_sum += delta; |
| |
| if (!exiting_task(p)) |
| p->ravg.curr_window = delta; |
| |
| return; |
| } |
| |
| if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { |
| /* |
| * account_busy_for_cpu_time() = 1 so busy time needs |
| * to be accounted to the current window. A new window |
| * has started and p is the current task so rollover is |
| * needed. If any of these three above conditions are true |
| * then this busy time can't be accounted as irqtime. |
| * |
| * Busy time for the idle task or exiting tasks need not |
| * be accounted. |
| * |
| * An example of this would be a task that starts execution |
| * and then sleeps once a new window has begun. |
| */ |
| |
| if (!full_window) { |
| /* |
| * A full window hasn't elapsed, account partial |
| * contribution to previous completed window. |
| */ |
| delta = scale_exec_time(window_start - mark_start, rq); |
| if (!is_idle_task(p) && !exiting_task(p)) |
| p->ravg.prev_window += delta; |
| } else { |
| /* |
| * Since at least one full window has elapsed, |
| * the contribution to the previous window is the |
| * full window (window_size). |
| */ |
| delta = scale_exec_time(window_size, rq); |
| if (!is_idle_task(p) && !exiting_task(p)) |
| p->ravg.prev_window = delta; |
| } |
| |
| /* |
| * Rollover is done here by overwriting the values in |
| * prev_runnable_sum and curr_runnable_sum. |
| */ |
| *prev_runnable_sum += delta; |
| if (new_task) |
| *nt_prev_runnable_sum += delta; |
| |
| /* Account piece of busy time in the current window. */ |
| delta = scale_exec_time(wallclock - window_start, rq); |
| *curr_runnable_sum += delta; |
| if (new_task) |
| *nt_curr_runnable_sum += delta; |
| |
| if (!is_idle_task(p) && !exiting_task(p)) |
| p->ravg.curr_window = delta; |
| |
| return; |
| } |
| |
| if (irqtime) { |
| /* |
| * account_busy_for_cpu_time() = 1 so busy time needs |
| * to be accounted to the current window. A new window |
| * has started and p is the current task so rollover is |
| * needed. The current task must be the idle task because |
| * irqtime is not accounted for any other task. |
| * |
| * Irqtime will be accounted each time we process IRQ activity |
| * after a period of idleness, so we know the IRQ busy time |
| * started at wallclock - irqtime. |
| */ |
| |
| BUG_ON(!is_idle_task(p)); |
| mark_start = wallclock - irqtime; |
| |
| /* |
| * Roll window over. If IRQ busy time was just in the current |
| * window then that is all that need be accounted. |
| */ |
| if (mark_start > window_start) { |
| *curr_runnable_sum = scale_exec_time(irqtime, rq); |
| return; |
| } |
| |
| /* |
| * The IRQ busy time spanned multiple windows. Process the |
| * busy time preceding the current window start first. |
| */ |
| delta = window_start - mark_start; |
| if (delta > window_size) |
| delta = window_size; |
| delta = scale_exec_time(delta, rq); |
| *prev_runnable_sum += delta; |
| |
| /* Process the remaining IRQ busy time in the current window. */ |
| delta = wallclock - window_start; |
| rq->curr_runnable_sum = scale_exec_time(delta, rq); |
| |
| return; |
| } |
| |
| BUG(); |
| } |
| |
| static inline u32 predict_and_update_buckets(struct rq *rq, |
| struct task_struct *p, u32 runtime) { |
| |
| int bidx; |
| u32 pred_demand; |
| |
| bidx = busy_to_bucket(runtime); |
| pred_demand = get_pred_busy(rq, p, bidx, runtime); |
| bucket_increase(p->ravg.busy_buckets, bidx); |
| |
| return pred_demand; |
| } |
| |
| static void update_task_cpu_cycles(struct task_struct *p, int cpu) |
| { |
| if (use_cycle_counter) |
| p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); |
| } |
| |
| static void |
| update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, |
| u64 wallclock, u64 irqtime) |
| { |
| u64 cur_cycles; |
| int cpu = cpu_of(rq); |
| |
| lockdep_assert_held(&rq->lock); |
| |
| if (!use_cycle_counter) { |
| rq->cc.cycles = cpu_cur_freq(cpu); |
| rq->cc.time = 1; |
| return; |
| } |
| |
| cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu); |
| |
| /* |
| * If current task is idle task and irqtime == 0 CPU was |
| * indeed idle and probably its cycle counter was not |
| * increasing. We still need estimatied CPU frequency |
| * for IO wait time accounting. Use the previously |
| * calculated frequency in such a case. |
| */ |
| if (!is_idle_task(rq->curr) || irqtime) { |
| if (unlikely(cur_cycles < p->cpu_cycles)) |
| rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles); |
| else |
| rq->cc.cycles = cur_cycles - p->cpu_cycles; |
| rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC; |
| |
| if (event == IRQ_UPDATE && is_idle_task(p)) |
| /* |
| * Time between mark_start of idle task and IRQ handler |
| * entry time is CPU cycle counter stall period. |
| * Upon IRQ handler entry sched_account_irqstart() |
| * replenishes idle task's cpu cycle counter so |
| * rq->cc.cycles now represents increased cycles during |
| * IRQ handler rather than time between idle entry and |
| * IRQ exit. Thus use irqtime as time delta. |
| */ |
| rq->cc.time = irqtime; |
| else |
| rq->cc.time = wallclock - p->ravg.mark_start; |
| BUG_ON((s64)rq->cc.time < 0); |
| } |
| |
| p->cpu_cycles = cur_cycles; |
| |
| trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); |
| } |
| |
| static int account_busy_for_task_demand(struct task_struct *p, int event) |
| { |
| /* |
| * No need to bother updating task demand for exiting tasks |
| * or the idle task. |
| */ |
| if (exiting_task(p) || is_idle_task(p)) |
| return 0; |
| |
| /* |
| * When a task is waking up it is completing a segment of non-busy |
| * time. Likewise, if wait time is not treated as busy time, then |
| * when a task begins to run or is migrated, it is not running and |
| * is completing a segment of non-busy time. |
| */ |
| if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && |
| (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) |
| return 0; |
| |
| return 1; |
| } |
| |
| /* |
| * Called when new window is starting for a task, to record cpu usage over |
| * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 |
| * when, say, a real-time task runs without preemption for several windows at a |
| * stretch. |
| */ |
| static void update_history(struct rq *rq, struct task_struct *p, |
| u32 runtime, int samples, int event) |
| { |
| u32 *hist = &p->ravg.sum_history[0]; |
| int ridx, widx; |
| u32 max = 0, avg, demand, pred_demand; |
| u64 sum = 0; |
| |
| /* Ignore windows where task had no activity */ |
| if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) |
| goto done; |
| |
| /* Push new 'runtime' value onto stack */ |
| widx = sched_ravg_hist_size - 1; |
| ridx = widx - samples; |
| for (; ridx >= 0; --widx, --ridx) { |
| hist[widx] = hist[ridx]; |
| sum += hist[widx]; |
| if (hist[widx] > max) |
| max = hist[widx]; |
| } |
| |
| for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { |
| hist[widx] = runtime; |
| sum += hist[widx]; |
| if (hist[widx] > max) |
| max = hist[widx]; |
| } |
| |
| p->ravg.sum = 0; |
| |
| if (sched_window_stats_policy == WINDOW_STATS_RECENT) { |
| demand = runtime; |
| } else if (sched_window_stats_policy == WINDOW_STATS_MAX) { |
| demand = max; |
| } else { |
| avg = div64_u64(sum, sched_ravg_hist_size); |
| if (sched_window_stats_policy == WINDOW_STATS_AVG) |
| demand = avg; |
| else |
| demand = max(avg, runtime); |
| } |
| pred_demand = predict_and_update_buckets(rq, p, runtime); |
| |
| /* |
| * A throttled deadline sched class task gets dequeued without |
| * changing p->on_rq. Since the dequeue decrements hmp stats |
| * avoid decrementing it here again. |
| */ |
| if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || |
| !p->dl.dl_throttled)) |
| p->sched_class->fixup_hmp_sched_stats(rq, p, demand, |
| pred_demand); |
| |
| p->ravg.demand = demand; |
| p->ravg.pred_demand = pred_demand; |
| |
| done: |
| trace_sched_update_history(rq, p, runtime, samples, event); |
| } |
| |
| static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) |
| { |
| delta = scale_exec_time(delta, rq); |
| p->ravg.sum += delta; |
| if (unlikely(p->ravg.sum > sched_ravg_window)) |
| p->ravg.sum = sched_ravg_window; |
| } |
| |
| /* |
| * Account cpu demand of task and/or update task's cpu demand history |
| * |
| * ms = p->ravg.mark_start; |
| * wc = wallclock |
| * ws = rq->window_start |
| * |
| * Three possibilities: |
| * |
| * a) Task event is contained within one window. |
| * window_start < mark_start < wallclock |
| * |
| * ws ms wc |
| * | | | |
| * V V V |
| * |---------------| |
| * |
| * In this case, p->ravg.sum is updated *iff* event is appropriate |
| * (ex: event == PUT_PREV_TASK) |
| * |
| * b) Task event spans two windows. |
| * mark_start < window_start < wallclock |
| * |
| * ms ws wc |
| * | | | |
| * V V V |
| * -----|------------------- |
| * |
| * In this case, p->ravg.sum is updated with (ws - ms) *iff* event |
| * is appropriate, then a new window sample is recorded followed |
| * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. |
| * |
| * c) Task event spans more than two windows. |
| * |
| * ms ws_tmp ws wc |
| * | | | | |
| * V V V V |
| * ---|-------|-------|-------|-------|------ |
| * | | |
| * |<------ nr_full_windows ------>| |
| * |
| * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* |
| * event is appropriate, window sample of p->ravg.sum is recorded, |
| * 'nr_full_window' samples of window_size is also recorded *iff* |
| * event is appropriate and finally p->ravg.sum is set to (wc - ws) |
| * *iff* event is appropriate. |
| * |
| * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() |
| * depends on it! |
| */ |
| static void update_task_demand(struct task_struct *p, struct rq *rq, |
| int event, u64 wallclock) |
| { |
| u64 mark_start = p->ravg.mark_start; |
| u64 delta, window_start = rq->window_start; |
| int new_window, nr_full_windows; |
| u32 window_size = sched_ravg_window; |
| |
| new_window = mark_start < window_start; |
| if (!account_busy_for_task_demand(p, event)) { |
| if (new_window) |
| /* |
| * If the time accounted isn't being accounted as |
| * busy time, and a new window started, only the |
| * previous window need be closed out with the |
| * pre-existing demand. Multiple windows may have |
| * elapsed, but since empty windows are dropped, |
| * it is not necessary to account those. |
| */ |
| update_history(rq, p, p->ravg.sum, 1, event); |
| return; |
| } |
| |
| if (!new_window) { |
| /* |
| * The simple case - busy time contained within the existing |
| * window. |
| */ |
| add_to_task_demand(rq, p, wallclock - mark_start); |
| return; |
| } |
| |
| /* |
| * Busy time spans at least two windows. Temporarily rewind |
| * window_start to first window boundary after mark_start. |
| */ |
| delta = window_start - mark_start; |
| nr_full_windows = div64_u64(delta, window_size); |
| window_start -= (u64)nr_full_windows * (u64)window_size; |
| |
| /* Process (window_start - mark_start) first */ |
| add_to_task_demand(rq, p, window_start - mark_start); |
| |
| /* Push new sample(s) into task's demand history */ |
| update_history(rq, p, p->ravg.sum, 1, event); |
| if (nr_full_windows) |
| update_history(rq, p, scale_exec_time(window_size, rq), |
| nr_full_windows, event); |
| |
| /* |
| * Roll window_start back to current to process any remainder |
| * in current window. |
| */ |
| window_start += (u64)nr_full_windows * (u64)window_size; |
| |
| /* Process (wallclock - window_start) next */ |
| mark_start = window_start; |
| add_to_task_demand(rq, p, wallclock - mark_start); |
| } |
| |
| /* Reflect task activity on its demand and cpu's busy time statistics */ |
| void update_task_ravg(struct task_struct *p, struct rq *rq, int event, |
| u64 wallclock, u64 irqtime) |
| { |
| if (!rq->window_start || sched_disable_window_stats) |
| return; |
| |
| lockdep_assert_held(&rq->lock); |
| |
| update_window_start(rq, wallclock); |
| |
| if (!p->ravg.mark_start) { |
| update_task_cpu_cycles(p, cpu_of(rq)); |
| goto done; |
| } |
| |
| update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); |
| update_task_demand(p, rq, event, wallclock); |
| update_cpu_busy_time(p, rq, event, wallclock, irqtime); |
| update_task_pred_demand(rq, p, event); |
| done: |
| trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, |
| rq->cc.cycles, rq->cc.time, |
| _group_cpu_time(p->grp, cpu_of(rq))); |
| |
| p->ravg.mark_start = wallclock; |
| } |
| |
| void sched_account_irqtime(int cpu, struct task_struct *curr, |
| u64 delta, u64 wallclock) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| unsigned long flags, nr_windows; |
| u64 cur_jiffies_ts; |
| |
| raw_spin_lock_irqsave(&rq->lock, flags); |
| |
| /* |
| * cputime (wallclock) uses sched_clock so use the same here for |
| * consistency. |
| */ |
| delta += sched_clock() - wallclock; |
| cur_jiffies_ts = get_jiffies_64(); |
| |
| if (is_idle_task(curr)) |
| update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), |
| delta); |
| |
| nr_windows = cur_jiffies_ts - rq->irqload_ts; |
| |
| if (nr_windows) { |
| if (nr_windows < 10) { |
| /* Decay CPU's irqload by 3/4 for each window. */ |
| rq->avg_irqload *= (3 * nr_windows); |
| rq->avg_irqload = div64_u64(rq->avg_irqload, |
| 4 * nr_windows); |
| } else { |
| rq->avg_irqload = 0; |
| } |
| rq->avg_irqload += rq->cur_irqload; |
| rq->cur_irqload = 0; |
| } |
| |
| rq->cur_irqload += delta; |
| rq->irqload_ts = cur_jiffies_ts; |
| raw_spin_unlock_irqrestore(&rq->lock, flags); |
| } |
| |
| void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) |
| { |
| struct rq *rq = cpu_rq(cpu); |
| |
| if (!rq->window_start || sched_disable_window_stats) |
| return; |
| |
| if (is_idle_task(curr)) { |
| /* We're here without rq->lock held, IRQ disabled */ |
| raw_spin_lock(&rq->lock); |
| update_task_cpu_cycles(curr, cpu); |
| raw_spin_unlock(&rq->lock); |
| } |
| } |
| |
| void reset_task_stats(struct task_struct *p) |
| { |
| u32 sum = 0; |
| |
| if (exiting_task(p)) |
| sum = EXITING_TASK_MARKER; |
| |
| memset(&p->ravg, 0, sizeof(struct ravg)); |
| /* Retain EXITING_TASK marker */ |
| p->ravg.sum_history[0] = sum; |
| } |
| |
| void mark_task_starting(struct task_struct *p) |
| { |
| u64 wallclock; |
| struct rq *rq = task_rq(p); |
| |
| if (!rq->window_start || sched_disable_window_stats) { |
| reset_task_stats(p); |
| return; |
| } |
| |
| wallclock = sched_ktime_clock(); |
| p->ravg.mark_start = p->last_wake_ts = wallclock; |
| p->last_cpu_selected_ts = wallclock; |
| p->last_switch_out_ts = 0; |
| update_task_cpu_cycles(p, cpu_of(rq)); |
| } |
| |
| void set_window_start(struct rq *rq) |
| { |
| int cpu = cpu_of(rq); |
| struct rq *sync_rq = cpu_rq(sync_cpu); |
| |
| if (rq->window_start) |
| return; |
| |
| if (cpu == sync_cpu) { |
| rq->window_start = sched_ktime_clock(); |
| } else { |
| raw_spin_unlock(&rq->lock); |
| double_rq_lock(rq, sync_rq); |
| rq->window_start = cpu_rq(sync_cpu)->window_start; |
| rq->curr_runnable_sum = rq->prev_runnable_sum = 0; |
| rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; |
| raw_spin_unlock(&sync_rq->lock); |
| } |
| |
| rq->curr->ravg.mark_start = rq->window_start; |
| } |
| |
| void migrate_sync_cpu(int cpu) |
| { |
| if (cpu == sync_cpu) |
| sync_cpu = smp_processor_id(); |
| } |
| |
| static void reset_all_task_stats(void) |
| { |
| struct task_struct *g, *p; |
| |
| read_lock(&tasklist_lock); |
| do_each_thread(g, p) { |
| reset_task_stats(p); |
| } while_each_thread(g, p); |
| read_unlock(&tasklist_lock); |
| } |
| |
| static void disable_window_stats(void) |
| { |
| unsigned long flags; |
| int i; |
| |
| local_irq_save(flags); |
| for_each_possible_cpu(i) |
| raw_spin_lock(&cpu_rq(i)->lock); |
| |
| sched_disable_window_stats = 1; |
| |
| for_each_possible_cpu(i) |
| raw_spin_unlock(&cpu_rq(i)->lock); |
| |
| local_irq_restore(flags); |
| } |
| |
| /* Called with all cpu's rq->lock held */ |
| static void enable_window_stats(void) |
| { |
| sched_disable_window_stats = 0; |
| |
| } |
| |
| enum reset_reason_code { |
| WINDOW_CHANGE, |
| POLICY_CHANGE, |
| HIST_SIZE_CHANGE, |
| FREQ_AGGREGATE_CHANGE, |
| }; |
| |
| const char *sched_window_reset_reasons[] = { |
| "WINDOW_CHANGE", |
| "POLICY_CHANGE", |
| "HIST_SIZE_CHANGE", |
| }; |
| |
| /* Called with IRQs enabled */ |
| void reset_all_window_stats(u64 window_start, unsigned int window_size) |
| { |
| int cpu; |
| unsigned long flags; |
| u64 start_ts = sched_ktime_clock(); |
| int reason = WINDOW_CHANGE; |
| unsigned int old = 0, new = 0; |
| struct related_thread_group *grp; |
| |
| disable_window_stats(); |
| |
| reset_all_task_stats(); |
| |
| local_irq_save(flags); |
| |
| read_lock(&related_thread_group_lock); |
| |
| for_each_possible_cpu(cpu) |
| raw_spin_lock(&cpu_rq(cpu)->lock); |
| |
| list_for_each_entry(grp, &related_thread_groups, list) { |
| int j; |
| |
| for_each_possible_cpu(j) { |
| struct group_cpu_time *cpu_time; |
| /* Protected by rq lock */ |
| cpu_time = _group_cpu_time(grp, j); |
| memset(cpu_time, 0, sizeof(struct group_cpu_time)); |
| if (window_start) |
| cpu_time->window_start = window_start; |
| } |
| } |
| |
| if (window_size) { |
| sched_ravg_window = window_size * TICK_NSEC; |
| set_hmp_defaults(); |
| } |
| |
| enable_window_stats(); |
| |
| for_each_possible_cpu(cpu) { |
| struct rq *rq = cpu_rq(cpu); |
| |
| if (window_start) |
| rq->window_start = window_start; |
| rq->curr_runnable_sum = rq->prev_runnable_sum = 0; |
| rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; |
| reset_cpu_hmp_stats(cpu, 1); |
| } |
| |
| if (sched_window_stats_policy != sysctl_sched_window_stats_policy) { |
| reason = POLICY_CHANGE; |
| old = sched_window_stats_policy; |
| new = sysctl_sched_window_stats_policy; |
| sched_window_stats_policy = sysctl_sched_window_stats_policy; |
| } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) { |
| reason = HIST_SIZE_CHANGE; |
| old = sched_ravg_hist_size; |
| new = sysctl_sched_ravg_hist_size; |
| sched_ravg_hist_size = sysctl_sched_ravg_hist_size; |
| } else if (sched_freq_aggregate != |
| sysctl_sched_freq_aggregate) { |
| reason = FREQ_AGGREGATE_CHANGE; |
| old = sched_freq_aggregate; |
| new = sysctl_sched_freq_aggregate; |
| sched_freq_aggregate = sysctl_sched_freq_aggregate; |
| } |
| |
| for_each_possible_cpu(cpu) |
| raw_spin_unlock(&cpu_rq(cpu)->lock); |
| |
| read_unlock(&related_thread_group_lock); |
| |
| local_irq_restore(flags); |
| |
| trace_sched_reset_all_window_stats(window_start, window_size, |
| sched_ktime_clock() - start_ts, reason, old, new); |
| } |
| |
| static inline void |
| sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); |
| |
| void sched_get_cpus_busy(struct sched_load *busy, |
| const struct cpumask *query_cpus) |
| { |
| unsigned long flags; |
| struct rq *rq; |
| const int cpus = cpumask_weight(query_cpus); |
| u64 load[cpus], group_load[cpus]; |
| u64 nload[cpus], ngload[cpus]; |
| u64 pload[cpus]; |
| unsigned int cur_freq[cpus], max_freq[cpus]; |
| int notifier_sent = 0; |
| int early_detection[cpus]; |
| int cpu, i = 0; |
| unsigned int window_size; |
| u64 max_prev_sum = 0; |
| int max_busy_cpu = cpumask_first(query_cpus); |
| struct related_thread_group *grp; |
| u64 total_group_load = 0, total_ngload = 0; |
| bool aggregate_load = false; |
| |
| if (unlikely(cpus == 0)) |
| return; |
| |
| /* |
| * This function could be called in timer context, and the |
| * current task may have been executing for a long time. Ensure |
| * that the window stats are current by doing an update. |
| */ |
| read_lock(&related_thread_group_lock); |
| |
| local_irq_save(flags); |
| for_each_cpu(cpu, query_cpus) |
| raw_spin_lock(&cpu_rq(cpu)->lock); |
| |
| window_size = sched_ravg_window; |
| |
| for_each_cpu(cpu, query_cpus) { |
| rq = cpu_rq(cpu); |
| |
| update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), |
| 0); |
| cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); |
| |
| load[i] = rq->old_busy_time = rq->prev_runnable_sum; |
| nload[i] = rq->nt_prev_runnable_sum; |
| pload[i] = rq->hmp_stats.pred_demands_sum; |
| rq->old_estimated_time = pload[i]; |
| |
| if (load[i] > max_prev_sum) { |
| max_prev_sum = load[i]; |
| max_busy_cpu = cpu; |
| } |
| |
| /* |
| * sched_get_cpus_busy() is called for all CPUs in a |
| * frequency domain. So the notifier_sent flag per |
| * cluster works even when a frequency domain spans |
| * more than 1 cluster. |
| */ |
| if (rq->cluster->notifier_sent) { |
| notifier_sent = 1; |
| rq->cluster->notifier_sent = 0; |
| } |
| early_detection[i] = (rq->ed_task != NULL); |
| cur_freq[i] = cpu_cur_freq(cpu); |
| max_freq[i] = cpu_max_freq(cpu); |
| i++; |
| } |
| |
| for_each_related_thread_group(grp) { |
| for_each_cpu(cpu, query_cpus) { |
| /* Protected by rq_lock */ |
| struct group_cpu_time *cpu_time = |
| _group_cpu_time(grp, cpu); |
| sync_window_start(cpu_rq(cpu), cpu_time); |
| } |
| } |
| |
| group_load_in_freq_domain( |
| &cpu_rq(max_busy_cpu)->freq_domain_cpumask, |
| &total_group_load, &total_ngload); |
| aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold); |
| |
| i = 0; |
| for_each_cpu(cpu, query_cpus) { |
| group_load[i] = 0; |
| ngload[i] = 0; |
| |
| if (early_detection[i]) |
| goto skip_early; |
| |
| rq = cpu_rq(cpu); |
| if (aggregate_load) { |
| if (cpu == max_busy_cpu) { |
| group_load[i] = total_group_load; |
| ngload[i] = total_ngload; |
| } |
| } else { |
| _group_load_in_cpu(cpu, &group_load[i], &ngload[i]); |
| } |
| |
| load[i] += group_load[i]; |
| nload[i] += ngload[i]; |
| /* |
| * Scale load in reference to cluster max_possible_freq. |
| * |
| * Note that scale_load_to_cpu() scales load in reference to |
| * the cluster max_freq. |
| */ |
| load[i] = scale_load_to_cpu(load[i], cpu); |
| nload[i] = scale_load_to_cpu(nload[i], cpu); |
| pload[i] = scale_load_to_cpu(pload[i], cpu); |
| skip_early: |
| i++; |
| } |
| |
| for_each_cpu(cpu, query_cpus) |
| raw_spin_unlock(&(cpu_rq(cpu))->lock); |
| local_irq_restore(flags); |
| |
| read_unlock(&related_thread_group_lock); |
| |
| i = 0; |
| for_each_cpu(cpu, query_cpus) { |
| rq = cpu_rq(cpu); |
| |
| if (early_detection[i]) { |
| busy[i].prev_load = div64_u64(sched_ravg_window, |
| NSEC_PER_USEC); |
| busy[i].new_task_load = 0; |
| goto exit_early; |
| } |
| |
| /* |
| * When the load aggregation is controlled by |
| * sched_freq_aggregate_threshold, allow reporting loads |
| * greater than 100 @ Fcur to ramp up the frequency |
| * faster. |
| */ |
| if (notifier_sent || (aggregate_load && |
| sched_freq_aggregate_threshold)) { |
| load[i] = scale_load_to_freq(load[i], max_freq[i], |
| cpu_max_possible_freq(cpu)); |
| nload[i] = scale_load_to_freq(nload[i], max_freq[i], |
| cpu_max_possible_freq(cpu)); |
| } else { |
| load[i] = scale_load_to_freq(load[i], max_freq[i], |
| cur_freq[i]); |
| nload[i] = scale_load_to_freq(nload[i], max_freq[i], |
| cur_freq[i]); |
| if (load[i] > window_size) |
| load[i] = window_size; |
| if (nload[i] > window_size) |
| nload[i] = window_size; |
| |
| load[i] = scale_load_to_freq(load[i], cur_freq[i], |
| cpu_max_possible_freq(cpu)); |
| nload[i] = scale_load_to_freq(nload[i], cur_freq[i], |
| cpu_max_possible_freq(cpu)); |
| } |
| pload[i] = scale_load_to_freq(pload[i], max_freq[i], |
| rq->cluster->max_possible_freq); |
| |
| busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC); |
| busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC); |
| busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC); |
| |
| exit_early: |
| trace_sched_get_busy(cpu, busy[i].prev_load, |
| busy[i].new_task_load, |
| busy[i].predicted_load, |
| early_detection[i]); |
| i++; |
| } |
| } |
| |
| void sched_set_io_is_busy(int val) |
| { |
| sched_io_is_busy = val; |
| } |
| |
| int sched_set_window(u64 window_start, unsigned int window_size) |
| { |
| u64 now, cur_jiffies, jiffy_ktime_ns; |
| s64 ws; |
| unsigned long flags; |
| |
| if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW) |
| return -EINVAL; |
| |
| mutex_lock(&policy_mutex); |
| |
| /* |
| * Get a consistent view of ktime, jiffies, and the time |
| * since the last jiffy (based on last_jiffies_update). |
| */ |
| local_irq_save(flags); |
| cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns); |
| local_irq_restore(flags); |
| |
| /* translate window_start from jiffies to nanoseconds */ |
| ws = (window_start - cur_jiffies); /* jiffy difference */ |
| ws *= TICK_NSEC; |
| ws += jiffy_ktime_ns; |
| |
| /* |
| * Roll back calculated window start so that it is in |
| * the past (window stats must have a current window). |
| */ |
| while (ws > now) |
| ws -= (window_size * TICK_NSEC); |
| |
| BUG_ON(sched_ktime_clock() < ws); |
| |
| reset_all_window_stats(ws, window_size); |
| |
| sched_update_freq_max_load(cpu_possible_mask); |
| |
| mutex_unlock(&policy_mutex); |
| |
| return 0; |
| } |
| |
| void fixup_busy_time(struct task_struct *p, int new_cpu) |
| { |
| struct rq *src_rq = task_rq(p); |
| struct rq *dest_rq = cpu_rq(new_cpu); |
| u64 wallclock; |
| u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; |
| u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; |
| u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; |
| u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; |
| int migrate_type; |
| struct migration_sum_data d; |
| bool new_task; |
| struct related_thread_group *grp; |
| |
| if (!p->on_rq && p->state != TASK_WAKING) |
| return; |
| |
| if (exiting_task(p)) { |
| clear_ed_task(p, src_rq); |
| return; |
| } |
| |
| if (p->state == TASK_WAKING) |
| double_rq_lock(src_rq, dest_rq); |
| |
| if (sched_disable_window_stats) |
| goto done; |
| |
| wallclock = sched_ktime_clock(); |
| |
| update_task_ravg(task_rq(p)->curr, task_rq(p), |
| TASK_UPDATE, |
| wallclock, 0); |
| update_task_ravg(dest_rq->curr, dest_rq, |
| TASK_UPDATE, wallclock, 0); |
| |
| update_task_ravg(p, task_rq(p), TASK_MIGRATE, |
| wallclock, 0); |
| |
| update_task_cpu_cycles(p, new_cpu); |
| |
| new_task = is_new_task(p); |
| /* Protected by rq_lock */ |
| grp = p->grp; |
| if (grp && sched_freq_aggregate) { |
| struct group_cpu_time *cpu_time; |
| |
| migrate_type = GROUP_TO_GROUP; |
| /* Protected by rq_lock */ |
| cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); |
| d.src_rq = NULL; |
| d.src_cpu_time = cpu_time; |
| src_curr_runnable_sum = &cpu_time->curr_runnable_sum; |
| src_prev_runnable_sum = &cpu_time->prev_runnable_sum; |
| src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; |
| src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; |
| |
| /* Protected by rq_lock */ |
| cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); |
| d.dst_rq = NULL; |
| d.dst_cpu_time = cpu_time; |
| dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; |
| dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; |
| dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; |
| dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; |
| sync_window_start(dest_rq, cpu_time); |
| } else { |
| migrate_type = RQ_TO_RQ; |
| d.src_rq = src_rq; |
| d.src_cpu_time = NULL; |
| d.dst_rq = dest_rq; |
| d.dst_cpu_time = NULL; |
| src_curr_runnable_sum = &src_rq->curr_runnable_sum; |
| src_prev_runnable_sum = &src_rq->prev_runnable_sum; |
| src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; |
| src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; |
| |
| dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; |
| dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; |
| dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; |
| dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; |
| } |
| |
| if (p->ravg.curr_window) { |
| *src_curr_runnable_sum -= p->ravg.curr_window; |
| *dst_curr_runnable_sum += p->ravg.curr_window; |
| if (new_task) { |
| *src_nt_curr_runnable_sum -= p->ravg.curr_window; |
| *dst_nt_curr_runnable_sum += p->ravg.curr_window; |
| } |
| } |
| |
| if (p->ravg.prev_window) { |
| *src_prev_runnable_sum -= p->ravg.prev_window; |
| *dst_prev_runnable_sum += p->ravg.prev_window; |
| if (new_task) { |
| *src_nt_prev_runnable_sum -= p->ravg.prev_window; |
| *dst_nt_prev_runnable_sum += p->ravg.prev_window; |
| } |
| } |
| |
| if (p == src_rq->ed_task) { |
| src_rq->ed_task = NULL; |
| if (!dest_rq->ed_task) |
| dest_rq->ed_task = p; |
| } |
| |
| trace_sched_migration_update_sum(p, migrate_type, &d); |
| BUG_ON((s64)*src_prev_runnable_sum < 0); |
| BUG_ON((s64)*src_curr_runnable_sum < 0); |
| BUG_ON((s64)*src_nt_prev_runnable_sum < 0); |
| BUG_ON((s64)*src_nt_curr_runnable_sum < 0); |
| |
| done: |
| if (p->state == TASK_WAKING) |
| double_rq_unlock(src_rq, dest_rq); |
| } |
| |
| #define sched_up_down_migrate_auto_update 1 |
| static void check_for_up_down_migrate_update(const struct cpumask *cpus) |
| { |
| int i = cpumask_first(cpus); |
| |
| if (!sched_up_down_migrate_auto_update) |
| return; |
| |
| if (cpu_max_possible_capacity(i) == max_possible_capacity) |
| return; |
| |
| if (cpu_max_possible_freq(i) == cpu_max_freq(i)) |
| up_down_migrate_scale_factor = 1024; |
| else |
| up_down_migrate_scale_factor = (1024 * |
| cpu_max_possible_freq(i)) / cpu_max_freq(i); |
| |
| update_up_down_migrate(); |
| } |
| |
| /* Return cluster which can offer required capacity for group */ |
| static struct sched_cluster * |
| best_cluster(struct related_thread_group *grp, u64 total_demand) |
| { |
| struct sched_cluster *cluster = NULL; |
| |
| for_each_sched_cluster(cluster) { |
| if (group_will_fit(cluster, grp, total_demand)) |
| return cluster; |
| } |
| |
| return NULL; |
| } |
| |
| static void _set_preferred_cluster(struct related_thread_group *grp) |
| { |
| struct task_struct *p; |
| u64 combined_demand = 0; |
| |
| if (!sysctl_sched_enable_colocation) { |
| grp->last_update = sched_ktime_clock(); |
| grp->preferred_cluster = NULL; |
| return; |
| } |
| |
| /* |
| * wakeup of two or more related tasks could race with each other and |
| * could result in multiple calls to _set_preferred_cluster being issued |
| * at same time. Avoid overhead in such cases of rechecking preferred |
| * cluster |
| */ |
| if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) |
| return; |
| |
| list_for_each_entry(p, &grp->tasks, grp_list) |
| combined_demand += p->ravg.demand; |
| |
| grp->preferred_cluster = best_cluster(grp, combined_demand); |
| grp->last_update = sched_ktime_clock(); |
| trace_sched_set_preferred_cluster(grp, combined_demand); |
| } |
| |
| void set_preferred_cluster(struct related_thread_group *grp) |
| { |
| raw_spin_lock(&grp->lock); |
| _set_preferred_cluster(grp); |
| raw_spin_unlock(&grp->lock); |
| } |
| |
| #define ADD_TASK 0 |
| #define REM_TASK 1 |
| |
| static inline void free_group_cputime(struct related_thread_group *grp) |
| { |
| free_percpu(grp->cpu_time); |
| } |
| |
| static int alloc_group_cputime(struct related_thread_group *grp) |
| { |
| int i; |
| struct group_cpu_time *cpu_time; |
| int cpu = raw_smp_processor_id(); |
| struct rq *rq = cpu_rq(cpu); |
| u64 window_start = rq->window_start; |
| |
| grp->cpu_time = alloc_percpu(struct group_cpu_time); |
| if (!grp->cpu_time) |
| return -ENOMEM; |
| |
| for_each_possible_cpu(i) { |
| cpu_time = per_cpu_ptr(grp->cpu_time, i); |
| memset(cpu_time, 0, sizeof(struct group_cpu_time)); |
| cpu_time->window_start = window_start; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * A group's window_start may be behind. When moving it forward, flip prev/curr |
| * counters. When moving forward > 1 window, prev counter is set to 0 |
| */ |
| static inline void |
| sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time) |
| { |
| u64 delta; |
| int nr_windows; |
| u64 curr_sum = cpu_time->curr_runnable_sum; |
| u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum; |
| |
| delta = rq->window_start - cpu_time->window_start; |
| if (!delta) |
| return; |
| |
| nr_windows = div64_u64(delta, sched_ravg_window); |
| if (nr_windows > 1) |
| curr_sum = nt_curr_sum = 0; |
| |
| cpu_time->prev_runnable_sum = curr_sum; |
| cpu_time->curr_runnable_sum = 0; |
| |
| cpu_time->nt_prev_runnable_sum = nt_curr_sum; |
| cpu_time->nt_curr_runnable_sum = 0; |
| |
| cpu_time->window_start = rq->window_start; |
| } |
| |
| /* |
| * Task's cpu usage is accounted in: |
| * rq->curr/prev_runnable_sum, when its ->grp is NULL |
| * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL |
| * |
| * Transfer task's cpu usage between those counters when transitioning between |
| * groups |
| */ |
| static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, |
| struct task_struct *p, int event) |
| { |
| u64 wallclock; |
| struct group_cpu_time *cpu_time; |
| u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; |
| u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; |
| u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; |
| u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; |
| struct migration_sum_data d; |
| int migrate_type; |
| |
| if (!sched_freq_aggregate) |
| return; |
| |
| wallclock = sched_ktime_clock(); |
| |
| update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); |
| update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); |
| |
| /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ |
| cpu_time = _group_cpu_time(grp, cpu_of(rq)); |
| if (event == ADD_TASK) { |
| sync_window_start(rq, cpu_time); |
| migrate_type = RQ_TO_GROUP; |
| d.src_rq = rq; |
| d.src_cpu_time = NULL; |
| d.dst_rq = NULL; |
| d.dst_cpu_time = cpu_time; |
| src_curr_runnable_sum = &rq->curr_runnable_sum; |
| dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; |
| src_prev_runnable_sum = &rq->prev_runnable_sum; |
| dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; |
| |
| src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; |
| dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; |
| src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; |
| dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; |
| } else { |
| migrate_type = GROUP_TO_RQ; |
| d.src_rq = NULL; |
| d.src_cpu_time = cpu_time; |
| d.dst_rq = rq; |
| d.dst_cpu_time = NULL; |
| |
| /* |
| * In case of REM_TASK, cpu_time->window_start would be |
| * uptodate, because of the update_task_ravg() we called |
| * above on the moving task. Hence no need for |
| * sync_window_start() |
| */ |
| src_curr_runnable_sum = &cpu_time->curr_runnable_sum; |
| dst_curr_runnable_sum = &rq->curr_runnable_sum; |
| src_prev_runnable_sum = &cpu_time->prev_runnable_sum; |
| dst_prev_runnable_sum = &rq->prev_runnable_sum; |
| |
| src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; |
| dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; |
| src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; |
| dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; |
| } |
| |
| *src_curr_runnable_sum -= p->ravg.curr_window; |
| *dst_curr_runnable_sum += p->ravg.curr_window; |
| |
| *src_prev_runnable_sum -= p->ravg.prev_window; |
| *dst_prev_runnable_sum += p->ravg.prev_window; |
| |
| if (is_new_task(p)) { |
| *src_nt_curr_runnable_sum -= p->ravg.curr_window; |
| *dst_nt_curr_runnable_sum += p->ravg.curr_window; |
| *src_nt_prev_runnable_sum -= p->ravg.prev_window; |
| *dst_nt_prev_runnable_sum += p->ravg.prev_window; |
| } |
| |
| trace_sched_migration_update_sum(p, migrate_type, &d); |
| |
| BUG_ON((s64)*src_curr_runnable_sum < 0); |
| BUG_ON((s64)*src_prev_runnable_sum < 0); |
| } |
| |
| static inline struct group_cpu_time * |
| task_group_cpu_time(struct task_struct *p, int cpu) |
| { |
| return _group_cpu_time(rcu_dereference(p->grp), cpu); |
| } |
| |
| static inline struct group_cpu_time * |
| _group_cpu_time(struct related_thread_group *grp, int cpu) |
| { |
| return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; |
| } |
| |
| struct related_thread_group *alloc_related_thread_group(int group_id) |
| { |
| struct related_thread_group *grp; |
| |
| grp = kzalloc(sizeof(*grp), GFP_KERNEL); |
| if (!grp) |
| return ERR_PTR(-ENOMEM); |
| |
| if (alloc_group_cputime(grp)) { |
| kfree(grp); |
| return ERR_PTR(-ENOMEM); |
| } |
| |
| grp->id = group_id; |
| INIT_LIST_HEAD(&grp->tasks); |
| INIT_LIST_HEAD(&grp->list); |
| raw_spin_lock_init(&grp->lock); |
| |
| return grp; |
| } |
| |
| struct related_thread_group *lookup_related_thread_group(unsigned int group_id) |
| { |
| struct related_thread_group *grp; |
| |
| list_for_each_entry(grp, &related_thread_groups, list) { |
| if (grp->id == group_id) |
| return grp; |
| } |
| |
| return NULL; |
| } |
| |
| /* See comments before preferred_cluster() */ |
| static void free_related_thread_group(struct rcu_head *rcu) |
| { |
| struct related_thread_group *grp = container_of(rcu, struct |
| related_thread_group, rcu); |
| |
| free_group_cputime(grp); |
| kfree(grp); |
| } |
| |
| static void remove_task_from_group(struct task_struct *p) |
| { |
| struct related_thread_group *grp = p->grp; |
| struct rq *rq; |
| int empty_group = 1; |
| struct rq_flags rf; |
| |
| raw_spin_lock(&grp->lock); |
| |
| rq = __task_rq_lock(p, &rf); |
| transfer_busy_time(rq, p->grp, p, REM_TASK); |
| list_del_init(&p->grp_list); |
| rcu_assign_pointer(p->grp, NULL); |
| __task_rq_unlock(rq, &rf); |
| |
| if (!list_empty(&grp->tasks)) { |
| empty_group = 0; |
| _set_preferred_cluster(grp); |
| } |
| |
| raw_spin_unlock(&grp->lock); |
| |
| if (empty_group) { |
| list_del(&grp->list); |
| call_rcu(&grp->rcu, free_related_thread_group); |
| } |
| } |
| |
| static int |
| add_task_to_group(struct task_struct *p, struct related_thread_group *grp) |
| { |
| struct rq *rq; |
| struct rq_flags rf; |
| |
| raw_spin_lock(&grp->lock); |
| |
| /* |
| * Change p->grp under rq->lock. Will prevent races with read-side |
| * reference of p->grp in various hot-paths |
| */ |
| rq = __task_rq_lock(p, &rf); |
| transfer_busy_time(rq, grp, p, ADD_TASK); |
| list_add(&p->grp_list, &grp->tasks); |
| rcu_assign_pointer(p->grp, grp); |
| __task_rq_unlock(rq, &rf); |
| |
| _set_preferred_cluster(grp); |
| |
| raw_spin_unlock(&grp->lock); |
| |
| return 0; |
| } |
| |
| void add_new_task_to_grp(struct task_struct *new) |
| { |
| unsigned long flags; |
| struct related_thread_group *grp; |
| struct task_struct *parent; |
| |
| if (!sysctl_sched_enable_thread_grouping) |
| return; |
| |
| if (thread_group_leader(new)) |
| return; |
| |
| parent = new->group_leader; |
| |
| /* |
| * The parent's pi_lock is required here to protect race |
| * against the parent task being removed from the |
| * group. |
| */ |
| raw_spin_lock_irqsave(&parent->pi_lock, flags); |
| |
| /* protected by pi_lock. */ |
| grp = task_related_thread_group(parent); |
| if (!grp) { |
| raw_spin_unlock_irqrestore(&parent->pi_lock, flags); |
| return; |
| } |
| raw_spin_lock(&grp->lock); |
| |
| rcu_assign_pointer(new->grp, grp); |
| list_add(&new->grp_list, &grp->tasks); |
| |
| raw_spin_unlock(&grp->lock); |
| raw_spin_unlock_irqrestore(&parent->pi_lock, flags); |
| } |
| |
| int sched_set_group_id(struct task_struct *p, unsigned int group_id) |
| { |
| int rc = 0, destroy = 0; |
| unsigned long flags; |
| struct related_thread_group *grp = NULL, *new = NULL; |
| |
| redo: |
| raw_spin_lock_irqsave(&p->pi_lock, flags); |
| |
| if ((current != p && p->flags & PF_EXITING) || |
| (!p->grp && !group_id) || |
| (p->grp && p->grp->id == group_id)) |
| goto done; |
| |
| write_lock(&related_thread_group_lock); |
| |
| if (!group_id) { |
| remove_task_from_group(p); |
| write_unlock(&related_thread_group_lock); |
| goto done; |
| } |
| |
| if (p->grp && p->grp->id != group_id) |
| remove_task_from_group(p); |
| |
| grp = lookup_related_thread_group(group_id); |
| if (!grp && !new) { |
| /* New group */ |
| write_unlock(&related_thread_group_lock); |
| raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| new = alloc_related_thread_group(group_id); |
| if (IS_ERR(new)) |
| return -ENOMEM; |
| destroy = 1; |
| /* Rerun checks (like task exiting), since we dropped pi_lock */ |
| goto redo; |
| } else if (!grp && new) { |
| /* New group - use object allocated before */ |
| destroy = 0; |
| list_add(&new->list, &related_thread_groups); |
| grp = new; |
| } |
| |
| BUG_ON(!grp); |
| rc = add_task_to_group(p, grp); |
| write_unlock(&related_thread_group_lock); |
| done: |
| raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| |
| if (new && destroy) { |
| free_group_cputime(new); |
| kfree(new); |
| } |
| |
| return rc; |
| } |
| |
| unsigned int sched_get_group_id(struct task_struct *p) |
| { |
| unsigned int group_id; |
| struct related_thread_group *grp; |
| |
| rcu_read_lock(); |
| grp = task_related_thread_group(p); |
| group_id = grp ? grp->id : 0; |
| rcu_read_unlock(); |
| |
| return group_id; |
| } |
| |
| static void update_cpu_cluster_capacity(const cpumask_t *cpus) |
| { |
| int i; |
| struct sched_cluster *cluster; |
| struct cpumask cpumask; |
| |
| cpumask_copy(&cpumask, cpus); |
| pre_big_task_count_change(cpu_possible_mask); |
| |
| for_each_cpu(i, &cpumask) { |
| cluster = cpu_rq(i)->cluster; |
| cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); |
| |
| cluster->capacity = compute_capacity(cluster); |
| cluster->load_scale_factor = compute_load_scale_factor(cluster); |
| |
| /* 'cpus' can contain cpumask more than one cluster */ |
| check_for_up_down_migrate_update(&cluster->cpus); |
| } |
| |
| __update_min_max_capacity(); |
| |
| post_big_task_count_change(cpu_possible_mask); |
| } |
| |
| static DEFINE_SPINLOCK(cpu_freq_min_max_lock); |
| void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax) |
| { |
| struct cpumask cpumask; |
| struct sched_cluster *cluster; |
| int i, update_capacity = 0; |
| unsigned long flags; |
| |
| spin_lock_irqsave(&cpu_freq_min_max_lock, flags); |
| cpumask_copy(&cpumask, cpus); |
| for_each_cpu(i, &cpumask) { |
| cluster = cpu_rq(i)->cluster; |
| cpumask_andnot(&cpumask, &cpumask, &cluster->cpus); |
| |
| update_capacity += (cluster->max_mitigated_freq != fmax); |
| cluster->max_mitigated_freq = fmax; |
| } |
| spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags); |
| |
| if (update_capacity) |
| update_cpu_cluster_capacity(cpus); |
| } |
| |
| static int cpufreq_notifier_policy(struct notifier_block *nb, |
| unsigned long val, void *data) |
| { |
| struct cpufreq_policy *policy = (struct cpufreq_policy *)data; |
| struct sched_cluster *cluster = NULL; |
| struct cpumask policy_cluster = *policy->related_cpus; |
| unsigned int orig_max_freq = 0; |
| int i, j, update_capacity = 0; |
| |
| if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && |
| val != CPUFREQ_CREATE_POLICY) |
| return 0; |
| |
| if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { |
| update_min_max_capacity(); |
| return 0; |
| } |
| |
| max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); |
| if (min_max_freq == 1) |
| min_max_freq = UINT_MAX; |
| min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); |
| BUG_ON(!min_max_freq); |
| BUG_ON(!policy->max); |
| |
| for_each_cpu(i, &policy_cluster) { |
| cluster = cpu_rq(i)->cluster; |
| cpumask_andnot(&policy_cluster, &policy_cluster, |
| &cluster->cpus); |
| |
| orig_max_freq = cluster->max_freq; |
| cluster->min_freq = policy->min; |
| cluster->max_freq = policy->max; |
| cluster->cur_freq = policy->cur; |
| |
| if (!cluster->freq_init_done) { |
| mutex_lock(&cluster_lock); |
| for_each_cpu(j, &cluster->cpus) |
| cpumask_copy(&cpu_rq(j)->freq_domain_cpumask, |
| policy->related_cpus); |
| cluster->max_possible_freq = policy->cpuinfo.max_freq; |
| cluster->max_possible_capacity = |
| compute_max_possible_capacity(cluster); |
| cluster->freq_init_done = true; |
| |
| sort_clusters(); |
| update_all_clusters_stats(); |
| mutex_unlock(&cluster_lock); |
| continue; |
| } |
| |
| update_capacity += (orig_max_freq != cluster->max_freq); |
| } |
| |
| if (update_capacity) |
| update_cpu_cluster_capacity(policy->related_cpus); |
| |
| return 0; |
| } |
| |
| static int cpufreq_notifier_trans(struct notifier_block *nb, |
| unsigned long val, void *data) |
| { |
| struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; |
| unsigned int cpu = freq->cpu, new_freq = freq->new; |
| unsigned long flags; |
| struct sched_cluster *cluster; |
| struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask; |
| int i, j; |
| |
| if (val != CPUFREQ_POSTCHANGE) |
| return 0; |
| |
| BUG_ON(!new_freq); |
| |
| if (cpu_cur_freq(cpu) == new_freq) |
| return 0; |
| |
| for_each_cpu(i, &policy_cpus) { |
| cluster = cpu_rq(i)->cluster; |
| |
| for_each_cpu(j, &cluster->cpus) { |
| struct rq *rq = cpu_rq(j); |
| |
| raw_spin_lock_irqsave(&rq->lock, flags); |
| update_task_ravg(rq->curr, rq, TASK_UPDATE, |
| sched_ktime_clock(), 0); |
| raw_spin_unlock_irqrestore(&rq->lock, flags); |
| } |
| |
| cluster->cur_freq = new_freq; |
| cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); |
| } |
| |
| return 0; |
| } |
| |
| static int pwr_stats_ready_notifier(struct notifier_block *nb, |
| unsigned long cpu, void *data) |
| { |
| cpumask_t mask = CPU_MASK_NONE; |
| |
| cpumask_set_cpu(cpu, &mask); |
| sched_update_freq_max_load(&mask); |
| |
| mutex_lock(&cluster_lock); |
| sort_clusters(); |
| mutex_unlock(&cluster_lock); |
| |
| return 0; |
| } |
| |
| static struct notifier_block notifier_policy_block = { |
| .notifier_call = cpufreq_notifier_policy |
| }; |
| |
| static struct notifier_block notifier_trans_block = { |
| .notifier_call = cpufreq_notifier_trans |
| }; |
| |
| static struct notifier_block notifier_pwr_stats_ready = { |
| .notifier_call = pwr_stats_ready_notifier |
| }; |
| |
| int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb) |
| { |
| return -EINVAL; |
| } |
| |
| static int register_sched_callback(void) |
| { |
| int ret; |
| |
| ret = cpufreq_register_notifier(¬ifier_policy_block, |
| CPUFREQ_POLICY_NOTIFIER); |
| |
| if (!ret) |
| ret = cpufreq_register_notifier(¬ifier_trans_block, |
| CPUFREQ_TRANSITION_NOTIFIER); |
| |
| register_cpu_pwr_stats_ready_notifier(¬ifier_pwr_stats_ready); |
| |
| return 0; |
| } |
| |
| /* |
| * cpufreq callbacks can be registered at core_initcall or later time. |
| * Any registration done prior to that is "forgotten" by cpufreq. See |
| * initialization of variable init_cpufreq_transition_notifier_list_called |
| * for further information. |
| */ |
| core_initcall(register_sched_callback); |
| |
| int update_preferred_cluster(struct related_thread_group *grp, |
| struct task_struct *p, u32 old_load) |
| { |
| u32 new_load = task_load(p); |
| |
| if (!grp) |
| return 0; |
| |
| /* |
| * Update if task's load has changed significantly or a complete window |
| * has passed since we last updated preference |
| */ |
| if (abs(new_load - old_load) > sched_ravg_window / 4 || |
| sched_ktime_clock() - grp->last_update > sched_ravg_window) |
| return 1; |
| |
| return 0; |
| } |
| |
| bool early_detection_notify(struct rq *rq, u64 wallclock) |
| { |
| struct task_struct *p; |
| int loop_max = 10; |
| |
| if (!sched_boost() || !rq->cfs.h_nr_running) |
| return 0; |
| |
| rq->ed_task = NULL; |
| list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { |
| if (!loop_max) |
| break; |
| |
| if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) { |
| rq->ed_task = p; |
| return 1; |
| } |
| |
| loop_max--; |
| } |
| |
| return 0; |
| } |
| |
| #ifdef CONFIG_CGROUP_SCHED |
| u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, |
| struct cftype *cft) |
| { |
| struct task_group *tg = css_tg(css); |
| |
| return tg->upmigrate_discouraged; |
| } |
| |
| int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css, |
| struct cftype *cft, u64 upmigrate_discourage) |
| { |
| struct task_group *tg = css_tg(css); |
| int discourage = upmigrate_discourage > 0; |
| |
| if (tg->upmigrate_discouraged == discourage) |
| return 0; |
| |
| /* |
| * Revisit big-task classification for tasks of this cgroup. It would |
| * have been efficient to walk tasks of just this cgroup in running |
| * state, but we don't have easy means to do that. Walk all tasks in |
| * running state on all cpus instead and re-visit their big task |
| * classification. |
| */ |
| get_online_cpus(); |
| pre_big_task_count_change(cpu_online_mask); |
| |
| tg->upmigrate_discouraged = discourage; |
| |
| post_big_task_count_change(cpu_online_mask); |
| put_online_cpus(); |
| |
| return 0; |
| } |
| #endif /* CONFIG_CGROUP_SCHED */ |