Merge branch 'thermal-soc' into next
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index ef473dc..bb9a0a5 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -582,3 +582,24 @@
This function serves as an arbitrator to set the state of a cooling
device. It sets the cooling device to the deepest cooling state if
possible.
+
+6. thermal_emergency_poweroff:
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index f786ae4..4edc011 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -15,6 +15,23 @@
if THERMAL
+config THERMAL_EMERGENCY_POWEROFF_DELAY_MS
+ int "Emergency poweroff delay in milli-seconds"
+ depends on THERMAL
+ default 0
+ help
+ Thermal subsystem will issue a graceful shutdown when
+ critical temperatures are reached using orderly_poweroff(). In
+ case of failure of an orderly_poweroff(), the thermal emergency
+ poweroff kicks in after a delay has elapsed and shuts down the system.
+ This config is number of milliseconds to delay before emergency
+ poweroff kicks in. Similarly to the critical trip point,
+ the delay should be carefully profiled so as to give adequate
+ time for orderly_poweroff() to finish on regular execution.
+ If set to 0 emergency poweroff will not be supported.
+
+ In doubt, leave as 0.
+
config THERMAL_HWMON
bool
prompt "Expose thermal sensors as hwmon device"
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 4bf4ad5..ef59256 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -28,6 +28,8 @@
#include <trace/events/thermal.h>
+#define SCALE_ERROR_MITIGATION 100
+
static DEFINE_IDA(devfreq_ida);
/**
@@ -45,6 +47,12 @@ static DEFINE_IDA(devfreq_ida);
* @freq_table_size: Size of the @freq_table and @power_table
* @power_ops: Pointer to devfreq_cooling_power, used to generate the
* @power_table.
+ * @res_util: Resource utilization scaling factor for the power.
+ * It is multiplied by 100 to minimize the error. It is used
+ * for estimation of the power budget instead of using
+ * 'utilization' (which is 'busy_time / 'total_time').
+ * The 'res_util' range is from 100 to (power_table[state] * 100)
+ * for the corresponding 'state'.
*/
struct devfreq_cooling_device {
int id;
@@ -55,6 +63,8 @@ struct devfreq_cooling_device {
u32 *freq_table;
size_t freq_table_size;
struct devfreq_cooling_power *power_ops;
+ u32 res_util;
+ int capped_state;
};
/**
@@ -164,27 +174,12 @@ freq_get_state(struct devfreq_cooling_device *dfc, unsigned long freq)
return THERMAL_CSTATE_INVALID;
}
-/**
- * get_static_power() - calculate the static power
- * @dfc: Pointer to devfreq cooling device
- * @freq: Frequency in Hz
- *
- * Calculate the static power in milliwatts using the supplied
- * get_static_power(). The current voltage is calculated using the
- * OPP library. If no get_static_power() was supplied, assume the
- * static power is negligible.
- */
-static unsigned long
-get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
+static unsigned long get_voltage(struct devfreq *df, unsigned long freq)
{
- struct devfreq *df = dfc->devfreq;
struct device *dev = df->dev.parent;
unsigned long voltage;
struct dev_pm_opp *opp;
- if (!dfc->power_ops->get_static_power)
- return 0;
-
opp = dev_pm_opp_find_freq_exact(dev, freq, true);
if (PTR_ERR(opp) == -ERANGE)
opp = dev_pm_opp_find_freq_exact(dev, freq, false);
@@ -202,9 +197,35 @@ get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
dev_err_ratelimited(dev,
"Failed to get voltage for frequency %lu\n",
freq);
- return 0;
}
+ return voltage;
+}
+
+/**
+ * get_static_power() - calculate the static power
+ * @dfc: Pointer to devfreq cooling device
+ * @freq: Frequency in Hz
+ *
+ * Calculate the static power in milliwatts using the supplied
+ * get_static_power(). The current voltage is calculated using the
+ * OPP library. If no get_static_power() was supplied, assume the
+ * static power is negligible.
+ */
+static unsigned long
+get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
+{
+ struct devfreq *df = dfc->devfreq;
+ unsigned long voltage;
+
+ if (!dfc->power_ops->get_static_power)
+ return 0;
+
+ voltage = get_voltage(df, freq);
+
+ if (voltage == 0)
+ return 0;
+
return dfc->power_ops->get_static_power(df, voltage);
}
@@ -239,6 +260,16 @@ get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
return power;
}
+
+static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
+ unsigned long freq,
+ unsigned long voltage)
+{
+ return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq,
+ voltage);
+}
+
+
static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
struct thermal_zone_device *tz,
u32 *power)
@@ -248,27 +279,55 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
struct devfreq_dev_status *status = &df->last_status;
unsigned long state;
unsigned long freq = status->current_frequency;
- u32 dyn_power, static_power;
+ unsigned long voltage;
+ u32 dyn_power = 0;
+ u32 static_power = 0;
+ int res;
- /* Get dynamic power for state */
state = freq_get_state(dfc, freq);
- if (state == THERMAL_CSTATE_INVALID)
- return -EAGAIN;
+ if (state == THERMAL_CSTATE_INVALID) {
+ res = -EAGAIN;
+ goto fail;
+ }
- dyn_power = dfc->power_table[state];
+ if (dfc->power_ops->get_real_power) {
+ voltage = get_voltage(df, freq);
+ if (voltage == 0) {
+ res = -EINVAL;
+ goto fail;
+ }
- /* Scale dynamic power for utilization */
- dyn_power = (dyn_power * status->busy_time) / status->total_time;
+ res = dfc->power_ops->get_real_power(df, power, freq, voltage);
+ if (!res) {
+ state = dfc->capped_state;
+ dfc->res_util = dfc->power_table[state];
+ dfc->res_util *= SCALE_ERROR_MITIGATION;
- /* Get static power */
- static_power = get_static_power(dfc, freq);
+ if (*power > 1)
+ dfc->res_util /= *power;
+ } else {
+ goto fail;
+ }
+ } else {
+ dyn_power = dfc->power_table[state];
+
+ /* Scale dynamic power for utilization */
+ dyn_power *= status->busy_time;
+ dyn_power /= status->total_time;
+ /* Get static power */
+ static_power = get_static_power(dfc, freq);
+
+ *power = dyn_power + static_power;
+ }
trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power,
- static_power);
-
- *power = dyn_power + static_power;
+ static_power, *power);
return 0;
+fail:
+ /* It is safe to set max in this case */
+ dfc->res_util = SCALE_ERROR_MITIGATION;
+ return res;
}
static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
@@ -301,26 +360,34 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
unsigned long busy_time;
s32 dyn_power;
u32 static_power;
+ s32 est_power;
int i;
- static_power = get_static_power(dfc, freq);
+ if (dfc->power_ops->get_real_power) {
+ /* Scale for resource utilization */
+ est_power = power * dfc->res_util;
+ est_power /= SCALE_ERROR_MITIGATION;
+ } else {
+ static_power = get_static_power(dfc, freq);
- dyn_power = power - static_power;
- dyn_power = dyn_power > 0 ? dyn_power : 0;
+ dyn_power = power - static_power;
+ dyn_power = dyn_power > 0 ? dyn_power : 0;
- /* Scale dynamic power for utilization */
- busy_time = status->busy_time ?: 1;
- dyn_power = (dyn_power * status->total_time) / busy_time;
+ /* Scale dynamic power for utilization */
+ busy_time = status->busy_time ?: 1;
+ est_power = (dyn_power * status->total_time) / busy_time;
+ }
/*
* Find the first cooling state that is within the power
* budget for dynamic power.
*/
for (i = 0; i < dfc->freq_table_size - 1; i++)
- if (dyn_power >= dfc->power_table[i])
+ if (est_power >= dfc->power_table[i])
break;
*state = i;
+ dfc->capped_state = i;
trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
return 0;
}
@@ -376,7 +443,7 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
}
for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
- unsigned long power_dyn, voltage;
+ unsigned long power, voltage;
struct dev_pm_opp *opp;
opp = dev_pm_opp_find_freq_floor(dev, &freq);
@@ -389,12 +456,15 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
dev_pm_opp_put(opp);
if (dfc->power_ops) {
- power_dyn = get_dynamic_power(dfc, freq, voltage);
+ if (dfc->power_ops->get_real_power)
+ power = get_total_power(dfc, freq, voltage);
+ else
+ power = get_dynamic_power(dfc, freq, voltage);
- dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
- freq / 1000000, voltage, power_dyn, power_dyn);
+ dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
+ freq / 1000000, voltage, power, power);
- power_table[i] = power_dyn;
+ power_table[i] = power;
}
freq_table[i] = freq;
diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c
index b2bbaa1..c27868b 100644
--- a/drivers/thermal/intel_soc_dts_thermal.c
+++ b/drivers/thermal/intel_soc_dts_thermal.c
@@ -73,8 +73,12 @@ static int __init intel_soc_thermal_init(void)
IRQF_TRIGGER_RISING | IRQF_ONESHOT,
"soc_dts", soc_dts);
if (err) {
- pr_err("request_threaded_irq ret %d\n", err);
- goto error_irq;
+ /*
+ * Do not just error out because the user space thermal
+ * daemon such as DPTF may use polling instead of being
+ * interrupt driven.
+ */
+ pr_warn("request_threaded_irq ret %d\n", err);
}
}
@@ -88,7 +92,6 @@ static int __init intel_soc_thermal_init(void)
error_trips:
if (soc_dts_thres_irq)
free_irq(soc_dts_thres_irq, soc_dts);
-error_irq:
intel_soc_dts_iosf_exit(soc_dts);
return err;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 11f0675..b21b9cc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -45,8 +45,10 @@ static LIST_HEAD(thermal_governor_list);
static DEFINE_MUTEX(thermal_list_lock);
static DEFINE_MUTEX(thermal_governor_lock);
+static DEFINE_MUTEX(poweroff_lock);
static atomic_t in_suspend;
+static bool power_off_triggered;
static struct thermal_governor *def_governor;
@@ -322,6 +324,54 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
def_governor->throttle(tz, trip);
}
+/**
+ * thermal_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void thermal_emergency_poweroff_func(struct work_struct *work)
+{
+ /*
+ * We have reached here after the emergency thermal shutdown
+ * Waiting period has expired. This means orderly_poweroff has
+ * not been able to shut off the system for some reason.
+ * Try to shut down the system immediately using kernel_power_off
+ * if populated
+ */
+ WARN(1, "Attempting kernel_power_off: Temperature too high\n");
+ kernel_power_off();
+
+ /*
+ * Worst of the worst case trigger emergency restart
+ */
+ WARN(1, "Attempting emergency_restart: Temperature too high\n");
+ emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work,
+ thermal_emergency_poweroff_func);
+
+/**
+ * thermal_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a known period of time. By default this is not scheduled.
+ */
+void thermal_emergency_poweroff(void)
+{
+ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
+ /*
+ * poweroff_delay_ms must be a carefully profiled positive value.
+ * Its a must for thermal_emergency_poweroff_work to be scheduled
+ */
+ if (poweroff_delay_ms <= 0)
+ return;
+ schedule_delayed_work(&thermal_emergency_poweroff_work,
+ msecs_to_jiffies(poweroff_delay_ms));
+}
+
static void handle_critical_trips(struct thermal_zone_device *tz,
int trip, enum thermal_trip_type trip_type)
{
@@ -342,7 +392,17 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
dev_emerg(&tz->device,
"critical temperature reached(%d C),shutting down\n",
tz->temperature / 1000);
- orderly_poweroff(true);
+ mutex_lock(&poweroff_lock);
+ if (!power_off_triggered) {
+ /*
+ * Queue a backup emergency shutdown in the event of
+ * orderly_poweroff failure
+ */
+ thermal_emergency_poweroff();
+ orderly_poweroff(true);
+ power_off_triggered = true;
+ }
+ mutex_unlock(&poweroff_lock);
}
}
@@ -1463,6 +1523,7 @@ static int __init thermal_init(void)
{
int result;
+ mutex_init(&poweroff_lock);
result = thermal_register_governors();
if (result)
goto error;
@@ -1497,6 +1558,7 @@ static int __init thermal_init(void)
ida_destroy(&thermal_cdev_ida);
mutex_destroy(&thermal_list_lock);
mutex_destroy(&thermal_governor_lock);
+ mutex_destroy(&poweroff_lock);
return result;
}
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index c35d0c0..4635f95 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -34,6 +34,23 @@
* If get_dynamic_power() is NULL, then the
* dynamic power is calculated as
* @dyn_power_coeff * frequency * voltage^2
+ * @get_real_power: When this is set, the framework uses it to ask the
+ * device driver for the actual power.
+ * Some devices have more sophisticated methods
+ * (like power counters) to approximate the actual power
+ * that they use.
+ * This function provides more accurate data to the
+ * thermal governor. When the driver does not provide
+ * such function, framework just uses pre-calculated
+ * table and scale the power by 'utilization'
+ * (based on 'busy_time' and 'total_time' taken from
+ * devfreq 'last_status').
+ * The value returned by this function must be lower
+ * or equal than the maximum power value
+ * for the current state
+ * (which can be found in power_table[state]).
+ * When this interface is used, the power_table holds
+ * max total (static + dynamic) power value for each OPP.
*/
struct devfreq_cooling_power {
unsigned long (*get_static_power)(struct devfreq *devfreq,
@@ -41,6 +58,8 @@ struct devfreq_cooling_power {
unsigned long (*get_dynamic_power)(struct devfreq *devfreq,
unsigned long freq,
unsigned long voltage);
+ int (*get_real_power)(struct devfreq *df, u32 *power,
+ unsigned long freq, unsigned long voltage);
unsigned long dyn_power_coeff;
};
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 2b4a8ff..6cde5b3 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h
@@ -151,9 +151,9 @@ TRACE_EVENT(thermal_power_cpu_limit,
TRACE_EVENT(thermal_power_devfreq_get_power,
TP_PROTO(struct thermal_cooling_device *cdev,
struct devfreq_dev_status *status, unsigned long freq,
- u32 dynamic_power, u32 static_power),
+ u32 dynamic_power, u32 static_power, u32 power),
- TP_ARGS(cdev, status, freq, dynamic_power, static_power),
+ TP_ARGS(cdev, status, freq, dynamic_power, static_power, power),
TP_STRUCT__entry(
__string(type, cdev->type )
@@ -161,6 +161,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power,
__field(u32, load )
__field(u32, dynamic_power )
__field(u32, static_power )
+ __field(u32, power)
),
TP_fast_assign(
@@ -169,11 +170,13 @@ TRACE_EVENT(thermal_power_devfreq_get_power,
__entry->load = (100 * status->busy_time) / status->total_time;
__entry->dynamic_power = dynamic_power;
__entry->static_power = static_power;
+ __entry->power = power;
),
- TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u",
+ TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u power=%u",
__get_str(type), __entry->freq,
- __entry->load, __entry->dynamic_power, __entry->static_power)
+ __entry->load, __entry->dynamic_power, __entry->static_power,
+ __entry->power)
);
TRACE_EVENT(thermal_power_devfreq_limit,