Merge branch 'thermal-soc' into next
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index ef473dc..bb9a0a5 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -582,3 +582,24 @@
 This function serves as an arbitrator to set the state of a cooling
 device. It sets the cooling device to the deepest cooling state if
 possible.
+
+6. thermal_emergency_poweroff:
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index f786ae4..4edc011 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -15,6 +15,23 @@
 
 if THERMAL
 
+config THERMAL_EMERGENCY_POWEROFF_DELAY_MS
+	int "Emergency poweroff delay in milli-seconds"
+	depends on THERMAL
+	default 0
+	help
+	  Thermal subsystem will issue a graceful shutdown when
+	  critical temperatures are reached using orderly_poweroff(). In
+	  case of failure of an orderly_poweroff(), the thermal emergency
+	  poweroff kicks in after a delay has elapsed and shuts down the system.
+	  This config is number of milliseconds to delay before emergency
+	  poweroff kicks in. Similarly to the critical trip point,
+	  the delay should be carefully profiled so as to give adequate
+	  time for orderly_poweroff() to finish on regular execution.
+	  If set to 0 emergency poweroff will not be supported.
+
+	  In doubt, leave as 0.
+
 config THERMAL_HWMON
 	bool
 	prompt "Expose thermal sensors as hwmon device"
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 4bf4ad5..ef59256 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -28,6 +28,8 @@
 
 #include <trace/events/thermal.h>
 
+#define SCALE_ERROR_MITIGATION 100
+
 static DEFINE_IDA(devfreq_ida);
 
 /**
@@ -45,6 +47,12 @@ static DEFINE_IDA(devfreq_ida);
  * @freq_table_size:	Size of the @freq_table and @power_table
  * @power_ops:	Pointer to devfreq_cooling_power, used to generate the
  *		@power_table.
+ * @res_util:	Resource utilization scaling factor for the power.
+ *		It is multiplied by 100 to minimize the error. It is used
+ *		for estimation of the power budget instead of using
+ *		'utilization' (which is	'busy_time / 'total_time').
+ *		The 'res_util' range is from 100 to (power_table[state] * 100)
+ *		for the corresponding 'state'.
  */
 struct devfreq_cooling_device {
 	int id;
@@ -55,6 +63,8 @@ struct devfreq_cooling_device {
 	u32 *freq_table;
 	size_t freq_table_size;
 	struct devfreq_cooling_power *power_ops;
+	u32 res_util;
+	int capped_state;
 };
 
 /**
@@ -164,27 +174,12 @@ freq_get_state(struct devfreq_cooling_device *dfc, unsigned long freq)
 	return THERMAL_CSTATE_INVALID;
 }
 
-/**
- * get_static_power() - calculate the static power
- * @dfc:	Pointer to devfreq cooling device
- * @freq:	Frequency in Hz
- *
- * Calculate the static power in milliwatts using the supplied
- * get_static_power().  The current voltage is calculated using the
- * OPP library.  If no get_static_power() was supplied, assume the
- * static power is negligible.
- */
-static unsigned long
-get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
+static unsigned long get_voltage(struct devfreq *df, unsigned long freq)
 {
-	struct devfreq *df = dfc->devfreq;
 	struct device *dev = df->dev.parent;
 	unsigned long voltage;
 	struct dev_pm_opp *opp;
 
-	if (!dfc->power_ops->get_static_power)
-		return 0;
-
 	opp = dev_pm_opp_find_freq_exact(dev, freq, true);
 	if (PTR_ERR(opp) == -ERANGE)
 		opp = dev_pm_opp_find_freq_exact(dev, freq, false);
@@ -202,9 +197,35 @@ get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
 		dev_err_ratelimited(dev,
 				    "Failed to get voltage for frequency %lu\n",
 				    freq);
-		return 0;
 	}
 
+	return voltage;
+}
+
+/**
+ * get_static_power() - calculate the static power
+ * @dfc:	Pointer to devfreq cooling device
+ * @freq:	Frequency in Hz
+ *
+ * Calculate the static power in milliwatts using the supplied
+ * get_static_power().  The current voltage is calculated using the
+ * OPP library.  If no get_static_power() was supplied, assume the
+ * static power is negligible.
+ */
+static unsigned long
+get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq)
+{
+	struct devfreq *df = dfc->devfreq;
+	unsigned long voltage;
+
+	if (!dfc->power_ops->get_static_power)
+		return 0;
+
+	voltage = get_voltage(df, freq);
+
+	if (voltage == 0)
+		return 0;
+
 	return dfc->power_ops->get_static_power(df, voltage);
 }
 
@@ -239,6 +260,16 @@ get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
 	return power;
 }
 
+
+static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
+					    unsigned long freq,
+					    unsigned long voltage)
+{
+	return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq,
+							       voltage);
+}
+
+
 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
 					       struct thermal_zone_device *tz,
 					       u32 *power)
@@ -248,27 +279,55 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 	struct devfreq_dev_status *status = &df->last_status;
 	unsigned long state;
 	unsigned long freq = status->current_frequency;
-	u32 dyn_power, static_power;
+	unsigned long voltage;
+	u32 dyn_power = 0;
+	u32 static_power = 0;
+	int res;
 
-	/* Get dynamic power for state */
 	state = freq_get_state(dfc, freq);
-	if (state == THERMAL_CSTATE_INVALID)
-		return -EAGAIN;
+	if (state == THERMAL_CSTATE_INVALID) {
+		res = -EAGAIN;
+		goto fail;
+	}
 
-	dyn_power = dfc->power_table[state];
+	if (dfc->power_ops->get_real_power) {
+		voltage = get_voltage(df, freq);
+		if (voltage == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 
-	/* Scale dynamic power for utilization */
-	dyn_power = (dyn_power * status->busy_time) / status->total_time;
+		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
+		if (!res) {
+			state = dfc->capped_state;
+			dfc->res_util = dfc->power_table[state];
+			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
-	/* Get static power */
-	static_power = get_static_power(dfc, freq);
+			if (*power > 1)
+				dfc->res_util /= *power;
+		} else {
+			goto fail;
+		}
+	} else {
+		dyn_power = dfc->power_table[state];
+
+		/* Scale dynamic power for utilization */
+		dyn_power *= status->busy_time;
+		dyn_power /= status->total_time;
+		/* Get static power */
+		static_power = get_static_power(dfc, freq);
+
+		*power = dyn_power + static_power;
+	}
 
 	trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power,
-					      static_power);
-
-	*power = dyn_power + static_power;
+					      static_power, *power);
 
 	return 0;
+fail:
+	/* It is safe to set max in this case */
+	dfc->res_util = SCALE_ERROR_MITIGATION;
+	return res;
 }
 
 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
@@ -301,26 +360,34 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	unsigned long busy_time;
 	s32 dyn_power;
 	u32 static_power;
+	s32 est_power;
 	int i;
 
-	static_power = get_static_power(dfc, freq);
+	if (dfc->power_ops->get_real_power) {
+		/* Scale for resource utilization */
+		est_power = power * dfc->res_util;
+		est_power /= SCALE_ERROR_MITIGATION;
+	} else {
+		static_power = get_static_power(dfc, freq);
 
-	dyn_power = power - static_power;
-	dyn_power = dyn_power > 0 ? dyn_power : 0;
+		dyn_power = power - static_power;
+		dyn_power = dyn_power > 0 ? dyn_power : 0;
 
-	/* Scale dynamic power for utilization */
-	busy_time = status->busy_time ?: 1;
-	dyn_power = (dyn_power * status->total_time) / busy_time;
+		/* Scale dynamic power for utilization */
+		busy_time = status->busy_time ?: 1;
+		est_power = (dyn_power * status->total_time) / busy_time;
+	}
 
 	/*
 	 * Find the first cooling state that is within the power
 	 * budget for dynamic power.
 	 */
 	for (i = 0; i < dfc->freq_table_size - 1; i++)
-		if (dyn_power >= dfc->power_table[i])
+		if (est_power >= dfc->power_table[i])
 			break;
 
 	*state = i;
+	dfc->capped_state = i;
 	trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
 	return 0;
 }
@@ -376,7 +443,7 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 	}
 
 	for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
-		unsigned long power_dyn, voltage;
+		unsigned long power, voltage;
 		struct dev_pm_opp *opp;
 
 		opp = dev_pm_opp_find_freq_floor(dev, &freq);
@@ -389,12 +456,15 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 		dev_pm_opp_put(opp);
 
 		if (dfc->power_ops) {
-			power_dyn = get_dynamic_power(dfc, freq, voltage);
+			if (dfc->power_ops->get_real_power)
+				power = get_total_power(dfc, freq, voltage);
+			else
+				power = get_dynamic_power(dfc, freq, voltage);
 
-			dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
-				freq / 1000000, voltage, power_dyn, power_dyn);
+			dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
+				freq / 1000000, voltage, power, power);
 
-			power_table[i] = power_dyn;
+			power_table[i] = power;
 		}
 
 		freq_table[i] = freq;
diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c
index b2bbaa1..c27868b 100644
--- a/drivers/thermal/intel_soc_dts_thermal.c
+++ b/drivers/thermal/intel_soc_dts_thermal.c
@@ -73,8 +73,12 @@ static int __init intel_soc_thermal_init(void)
 					   IRQF_TRIGGER_RISING | IRQF_ONESHOT,
 					   "soc_dts", soc_dts);
 		if (err) {
-			pr_err("request_threaded_irq ret %d\n", err);
-			goto error_irq;
+			/*
+			 * Do not just error out because the user space thermal
+			 * daemon such as DPTF may use polling instead of being
+			 * interrupt driven.
+			 */
+			pr_warn("request_threaded_irq ret %d\n", err);
 		}
 	}
 
@@ -88,7 +92,6 @@ static int __init intel_soc_thermal_init(void)
 error_trips:
 	if (soc_dts_thres_irq)
 		free_irq(soc_dts_thres_irq, soc_dts);
-error_irq:
 	intel_soc_dts_iosf_exit(soc_dts);
 
 	return err;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 11f0675..b21b9cc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -45,8 +45,10 @@ static LIST_HEAD(thermal_governor_list);
 
 static DEFINE_MUTEX(thermal_list_lock);
 static DEFINE_MUTEX(thermal_governor_lock);
+static DEFINE_MUTEX(poweroff_lock);
 
 static atomic_t in_suspend;
+static bool power_off_triggered;
 
 static struct thermal_governor *def_governor;
 
@@ -322,6 +324,54 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 		       def_governor->throttle(tz, trip);
 }
 
+/**
+ * thermal_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void thermal_emergency_poweroff_func(struct work_struct *work)
+{
+	/*
+	 * We have reached here after the emergency thermal shutdown
+	 * Waiting period has expired. This means orderly_poweroff has
+	 * not been able to shut off the system for some reason.
+	 * Try to shut down the system immediately using kernel_power_off
+	 * if populated
+	 */
+	WARN(1, "Attempting kernel_power_off: Temperature too high\n");
+	kernel_power_off();
+
+	/*
+	 * Worst of the worst case trigger emergency restart
+	 */
+	WARN(1, "Attempting emergency_restart: Temperature too high\n");
+	emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work,
+			    thermal_emergency_poweroff_func);
+
+/**
+ * thermal_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a known period of time. By default this is not scheduled.
+ */
+void thermal_emergency_poweroff(void)
+{
+	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
+	/*
+	 * poweroff_delay_ms must be a carefully profiled positive value.
+	 * Its a must for thermal_emergency_poweroff_work to be scheduled
+	 */
+	if (poweroff_delay_ms <= 0)
+		return;
+	schedule_delayed_work(&thermal_emergency_poweroff_work,
+			      msecs_to_jiffies(poweroff_delay_ms));
+}
+
 static void handle_critical_trips(struct thermal_zone_device *tz,
 				  int trip, enum thermal_trip_type trip_type)
 {
@@ -342,7 +392,17 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
 		dev_emerg(&tz->device,
 			  "critical temperature reached(%d C),shutting down\n",
 			  tz->temperature / 1000);
-		orderly_poweroff(true);
+		mutex_lock(&poweroff_lock);
+		if (!power_off_triggered) {
+			/*
+			 * Queue a backup emergency shutdown in the event of
+			 * orderly_poweroff failure
+			 */
+			thermal_emergency_poweroff();
+			orderly_poweroff(true);
+			power_off_triggered = true;
+		}
+		mutex_unlock(&poweroff_lock);
 	}
 }
 
@@ -1463,6 +1523,7 @@ static int __init thermal_init(void)
 {
 	int result;
 
+	mutex_init(&poweroff_lock);
 	result = thermal_register_governors();
 	if (result)
 		goto error;
@@ -1497,6 +1558,7 @@ static int __init thermal_init(void)
 	ida_destroy(&thermal_cdev_ida);
 	mutex_destroy(&thermal_list_lock);
 	mutex_destroy(&thermal_governor_lock);
+	mutex_destroy(&poweroff_lock);
 	return result;
 }
 
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index c35d0c0..4635f95 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -34,6 +34,23 @@
  *			If get_dynamic_power() is NULL, then the
  *			dynamic power is calculated as
  *			@dyn_power_coeff * frequency * voltage^2
+ * @get_real_power:	When this is set, the framework uses it to ask the
+ *			device driver for the actual power.
+ *			Some devices have more sophisticated methods
+ *			(like power counters) to approximate the actual power
+ *			that they use.
+ *			This function provides more accurate data to the
+ *			thermal governor. When the driver does not provide
+ *			such function, framework just uses pre-calculated
+ *			table and scale the power by 'utilization'
+ *			(based on 'busy_time' and 'total_time' taken from
+ *			devfreq 'last_status').
+ *			The value returned by this function must be lower
+ *			or equal than the maximum power value
+ *			for the current	state
+ *			(which can be found in power_table[state]).
+ *			When this interface is used, the power_table holds
+ *			max total (static + dynamic) power value for each OPP.
  */
 struct devfreq_cooling_power {
 	unsigned long (*get_static_power)(struct devfreq *devfreq,
@@ -41,6 +58,8 @@ struct devfreq_cooling_power {
 	unsigned long (*get_dynamic_power)(struct devfreq *devfreq,
 					   unsigned long freq,
 					   unsigned long voltage);
+	int (*get_real_power)(struct devfreq *df, u32 *power,
+			      unsigned long freq, unsigned long voltage);
 	unsigned long dyn_power_coeff;
 };
 
diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 2b4a8ff..6cde5b3 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h
@@ -151,9 +151,9 @@ TRACE_EVENT(thermal_power_cpu_limit,
 TRACE_EVENT(thermal_power_devfreq_get_power,
 	TP_PROTO(struct thermal_cooling_device *cdev,
 		 struct devfreq_dev_status *status, unsigned long freq,
-		u32 dynamic_power, u32 static_power),
+		u32 dynamic_power, u32 static_power, u32 power),
 
-	TP_ARGS(cdev, status,  freq, dynamic_power, static_power),
+	TP_ARGS(cdev, status,  freq, dynamic_power, static_power, power),
 
 	TP_STRUCT__entry(
 		__string(type,         cdev->type    )
@@ -161,6 +161,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power,
 		__field(u32,           load          )
 		__field(u32,           dynamic_power )
 		__field(u32,           static_power  )
+		__field(u32,           power)
 	),
 
 	TP_fast_assign(
@@ -169,11 +170,13 @@ TRACE_EVENT(thermal_power_devfreq_get_power,
 		__entry->load = (100 * status->busy_time) / status->total_time;
 		__entry->dynamic_power = dynamic_power;
 		__entry->static_power = static_power;
+		__entry->power = power;
 	),
 
-	TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u",
+	TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u power=%u",
 		__get_str(type), __entry->freq,
-		__entry->load, __entry->dynamic_power, __entry->static_power)
+		__entry->load, __entry->dynamic_power, __entry->static_power,
+		__entry->power)
 );
 
 TRACE_EVENT(thermal_power_devfreq_limit,