platform/x86: Add support for Uncore frequency control

Some server users set limits on the uncore frequency using MSR 620H, while
running latency sensitive workloads. Here uncore frequency controls
RING/LLC(last-level cache) clocks.

But MSR control is not always possible from the user space, so this driver
provides a sysfs interface to set max and min frequency limits. This MSR
620H is a die scoped in multi-die system or package scoped in non multi-die
systems.

When this driver is loaded, a new directory is created under
 /sys/devices/system/cpu.

For example on a two package Skylake server:
$cd /sys/devices/system/cpu/intel_uncore_frequency

$ls
package_00_die_00 package_01_die_00

$ls package_00_die_00
max_freq_khz  min_freq_khz  initial_max_freq_khz
initial_min_freq_khz

$grep . *
    max_freq_khz:2400000
    min_freq_khz:1200000
    initial_max_freq_khz:2400000
    initial_min_freq_khz:1200000

Here, initial_max_freq_khz and initial_min_freq_khz are read only
attributes to show power up or initial values of max and min frequencies
respectively. Other attributes are read-write, so that users can modify.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index f7def0a..e60b051 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1337,6 +1337,17 @@
 	  To compile this driver as a module, choose M here: the module
 	  will be called pcengines-apuv2.
 
+config INTEL_UNCORE_FREQ_CONTROL
+	tristate "Intel Uncore frequency control driver"
+	depends on X86_64
+	help
+	  This driver allows control of uncore frequency limits on
+	  supported server platforms.
+	  Uncore frequency controls RING/LLC (last-level cache) clocks.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called intel-uncore-frequency.
+
 source "drivers/platform/x86/intel_speed_select_if/Kconfig"
 
 config SYSTEM76_ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 42d85a0..3747b1f 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -105,3 +105,4 @@
 obj-$(CONFIG_PCENGINES_APU2)	+= pcengines-apuv2.o
 obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
 obj-$(CONFIG_SYSTEM76_ACPI)	+= system76_acpi.o
+obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)	+= intel-uncore-frequency.o
diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
new file mode 100644
index 0000000..2b1a073
--- /dev/null
+++ b/drivers/platform/x86/intel-uncore-frequency.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Uncore Frequency Setting
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
+ * one control CPU is identified per die to read/write limit. This control CPU
+ * is changed, if the CPU state is changed to offline. When the last CPU is
+ * offline in a die then remove the sysfs object for that die.
+ * The majority of actual code is related to sysfs create and read/write
+ * attributes.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define MSR_UNCORE_RATIO_LIMIT			0x620
+#define UNCORE_FREQ_KHZ_MULTIPLIER		100000
+
+/**
+ * struct uncore_data -	Encapsulate all uncore data
+ * @stored_uncore_data:	Last user changed MSR 620 value, which will be restored
+ *			on system resume.
+ * @initial_min_freq_khz: Sampled minimum uncore frequency at driver init
+ * @initial_max_freq_khz: Sampled maximum uncore frequency at driver init
+ * @control_cpu:	Designated CPU for a die to read/write
+ * @valid:		Mark the data valid/invalid
+ *
+ * This structure is used to encapsulate all data related to uncore sysfs
+ * settings for a die/package.
+ */
+struct uncore_data {
+	struct kobject kobj;
+	u64 stored_uncore_data;
+	u32 initial_min_freq_khz;
+	u32 initial_max_freq_khz;
+	int control_cpu;
+	bool valid;
+};
+
+#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
+
+/* Max instances for uncore data, one for each die */
+static int uncore_max_entries __read_mostly;
+/* Storage for uncore data for all instances */
+static struct uncore_data *uncore_instances;
+/* Root of the all uncore sysfs kobjs */
+struct kobject uncore_root_kobj;
+/* Stores the CPU mask of the target CPUs to use during uncore read/write */
+static cpumask_t uncore_cpu_mask;
+/* CPU online callback register instance */
+static enum cpuhp_state uncore_hp_state __read_mostly;
+/* Mutex to control all mutual exclusions */
+static DEFINE_MUTEX(uncore_lock);
+
+struct uncore_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+	ssize_t (*store)(struct kobject *kobj,
+			 struct attribute *attr, const char *c, ssize_t count);
+};
+
+#define define_one_uncore_ro(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define define_one_uncore_rw(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define show_uncore_data(member_name)					\
+	static ssize_t show_##member_name(struct kobject *kobj,         \
+					  struct attribute *attr,	\
+					  char *buf)			\
+	{                                                               \
+		struct uncore_data *data = to_uncore_data(kobj);	\
+		return scnprintf(buf, PAGE_SIZE, "%u\n",		\
+				 data->member_name);			\
+	}								\
+	define_one_uncore_ro(member_name)
+
+show_uncore_data(initial_min_freq_khz);
+show_uncore_data(initial_max_freq_khz);
+
+/* Common function to read MSR 0x620 and read min/max */
+static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
+			     unsigned int *max)
+{
+	u64 cap;
+	int ret;
+
+	ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+	if (ret)
+		return ret;
+
+	*max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
+	*min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
+
+	return 0;
+}
+
+/* Common function to set min/max ratios to be used by sysfs callbacks */
+static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
+			      int set_max)
+{
+	int ret;
+	u64 cap;
+
+	mutex_lock(&uncore_lock);
+
+	input /= UNCORE_FREQ_KHZ_MULTIPLIER;
+	if (!input || input > 0x7F) {
+		ret = -EINVAL;
+		goto finish_write;
+	}
+
+	ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+	if (ret)
+		goto finish_write;
+
+	if (set_max) {
+		cap &= ~0x7F;
+		cap |= input;
+	} else  {
+		cap &= ~GENMASK(14, 8);
+		cap |= (input << 8);
+	}
+
+	ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
+	if (ret)
+		goto finish_write;
+
+	data->stored_uncore_data = cap;
+
+finish_write:
+	mutex_unlock(&uncore_lock);
+
+	return ret;
+}
+
+static ssize_t store_min_max_freq_khz(struct kobject *kobj,
+				      struct attribute *attr,
+				      const char *buf, ssize_t count,
+				      int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int input;
+
+	if (kstrtouint(buf, 10, &input))
+		return -EINVAL;
+
+	uncore_write_ratio(data, input, min_max);
+
+	return count;
+}
+
+static ssize_t show_min_max_freq_khz(struct kobject *kobj,
+				     struct attribute *attr,
+				     char *buf, int min_max)
+{
+	struct uncore_data *data = to_uncore_data(kobj);
+	unsigned int min, max;
+	int ret;
+
+	mutex_lock(&uncore_lock);
+	ret = uncore_read_ratio(data, &min, &max);
+	mutex_unlock(&uncore_lock);
+	if (ret)
+		return ret;
+
+	if (min_max)
+		return sprintf(buf, "%u\n", max);
+
+	return sprintf(buf, "%u\n", min);
+}
+
+#define store_uncore_min_max(name, min_max)				\
+	static ssize_t store_##name(struct kobject *kobj,		\
+				    struct attribute *attr,		\
+				    const char *buf, ssize_t count)	\
+	{                                                               \
+									\
+		return store_min_max_freq_khz(kobj, attr, buf, count,	\
+					      min_max);			\
+	}
+
+#define show_uncore_min_max(name, min_max)				\
+	static ssize_t show_##name(struct kobject *kobj,		\
+				   struct attribute *attr, char *buf)	\
+	{                                                               \
+									\
+		return show_min_max_freq_khz(kobj, attr, buf, min_max); \
+	}
+
+store_uncore_min_max(min_freq_khz, 0);
+store_uncore_min_max(max_freq_khz, 1);
+
+show_uncore_min_max(min_freq_khz, 0);
+show_uncore_min_max(max_freq_khz, 1);
+
+define_one_uncore_rw(min_freq_khz);
+define_one_uncore_rw(max_freq_khz);
+
+static struct attribute *uncore_attrs[] = {
+	&initial_min_freq_khz.attr,
+	&initial_max_freq_khz.attr,
+	&max_freq_khz.attr,
+	&min_freq_khz.attr,
+	NULL
+};
+
+static struct kobj_type uncore_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = uncore_attrs,
+};
+
+static struct kobj_type uncore_root_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/* Caller provides protection */
+static struct uncore_data *uncore_get_instance(unsigned int cpu)
+{
+	int id = topology_logical_die_id(cpu);
+
+	if (id >= 0 && id < uncore_max_entries)
+		return &uncore_instances[id];
+
+	return NULL;
+}
+
+static void uncore_add_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (!data) {
+		mutex_unlock(&uncore_lock);
+		return;
+	}
+
+	if (data->valid) {
+		/* control cpu changed */
+		data->control_cpu = cpu;
+	} else {
+		char str[64];
+		int ret;
+
+		memset(data, 0, sizeof(*data));
+		sprintf(str, "package_%02d_die_%02d",
+			topology_physical_package_id(cpu),
+			topology_die_id(cpu));
+
+		uncore_read_ratio(data, &data->initial_min_freq_khz,
+				  &data->initial_max_freq_khz);
+
+		ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
+					   &uncore_root_kobj, str);
+		if (!ret) {
+			data->control_cpu = cpu;
+			data->valid = true;
+		}
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+/* Last CPU in this die is offline, so remove sysfs entries */
+static void uncore_remove_die_entry(int cpu)
+{
+	struct uncore_data *data;
+
+	mutex_lock(&uncore_lock);
+	data = uncore_get_instance(cpu);
+	if (data) {
+		kobject_put(&data->kobj);
+		data->control_cpu = -1;
+		data->valid = false;
+	}
+	mutex_unlock(&uncore_lock);
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+	int target;
+
+	/* Check if there is an online cpu in the package for uncore MSR */
+	target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU on this die as a control CPU */
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+	uncore_add_die_entry(cpu);
+
+	return 0;
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+	int target;
+
+	/* Check if existing cpu is used for uncore MSRs */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return 0;
+
+	/* Find a new cpu to set uncore MSR */
+	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+
+	if (target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+		uncore_add_die_entry(target);
+	} else {
+		uncore_remove_die_entry(cpu);
+	}
+
+	return 0;
+}
+
+static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
+			    void *_unused)
+{
+	int cpu;
+
+	switch (mode) {
+	case PM_POST_HIBERNATION:
+	case PM_POST_RESTORE:
+	case PM_POST_SUSPEND:
+		for_each_cpu(cpu, &uncore_cpu_mask) {
+			struct uncore_data *data;
+			int ret;
+
+			data = uncore_get_instance(cpu);
+			if (!data || !data->valid || !data->stored_uncore_data)
+				continue;
+
+			ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
+					    data->stored_uncore_data);
+			if (ret)
+				return ret;
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block uncore_pm_nb = {
+	.notifier_call = uncore_pm_notify,
+};
+
+#define ICPU(model)     { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
+
+static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
+	ICPU(INTEL_FAM6_BROADWELL_G),
+	ICPU(INTEL_FAM6_BROADWELL_X),
+	ICPU(INTEL_FAM6_BROADWELL_D),
+	ICPU(INTEL_FAM6_SKYLAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_X),
+	ICPU(INTEL_FAM6_ICELAKE_D),
+	{}
+};
+
+static int __init intel_uncore_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_uncore_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	uncore_max_entries = topology_max_packages() *
+					topology_max_die_per_package();
+	uncore_instances = kcalloc(uncore_max_entries,
+				   sizeof(*uncore_instances), GFP_KERNEL);
+	if (!uncore_instances)
+		return -ENOMEM;
+
+	ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
+				   &cpu_subsys.dev_root->kobj,
+				   "intel_uncore_frequency");
+	if (ret)
+		goto err_free;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				"platform/x86/uncore-freq:online",
+				uncore_event_cpu_online,
+				uncore_event_cpu_offline);
+	if (ret < 0)
+		goto err_rem_kobj;
+
+	uncore_hp_state = ret;
+
+	ret = register_pm_notifier(&uncore_pm_nb);
+	if (ret)
+		goto err_rem_state;
+
+	return 0;
+
+err_rem_state:
+	cpuhp_remove_state(uncore_hp_state);
+err_rem_kobj:
+	kobject_put(&uncore_root_kobj);
+err_free:
+	kfree(uncore_instances);
+
+	return ret;
+}
+module_init(intel_uncore_init)
+
+static void __exit intel_uncore_exit(void)
+{
+	int i;
+
+	unregister_pm_notifier(&uncore_pm_nb);
+	cpuhp_remove_state(uncore_hp_state);
+	for (i = 0; i < uncore_max_entries; ++i) {
+		if (uncore_instances[i].valid)
+			kobject_put(&uncore_instances[i].kobj);
+	}
+	kobject_put(&uncore_root_kobj);
+	kfree(uncore_instances);
+}
+module_exit(intel_uncore_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");