Contributors: 3
Author Tokens Token Proportion Commits Commit Proportion
Fenghua Yu 646 96.56% 4 66.67%
Greg Kroah-Hartman 20 2.99% 1 16.67%
Benjamin Thiel 3 0.45% 1 16.67%
Total 669 6


// SPDX-License-Identifier: GPL-2.0
#include <linux/syscore_ops.h>
#include <linux/suspend.h>
#include <linux/cpu.h>

#include <asm/msr.h>
#include <asm/mwait.h>

#define UMWAIT_C02_ENABLE	0

#define UMWAIT_CTRL_VAL(max_time, c02_disable)				\
	(((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) |		\
	((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))

/*
 * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
 * umwait max time is 100000 in TSC-quanta and C0.2 is enabled
 */
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);

/*
 * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
 * hardware or BIOS before kernel boot.
 */
static u32 orig_umwait_control_cached __ro_after_init;

/*
 * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
 * the sysfs write functions.
 */
static DEFINE_MUTEX(umwait_lock);

static void umwait_update_control_msr(void * unused)
{
	lockdep_assert_irqs_disabled();
	wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
}

/*
 * The CPU hotplug callback sets the control MSR to the global control
 * value.
 *
 * Disable interrupts so the read of umwait_control_cached and the WRMSR
 * are protected against a concurrent sysfs write. Otherwise the sysfs
 * write could update the cached value after it had been read on this CPU
 * and issue the IPI before the old value had been written. The IPI would
 * interrupt, write the new value and after return from IPI the previous
 * value would be written by this CPU.
 *
 * With interrupts disabled the upcoming CPU either sees the new control
 * value or the IPI is updating this CPU to the new control value after
 * interrupts have been reenabled.
 */
static int umwait_cpu_online(unsigned int cpu)
{
	local_irq_disable();
	umwait_update_control_msr(NULL);
	local_irq_enable();
	return 0;
}

/*
 * The CPU hotplug callback sets the control MSR to the original control
 * value.
 */
static int umwait_cpu_offline(unsigned int cpu)
{
	/*
	 * This code is protected by the CPU hotplug already and
	 * orig_umwait_control_cached is never changed after it caches
	 * the original control MSR value in umwait_init(). So there
	 * is no race condition here.
	 */
	wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0);

	return 0;
}

/*
 * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
 * is the only active CPU at this time. The MSR is set up on the APs via the
 * CPU hotplug callback.
 *
 * This function is invoked on resume from suspend and hibernation. On
 * resume from suspend the restore should be not required, but we neither
 * trust the firmware nor does it matter if the same value is written
 * again.
 */
static void umwait_syscore_resume(void)
{
	umwait_update_control_msr(NULL);
}

static struct syscore_ops umwait_syscore_ops = {
	.resume	= umwait_syscore_resume,
};

/* sysfs interface */

/*
 * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
 * Otherwise, C0.2 is enabled.
 */
static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
{
	return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
}

static inline u32 umwait_ctrl_max_time(u32 ctrl)
{
	return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
}

static inline void umwait_update_control(u32 maxtime, bool c02_enable)
{
	u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;

	if (!c02_enable)
		ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;

	WRITE_ONCE(umwait_control_cached, ctrl);
	/* Propagate to all CPUs */
	on_each_cpu(umwait_update_control_msr, NULL, 1);
}

static ssize_t
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
{
	u32 ctrl = READ_ONCE(umwait_control_cached);

	return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
}

static ssize_t enable_c02_store(struct device *dev,
				struct device_attribute *attr,
				const char *buf, size_t count)
{
	bool c02_enable;
	u32 ctrl;
	int ret;

	ret = kstrtobool(buf, &c02_enable);
	if (ret)
		return ret;

	mutex_lock(&umwait_lock);

	ctrl = READ_ONCE(umwait_control_cached);
	if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
		umwait_update_control(ctrl, c02_enable);

	mutex_unlock(&umwait_lock);

	return count;
}
static DEVICE_ATTR_RW(enable_c02);

static ssize_t
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
{
	u32 ctrl = READ_ONCE(umwait_control_cached);

	return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
}

static ssize_t max_time_store(struct device *kobj,
			      struct device_attribute *attr,
			      const char *buf, size_t count)
{
	u32 max_time, ctrl;
	int ret;

	ret = kstrtou32(buf, 0, &max_time);
	if (ret)
		return ret;

	/* bits[1:0] must be zero */
	if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
		return -EINVAL;

	mutex_lock(&umwait_lock);

	ctrl = READ_ONCE(umwait_control_cached);
	if (max_time != umwait_ctrl_max_time(ctrl))
		umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));

	mutex_unlock(&umwait_lock);

	return count;
}
static DEVICE_ATTR_RW(max_time);

static struct attribute *umwait_attrs[] = {
	&dev_attr_enable_c02.attr,
	&dev_attr_max_time.attr,
	NULL
};

static struct attribute_group umwait_attr_group = {
	.attrs = umwait_attrs,
	.name = "umwait_control",
};

static int __init umwait_init(void)
{
	struct device *dev;
	int ret;

	if (!boot_cpu_has(X86_FEATURE_WAITPKG))
		return -ENODEV;

	/*
	 * Cache the original control MSR value before the control MSR is
	 * changed. This is the only place where orig_umwait_control_cached
	 * is modified.
	 */
	rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);

	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
				umwait_cpu_online, umwait_cpu_offline);
	if (ret < 0) {
		/*
		 * On failure, the control MSR on all CPUs has the
		 * original control value.
		 */
		return ret;
	}

	register_syscore_ops(&umwait_syscore_ops);

	/*
	 * Add umwait control interface. Ignore failure, so at least the
	 * default values are set up in case the machine manages to boot.
	 */
	dev = bus_get_dev_root(&cpu_subsys);
	if (dev) {
		ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
		put_device(dev);
	}
	return ret;
}
device_initcall(umwait_init);