Web lists-archives.com

[PATCH] x86: handle MSR exception when setting energy perf bias




On very rare occasions, immediately after a suspend, one of our
SandyBridge CI boxes hits the exception below on CPU0 while trying to
reconfigure the energy bias register.  As far as I can tell, this is not
likely a race in the kernel, since we have only one cpu online, no
preempt and irqs_disabled, and it can only be reproduced in this
specific SNB-2600 on rare occasions.  It looks more of a faulty hardware
thing to me.

Still, we can handle this exception more gracefully to silence the CI,
by using the safe version of the msrl_read/write wrapper.

The log on the faulty box is as follows:

[  280.969486] unchecked MSR access error: RDMSR from 0x1b0 at rIP: 0xffffffff81030a4f (init_intel_energy_perf.part.3+0xf/0xb0)
[  280.969488] Call Trace:
[  280.969492]  intel_bsp_resume+0x1a/0x20
[  280.969494]  bsp_resume+0x1d/0x20
[  280.969498]  syscore_resume+0x62/0x320
[  280.969501]  suspend_devices_and_enter+0xa09/0xcd0
[  280.969506]  pm_suspend+0x574/0xa00
[  280.969509]  state_store+0x82/0xf0
[  280.969514]  kobj_attr_store+0xf/0x20
[  280.969516]  sysfs_kf_write+0x45/0x60
[  280.969519]  kernfs_fop_write+0x124/0x1c0
[  280.969524]  __vfs_write+0x28/0x130
[  280.969526]  ? rcu_read_lock_sched_held+0x7a/0x90
[  280.969528]  ? rcu_sync_lockdep_assert+0x2f/0x60
[  280.969531]  ? __sb_start_write+0x10c/0x200
[  280.969535]  vfs_write+0xc8/0x1c0
[  280.969538]  SyS_write+0x49/0xb0
[  280.969542]  entry_SYSCALL_64_fastpath+0x1c/0xb1
[  280.969544] RIP: 0033:0x7fb6721ce8f0
[  280.969546] RSP: 002b:00007ffc3ac05da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  280.969549] RAX: ffffffffffffffda RBX: ffffffff81489d83 RCX: 00007fb6721ce8f0
[  280.969551] RDX: 0000000000000004 RSI: 00005634312da060 RDI: 0000000000000007
[  280.969552] RBP: ffffc90001147f88 R08: 00005634312d7de0 R09: 00007fb67269f700
[  280.969554] R10: 00007fb672497b58 R11: 0000000000000246 R12: 00005634312d7d00
[  280.969555] R13: 0000000000000001 R14: 0000000000000004 R15: 0000000000000004
[  280.969559]  ? __this_cpu_preempt_check+0x13/0x20
[  280.969566] unchecked MSR access error: WRMSR to 0x1b0 (tried to write 0x0000000000000006) at rIP: 0xffffffff81030a8e (init_intel_energy_perf.part.3+0x4e/0xb0)
[  280.969567] Call Trace:
[  280.969569]  intel_bsp_resume+0x1a/0x20
[  280.969571]  bsp_resume+0x1d/0x20
[  280.969573]  syscore_resume+0x62/0x320
[  280.969576]  suspend_devices_and_enter+0xa09/0xcd0
[  280.969580]  pm_suspend+0x574/0xa00
[  280.969584]  state_store+0x82/0xf0
[  280.969587]  kobj_attr_store+0xf/0x20
[  280.969589]  sysfs_kf_write+0x45/0x60
[  280.969592]  kernfs_fop_write+0x124/0x1c0
[  280.969595]  __vfs_write+0x28/0x130
[  280.969597]  ? rcu_read_lock_sched_held+0x7a/0x90
[  280.969599]  ? rcu_sync_lockdep_assert+0x2f/0x60
[  280.969601]  ? __sb_start_write+0x10c/0x200
[  280.969605]  vfs_write+0xc8/0x1c0
[  280.969608]  SyS_write+0x49/0xb0
[  280.969612]  entry_SYSCALL_64_fastpath+0x1c/0xb1
[  280.969613] RIP: 0033:0x7fb6721ce8f0
[  280.969614] RSP: 002b:00007ffc3ac05da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  280.969617] RAX: ffffffffffffffda RBX: ffffffff81489d83 RCX: 00007fb6721ce8f0
[  280.969619] RDX: 0000000000000004 RSI: 00005634312da060 RDI: 0000000000000007
[  280.969620] RBP: ffffc90001147f88 R08: 00005634312d7de0 R09: 00007fb67269f700
[  280.969621] R10: 00007fb672497b58 R11: 0000000000000246 R12: 00005634312d7d00
[  280.969623] R13: 0000000000000001 R14: 0000000000000004 R15: 0000000000000004
[  280.969626]  ? __this_cpu_preempt_check+0x13/0x20

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102365
Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxxxx>
---
 arch/x86/kernel/cpu/intel.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dfa90a3a5145..51082a9a3d8a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -465,14 +465,17 @@ static void init_intel_energy_perf(struct cpuinfo_x86 *c)
 	if (!cpu_has(c, X86_FEATURE_EPB))
 		return;
 
-	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+	if (rdmsrl_safe(MSR_IA32_ENERGY_PERF_BIAS, &epb) ||
+	    (epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
 		return;
 
+	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+	if (wrmsrl_safe(MSR_IA32_ENERGY_PERF_BIAS, epb)) {
+		pr_warn_once("ENERGY_PERF_BIAS: Failed to set to 'normal'\n");
+		return;
+	}
 	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
 	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
-	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
-	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
 }
 
 static void intel_bsp_resume(struct cpuinfo_x86 *c)
-- 
2.11.0