Fix machine check recovery
-----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) iQIcBAABAgAGBQJPtS5qAAoJEKurIx+X31iB+bAP/1FjUa2Nd53X89HFc6DoLktA 4AshM/JENhAfSbpTfGhg10ZOuwUa8sQ85Sf6yz1CsW6mEiJK/bDFrR1g2KrmejyL owgQvV6ukPABzfB27tWyXSVVBPmLkJviedLDautVpgPEPVuqauntmpe7fW51b5pf 92MxvYZ6AzgbDIjVaXP7e+kPomvgyM1C/UEvCgoyEcw81h5dchU9NSdXNBS67JS/ uOsMiMJyoNI46haYYbyFgMq3RmpYuxTLFj7qFDlUltyjP+vIvyLs38Ae/vkRMNfV sYXWRUQlRpvqg4MDIFVZx8FWTufzTm0BMS+Be7JkXKWdF3DAksq6FprOWIxfYi+d PMxwTFeSJzTINb9n9MiLt3TmuRy3mu37QWd28qJaJciNMkYWbclPqyJmjwsuAMKg hKSy2FvewIDHTAGOkwaVjS+L8O7j3TNRIAbweFA1d1K4rt6oSfwdn6GZrAb6MTvx oV0Fe1nAyY9mucyjknBTim3RYZ9qQ7H9SjL8JoaGihSzi988MgBun+iEQOQiwjNl YJNGIv0wb31qCKFtxU0A8rA0sFdhQRoLgigJfETb5a+2ghtG323oH6ZVWTnB8bp/ g1XOpus222/iJdL2AizMiJeUNV1IZ0SeLn2GoTFQIy9Uf11Zdgu5wLJumUva8EC6 JAACbpBlYufftIn278OH =tk2M -----END PGP SIGNATURE----- Merge tag 'linus-mce-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras Pull a machine check recovery fix from Tony Luck. I really don't like how the MCE code does some of the things it does, but this does seem to be an improvement. * tag 'linus-mce-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras: x86/mce: Only restart instruction after machine check recovery if it is safe
This commit is contained in:
commit
3d9944978e
1 changed files with 11 additions and 3 deletions
|
@ -945,9 +945,10 @@ struct mce_info {
|
|||
atomic_t inuse;
|
||||
struct task_struct *t;
|
||||
__u64 paddr;
|
||||
int restartable;
|
||||
} mce_info[MCE_INFO_MAX];
|
||||
|
||||
static void mce_save_info(__u64 addr)
|
||||
static void mce_save_info(__u64 addr, int c)
|
||||
{
|
||||
struct mce_info *mi;
|
||||
|
||||
|
@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr)
|
|||
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
|
||||
mi->t = current;
|
||||
mi->paddr = addr;
|
||||
mi->restartable = c;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||
if (worst == MCE_AR_SEVERITY) {
|
||||
/* schedule action before return to userland */
|
||||
mce_save_info(m.addr);
|
||||
mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
|
||||
set_thread_flag(TIF_MCE_NOTIFY);
|
||||
} else if (kill_it) {
|
||||
force_sig(SIGBUS, current);
|
||||
|
@ -1179,7 +1181,13 @@ void mce_notify_process(void)
|
|||
|
||||
pr_err("Uncorrected hardware memory error in user-access at %llx",
|
||||
mi->paddr);
|
||||
if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
|
||||
/*
|
||||
* We must call memory_failure() here even if the current process is
|
||||
* doomed. We still need to mark the page as poisoned and alert any
|
||||
* other users of the page.
|
||||
*/
|
||||
if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
|
||||
mi->restartable == 0) {
|
||||
pr_err("Memory error not recovered");
|
||||
force_sig(SIGBUS, current);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue