$NetBSD: patch-XSA404,v 1.1 2022/06/24 13:47:37 bouyer Exp $ From: Andrew Cooper Subject: x86/spec-ctrl: Make VERW flushing runtime conditional Currently, VERW flushing to mitigate MDS is boot time conditional per domain type. However, to provide mitigations for DRPW (CVE-2022-21166), we need to conditionally use VERW based on the trustworthiness of the guest, and the devices passed through. Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags. Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW disposition at domain creation time, and context switch the SCF_verw bit. For now, VERW flushing is used and controlled exactly as before, but later patches will add per-domain cases too. No change in behaviour. This is part of XSA-404. Signed-off-by: Andrew Cooper Reviewed-by: Jan Beulich Reviewed-by: Roger Pau Monné diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc index eead69ada2c2..e8bdf30fa46c 100644 --- docs/misc/xen-command-line.pandoc.orig +++ docs/misc/xen-command-line.pandoc @@ -2058,9 +2058,8 @@ in place for guests to use. Use of a positive boolean value for either of these options is invalid. The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine -grained control over the alternative blocks used by Xen. These impact Xen's -ability to protect itself, and Xen's ability to virtualise support for guests -to use. +grained control over the primitives by Xen. These impact Xen's ability to +protect itself, and Xen's ability to virtualise support for guests to use. * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests respectively. diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 820cb0f90558..fe95b25a034e 100644 --- xen/arch/x86/domain.c.orig +++ xen/arch/x86/domain.c @@ -651,6 +651,8 @@ int arch_domain_create(struct domain *d, domain_cpu_policy_changed(d); + spec_ctrl_init_domain(d); + return 0; fail: @@ -1746,14 +1748,15 @@ static void __context_switch(void) void context_switch(struct vcpu *prev, struct vcpu *next) { unsigned int cpu = smp_processor_id(); + struct cpu_info *info = get_cpu_info(); const struct domain *prevd = prev->domain, *nextd = next->domain; unsigned int dirty_cpu = next->dirty_cpu; ASSERT(prev != next); ASSERT(local_irq_is_enabled()); - get_cpu_info()->use_pv_cr3 = false; - get_cpu_info()->xen_cr3 = 0; + info->use_pv_cr3 = false; + info->xen_cr3 = 0; if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN ) { @@ -1816,6 +1819,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next) *last_id = next_id; } } + + /* Update the top-of-stack block with the VERW disposition. */ + info->spec_ctrl_flags &= ~SCF_verw; + if ( nextd->arch.verw ) + info->spec_ctrl_flags |= SCF_verw; } sched_context_switched(prev, next); diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S index 27c8c5ca4943..62ed0d854df1 100644 --- xen/arch/x86/hvm/vmx/entry.S.orig +++ xen/arch/x86/hvm/vmx/entry.S @@ -81,6 +81,7 @@ UNLIKELY_END(realmode) /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ SPEC_CTRL_EXIT_TO_HVM /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + DO_SPEC_CTRL_COND_VERW mov VCPU_hvm_guest_cr2(%rbx),%rax diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c index 7447d4a8e5b5..38e1f1098210 100644 --- xen/arch/x86/spec_ctrl.c.orig +++ xen/arch/x86/spec_ctrl.c @@ -35,8 +35,8 @@ static bool __initdata opt_msr_sc_pv = true; static bool __initdata opt_msr_sc_hvm = true; static bool __initdata opt_rsb_pv = true; static bool __initdata opt_rsb_hvm = true; -static int8_t __initdata opt_md_clear_pv = -1; -static int8_t __initdata opt_md_clear_hvm = -1; +static int8_t __read_mostly opt_md_clear_pv = -1; +static int8_t __read_mostly opt_md_clear_hvm = -1; /* Cmdline controls for Xen's speculative settings. */ static enum ind_thunk { @@ -878,6 +878,13 @@ static __init void mds_calculations(uint64_t caps) } } +void spec_ctrl_init_domain(struct domain *d) +{ + bool pv = is_pv_domain(d); + + d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; +} + void __init init_speculation_mitigations(void) { enum ind_thunk thunk = THUNK_DEFAULT; @@ -1078,21 +1085,20 @@ void __init init_speculation_mitigations(void) boot_cpu_has(X86_FEATURE_MD_CLEAR)); /* - * Enable MDS defences as applicable. The PV blocks need using all the - * time, and the Idle blocks need using if either PV or HVM defences are - * used. + * Enable MDS defences as applicable. The Idle blocks need using if + * either PV or HVM defences are used. * * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with - * equivelent semantics to avoid needing to perform both flushes on the - * HVM path. The HVM blocks don't need activating if our hypervisor told - * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves. + * equivalent semantics to avoid needing to perform both flushes on the + * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. + * + * After calculating the appropriate idle setting, simplify + * opt_md_clear_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. */ - if ( opt_md_clear_pv ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV); if ( opt_md_clear_pv || opt_md_clear_hvm ) setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); - if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM); + opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; /* * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h index a8222e978cd9..bcba926bda41 100644 --- xen/include/asm-x86/cpufeatures.h.orig +++ xen/include/asm-x86/cpufeatures.h @@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ -XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */ -XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */ +/* Bits 23,24 unused. */ XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ /* Bug words follow the synthetic words. */ diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 309b56e2d6b7..71d1ca243b32 100644 --- xen/include/asm-x86/domain.h.orig +++ xen/include/asm-x86/domain.h @@ -295,6 +295,9 @@ struct arch_domain uint32_t pci_cf8; uint8_t cmos_idx; + /* Use VERW on return-to-guest for its flushing side effect. */ + bool verw; + union { struct pv_domain pv; struct hvm_domain hvm; diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h index b252bb863111..157a2c67d89c 100644 --- xen/include/asm-x86/spec_ctrl.h.orig +++ xen/include/asm-x86/spec_ctrl.h @@ -24,6 +24,7 @@ #define SCF_use_shadow (1 << 0) #define SCF_ist_wrmsr (1 << 1) #define SCF_ist_rsb (1 << 2) +#define SCF_verw (1 << 3) #ifndef __ASSEMBLY__ @@ -32,6 +33,7 @@ #include void init_speculation_mitigations(void); +void spec_ctrl_init_domain(struct domain *d); extern bool opt_ibpb; extern bool opt_ssbd; diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h index c60093b090b5..4a3777cc5227 100644 --- xen/include/asm-x86/spec_ctrl_asm.h.orig +++ xen/include/asm-x86/spec_ctrl_asm.h @@ -141,6 +141,19 @@ wrmsr .endm +.macro DO_SPEC_CTRL_COND_VERW +/* + * Requires %rsp=cpuinfo + * + * Issue a VERW for its flushing side effect, if indicated. This is a Spectre + * v1 gadget, but the IRET/VMEntry is serialising. + */ + testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) + jz .L\@_verw_skip + verw CPUINFO_verw_sel(%rsp) +.L\@_verw_skip: +.endm + .macro DO_SPEC_CTRL_ENTRY maybexen:req /* * Requires %rsp=regs (also cpuinfo if !maybexen) @@ -242,15 +255,12 @@ #define SPEC_CTRL_EXIT_TO_PV \ ALTERNATIVE "", \ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ - ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \ - X86_FEATURE_SC_VERW_PV + DO_SPEC_CTRL_COND_VERW /* Use when exiting to HVM guest context. */ #define SPEC_CTRL_EXIT_TO_HVM \ ALTERNATIVE "", \ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM; \ - ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \ - X86_FEATURE_SC_VERW_HVM /* * Use in IST interrupt/exception context. May interrupt Xen or PV context. From: Andrew Cooper Subject: x86/spec-ctrl: Enumeration for MMIO Stale Data controls The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP data movement primitives. FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer flushing side effect. This is only enumerated on parts where VERW had previously lost it's flushing side effect due to the MDS/TAA vulnerabilities being fixed in hardware. FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer clearing side effect of VERW can be turned off for performance reasons. This is part of XSA-404. Signed-off-by: Andrew Cooper Reviewed-by: Roger Pau Monné diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c index 38e1f1098210..fd36927ba1cb 100644 --- xen/arch/x86/spec_ctrl.c.orig +++ xen/arch/x86/spec_ctrl.c @@ -318,7 +318,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) printk("Speculative mitigation facilities:\n"); /* Hardware features which pertain to speculative mitigations. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", @@ -333,7 +333,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", - (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); + (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "", + (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", + (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", + (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", + (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", + (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : ""); /* Compiled-in support which pertains to mitigations. */ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index ba9e90af210b..2a80660d849d 100644 --- xen/include/asm-x86/msr-index.h.orig +++ xen/include/asm-x86/msr-index.h @@ -55,6 +55,11 @@ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) #define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) +#define ARCH_CAPS_SBDR_SSDP_NO (_AC(1, ULL) << 13) +#define ARCH_CAPS_FBSDP_NO (_AC(1, ULL) << 14) +#define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15) +#define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17) +#define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) #define MSR_FLUSH_CMD 0x0000010b #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) From: Andrew Cooper Subject: x86/spec-ctrl: Add spec-ctrl=unpriv-mmio Per Xen's support statement, PCI passthrough should be to trusted domains because the overall system security depends on factors outside of Xen's control. As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR. However, users who have risk assessed their configuration may be happy with the risk of DoS, but unhappy with the risk of cross-domain data leakage. Such users should enable this option. On CPUs vulnerable to MDS, the existing mitigations are the best we can do to mitigate MMIO cross-domain data leakage. On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option: * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage using FB_CLEAR. * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the srb-lock, previously used to mitigate SRBDS. Both mitigations require microcode from IPU 2022.1, May 2022. This is part of XSA-404. Signed-off-by: Andrew Cooper Reviewed-by: Roger Pau Monné --- Backporting note: For Xen 4.7 and earlier with bool_t not aliasing bool, the ARCH_CAPS_FB_CLEAR hunk needs !! diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc index e8bdf30fa46c..022cb01da762 100644 --- docs/misc/xen-command-line.pandoc.orig +++ docs/misc/xen-command-line.pandoc @@ -2035,7 +2035,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). ### spec-ctrl (x86) > `= List of [ , xen=, {pv,hvm,msr-sc,rsb,md-clear}=, > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, -> l1d-flush,branch-harden,srb-lock}= ]` +> l1d-flush,branch-harden,srb-lock,unpriv-mmio}= ]` Controls for speculative execution sidechannel mitigations. By default, Xen will pick the most appropriate mitigations based on compiled in support, @@ -2114,8 +2114,16 @@ Xen will enable this mitigation. On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force or prevent Xen from protect the Special Register Buffer from leaking stale data. By default, Xen will enable this mitigation, except on parts where MDS -is fixed and TAA is fixed/mitigated (in which case, there is believed to be no -way for an attacker to obtain the stale data). +is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO +mappings (in which case, there is believed to be no way for an attacker to +obtain stale data). + +The `unpriv-mmio=` boolean indicates whether the system has (or will have) +less than fully privileged domains granted access to MMIO devices. By +default, this option is disabled. If enabled, Xen will use the `FB_CLEAR` +and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode +release to mitigate cross-domain leakage of data via the MMIO Stale Data +vulnerabilities. ### sync_console > `= ` diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c index fd36927ba1cb..d4ba9412067b 100644 --- xen/arch/x86/spec_ctrl.c.orig +++ xen/arch/x86/spec_ctrl.c @@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. static int8_t __initdata opt_srb_lock = -1; uint64_t __read_mostly default_xen_mcu_opt_ctrl; +static bool __initdata opt_unpriv_mmio; +static bool __read_mostly opt_fb_clear_mmio; static int __init parse_spec_ctrl(const char *s) { @@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s) opt_branch_harden = val; else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) + opt_unpriv_mmio = val; else rc = -EINVAL; @@ -367,7 +371,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", opt_ibpb ? " IBPB" : "", opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "", + opt_md_clear_pv || opt_md_clear_hvm || + opt_fb_clear_mmio ? " VERW" : "", opt_branch_harden ? " BRANCH_HARDEN" : ""); /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ @@ -887,7 +892,9 @@ void spec_ctrl_init_domain(struct domain *d) { bool pv = is_pv_domain(d); - d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; + d->arch.verw = + (pv ? opt_md_clear_pv : opt_md_clear_hvm) || + (opt_fb_clear_mmio && is_iommu_enabled(d)); } void __init init_speculation_mitigations(void) @@ -1078,6 +1085,18 @@ void __init init_speculation_mitigations(void) mds_calculations(caps); /* + * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have + * reintroduced the VERW fill buffer flushing side effect because of a + * susceptibility to FBSDP. + * + * If unprivileged guests have (or will have) MMIO mappings, we can + * mitigate cross-domain leakage of fill buffer data by issuing VERW on + * the return-to-guest path. + */ + if ( opt_unpriv_mmio ) + opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR; + + /* * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. * This will only be a token effort for MLPDS/MFBDS when HT is enabled, * but it is somewhat better than nothing. @@ -1090,18 +1109,20 @@ void __init init_speculation_mitigations(void) boot_cpu_has(X86_FEATURE_MD_CLEAR)); /* - * Enable MDS defences as applicable. The Idle blocks need using if - * either PV or HVM defences are used. + * Enable MDS/MMIO defences as applicable. The Idle blocks need using if + * either the PV or HVM MDS defences are used, or if we may give MMIO + * access to untrusted guests. * * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with * equivalent semantics to avoid needing to perform both flushes on the - * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. + * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for + * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) * * After calculating the appropriate idle setting, simplify * opt_md_clear_hvm to mean just "should we VERW on the way into HVM * guests", so spec_ctrl_init_domain() can calculate suitable settings. */ - if ( opt_md_clear_pv || opt_md_clear_hvm ) + if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; @@ -1170,12 +1191,18 @@ void __init init_speculation_mitigations(void) * On some SRBDS-affected hardware, it may be safe to relax srb-lock * by default. * - * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only way - * to access the Fill Buffer. If TSX isn't available (inc. SKU - * reasons on some models), or TSX is explicitly disabled, then there - * is no need for the extra overhead to protect RDRAND/RDSEED. + * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale + * RNG data becomes available to other contexts. To recover the data, + * an attacker needs to use: + * - SBDS (MDS or TAA to sample the cores fill buffer) + * - SBDR (Architecturally retrieve stale transaction buffer contents) + * - DRPW (Architecturally latch stale fill buffer data) + * + * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and + * there is no unprivileged MMIO access, the RNG data doesn't need + * protecting. */ - if ( opt_srb_lock == -1 && + if ( opt_srb_lock == -1 && !opt_unpriv_mmio && (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO && (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && opt_tsx == 0)) ) opt_srb_lock = 0;