diff options
author | bouyer <bouyer@pkgsrc.org> | 2019-11-13 13:36:11 +0000 |
---|---|---|
committer | bouyer <bouyer@pkgsrc.org> | 2019-11-13 13:36:11 +0000 |
commit | 828418984be2315141e8d854e413e9d0b133e14a (patch) | |
tree | 892445ac19c845338578c93e9d893e3c32226178 | |
parent | 038530648c7224c3690c222cb48aea7bed65807a (diff) | |
download | pkgsrc-828418984be2315141e8d854e413e9d0b133e14a.tar.gz |
Add patches for relevant Xen security advisory up to XSA305 (everything
up to XSA297 is already fixed upstream).
Bump PKGREVISION
-rw-r--r-- | sysutils/xenkernel411/Makefile | 4 | ||||
-rw-r--r-- | sysutils/xenkernel411/distinfo | 6 | ||||
-rw-r--r-- | sysutils/xenkernel411/patches/patch-XSA298 | 89 | ||||
-rw-r--r-- | sysutils/xenkernel411/patches/patch-XSA302 | 537 | ||||
-rw-r--r-- | sysutils/xenkernel411/patches/patch-XSA304 | 481 | ||||
-rw-r--r-- | sysutils/xenkernel411/patches/patch-XSA305 | 482 |
6 files changed, 1596 insertions, 3 deletions
diff --git a/sysutils/xenkernel411/Makefile b/sysutils/xenkernel411/Makefile index 5753bffb774..4e890e3cbc7 100644 --- a/sysutils/xenkernel411/Makefile +++ b/sysutils/xenkernel411/Makefile @@ -1,7 +1,7 @@ -# $NetBSD: Makefile,v 1.8 2019/08/30 13:16:27 bouyer Exp $ +# $NetBSD: Makefile,v 1.9 2019/11/13 13:36:11 bouyer Exp $ VERSION= 4.11.2 -#PKGREVISION= 0 +PKGREVISION= 1 DISTNAME= xen-${VERSION} PKGNAME= xenkernel411-${VERSION} CATEGORIES= sysutils diff --git a/sysutils/xenkernel411/distinfo b/sysutils/xenkernel411/distinfo index ccf14678aaa..0354944c4b3 100644 --- a/sysutils/xenkernel411/distinfo +++ b/sysutils/xenkernel411/distinfo @@ -1,10 +1,14 @@ -$NetBSD: distinfo,v 1.5 2019/08/30 13:16:27 bouyer Exp $ +$NetBSD: distinfo,v 1.6 2019/11/13 13:36:11 bouyer Exp $ SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65 +SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6 +SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b +SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399 +SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b diff --git a/sysutils/xenkernel411/patches/patch-XSA298 b/sysutils/xenkernel411/patches/patch-XSA298 new file mode 100644 index 00000000000..10ff22f8e67 --- /dev/null +++ b/sysutils/xenkernel411/patches/patch-XSA298 @@ -0,0 +1,89 @@ +$NetBSD: patch-XSA298,v 1.1 2019/11/13 13:36:11 bouyer Exp $ + +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/PV: check GDT/LDT limits during emulation + +Accesses beyond the LDT limit originating from emulation would trigger +the ASSERT() in pv_map_ldt_shadow_page(). On production builds such +accesses would cause an attempt to promote the touched page (offset from +the present LDT base address) to a segment descriptor one. If this +happens to succeed, guest user mode would be able to elevate its +privileges to that of the guest kernel. This is particularly easy when +there's no LDT at all, in which case the LDT base stored internally to +Xen is simply zero. + +Also adjust the ASSERT() that was triggering: It was off by one to +begin with, and for production builds we also better use +ASSERT_UNREACHABLE() instead with suitable recovery code afterwards. + +This is XSA-298. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> + +--- xen/arch/x86/pv/emul-gate-op.c.orig ++++ xen/arch/x86/pv/emul-gate-op.c +@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned + const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel); + + if ( (gate_sel < 4) || +- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || ++ /* ++ * We're interested in call gates only, which occupy a single ++ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit. ++ */ ++ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >= ++ (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents ++ : v->arch.pv_vcpu.gdt_ents)) || + __get_user(desc, pdesc) ) + return 0; + +@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned + if ( !is_pv_32bit_vcpu(v) ) + { + if ( (*ar & 0x1f00) != 0x0c00 || +- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || ++ /* Limit check done above already. */ + __get_user(desc, pdesc + 1) || + (desc.b & 0x1f00) ) + return 0; +--- xen/arch/x86/pv/emulate.c.orig ++++ xen/arch/x86/pv/emulate.c +@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int + { + struct desc_struct desc; + +- if ( sel < 4) ++ if ( sel < 4 || ++ /* ++ * Don't apply the GDT limit here, as the selector may be a Xen ++ * provided one. __get_user() will fail (without taking further ++ * action) for ones falling in the gap between guest populated ++ * and Xen ones. ++ */ ++ ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) ) + desc.b = desc.a = 0; + else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) ) + return 0; +--- xen/arch/x86/pv/mm.c.orig ++++ xen/arch/x86/pv/mm.c +@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int + BUG_ON(unlikely(in_irq())); + + /* +- * Hardware limit checking should guarantee this property. NB. This is ++ * Prior limit checking should guarantee this property. NB. This is + * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the + * current vcpu, and vcpu_reset() will block until this vcpu has been + * descheduled before continuing. + */ +- ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents); ++ if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return false; ++ } + + if ( is_pv_32bit_domain(currd) ) + linear = (uint32_t)linear; diff --git a/sysutils/xenkernel411/patches/patch-XSA302 b/sysutils/xenkernel411/patches/patch-XSA302 new file mode 100644 index 00000000000..e1c08b56a0c --- /dev/null +++ b/sysutils/xenkernel411/patches/patch-XSA302 @@ -0,0 +1,537 @@ +$NetBSD: patch-XSA302,v 1.1 2019/11/13 13:36:11 bouyer Exp $ + +From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Wed, 2 Oct 2019 13:36:59 +0200 +Subject: [PATCH 1/2] IOMMU: add missing HVM check +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fix an unguarded d->arch.hvm access in assign_device(). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> + +(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6) +Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> +--- + xen/drivers/passthrough/pci.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index f51cae7f4e..037aba7c94 100644 +--- xen/drivers/passthrough/pci.c.orig ++++ xen/drivers/passthrough/pci.c +@@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + /* Prevent device assign if mem paging or mem sharing have been + * enabled for this domain */ + if ( unlikely(!need_iommu(d) && +- (d->arch.hvm_domain.mem_sharing_enabled || ++ ((is_hvm_domain(d) && ++ d->arch.hvm_domain.mem_sharing_enabled) || + vm_event_check_ring(d->vm_event_paging) || + p2m_get_hostp2m(d)->global_logdirty)) ) + return -EXDEV; +-- +2.11.0 + +From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001 +From: Paul Durrant <paul.durrant@citrix.com> +Date: Mon, 14 Oct 2019 17:52:59 +0100 +Subject: [PATCH 2/2] passthrough: quarantine PCI devices + +When a PCI device is assigned to an untrusted domain, it is possible for +that domain to program the device to DMA to an arbitrary address. The +IOMMU is used to protect the host from malicious DMA by making sure that +the device addresses can only target memory assigned to the guest. However, +when the guest domain is torn down the device is assigned back to dom0, +thus allowing any in-flight DMA to potentially target critical host data. + +This patch introduces a 'quarantine' for PCI devices using dom_io. When +the toolstack makes a device assignable (by binding it to pciback), it +will now also assign it to DOMID_IO and the device will only be assigned +back to dom0 when the device is made unassignable again. Whilst device is +assignable it will only ever transfer between dom_io and guest domains. +dom_io is actually only used as a sentinel domain for quarantining purposes; +it is not configured with any IOMMU mappings. Assignment to dom_io simply +means that the device's initiator (requestor) identifier is not present in +the IOMMU's device table and thus any DMA transactions issued will be +terminated with a fault condition. + +In addition, a fix to assignment handling is made for VT-d. Failure +during the assignment step should not lead to a device still being +associated with its prior owner. Hand the device to DomIO temporarily, +until the assignment step has completed successfully. Remove the PI +hooks from the source domain then earlier as well. + +Failure of the recovery reassign_device_ownership() may not go silent: +There e.g. may still be left over RMRR mappings in the domain assignment +to which has failed, and hence we can't allow that domain to continue +executing. + +NOTE: This patch also includes one printk() cleanup; the + "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(), + since similar printk()-s elsewhere also don't log such a tag. + +This is XSA-302. + +Signed-off-by: Paul Durrant <paul.durrant@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> +--- + tools/libxl/libxl_pci.c | 25 +++++++++++- + xen/arch/x86/mm.c | 2 + + xen/common/domctl.c | 14 ++++++- + xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++- + xen/drivers/passthrough/iommu.c | 9 +++++ + xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++------- + xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++--- + xen/include/xen/pci.h | 3 ++ + 8 files changed, 138 insertions(+), 24 deletions(-) + +diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c +index 4755a0c93c..81890a91ac 100644 +--- tools/libxl/libxl_pci.c.orig ++++ tools/libxl/libxl_pci.c +@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + libxl_device_pci *pcidev, + int rebind) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); + unsigned dom, bus, dev, func; + char *spath, *driver_path = NULL; + int rc; +@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + } + if ( rc ) { + LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func); +- return 0; ++ goto quarantine; + } + + /* Check to see if there's already a driver that we need to unbind from */ +@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc, + return ERROR_FAIL; + } + ++quarantine: ++ /* ++ * DOMID_IO is just a sentinel domain, without any actual mappings, ++ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being ++ * unnecessarily denied. ++ */ ++ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev), ++ XEN_DOMCTL_DEV_RDM_RELAXED); ++ if ( rc < 0 ) { ++ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func); ++ return ERROR_FAIL; ++ } ++ + return 0; + } + +@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc, + libxl_device_pci *pcidev, + int rebind) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); + int rc; + char *driver_path; + ++ /* De-quarantine */ ++ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev)); ++ if ( rc < 0 ) { ++ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus, ++ pcidev->dev, pcidev->func); ++ return ERROR_FAIL; ++ } ++ + /* Unbind from pciback */ + if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) { + return ERROR_FAIL; +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index e6a4cb28f8..c1ab57f9a5 100644 +--- xen/arch/x86/mm.c.orig ++++ xen/arch/x86/mm.c +@@ -295,9 +295,11 @@ void __init arch_init_memory(void) + * Initialise our DOMID_IO domain. + * This domain owns I/O pages that are within the range of the page_info + * array. Mappings occur at the priv of the caller. ++ * Quarantined PCI devices will be associated with this domain. + */ + dom_io = domain_create(DOMID_IO, NULL); + BUG_ON(IS_ERR(dom_io)); ++ INIT_LIST_HEAD(&dom_io->arch.pdev_list); + + /* + * Initialise our COW domain. +diff --git a/xen/common/domctl.c b/xen/common/domctl.c +index 9b7bc083ee..741d774cd1 100644 +--- xen/common/domctl.c.orig ++++ xen/common/domctl.c +@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + + switch ( op->cmd ) + { ++ case XEN_DOMCTL_assign_device: ++ case XEN_DOMCTL_deassign_device: ++ if ( op->domain == DOMID_IO ) ++ { ++ d = dom_io; ++ break; ++ } ++ else if ( op->domain == DOMID_INVALID ) ++ return -ESRCH; ++ /* fall through */ + case XEN_DOMCTL_test_assign_device: + if ( op->domain == DOMID_INVALID ) + { +@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + + if ( !domctl_lock_acquire() ) + { +- if ( d ) ++ if ( d && d != dom_io ) + rcu_unlock_domain(d); + return hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); +@@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + domctl_lock_release(); + + domctl_out_unlock_domonly: +- if ( d ) ++ if ( d && d != dom_io ) + rcu_unlock_domain(d); + + if ( copyback && __copy_to_guest(u_domctl, op, 1) ) +diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c +index 12d2695b89..ec8baae717 100644 +--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig ++++ xen/drivers/passthrough/amd/pci_amd_iommu.c +@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device( + u8 bus = pdev->bus; + const struct domain_iommu *hd = dom_iommu(domain); + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return; ++ + BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || + !iommu->dev_table.buffer ); + +@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain, + int req_id; + u8 bus = pdev->bus; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return; ++ + BUG_ON ( iommu->dev_table.buffer == NULL ); + req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); + dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); +@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn, + ivrs_mappings[req_id].read_permission); + } + +- return reassign_device(hardware_domain, d, devfn, pdev); ++ return reassign_device(pdev->domain, d, devfn, pdev); + } + + static void deallocate_next_page_table(struct page_info *pg, int level) +diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c +index 04b0be37d3..8027d96f1c 100644 +--- xen/drivers/passthrough/iommu.c.orig ++++ xen/drivers/passthrough/iommu.c +@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d) + { + const struct domain_iommu *hd = dom_iommu(d); + ++ if ( d == dom_io ) ++ return; ++ + d->need_iommu = 0; + hd->platform_ops->teardown(d); + tasklet_schedule(&iommu_pt_cleanup_tasklet); +@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d) + if ( need_iommu(d) > 0 ) + return 0; + ++ if ( d == dom_io ) ++ return 0; ++ + if ( !iommu_use_hap_pt(d) ) + { + int rc; +@@ -404,6 +410,9 @@ int __init iommu_setup(void) + printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); + if ( iommu_enabled ) + { ++ if ( iommu_domain_init(dom_io) ) ++ panic("Could not set up quarantine\n"); ++ + printk(" - Dom0 mode: %s\n", + iommu_passthrough ? "Passthrough" : + iommu_dom0_strict ? "Strict" : "Relaxed"); +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 037aba7c94..fb010a547b 100644 +--- xen/drivers/passthrough/pci.c.orig ++++ xen/drivers/passthrough/pci.c +@@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev) + return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev)); + } + +-/* +- * If the device isn't owned by the hardware domain, it means it already +- * has been assigned to other domain, or it doesn't exist. +- */ + static int device_assigned(u16 seg, u8 bus, u8 devfn) + { + struct pci_dev *pdev; ++ int rc = 0; + + pcidevs_lock(); +- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); ++ ++ pdev = pci_get_pdev(seg, bus, devfn); ++ ++ if ( !pdev ) ++ rc = -ENODEV; ++ /* ++ * If the device exists and it is not owned by either the hardware ++ * domain or dom_io then it must be assigned to a guest, or be ++ * hidden (owned by dom_xen). ++ */ ++ else if ( pdev->domain != hardware_domain && ++ pdev->domain != dom_io ) ++ rc = -EBUSY; ++ + pcidevs_unlock(); + +- return pdev ? 0 : -EBUSY; ++ return rc; + } + + static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) +@@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + /* Prevent device assign if mem paging or mem sharing have been + * enabled for this domain */ +- if ( unlikely(!need_iommu(d) && ++ if ( d != dom_io && ++ unlikely(!need_iommu(d) && + ((is_hvm_domain(d) && + d->arch.hvm_domain.mem_sharing_enabled) || + vm_event_check_ring(d->vm_event_paging) || +@@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + return rc; + } + +- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn); ++ pdev = pci_get_pdev(seg, bus, devfn); ++ ++ rc = -ENODEV; + if ( !pdev ) +- { +- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; + goto done; +- } ++ ++ rc = 0; ++ if ( d == pdev->domain ) ++ goto done; ++ ++ rc = -EBUSY; ++ if ( pdev->domain != hardware_domain && ++ pdev->domain != dom_io ) ++ goto done; + + if ( pdev->msix ) + msixtbl_init(d); +@@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + } + + done: ++ /* The device is assigned to dom_io so mark it as quarantined */ ++ if ( !rc && d == dom_io ) ++ pdev->quarantine = true; ++ + if ( !has_arch_pdevs(d) && need_iommu(d) ) + iommu_teardown(d); + pcidevs_unlock(); +@@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + { + const struct domain_iommu *hd = dom_iommu(d); + struct pci_dev *pdev = NULL; ++ struct domain *target; + int ret = 0; + + if ( !iommu_enabled || !hd->platform_ops ) +@@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + if ( !pdev ) + return -ENODEV; + ++ /* De-assignment from dom_io should de-quarantine the device */ ++ target = (pdev->quarantine && pdev->domain != dom_io) ? ++ dom_io : hardware_domain; ++ + while ( pdev->phantom_stride ) + { + devfn += pdev->phantom_stride; + if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) + break; +- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, ++ ret = hd->platform_ops->reassign_device(d, target, devfn, + pci_to_dev(pdev)); + if ( !ret ) + continue; +@@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + } + + devfn = pdev->devfn; +- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn, ++ ret = hd->platform_ops->reassign_device(d, target, devfn, + pci_to_dev(pdev)); + if ( ret ) + { +@@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) + return ret; + } + ++ if ( pdev->domain == hardware_domain ) ++ pdev->quarantine = false; ++ + pdev->fault.count = 0; + + if ( !has_arch_pdevs(d) && need_iommu(d) ) +@@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl( + ret = hypercall_create_continuation(__HYPERVISOR_domctl, + "h", u_domctl); + else if ( ret ) +- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " ++ printk(XENLOG_G_ERR + "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + d->domain_id, ret); +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 4c719d4ee7..19f7d13013 100644 +--- xen/drivers/passthrough/vtd/iommu.c.orig ++++ xen/drivers/passthrough/vtd/iommu.c +@@ -1338,6 +1338,10 @@ int domain_context_mapping_one( + int agaw, rc, ret; + bool_t flush_dev_iotlb; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return 0; ++ + ASSERT(pcidevs_locked()); + spin_lock(&iommu->lock); + maddr = bus_to_context_maddr(iommu, bus); +@@ -1573,6 +1577,10 @@ int domain_context_unmap_one( + int iommu_domid, rc, ret; + bool_t flush_dev_iotlb; + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ return 0; ++ + ASSERT(pcidevs_locked()); + spin_lock(&iommu->lock); + +@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn, + goto out; + } + ++ /* dom_io is used as a sentinel for quarantined devices */ ++ if ( domain == dom_io ) ++ goto out; ++ + /* + * if no other devices under the same iommu owned by this domain, + * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp +@@ -2389,6 +2401,15 @@ static int reassign_device_ownership( + if ( ret ) + return ret; + ++ if ( devfn == pdev->devfn ) ++ { ++ list_move(&pdev->domain_list, &dom_io->arch.pdev_list); ++ pdev->domain = dom_io; ++ } ++ ++ if ( !has_arch_pdevs(source) ) ++ vmx_pi_hooks_deassign(source); ++ + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_assign(target); + +@@ -2407,15 +2428,13 @@ static int reassign_device_ownership( + pdev->domain = target; + } + +- if ( !has_arch_pdevs(source) ) +- vmx_pi_hooks_deassign(source); +- + return ret; + } + + static int intel_iommu_assign_device( + struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) + { ++ struct domain *s = pdev->domain; + struct acpi_rmrr_unit *rmrr; + int ret = 0, i; + u16 bdf, seg; +@@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device( + } + } + +- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev); +- if ( ret ) ++ ret = reassign_device_ownership(s, d, devfn, pdev); ++ if ( ret || d == dom_io ) + return ret; + + /* Setup rmrr identity mapping */ +@@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device( + ret = rmrr_identity_mapping(d, 1, rmrr, flag); + if ( ret ) + { +- reassign_device_ownership(d, hardware_domain, devfn, pdev); ++ int rc; ++ ++ rc = reassign_device_ownership(d, s, devfn, pdev); + printk(XENLOG_G_ERR VTDPREFIX + " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", + rmrr->base_address, rmrr->end_address, + d->domain_id, ret); ++ if ( rc ) ++ { ++ printk(XENLOG_ERR VTDPREFIX ++ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", ++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); ++ domain_crash(d); ++ } + break; + } + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index 4cfa774615..066364bdef 100644 +--- xen/include/xen/pci.h.orig ++++ xen/include/xen/pci.h +@@ -88,6 +88,9 @@ struct pci_dev { + + nodeid_t node; /* NUMA node */ + ++ /* Device to be quarantined, don't automatically re-assign to dom0 */ ++ bool quarantine; ++ + enum pdev_type { + DEV_TYPE_PCI_UNKNOWN, + DEV_TYPE_PCIe_ENDPOINT, +-- +2.11.0 + diff --git a/sysutils/xenkernel411/patches/patch-XSA304 b/sysutils/xenkernel411/patches/patch-XSA304 new file mode 100644 index 00000000000..b905d46b748 --- /dev/null +++ b/sysutils/xenkernel411/patches/patch-XSA304 @@ -0,0 +1,481 @@ +$NetBSD: patch-XSA304,v 1.1 2019/11/13 13:36:11 bouyer Exp $ + +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs + +Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and +an EPT superpage gets shattered. The root cause is still under investigation, +but the end result is unusable in combination with CVE-2018-12207 protections. + +This is part of XSA-304 / CVE-2018-12207 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index fb7edfaef9..d698b1d50a 100644 +--- xen/drivers/passthrough/vtd/extern.h.orig ++++ xen/drivers/passthrough/vtd/extern.h +@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu); + int __must_check me_wifi_quirk(struct domain *domain, + u8 bus, u8 devfn, int map); + void pci_vtd_quirk(const struct pci_dev *); ++void quirk_iommu_caps(struct iommu *iommu); ++ + bool_t platform_supports_intremap(void); + bool_t platform_supports_x2apic(void); + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index f242e30caf..8712d3b4dc 100644 +--- xen/drivers/passthrough/vtd/iommu.c.orig ++++ xen/drivers/passthrough/vtd/iommu.c +@@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) + if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) + return -ENODEV; + ++ quirk_iommu_caps(iommu); ++ + if ( cap_fault_reg_offset(iommu->cap) + + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || + ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index d6db862678..b02688e316 100644 +--- xen/drivers/passthrough/vtd/quirks.c.orig ++++ xen/drivers/passthrough/vtd/quirks.c +@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev) + break; + } + } ++ ++void __init quirk_iommu_caps(struct iommu *iommu) ++{ ++ /* ++ * IOMMU Quirks: ++ * ++ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't ++ * implement superpages internally. ++ * ++ * There are issues changing the walk length under in-flight DMA, which ++ * has manifested as incompatibility between EPT/IOMMU sharing and the ++ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages ++ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT ++ * and IOMMU pagetables. ++ * ++ * Detection of SandyBridge unfortunately has to be done by processor ++ * model because the client parts don't expose their IOMMUs as PCI devices ++ * we could match with a Device ID. ++ */ ++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ boot_cpu_data.x86 == 6 && ++ (boot_cpu_data.x86_model == 0x2a || ++ boot_cpu_data.x86_model == 0x2d) ) ++ iommu->cap &= ~(0xful << 34); ++} +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtx: Disable executable EPT superpages to work around + CVE-2018-12207 + +CVE-2018-12207 covers a set of errata on various Intel processors, whereby a +machine check exception can be generated in a corner case when an executable +mapping changes size or cacheability without TLB invalidation. HVM guest +kernels can trigger this to DoS the host. + +To mitigate, in affected hardware, all EPT superpages are marked NX. When an +instruction fetch violation is observed against the superpage, the superpage +is shattered to 4k and has execute permissions restored. This prevents the +guest kernel from being able to create the necessary preconditions in the iTLB +to exploit the vulnerability. + +This does come with a workload-dependent performance overhead, caused by +increased TLB pressure. Performance can be restored, if guest kernels are +trusted not to mount an attack, by specifying ept=exec-sp on the command line. + +This is part of XSA-304 / CVE-2018-12207 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: George Dunlap <george.dunlap@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index c63a07d29b..684671cb7b 100644 +--- docs/misc/xen-command-line.markdown.orig ++++ docs/misc/xen-command-line.markdown +@@ -828,7 +828,7 @@ effect the inverse meaning. + >> set as UC. + + ### ept (Intel) +-> `= List of ( {no-}pml | {no-}ad )` ++> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]` + + Controls EPT related features. + +@@ -851,6 +851,16 @@ Controls EPT related features. + + >> Have hardware keep accessed/dirty (A/D) bits updated. + ++* The `exec-sp` boolean controls whether EPT superpages with execute ++ permissions are permitted. In general this is good for performance. ++ ++ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can ++ use executable superpages to crash the host. By default, executable ++ superpages are disabled on affected hardware. ++ ++ If HVM guest kernels are trusted not to mount a DoS against the system, ++ this option can enabled to regain performance. ++ + ### extra\_guest\_irqs + > `= [<domU number>][,<dom0 number>]` + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index f4a6a37149..1924434960 100644 +--- xen/arch/x86/hvm/hvm.c.orig ++++ xen/arch/x86/hvm/hvm.c +@@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, + struct p2m_domain *p2m, *hostp2m; + int rc, fall_through = 0, paged = 0; + int sharing_enomem = 0; ++ unsigned int page_order = 0; + vm_event_request_t *req_ptr = NULL; + bool_t ap2m_active, sync = 0; + +@@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, + hostp2m = p2m_get_hostp2m(currd); + mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma, + P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0), +- NULL); ++ &page_order); + + if ( ap2m_active ) + { +@@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, + goto out; + } + +- mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL); ++ mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order); + } + else + p2m = hostp2m; +@@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, + break; + } + ++ /* ++ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution ++ * fault against a non-executable superpage, shatter it to regain ++ * execute permissions. ++ */ ++ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation ) ++ { ++ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K, ++ p2mt, p2ma); ++ ++ if ( res ) ++ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n", ++ gfn, res); ++ ++ rc = !res; ++ goto out_put_gfn; ++ } ++ + if ( violation ) + { + /* Should #VE be emulated for this fault? */ +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 493986e84a..8821a3b536 100644 +--- xen/arch/x86/hvm/vmx/vmcs.c.orig ++++ xen/arch/x86/hvm/vmx/vmcs.c +@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window); + + static bool_t __read_mostly opt_pml_enabled = 1; + static s8 __read_mostly opt_ept_ad = -1; ++int8_t __read_mostly opt_ept_exec_sp = -1; + + /* + * The 'ept' parameter controls functionalities that depend on, or impact the +@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s) + opt_pml_enabled = val; + else if ( !cmdline_strcmp(s, "ad") ) + opt_ept_ad = val; ++ else if ( !cmdline_strcmp(s, "exec-sp") ) ++ opt_ept_exec_sp = val; + else + rc = -EINVAL; + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 840dc2b44d..a568d62643 100644 +--- xen/arch/x86/hvm/vmx/vmx.c.orig ++++ xen/arch/x86/hvm/vmx/vmx.c +@@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs) + static void __init lbr_tsx_fixup_check(void); + static void __init bdw_erratum_bdf14_fixup_check(void); + ++/* ++ * Calculate whether the CPU is vulnerable to Instruction Fetch page ++ * size-change MCEs. ++ */ ++static bool __init has_if_pschange_mc(void) ++{ ++ uint64_t caps = 0; ++ ++ /* ++ * If we are virtualised, there is nothing we can do. Our EPT tables are ++ * shadowed by our hypervisor, and not walked by hardware. ++ */ ++ if ( cpu_has_hypervisor ) ++ return false; ++ ++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); ++ ++ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO ) ++ return false; ++ ++ /* ++ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at ++ * this time. ++ */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return false; ++ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* ++ * Core processors since at least Nehalem are vulnerable. ++ */ ++ case 0x1f: /* Auburndale / Havendale */ ++ case 0x1e: /* Nehalem */ ++ case 0x1a: /* Nehalem EP */ ++ case 0x2e: /* Nehalem EX */ ++ case 0x25: /* Westmere */ ++ case 0x2c: /* Westmere EP */ ++ case 0x2f: /* Westmere EX */ ++ case 0x2a: /* SandyBridge */ ++ case 0x2d: /* SandyBridge EP/EX */ ++ case 0x3a: /* IvyBridge */ ++ case 0x3e: /* IvyBridge EP/EX */ ++ case 0x3c: /* Haswell */ ++ case 0x3f: /* Haswell EX/EP */ ++ case 0x45: /* Haswell D */ ++ case 0x46: /* Haswell H */ ++ case 0x3d: /* Broadwell */ ++ case 0x47: /* Broadwell H */ ++ case 0x4f: /* Broadwell EP/EX */ ++ case 0x56: /* Broadwell D */ ++ case 0x4e: /* Skylake M */ ++ case 0x5e: /* Skylake D */ ++ case 0x55: /* Skylake-X / Cascade Lake */ ++ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */ ++ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */ ++ return true; ++ ++ /* ++ * Atom processors are not vulnerable. ++ */ ++ case 0x1c: /* Pineview */ ++ case 0x26: /* Lincroft */ ++ case 0x27: /* Penwell */ ++ case 0x35: /* Cloverview */ ++ case 0x36: /* Cedarview */ ++ case 0x37: /* Baytrail / Valleyview (Silvermont) */ ++ case 0x4d: /* Avaton / Rangely (Silvermont) */ ++ case 0x4c: /* Cherrytrail / Brasswell */ ++ case 0x4a: /* Merrifield */ ++ case 0x5a: /* Moorefield */ ++ case 0x5c: /* Goldmont */ ++ case 0x5d: /* SoFIA 3G Granite/ES2.1 */ ++ case 0x65: /* SoFIA LTE AOSP */ ++ case 0x5f: /* Denverton */ ++ case 0x6e: /* Cougar Mountain */ ++ case 0x75: /* Lightning Mountain */ ++ case 0x7a: /* Gemini Lake */ ++ case 0x86: /* Jacobsville */ ++ ++ /* ++ * Knights processors are not vulnerable. ++ */ ++ case 0x57: /* Knights Landing */ ++ case 0x85: /* Knights Mill */ ++ return false; ++ ++ default: ++ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n", ++ boot_cpu_data.x86_model); ++ return true; ++ } ++} ++ + const struct hvm_function_table * __init start_vmx(void) + { + set_in_cr4(X86_CR4_VMXE); +@@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void) + */ + if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) ) + { ++ bool cpu_has_bug_pschange_mc = has_if_pschange_mc(); ++ ++ if ( opt_ept_exec_sp == -1 ) ++ { ++ /* Default to non-executable superpages on vulnerable hardware. */ ++ opt_ept_exec_sp = !cpu_has_bug_pschange_mc; ++ ++ if ( cpu_has_bug_pschange_mc ) ++ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n"); ++ } ++ + vmx_function_table.hap_supported = 1; + vmx_function_table.altp2m_supported = 1; + +diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c +index ce46201d45..93e08f89a2 100644 +--- xen/arch/x86/mm/p2m-ept.c.orig ++++ xen/arch/x86/mm/p2m-ept.c +@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry, + break; + } + ++ /* ++ * Don't create executable superpages if we need to shatter them to ++ * protect against CVE-2018-12207. ++ */ ++ if ( !opt_ept_exec_sp && is_epte_superpage(entry) ) ++ entry->x = 0; + } + + #define GUEST_TABLE_MAP_FAILED 0 +diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h +index 89619e4afd..20eb7f6082 100644 +--- xen/include/asm-x86/hvm/vmx/vmx.h.orig ++++ xen/include/asm-x86/hvm/vmx/vmx.h +@@ -28,6 +28,8 @@ + #include <asm/hvm/trace.h> + #include <asm/hvm/vmx/vmcs.h> + ++extern int8_t opt_ept_exec_sp; ++ + typedef union { + struct { + u64 r : 1, /* bit 0 - Read permission */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index b8151d2d9f..89ae3e03f1 100644 +--- xen/include/asm-x86/msr-index.h.orig ++++ xen/include/asm-x86/msr-index.h +@@ -54,6 +54,7 @@ + #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3) + #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) ++#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/vtx: Allow runtime modification of the exec-sp setting + +See patch for details. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 684671cb7b..33ed1ffc40 100644 +--- docs/misc/xen-command-line.markdown.orig ++++ docs/misc/xen-command-line.markdown +@@ -861,6 +861,21 @@ Controls EPT related features. + If HVM guest kernels are trusted not to mount a DoS against the system, + this option can enabled to regain performance. + ++ This boolean may be modified at runtime using `xl set-parameters ++ ept=[no-]exec-sp` to switch between fast and secure. ++ ++ * When switching from secure to fast, preexisting HVM domains will run ++ at their current performance until they are rebooted; new domains will ++ run without any overhead. ++ ++ * When switching from fast to secure, all HVM domains will immediately ++ suffer a performance penalty. ++ ++ **Warning: No guarantee is made that this runtime option will be retained ++ indefinitely, or that it will retain this exact behaviour. It is ++ intended as an emergency option for people who first chose fast, then ++ change their minds to secure, and wish not to reboot.** ++ + ### extra\_guest\_irqs + > `= [<domU number>][,<dom0 number>]` + +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 8821a3b536..15376e25ba 100644 +--- xen/arch/x86/hvm/vmx/vmcs.c.orig ++++ xen/arch/x86/hvm/vmx/vmcs.c +@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s) + } + custom_param("ept", parse_ept_param); + ++static int parse_ept_param_runtime(const char *s) ++{ ++ int val; ++ ++ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported || ++ !(hvm_funcs.hap_capabilities & ++ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) ) ++ { ++ printk("VMX: EPT not available, or not in use - ignoring\n"); ++ return 0; ++ } ++ ++ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 ) ++ return -EINVAL; ++ ++ if ( val != opt_ept_exec_sp ) ++ { ++ struct domain *d; ++ ++ opt_ept_exec_sp = val; ++ ++ rcu_read_lock(&domlist_read_lock); ++ for_each_domain ( d ) ++ if ( paging_mode_hap(d) ) ++ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw); ++ rcu_read_unlock(&domlist_read_lock); ++ } ++ ++ printk("VMX: EPT executable superpages %sabled\n", ++ val ? "en" : "dis"); ++ ++ return 0; ++} ++custom_runtime_only_param("ept", parse_ept_param_runtime); ++ + /* Dynamic (run-time adjusted) execution control flags. */ + u32 vmx_pin_based_exec_control __read_mostly; + u32 vmx_cpu_based_exec_control __read_mostly; +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 2b62bc61dd..97c417fc3e 100644 +--- xen/arch/x86/mm/p2m.c.orig ++++ xen/arch/x86/mm/p2m.c +@@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start, + return 0; + } + ++/* ++ * May be called with ot = nt = p2m_ram_rw for its side effect of ++ * recalculating all PTEs in the p2m. ++ */ + void p2m_change_entry_type_global(struct domain *d, + p2m_type_t ot, p2m_type_t nt) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); + +- ASSERT(ot != nt); + ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt)); + + p2m_lock(p2m); + p2m->change_entry_type_global(p2m, ot, nt); +- p2m->global_logdirty = (nt == p2m_ram_logdirty); ++ /* Don't allow 'recalculate' operations to change the logdirty state. */ ++ if ( ot != nt ) ++ p2m->global_logdirty = (nt == p2m_ram_logdirty); + p2m_unlock(p2m); + } + diff --git a/sysutils/xenkernel411/patches/patch-XSA305 b/sysutils/xenkernel411/patches/patch-XSA305 new file mode 100644 index 00000000000..2ed167cbdf0 --- /dev/null +++ b/sysutils/xenkernel411/patches/patch-XSA305 @@ -0,0 +1,482 @@ +$NetBSD: patch-XSA305,v 1.1 2019/11/13 13:36:11 bouyer Exp $ + +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available + +To protect against the TSX Async Abort speculative vulnerability, Intel have +released new microcode for affected parts which introduce the MSR_TSX_CTRL +control, which allows TSX to be turned off. This will be architectural on +future parts. + +Introduce tsx= to provide a global on/off for TSX, including its enumeration +via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to +guests at the moment. + +VMs may have booted before microcode is loaded, or before hosts have rebooted, +and they still want to migrate freely. A VM which booted seeing TSX can +migrate safely to hosts with TSX disabled - TSX will start unconditionally +aborting, but still behave in a manner compatible with the ABI. + +The guest-visible behaviour is equivalent to late loading the microcode and +setting the RTM_DISABLE bit in the course of live patching. + +This is part of XSA-305 / CVE-2019-11135 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index 684671cb7b..b86d26399a 100644 +--- docs/misc/xen-command-line.markdown.orig ++++ docs/misc/xen-command-line.markdown +@@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter. + ### tsc (x86) + > `= unstable | skewed | stable:socket` + ++### tsx ++ = <bool> ++ ++ Applicability: x86 ++ Default: true ++ ++Controls for the use of Transactional Synchronization eXtensions. ++ ++On Intel parts released in Q3 2019 (with updated microcode), and future parts, ++a control has been introduced which allows TSX to be turned off. ++ ++On systems with the ability to turn TSX off, this boolean offers system wide ++control of whether TSX is enabled or disabled. ++ + ### ucode (x86) + > `= [<integer> | scan]` + +diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile +index da1e4827f4..4c82d9f710 100644 +--- xen/arch/x86/Makefile.orig ++++ xen/arch/x86/Makefile +@@ -65,6 +65,7 @@ obj-y += sysctl.o + obj-y += time.o + obj-y += trace.o + obj-y += traps.o ++obj-y += tsx.o + obj-y += usercopy.o + obj-y += x86_emulate.o + obj-$(CONFIG_TBOOT) += tboot.o +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index 5e11970701..04aefa555d 100644 +--- xen/arch/x86/cpuid.c.orig ++++ xen/arch/x86/cpuid.c +@@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d) + if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) ) + __set_bit(X86_FEATURE_ITSC, max_fs); + ++ /* ++ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable ++ * TSX and hide the feature bits. Migrating-in VMs may have been booted ++ * pre-mitigation when the TSX features were visbile. ++ * ++ * This situation is compatible (albeit with a perf hit to any TSX code in ++ * the guest), so allow the feature bits to remain set. ++ */ ++ if ( cpu_has_tsx_ctrl ) ++ { ++ __set_bit(X86_FEATURE_HLE, max_fs); ++ __set_bit(X86_FEATURE_RTM, max_fs); ++ } ++ + /* Clamp the toolstacks choices to reality. */ + for ( i = 0; i < ARRAY_SIZE(fs); i++ ) + fs[i] &= max_fs[i]; +diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c +index ebc0665615..35d99a98a1 100644 +--- xen/arch/x86/msr.c.orig ++++ xen/arch/x86/msr.c +@@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) + case MSR_FLUSH_CMD: + /* Write-only */ + case MSR_TSX_FORCE_ABORT: ++ case MSR_TSX_CTRL: + /* Not offered to guests. */ + goto gp_fault; + +@@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) + case MSR_ARCH_CAPABILITIES: + /* Read-only */ + case MSR_TSX_FORCE_ABORT: ++ case MSR_TSX_CTRL: + /* Not offered to guests. */ + goto gp_fault; + +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 657160549f..dc13ad6c36 100644 +--- xen/arch/x86/setup.c.orig ++++ xen/arch/x86/setup.c +@@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + early_microcode_init(); + ++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ ++ + identify_cpu(&boot_cpu_data); + + set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT); +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index fd52a10cf9..bdc118d88b 100644 +--- xen/arch/x86/smpboot.c.orig ++++ xen/arch/x86/smpboot.c +@@ -376,6 +376,8 @@ void start_secondary(void *unused) + if ( boot_cpu_has(X86_FEATURE_IBRSB) ) + wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl); + ++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */ ++ + if ( xen_guest ) + hypervisor_ap_setup(); + +diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c +new file mode 100644 +index 0000000000..a8ec2ccc69 +--- /dev/null ++++ xen/arch/x86/tsx.c +@@ -0,0 +1,74 @@ ++#include <xen/init.h> ++#include <asm/msr.h> ++ ++/* ++ * Valid values: ++ * 1 => Explicit tsx=1 ++ * 0 => Explicit tsx=0 ++ * -1 => Default, implicit tsx=1 ++ * ++ * This is arranged such that the bottom bit encodes whether TSX is actually ++ * disabled, while identifying various explicit (>=0) and implicit (<0) ++ * conditions. ++ */ ++int8_t __read_mostly opt_tsx = -1; ++int8_t __read_mostly cpu_has_tsx_ctrl = -1; ++ ++static int __init parse_tsx(const char *s) ++{ ++ int rc = 0, val = parse_bool(s, NULL); ++ ++ if ( val >= 0 ) ++ opt_tsx = val; ++ else ++ rc = -EINVAL; ++ ++ return rc; ++} ++custom_param("tsx", parse_tsx); ++ ++void tsx_init(void) ++{ ++ /* ++ * This function is first called between microcode being loaded, and CPUID ++ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL ++ * is available. ++ */ ++ if ( unlikely(cpu_has_tsx_ctrl < 0) ) ++ { ++ uint64_t caps = 0; ++ ++ if ( boot_cpu_data.cpuid_level >= 7 && ++ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) ) ++ rdmsrl(MSR_ARCH_CAPABILITIES, caps); ++ ++ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL); ++ } ++ ++ if ( cpu_has_tsx_ctrl ) ++ { ++ uint64_t val; ++ ++ rdmsrl(MSR_TSX_CTRL, val); ++ ++ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR); ++ /* Check bottom bit only. Higher bits are various sentinals. */ ++ if ( !(opt_tsx & 1) ) ++ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_TSX_CTRL, val); ++ } ++ else if ( opt_tsx >= 0 ) ++ printk_once(XENLOG_WARNING ++ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n"); ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 89ae3e03f1..5ee7a37c12 100644 +--- xen/include/asm-x86/msr-index.h.orig ++++ xen/include/asm-x86/msr-index.h +@@ -55,6 +55,7 @@ + #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4) + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) + #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) ++#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +@@ -62,6 +63,10 @@ + #define MSR_TSX_FORCE_ABORT 0x0000010f + #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0) + ++#define MSR_TSX_CTRL 0x00000122 ++#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0) ++#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1) ++ + /* Intel MSRs. Some also available on other CPUs */ + #define MSR_IA32_PERFCTR0 0x000000c1 + #define MSR_IA32_A_PERFCTR0 0x000004c1 +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 20d1ecb332..66224f23b9 100644 +--- xen/include/asm-x86/processor.h.orig ++++ xen/include/asm-x86/processor.h +@@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx( + return ebx; + } + ++static always_inline unsigned int cpuid_count_edx( ++ unsigned int leaf, unsigned int subleaf) ++{ ++ unsigned int edx, tmp; ++ ++ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx); ++ ++ return edx; ++} ++ + static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf, + struct cpuid_leaf *data) + { +@@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model, + return fam; + } + ++extern int8_t opt_tsx, cpu_has_tsx_ctrl; ++void tsx_init(void); ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __ASM_X86_PROCESSOR_H */ +diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h +index 750f809968..be223a6950 100644 +--- xen/include/xen/lib.h.orig ++++ xen/include/xen/lib.h +@@ -116,6 +116,16 @@ extern int printk_ratelimit(void); + #define gprintk(lvl, fmt, args...) \ + printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args) + ++#define printk_once(fmt, args...) \ ++({ \ ++ static bool __read_mostly once_; \ ++ if ( unlikely(!once_) ) \ ++ { \ ++ once_ = true; \ ++ printk(fmt, ## args); \ ++ } \ ++}) ++ + #ifdef NDEBUG + + static inline void +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel + +See patch documentation and comments. + +This is part of XSA-305 / CVE-2019-11135 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index b86d26399a..31635a473a 100644 +--- docs/misc/xen-command-line.markdown.orig ++++ docs/misc/xen-command-line.markdown +@@ -1841,7 +1841,7 @@ extreme care.** + An overall boolean value, `spec-ctrl=no`, can be specified to turn off all + mitigations, including pieces of infrastructure used to virtualise certain + mitigation features for guests. This also includes settings which `xpti`, +-`smt`, `pv-l1tf` control, unless the respective option(s) have been ++`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been + specified earlier on the command line. + + Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to +@@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter. + = <bool> + + Applicability: x86 +- Default: true ++ Default: false on parts vulnerable to TAA, true otherwise + + Controls for the use of Transactional Synchronization eXtensions. + +@@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off. + On systems with the ability to turn TSX off, this boolean offers system wide + control of whether TSX is enabled or disabled. + ++On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following ++logic applies: ++ ++ * An explicit `tsx=` choice is honoured, even if it is `true` and would ++ result in a vulnerable system. ++ ++ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be ++ mitigated by disabling TSX, as this is the lowest overhead option. ++ ++ * If the use of TSX is important, the more expensive TAA mitigations can be ++ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain ++ active by default. ++ + ### ucode (x86) + > `= [<integer> | scan]` + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 2fe16b423d..ab196b156d 100644 +--- xen/arch/x86/spec_ctrl.c.orig ++++ xen/arch/x86/spec_ctrl.c +@@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s) + if ( opt_pv_l1tf_domu < 0 ) + opt_pv_l1tf_domu = 0; + ++ if ( opt_tsx == -1 ) ++ opt_tsx = -3; ++ + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +@@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + printk("Speculative mitigation facilities:\n"); + + /* Hardware features which pertain to speculative mitigations. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "", + (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "", +@@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", + (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "", + (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", +- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : ""); ++ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", ++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", ++ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : ""); + + /* Compiled-in support which pertains to mitigations. */ + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) +@@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n", ++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n", + thunk == THUNK_NONE ? "N/A" : + thunk == THUNK_RETPOLINE ? "RETPOLINE" : + thunk == THUNK_LFENCE ? "LFENCE" : +@@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) + (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", + !boot_cpu_has(X86_FEATURE_SSBD) ? "" : + (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", ++ !(caps & ARCH_CAPS_TSX_CTRL) ? "" : ++ (opt_tsx & 1) ? " TSX+" : " TSX-", + opt_ibpb ? " IBPB" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", + opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : ""); +@@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; + bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled; ++ bool cpu_has_bug_taa; + uint64_t caps = 0; + + if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) ) +@@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void) + "enabled. Mitigations will not be fully effective. Please\n" + "choose an explicit smt=<bool> setting. See XSA-297.\n"); + ++ /* ++ * Vulnerability to TAA is a little complicated to quantify. ++ * ++ * In the pipeline, it is just another way to get speculative access to ++ * stale load port, store buffer or fill buffer data, and therefore can be ++ * considered a superset of MDS (on TSX-capable parts). On parts which ++ * predate MDS_NO, the existing VERW flushing will mitigate this ++ * sidechannel as well. ++ * ++ * On parts which contain MDS_NO, the lack of VERW flushing means that an ++ * attacker can still use TSX to target microarchitectural buffers to leak ++ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts ++ * which have MDS_NO but lack TAA_NO. ++ * ++ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the ++ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so ++ * we check both to spot TSX in a microcode/cmdline independent way. ++ */ ++ cpu_has_bug_taa = ++ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) && ++ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO; ++ ++ /* ++ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs ++ * the MDS mitigation of disabling HT and using VERW flushing. ++ * ++ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until ++ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being ++ * advertised, and there isn't a MD_CLEAR_2 flag to use... ++ * ++ * If we're on affected hardware, able to do something about it (which ++ * implies that VERW now works), no explicit TSX choice and traditional ++ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might ++ * plausibly value TSX higher than Hyperthreading...), disable TSX to ++ * mitigate TAA. ++ */ ++ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) && ++ ((hw_smt_enabled && opt_smt) || ++ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) ) ++ { ++ setup_clear_cpu_cap(X86_FEATURE_HLE); ++ setup_clear_cpu_cap(X86_FEATURE_RTM); ++ ++ opt_tsx = 0; ++ tsx_init(); ++ } ++ + print_details(thunk, caps); + + /* +diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c +index a8ec2ccc69..2d202a0d4e 100644 +--- xen/arch/x86/tsx.c.orig ++++ xen/arch/x86/tsx.c +@@ -5,7 +5,8 @@ + * Valid values: + * 1 => Explicit tsx=1 + * 0 => Explicit tsx=0 +- * -1 => Default, implicit tsx=1 ++ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA ++ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0) + * + * This is arranged such that the bottom bit encodes whether TSX is actually + * disabled, while identifying various explicit (>=0) and implicit (<0) +diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h +index 5ee7a37c12..1761a01f1f 100644 +--- xen/include/asm-x86/msr-index.h.orig ++++ xen/include/asm-x86/msr-index.h +@@ -56,6 +56,7 @@ + #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5) + #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) + #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) ++#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) |