summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbouyer <bouyer@pkgsrc.org>2019-11-13 13:36:11 +0000
committerbouyer <bouyer@pkgsrc.org>2019-11-13 13:36:11 +0000
commit828418984be2315141e8d854e413e9d0b133e14a (patch)
tree892445ac19c845338578c93e9d893e3c32226178
parent038530648c7224c3690c222cb48aea7bed65807a (diff)
downloadpkgsrc-828418984be2315141e8d854e413e9d0b133e14a.tar.gz
Add patches for relevant Xen security advisory up to XSA305 (everything
up to XSA297 is already fixed upstream). Bump PKGREVISION
-rw-r--r--sysutils/xenkernel411/Makefile4
-rw-r--r--sysutils/xenkernel411/distinfo6
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA29889
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA302537
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA304481
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA305482
6 files changed, 1596 insertions, 3 deletions
diff --git a/sysutils/xenkernel411/Makefile b/sysutils/xenkernel411/Makefile
index 5753bffb774..4e890e3cbc7 100644
--- a/sysutils/xenkernel411/Makefile
+++ b/sysutils/xenkernel411/Makefile
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.8 2019/08/30 13:16:27 bouyer Exp $
+# $NetBSD: Makefile,v 1.9 2019/11/13 13:36:11 bouyer Exp $
VERSION= 4.11.2
-#PKGREVISION= 0
+PKGREVISION= 1
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel411-${VERSION}
CATEGORIES= sysutils
diff --git a/sysutils/xenkernel411/distinfo b/sysutils/xenkernel411/distinfo
index ccf14678aaa..0354944c4b3 100644
--- a/sysutils/xenkernel411/distinfo
+++ b/sysutils/xenkernel411/distinfo
@@ -1,10 +1,14 @@
-$NetBSD: distinfo,v 1.5 2019/08/30 13:16:27 bouyer Exp $
+$NetBSD: distinfo,v 1.6 2019/11/13 13:36:11 bouyer Exp $
SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6
+SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b
+SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399
+SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
diff --git a/sysutils/xenkernel411/patches/patch-XSA298 b/sysutils/xenkernel411/patches/patch-XSA298
new file mode 100644
index 00000000000..10ff22f8e67
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA298
@@ -0,0 +1,89 @@
+$NetBSD: patch-XSA298,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PV: check GDT/LDT limits during emulation
+
+Accesses beyond the LDT limit originating from emulation would trigger
+the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
+accesses would cause an attempt to promote the touched page (offset from
+the present LDT base address) to a segment descriptor one. If this
+happens to succeed, guest user mode would be able to elevate its
+privileges to that of the guest kernel. This is particularly easy when
+there's no LDT at all, in which case the LDT base stored internally to
+Xen is simply zero.
+
+Also adjust the ASSERT() that was triggering: It was off by one to
+begin with, and for production builds we also better use
+ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
+
+This is XSA-298.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+--- xen/arch/x86/pv/emul-gate-op.c.orig
++++ xen/arch/x86/pv/emul-gate-op.c
+@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned
+ const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel);
+
+ if ( (gate_sel < 4) ||
+- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
++ /*
++ * We're interested in call gates only, which occupy a single
++ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
++ */
++ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
++ (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents
++ : v->arch.pv_vcpu.gdt_ents)) ||
+ __get_user(desc, pdesc) )
+ return 0;
+
+@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned
+ if ( !is_pv_32bit_vcpu(v) )
+ {
+ if ( (*ar & 0x1f00) != 0x0c00 ||
+- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
++ /* Limit check done above already. */
+ __get_user(desc, pdesc + 1) ||
+ (desc.b & 0x1f00) )
+ return 0;
+--- xen/arch/x86/pv/emulate.c.orig
++++ xen/arch/x86/pv/emulate.c
+@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
+ {
+ struct desc_struct desc;
+
+- if ( sel < 4)
++ if ( sel < 4 ||
++ /*
++ * Don't apply the GDT limit here, as the selector may be a Xen
++ * provided one. __get_user() will fail (without taking further
++ * action) for ones falling in the gap between guest populated
++ * and Xen ones.
++ */
++ ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) )
+ desc.b = desc.a = 0;
+ else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) )
+ return 0;
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int
+ BUG_ON(unlikely(in_irq()));
+
+ /*
+- * Hardware limit checking should guarantee this property. NB. This is
++ * Prior limit checking should guarantee this property. NB. This is
+ * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
+ * current vcpu, and vcpu_reset() will block until this vcpu has been
+ * descheduled before continuing.
+ */
+- ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents);
++ if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) )
++ {
++ ASSERT_UNREACHABLE();
++ return false;
++ }
+
+ if ( is_pv_32bit_domain(currd) )
+ linear = (uint32_t)linear;
diff --git a/sysutils/xenkernel411/patches/patch-XSA302 b/sysutils/xenkernel411/patches/patch-XSA302
new file mode 100644
index 00000000000..e1c08b56a0c
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA302
@@ -0,0 +1,537 @@
+$NetBSD: patch-XSA302,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 2 Oct 2019 13:36:59 +0200
+Subject: [PATCH 1/2] IOMMU: add missing HVM check
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fix an unguarded d->arch.hvm access in assign_device().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
+Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
+---
+ xen/drivers/passthrough/pci.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index f51cae7f4e..037aba7c94 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ /* Prevent device assign if mem paging or mem sharing have been
+ * enabled for this domain */
+ if ( unlikely(!need_iommu(d) &&
+- (d->arch.hvm_domain.mem_sharing_enabled ||
++ ((is_hvm_domain(d) &&
++ d->arch.hvm_domain.mem_sharing_enabled) ||
+ vm_event_check_ring(d->vm_event_paging) ||
+ p2m_get_hostp2m(d)->global_logdirty)) )
+ return -EXDEV;
+--
+2.11.0
+
+From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001
+From: Paul Durrant <paul.durrant@citrix.com>
+Date: Mon, 14 Oct 2019 17:52:59 +0100
+Subject: [PATCH 2/2] passthrough: quarantine PCI devices
+
+When a PCI device is assigned to an untrusted domain, it is possible for
+that domain to program the device to DMA to an arbitrary address. The
+IOMMU is used to protect the host from malicious DMA by making sure that
+the device addresses can only target memory assigned to the guest. However,
+when the guest domain is torn down the device is assigned back to dom0,
+thus allowing any in-flight DMA to potentially target critical host data.
+
+This patch introduces a 'quarantine' for PCI devices using dom_io. When
+the toolstack makes a device assignable (by binding it to pciback), it
+will now also assign it to DOMID_IO and the device will only be assigned
+back to dom0 when the device is made unassignable again. Whilst device is
+assignable it will only ever transfer between dom_io and guest domains.
+dom_io is actually only used as a sentinel domain for quarantining purposes;
+it is not configured with any IOMMU mappings. Assignment to dom_io simply
+means that the device's initiator (requestor) identifier is not present in
+the IOMMU's device table and thus any DMA transactions issued will be
+terminated with a fault condition.
+
+In addition, a fix to assignment handling is made for VT-d. Failure
+during the assignment step should not lead to a device still being
+associated with its prior owner. Hand the device to DomIO temporarily,
+until the assignment step has completed successfully. Remove the PI
+hooks from the source domain then earlier as well.
+
+Failure of the recovery reassign_device_ownership() may not go silent:
+There e.g. may still be left over RMRR mappings in the domain assignment
+to which has failed, and hence we can't allow that domain to continue
+executing.
+
+NOTE: This patch also includes one printk() cleanup; the
+ "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
+ since similar printk()-s elsewhere also don't log such a tag.
+
+This is XSA-302.
+
+Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
+---
+ tools/libxl/libxl_pci.c | 25 +++++++++++-
+ xen/arch/x86/mm.c | 2 +
+ xen/common/domctl.c | 14 ++++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
+ xen/drivers/passthrough/iommu.c | 9 +++++
+ xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++-------
+ xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++---
+ xen/include/xen/pci.h | 3 ++
+ 8 files changed, 138 insertions(+), 24 deletions(-)
+
+diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
+index 4755a0c93c..81890a91ac 100644
+--- tools/libxl/libxl_pci.c.orig
++++ tools/libxl/libxl_pci.c
+@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ libxl_device_pci *pcidev,
+ int rebind)
+ {
++ libxl_ctx *ctx = libxl__gc_owner(gc);
+ unsigned dom, bus, dev, func;
+ char *spath, *driver_path = NULL;
+ int rc;
+@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ }
+ if ( rc ) {
+ LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
+- return 0;
++ goto quarantine;
+ }
+
+ /* Check to see if there's already a driver that we need to unbind from */
+@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ return ERROR_FAIL;
+ }
+
++quarantine:
++ /*
++ * DOMID_IO is just a sentinel domain, without any actual mappings,
++ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
++ * unnecessarily denied.
++ */
++ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
++ XEN_DOMCTL_DEV_RDM_RELAXED);
++ if ( rc < 0 ) {
++ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
++ return ERROR_FAIL;
++ }
++
+ return 0;
+ }
+
+@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
+ libxl_device_pci *pcidev,
+ int rebind)
+ {
++ libxl_ctx *ctx = libxl__gc_owner(gc);
+ int rc;
+ char *driver_path;
+
++ /* De-quarantine */
++ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
++ if ( rc < 0 ) {
++ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
++ pcidev->dev, pcidev->func);
++ return ERROR_FAIL;
++ }
++
+ /* Unbind from pciback */
+ if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
+ return ERROR_FAIL;
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e6a4cb28f8..c1ab57f9a5 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
+ * Initialise our DOMID_IO domain.
+ * This domain owns I/O pages that are within the range of the page_info
+ * array. Mappings occur at the priv of the caller.
++ * Quarantined PCI devices will be associated with this domain.
+ */
+ dom_io = domain_create(DOMID_IO, NULL);
+ BUG_ON(IS_ERR(dom_io));
++ INIT_LIST_HEAD(&dom_io->arch.pdev_list);
+
+ /*
+ * Initialise our COW domain.
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index 9b7bc083ee..741d774cd1 100644
+--- xen/common/domctl.c.orig
++++ xen/common/domctl.c
+@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+
+ switch ( op->cmd )
+ {
++ case XEN_DOMCTL_assign_device:
++ case XEN_DOMCTL_deassign_device:
++ if ( op->domain == DOMID_IO )
++ {
++ d = dom_io;
++ break;
++ }
++ else if ( op->domain == DOMID_INVALID )
++ return -ESRCH;
++ /* fall through */
+ case XEN_DOMCTL_test_assign_device:
+ if ( op->domain == DOMID_INVALID )
+ {
+@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+
+ if ( !domctl_lock_acquire() )
+ {
+- if ( d )
++ if ( d && d != dom_io )
+ rcu_unlock_domain(d);
+ return hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
+@@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ domctl_lock_release();
+
+ domctl_out_unlock_domonly:
+- if ( d )
++ if ( d && d != dom_io )
+ rcu_unlock_domain(d);
+
+ if ( copyback && __copy_to_guest(u_domctl, op, 1) )
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index 12d2695b89..ec8baae717 100644
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device(
+ u8 bus = pdev->bus;
+ const struct domain_iommu *hd = dom_iommu(domain);
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return;
++
+ BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
+ !iommu->dev_table.buffer );
+
+@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
+ int req_id;
+ u8 bus = pdev->bus;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return;
++
+ BUG_ON ( iommu->dev_table.buffer == NULL );
+ req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
+ dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
+ ivrs_mappings[req_id].read_permission);
+ }
+
+- return reassign_device(hardware_domain, d, devfn, pdev);
++ return reassign_device(pdev->domain, d, devfn, pdev);
+ }
+
+ static void deallocate_next_page_table(struct page_info *pg, int level)
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index 04b0be37d3..8027d96f1c 100644
+--- xen/drivers/passthrough/iommu.c.orig
++++ xen/drivers/passthrough/iommu.c
+@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d)
+ {
+ const struct domain_iommu *hd = dom_iommu(d);
+
++ if ( d == dom_io )
++ return;
++
+ d->need_iommu = 0;
+ hd->platform_ops->teardown(d);
+ tasklet_schedule(&iommu_pt_cleanup_tasklet);
+@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d)
+ if ( need_iommu(d) > 0 )
+ return 0;
+
++ if ( d == dom_io )
++ return 0;
++
+ if ( !iommu_use_hap_pt(d) )
+ {
+ int rc;
+@@ -404,6 +410,9 @@ int __init iommu_setup(void)
+ printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
+ if ( iommu_enabled )
+ {
++ if ( iommu_domain_init(dom_io) )
++ panic("Could not set up quarantine\n");
++
+ printk(" - Dom0 mode: %s\n",
+ iommu_passthrough ? "Passthrough" :
+ iommu_dom0_strict ? "Strict" : "Relaxed");
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 037aba7c94..fb010a547b 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
+ return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
+ }
+
+-/*
+- * If the device isn't owned by the hardware domain, it means it already
+- * has been assigned to other domain, or it doesn't exist.
+- */
+ static int device_assigned(u16 seg, u8 bus, u8 devfn)
+ {
+ struct pci_dev *pdev;
++ int rc = 0;
+
+ pcidevs_lock();
+- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++
++ pdev = pci_get_pdev(seg, bus, devfn);
++
++ if ( !pdev )
++ rc = -ENODEV;
++ /*
++ * If the device exists and it is not owned by either the hardware
++ * domain or dom_io then it must be assigned to a guest, or be
++ * hidden (owned by dom_xen).
++ */
++ else if ( pdev->domain != hardware_domain &&
++ pdev->domain != dom_io )
++ rc = -EBUSY;
++
+ pcidevs_unlock();
+
+- return pdev ? 0 : -EBUSY;
++ return rc;
+ }
+
+ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+@@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+
+ /* Prevent device assign if mem paging or mem sharing have been
+ * enabled for this domain */
+- if ( unlikely(!need_iommu(d) &&
++ if ( d != dom_io &&
++ unlikely(!need_iommu(d) &&
+ ((is_hvm_domain(d) &&
+ d->arch.hvm_domain.mem_sharing_enabled) ||
+ vm_event_check_ring(d->vm_event_paging) ||
+@@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ return rc;
+ }
+
+- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++ pdev = pci_get_pdev(seg, bus, devfn);
++
++ rc = -ENODEV;
+ if ( !pdev )
+- {
+- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
+ goto done;
+- }
++
++ rc = 0;
++ if ( d == pdev->domain )
++ goto done;
++
++ rc = -EBUSY;
++ if ( pdev->domain != hardware_domain &&
++ pdev->domain != dom_io )
++ goto done;
+
+ if ( pdev->msix )
+ msixtbl_init(d);
+@@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ }
+
+ done:
++ /* The device is assigned to dom_io so mark it as quarantined */
++ if ( !rc && d == dom_io )
++ pdev->quarantine = true;
++
+ if ( !has_arch_pdevs(d) && need_iommu(d) )
+ iommu_teardown(d);
+ pcidevs_unlock();
+@@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ {
+ const struct domain_iommu *hd = dom_iommu(d);
+ struct pci_dev *pdev = NULL;
++ struct domain *target;
+ int ret = 0;
+
+ if ( !iommu_enabled || !hd->platform_ops )
+@@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ if ( !pdev )
+ return -ENODEV;
+
++ /* De-assignment from dom_io should de-quarantine the device */
++ target = (pdev->quarantine && pdev->domain != dom_io) ?
++ dom_io : hardware_domain;
++
+ while ( pdev->phantom_stride )
+ {
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ break;
+- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++ ret = hd->platform_ops->reassign_device(d, target, devfn,
+ pci_to_dev(pdev));
+ if ( !ret )
+ continue;
+@@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ }
+
+ devfn = pdev->devfn;
+- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++ ret = hd->platform_ops->reassign_device(d, target, devfn,
+ pci_to_dev(pdev));
+ if ( ret )
+ {
+@@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ return ret;
+ }
+
++ if ( pdev->domain == hardware_domain )
++ pdev->quarantine = false;
++
+ pdev->fault.count = 0;
+
+ if ( !has_arch_pdevs(d) && need_iommu(d) )
+@@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl(
+ ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+ "h", u_domctl);
+ else if ( ret )
+- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
++ printk(XENLOG_G_ERR
+ "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ d->domain_id, ret);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 4c719d4ee7..19f7d13013 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1338,6 +1338,10 @@ int domain_context_mapping_one(
+ int agaw, rc, ret;
+ bool_t flush_dev_iotlb;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return 0;
++
+ ASSERT(pcidevs_locked());
+ spin_lock(&iommu->lock);
+ maddr = bus_to_context_maddr(iommu, bus);
+@@ -1573,6 +1577,10 @@ int domain_context_unmap_one(
+ int iommu_domid, rc, ret;
+ bool_t flush_dev_iotlb;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return 0;
++
+ ASSERT(pcidevs_locked());
+ spin_lock(&iommu->lock);
+
+@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ goto out;
+ }
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ goto out;
++
+ /*
+ * if no other devices under the same iommu owned by this domain,
+ * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
+@@ -2389,6 +2401,15 @@ static int reassign_device_ownership(
+ if ( ret )
+ return ret;
+
++ if ( devfn == pdev->devfn )
++ {
++ list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
++ pdev->domain = dom_io;
++ }
++
++ if ( !has_arch_pdevs(source) )
++ vmx_pi_hooks_deassign(source);
++
+ if ( !has_arch_pdevs(target) )
+ vmx_pi_hooks_assign(target);
+
+@@ -2407,15 +2428,13 @@ static int reassign_device_ownership(
+ pdev->domain = target;
+ }
+
+- if ( !has_arch_pdevs(source) )
+- vmx_pi_hooks_deassign(source);
+-
+ return ret;
+ }
+
+ static int intel_iommu_assign_device(
+ struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
+ {
++ struct domain *s = pdev->domain;
+ struct acpi_rmrr_unit *rmrr;
+ int ret = 0, i;
+ u16 bdf, seg;
+@@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device(
+ }
+ }
+
+- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
+- if ( ret )
++ ret = reassign_device_ownership(s, d, devfn, pdev);
++ if ( ret || d == dom_io )
+ return ret;
+
+ /* Setup rmrr identity mapping */
+@@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device(
+ ret = rmrr_identity_mapping(d, 1, rmrr, flag);
+ if ( ret )
+ {
+- reassign_device_ownership(d, hardware_domain, devfn, pdev);
++ int rc;
++
++ rc = reassign_device_ownership(d, s, devfn, pdev);
+ printk(XENLOG_G_ERR VTDPREFIX
+ " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
+ rmrr->base_address, rmrr->end_address,
+ d->domain_id, ret);
++ if ( rc )
++ {
++ printk(XENLOG_ERR VTDPREFIX
++ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
++ domain_crash(d);
++ }
+ break;
+ }
+ }
+diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
+index 4cfa774615..066364bdef 100644
+--- xen/include/xen/pci.h.orig
++++ xen/include/xen/pci.h
+@@ -88,6 +88,9 @@ struct pci_dev {
+
+ nodeid_t node; /* NUMA node */
+
++ /* Device to be quarantined, don't automatically re-assign to dom0 */
++ bool quarantine;
++
+ enum pdev_type {
+ DEV_TYPE_PCI_UNKNOWN,
+ DEV_TYPE_PCIe_ENDPOINT,
+--
+2.11.0
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA304 b/sysutils/xenkernel411/patches/patch-XSA304
new file mode 100644
index 00000000000..b905d46b748
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA304
@@ -0,0 +1,481 @@
+$NetBSD: patch-XSA304,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
+
+Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
+an EPT superpage gets shattered. The root cause is still under investigation,
+but the end result is unusable in combination with CVE-2018-12207 protections.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index fb7edfaef9..d698b1d50a 100644
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
+ int __must_check me_wifi_quirk(struct domain *domain,
+ u8 bus, u8 devfn, int map);
+ void pci_vtd_quirk(const struct pci_dev *);
++void quirk_iommu_caps(struct iommu *iommu);
++
+ bool_t platform_supports_intremap(void);
+ bool_t platform_supports_x2apic(void);
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index f242e30caf..8712d3b4dc 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
+ return -ENODEV;
+
++ quirk_iommu_caps(iommu);
++
+ if ( cap_fault_reg_offset(iommu->cap) +
+ cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
+ ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index d6db862678..b02688e316 100644
+--- xen/drivers/passthrough/vtd/quirks.c.orig
++++ xen/drivers/passthrough/vtd/quirks.c
+@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
+ break;
+ }
+ }
++
++void __init quirk_iommu_caps(struct iommu *iommu)
++{
++ /*
++ * IOMMU Quirks:
++ *
++ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
++ * implement superpages internally.
++ *
++ * There are issues changing the walk length under in-flight DMA, which
++ * has manifested as incompatibility between EPT/IOMMU sharing and the
++ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages
++ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
++ * and IOMMU pagetables.
++ *
++ * Detection of SandyBridge unfortunately has to be done by processor
++ * model because the client parts don't expose their IOMMUs as PCI devices
++ * we could match with a Device ID.
++ */
++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
++ boot_cpu_data.x86 == 6 &&
++ (boot_cpu_data.x86_model == 0x2a ||
++ boot_cpu_data.x86_model == 0x2d) )
++ iommu->cap &= ~(0xful << 34);
++}
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtx: Disable executable EPT superpages to work around
+ CVE-2018-12207
+
+CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
+machine check exception can be generated in a corner case when an executable
+mapping changes size or cacheability without TLB invalidation. HVM guest
+kernels can trigger this to DoS the host.
+
+To mitigate, in affected hardware, all EPT superpages are marked NX. When an
+instruction fetch violation is observed against the superpage, the superpage
+is shattered to 4k and has execute permissions restored. This prevents the
+guest kernel from being able to create the necessary preconditions in the iTLB
+to exploit the vulnerability.
+
+This does come with a workload-dependent performance overhead, caused by
+increased TLB pressure. Performance can be restored, if guest kernels are
+trusted not to mount an attack, by specifying ept=exec-sp on the command line.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index c63a07d29b..684671cb7b 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -828,7 +828,7 @@ effect the inverse meaning.
+ >> set as UC.
+
+ ### ept (Intel)
+-> `= List of ( {no-}pml | {no-}ad )`
++> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]`
+
+ Controls EPT related features.
+
+@@ -851,6 +851,16 @@ Controls EPT related features.
+
+ >> Have hardware keep accessed/dirty (A/D) bits updated.
+
++* The `exec-sp` boolean controls whether EPT superpages with execute
++ permissions are permitted. In general this is good for performance.
++
++ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can
++ use executable superpages to crash the host. By default, executable
++ superpages are disabled on affected hardware.
++
++ If HVM guest kernels are trusted not to mount a DoS against the system,
++ this option can enabled to regain performance.
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index f4a6a37149..1924434960 100644
+--- xen/arch/x86/hvm/hvm.c.orig
++++ xen/arch/x86/hvm/hvm.c
+@@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ struct p2m_domain *p2m, *hostp2m;
+ int rc, fall_through = 0, paged = 0;
+ int sharing_enomem = 0;
++ unsigned int page_order = 0;
+ vm_event_request_t *req_ptr = NULL;
+ bool_t ap2m_active, sync = 0;
+
+@@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ hostp2m = p2m_get_hostp2m(currd);
+ mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
+ P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
+- NULL);
++ &page_order);
+
+ if ( ap2m_active )
+ {
+@@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ goto out;
+ }
+
+- mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
++ mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order);
+ }
+ else
+ p2m = hostp2m;
+@@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ break;
+ }
+
++ /*
++ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution
++ * fault against a non-executable superpage, shatter it to regain
++ * execute permissions.
++ */
++ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
++ {
++ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
++ p2mt, p2ma);
++
++ if ( res )
++ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
++ gfn, res);
++
++ rc = !res;
++ goto out_put_gfn;
++ }
++
+ if ( violation )
+ {
+ /* Should #VE be emulated for this fault? */
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 493986e84a..8821a3b536 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
+
+ static bool_t __read_mostly opt_pml_enabled = 1;
+ static s8 __read_mostly opt_ept_ad = -1;
++int8_t __read_mostly opt_ept_exec_sp = -1;
+
+ /*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s)
+ opt_pml_enabled = val;
+ else if ( !cmdline_strcmp(s, "ad") )
+ opt_ept_ad = val;
++ else if ( !cmdline_strcmp(s, "exec-sp") )
++ opt_ept_exec_sp = val;
+ else
+ rc = -EINVAL;
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 840dc2b44d..a568d62643 100644
+--- xen/arch/x86/hvm/vmx/vmx.c.orig
++++ xen/arch/x86/hvm/vmx/vmx.c
+@@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
+ static void __init lbr_tsx_fixup_check(void);
+ static void __init bdw_erratum_bdf14_fixup_check(void);
+
++/*
++ * Calculate whether the CPU is vulnerable to Instruction Fetch page
++ * size-change MCEs.
++ */
++static bool __init has_if_pschange_mc(void)
++{
++ uint64_t caps = 0;
++
++ /*
++ * If we are virtualised, there is nothing we can do. Our EPT tables are
++ * shadowed by our hypervisor, and not walked by hardware.
++ */
++ if ( cpu_has_hypervisor )
++ return false;
++
++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
++ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
++ return false;
++
++ /*
++ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
++ * this time.
++ */
++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
++ boot_cpu_data.x86 != 6 )
++ return false;
++
++ switch ( boot_cpu_data.x86_model )
++ {
++ /*
++ * Core processors since at least Nehalem are vulnerable.
++ */
++ case 0x1f: /* Auburndale / Havendale */
++ case 0x1e: /* Nehalem */
++ case 0x1a: /* Nehalem EP */
++ case 0x2e: /* Nehalem EX */
++ case 0x25: /* Westmere */
++ case 0x2c: /* Westmere EP */
++ case 0x2f: /* Westmere EX */
++ case 0x2a: /* SandyBridge */
++ case 0x2d: /* SandyBridge EP/EX */
++ case 0x3a: /* IvyBridge */
++ case 0x3e: /* IvyBridge EP/EX */
++ case 0x3c: /* Haswell */
++ case 0x3f: /* Haswell EX/EP */
++ case 0x45: /* Haswell D */
++ case 0x46: /* Haswell H */
++ case 0x3d: /* Broadwell */
++ case 0x47: /* Broadwell H */
++ case 0x4f: /* Broadwell EP/EX */
++ case 0x56: /* Broadwell D */
++ case 0x4e: /* Skylake M */
++ case 0x5e: /* Skylake D */
++ case 0x55: /* Skylake-X / Cascade Lake */
++ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
++ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
++ return true;
++
++ /*
++ * Atom processors are not vulnerable.
++ */
++ case 0x1c: /* Pineview */
++ case 0x26: /* Lincroft */
++ case 0x27: /* Penwell */
++ case 0x35: /* Cloverview */
++ case 0x36: /* Cedarview */
++ case 0x37: /* Baytrail / Valleyview (Silvermont) */
++ case 0x4d: /* Avaton / Rangely (Silvermont) */
++ case 0x4c: /* Cherrytrail / Brasswell */
++ case 0x4a: /* Merrifield */
++ case 0x5a: /* Moorefield */
++ case 0x5c: /* Goldmont */
++ case 0x5d: /* SoFIA 3G Granite/ES2.1 */
++ case 0x65: /* SoFIA LTE AOSP */
++ case 0x5f: /* Denverton */
++ case 0x6e: /* Cougar Mountain */
++ case 0x75: /* Lightning Mountain */
++ case 0x7a: /* Gemini Lake */
++ case 0x86: /* Jacobsville */
++
++ /*
++ * Knights processors are not vulnerable.
++ */
++ case 0x57: /* Knights Landing */
++ case 0x85: /* Knights Mill */
++ return false;
++
++ default:
++ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
++ boot_cpu_data.x86_model);
++ return true;
++ }
++}
++
+ const struct hvm_function_table * __init start_vmx(void)
+ {
+ set_in_cr4(X86_CR4_VMXE);
+@@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void)
+ */
+ if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
+ {
++ bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
++
++ if ( opt_ept_exec_sp == -1 )
++ {
++ /* Default to non-executable superpages on vulnerable hardware. */
++ opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
++
++ if ( cpu_has_bug_pschange_mc )
++ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
++ }
++
+ vmx_function_table.hap_supported = 1;
+ vmx_function_table.altp2m_supported = 1;
+
+diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
+index ce46201d45..93e08f89a2 100644
+--- xen/arch/x86/mm/p2m-ept.c.orig
++++ xen/arch/x86/mm/p2m-ept.c
+@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+ break;
+ }
+
++ /*
++ * Don't create executable superpages if we need to shatter them to
++ * protect against CVE-2018-12207.
++ */
++ if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
++ entry->x = 0;
+ }
+
+ #define GUEST_TABLE_MAP_FAILED 0
+diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
+index 89619e4afd..20eb7f6082 100644
+--- xen/include/asm-x86/hvm/vmx/vmx.h.orig
++++ xen/include/asm-x86/hvm/vmx/vmx.h
+@@ -28,6 +28,8 @@
+ #include <asm/hvm/trace.h>
+ #include <asm/hvm/vmx/vmcs.h>
+
++extern int8_t opt_ept_exec_sp;
++
+ typedef union {
+ struct {
+ u64 r : 1, /* bit 0 - Read permission */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index b8151d2d9f..89ae3e03f1 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -54,6 +54,7 @@
+ #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
+ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
++#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtx: Allow runtime modification of the exec-sp setting
+
+See patch for details.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..33ed1ffc40 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -861,6 +861,21 @@ Controls EPT related features.
+ If HVM guest kernels are trusted not to mount a DoS against the system,
+ this option can enabled to regain performance.
+
++ This boolean may be modified at runtime using `xl set-parameters
++ ept=[no-]exec-sp` to switch between fast and secure.
++
++ * When switching from secure to fast, preexisting HVM domains will run
++ at their current performance until they are rebooted; new domains will
++ run without any overhead.
++
++ * When switching from fast to secure, all HVM domains will immediately
++ suffer a performance penalty.
++
++ **Warning: No guarantee is made that this runtime option will be retained
++ indefinitely, or that it will retain this exact behaviour. It is
++ intended as an emergency option for people who first chose fast, then
++ change their minds to secure, and wish not to reboot.**
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 8821a3b536..15376e25ba 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s)
+ }
+ custom_param("ept", parse_ept_param);
+
++static int parse_ept_param_runtime(const char *s)
++{
++ int val;
++
++ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
++ !(hvm_funcs.hap_capabilities &
++ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
++ {
++ printk("VMX: EPT not available, or not in use - ignoring\n");
++ return 0;
++ }
++
++ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
++ return -EINVAL;
++
++ if ( val != opt_ept_exec_sp )
++ {
++ struct domain *d;
++
++ opt_ept_exec_sp = val;
++
++ rcu_read_lock(&domlist_read_lock);
++ for_each_domain ( d )
++ if ( paging_mode_hap(d) )
++ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
++ rcu_read_unlock(&domlist_read_lock);
++ }
++
++ printk("VMX: EPT executable superpages %sabled\n",
++ val ? "en" : "dis");
++
++ return 0;
++}
++custom_runtime_only_param("ept", parse_ept_param_runtime);
++
+ /* Dynamic (run-time adjusted) execution control flags. */
+ u32 vmx_pin_based_exec_control __read_mostly;
+ u32 vmx_cpu_based_exec_control __read_mostly;
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index 2b62bc61dd..97c417fc3e 100644
+--- xen/arch/x86/mm/p2m.c.orig
++++ xen/arch/x86/mm/p2m.c
+@@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+ return 0;
+ }
+
++/*
++ * May be called with ot = nt = p2m_ram_rw for its side effect of
++ * recalculating all PTEs in the p2m.
++ */
+ void p2m_change_entry_type_global(struct domain *d,
+ p2m_type_t ot, p2m_type_t nt)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+- ASSERT(ot != nt);
+ ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
+
+ p2m_lock(p2m);
+ p2m->change_entry_type_global(p2m, ot, nt);
+- p2m->global_logdirty = (nt == p2m_ram_logdirty);
++ /* Don't allow 'recalculate' operations to change the logdirty state. */
++ if ( ot != nt )
++ p2m->global_logdirty = (nt == p2m_ram_logdirty);
+ p2m_unlock(p2m);
+ }
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA305 b/sysutils/xenkernel411/patches/patch-XSA305
new file mode 100644
index 00000000000..2ed167cbdf0
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA305
@@ -0,0 +1,482 @@
+$NetBSD: patch-XSA305,v 1.1 2019/11/13 13:36:11 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
+
+To protect against the TSX Async Abort speculative vulnerability, Intel have
+released new microcode for affected parts which introduce the MSR_TSX_CTRL
+control, which allows TSX to be turned off. This will be architectural on
+future parts.
+
+Introduce tsx= to provide a global on/off for TSX, including its enumeration
+via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to
+guests at the moment.
+
+VMs may have booted before microcode is loaded, or before hosts have rebooted,
+and they still want to migrate freely. A VM which booted seeing TSX can
+migrate safely to hosts with TSX disabled - TSX will start unconditionally
+aborting, but still behave in a manner compatible with the ABI.
+
+The guest-visible behaviour is equivalent to late loading the microcode and
+setting the RTM_DISABLE bit in the course of live patching.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..b86d26399a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter.
+ ### tsc (x86)
+ > `= unstable | skewed | stable:socket`
+
++### tsx
++ = <bool>
++
++ Applicability: x86
++ Default: true
++
++Controls for the use of Transactional Synchronization eXtensions.
++
++On Intel parts released in Q3 2019 (with updated microcode), and future parts,
++a control has been introduced which allows TSX to be turned off.
++
++On systems with the ability to turn TSX off, this boolean offers system wide
++control of whether TSX is enabled or disabled.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+
+diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
+index da1e4827f4..4c82d9f710 100644
+--- xen/arch/x86/Makefile.orig
++++ xen/arch/x86/Makefile
+@@ -65,6 +65,7 @@ obj-y += sysctl.o
+ obj-y += time.o
+ obj-y += trace.o
+ obj-y += traps.o
++obj-y += tsx.o
+ obj-y += usercopy.o
+ obj-y += x86_emulate.o
+ obj-$(CONFIG_TBOOT) += tboot.o
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index 5e11970701..04aefa555d 100644
+--- xen/arch/x86/cpuid.c.orig
++++ xen/arch/x86/cpuid.c
+@@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d)
+ if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
+ __set_bit(X86_FEATURE_ITSC, max_fs);
+
++ /*
++ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
++ * TSX and hide the feature bits. Migrating-in VMs may have been booted
++ * pre-mitigation when the TSX features were visbile.
++ *
++ * This situation is compatible (albeit with a perf hit to any TSX code in
++ * the guest), so allow the feature bits to remain set.
++ */
++ if ( cpu_has_tsx_ctrl )
++ {
++ __set_bit(X86_FEATURE_HLE, max_fs);
++ __set_bit(X86_FEATURE_RTM, max_fs);
++ }
++
+ /* Clamp the toolstacks choices to reality. */
+ for ( i = 0; i < ARRAY_SIZE(fs); i++ )
+ fs[i] &= max_fs[i];
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index ebc0665615..35d99a98a1 100644
+--- xen/arch/x86/msr.c.orig
++++ xen/arch/x86/msr.c
+@@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ case MSR_FLUSH_CMD:
+ /* Write-only */
+ case MSR_TSX_FORCE_ABORT:
++ case MSR_TSX_CTRL:
+ /* Not offered to guests. */
+ goto gp_fault;
+
+@@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+ case MSR_ARCH_CAPABILITIES:
+ /* Read-only */
+ case MSR_TSX_FORCE_ABORT:
++ case MSR_TSX_CTRL:
+ /* Not offered to guests. */
+ goto gp_fault;
+
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 657160549f..dc13ad6c36 100644
+--- xen/arch/x86/setup.c.orig
++++ xen/arch/x86/setup.c
+@@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+
+ early_microcode_init();
+
++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
++
+ identify_cpu(&boot_cpu_data);
+
+ set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index fd52a10cf9..bdc118d88b 100644
+--- xen/arch/x86/smpboot.c.orig
++++ xen/arch/x86/smpboot.c
+@@ -376,6 +376,8 @@ void start_secondary(void *unused)
+ if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+
++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
++
+ if ( xen_guest )
+ hypervisor_ap_setup();
+
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+new file mode 100644
+index 0000000000..a8ec2ccc69
+--- /dev/null
++++ xen/arch/x86/tsx.c
+@@ -0,0 +1,74 @@
++#include <xen/init.h>
++#include <asm/msr.h>
++
++/*
++ * Valid values:
++ * 1 => Explicit tsx=1
++ * 0 => Explicit tsx=0
++ * -1 => Default, implicit tsx=1
++ *
++ * This is arranged such that the bottom bit encodes whether TSX is actually
++ * disabled, while identifying various explicit (>=0) and implicit (<0)
++ * conditions.
++ */
++int8_t __read_mostly opt_tsx = -1;
++int8_t __read_mostly cpu_has_tsx_ctrl = -1;
++
++static int __init parse_tsx(const char *s)
++{
++ int rc = 0, val = parse_bool(s, NULL);
++
++ if ( val >= 0 )
++ opt_tsx = val;
++ else
++ rc = -EINVAL;
++
++ return rc;
++}
++custom_param("tsx", parse_tsx);
++
++void tsx_init(void)
++{
++ /*
++ * This function is first called between microcode being loaded, and CPUID
++ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL
++ * is available.
++ */
++ if ( unlikely(cpu_has_tsx_ctrl < 0) )
++ {
++ uint64_t caps = 0;
++
++ if ( boot_cpu_data.cpuid_level >= 7 &&
++ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
++ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
++ }
++
++ if ( cpu_has_tsx_ctrl )
++ {
++ uint64_t val;
++
++ rdmsrl(MSR_TSX_CTRL, val);
++
++ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
++ /* Check bottom bit only. Higher bits are various sentinals. */
++ if ( !(opt_tsx & 1) )
++ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
++
++ wrmsrl(MSR_TSX_CTRL, val);
++ }
++ else if ( opt_tsx >= 0 )
++ printk_once(XENLOG_WARNING
++ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 89ae3e03f1..5ee7a37c12 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -55,6 +55,7 @@
+ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
++#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+@@ -62,6 +63,10 @@
+ #define MSR_TSX_FORCE_ABORT 0x0000010f
+ #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0)
+
++#define MSR_TSX_CTRL 0x00000122
++#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0)
++#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1)
++
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0 0x000000c1
+ #define MSR_IA32_A_PERFCTR0 0x000004c1
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 20d1ecb332..66224f23b9 100644
+--- xen/include/asm-x86/processor.h.orig
++++ xen/include/asm-x86/processor.h
+@@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx(
+ return ebx;
+ }
+
++static always_inline unsigned int cpuid_count_edx(
++ unsigned int leaf, unsigned int subleaf)
++{
++ unsigned int edx, tmp;
++
++ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
++
++ return edx;
++}
++
+ static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf,
+ struct cpuid_leaf *data)
+ {
+@@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
+ return fam;
+ }
+
++extern int8_t opt_tsx, cpu_has_tsx_ctrl;
++void tsx_init(void);
++
+ #endif /* !__ASSEMBLY__ */
+
+ #endif /* __ASM_X86_PROCESSOR_H */
+diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
+index 750f809968..be223a6950 100644
+--- xen/include/xen/lib.h.orig
++++ xen/include/xen/lib.h
+@@ -116,6 +116,16 @@ extern int printk_ratelimit(void);
+ #define gprintk(lvl, fmt, args...) \
+ printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
+
++#define printk_once(fmt, args...) \
++({ \
++ static bool __read_mostly once_; \
++ if ( unlikely(!once_) ) \
++ { \
++ once_ = true; \
++ printk(fmt, ## args); \
++ } \
++})
++
+ #ifdef NDEBUG
+
+ static inline void
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
+
+See patch documentation and comments.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index b86d26399a..31635a473a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1841,7 +1841,7 @@ extreme care.**
+ An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
+ mitigations, including pieces of infrastructure used to virtualise certain
+ mitigation features for guests. This also includes settings which `xpti`,
+-`smt`, `pv-l1tf` control, unless the respective option(s) have been
++`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
+ specified earlier on the command line.
+
+ Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
+@@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter.
+ = <bool>
+
+ Applicability: x86
+- Default: true
++ Default: false on parts vulnerable to TAA, true otherwise
+
+ Controls for the use of Transactional Synchronization eXtensions.
+
+@@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off.
+ On systems with the ability to turn TSX off, this boolean offers system wide
+ control of whether TSX is enabled or disabled.
+
++On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
++logic applies:
++
++ * An explicit `tsx=` choice is honoured, even if it is `true` and would
++ result in a vulnerable system.
++
++ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
++ mitigated by disabling TSX, as this is the lowest overhead option.
++
++ * If the use of TSX is important, the more expensive TAA mitigations can be
++ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
++ active by default.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 2fe16b423d..ab196b156d 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s)
+ if ( opt_pv_l1tf_domu < 0 )
+ opt_pv_l1tf_domu = 0;
+
++ if ( opt_tsx == -1 )
++ opt_tsx = -3;
++
+ disable_common:
+ opt_rsb_pv = false;
+ opt_rsb_hvm = false;
+@@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ printk("Speculative mitigation facilities:\n");
+
+ /* Hardware features which pertain to speculative mitigations. */
+- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
+@@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
+ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
+ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
+- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "");
++ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
++ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "");
+
+ /* Compiled-in support which pertains to mitigations. */
+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+@@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ "\n");
+
+ /* Settings for Xen's protection, irrespective of guests. */
+- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
+ thunk == THUNK_NONE ? "N/A" :
+ thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+ thunk == THUNK_LFENCE ? "LFENCE" :
+@@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
+ !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
+ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
++ !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
++ (opt_tsx & 1) ? " TSX+" : " TSX-",
+ opt_ibpb ? " IBPB" : "",
+ opt_l1d_flush ? " L1D_FLUSH" : "",
+ opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "");
+@@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void)
+ {
+ enum ind_thunk thunk = THUNK_DEFAULT;
+ bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
++ bool cpu_has_bug_taa;
+ uint64_t caps = 0;
+
+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+@@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void)
+ "enabled. Mitigations will not be fully effective. Please\n"
+ "choose an explicit smt=<bool> setting. See XSA-297.\n");
+
++ /*
++ * Vulnerability to TAA is a little complicated to quantify.
++ *
++ * In the pipeline, it is just another way to get speculative access to
++ * stale load port, store buffer or fill buffer data, and therefore can be
++ * considered a superset of MDS (on TSX-capable parts). On parts which
++ * predate MDS_NO, the existing VERW flushing will mitigate this
++ * sidechannel as well.
++ *
++ * On parts which contain MDS_NO, the lack of VERW flushing means that an
++ * attacker can still use TSX to target microarchitectural buffers to leak
++ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts
++ * which have MDS_NO but lack TAA_NO.
++ *
++ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
++ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so
++ * we check both to spot TSX in a microcode/cmdline independent way.
++ */
++ cpu_has_bug_taa =
++ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
++ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
++
++ /*
++ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
++ * the MDS mitigation of disabling HT and using VERW flushing.
++ *
++ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
++ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
++ * advertised, and there isn't a MD_CLEAR_2 flag to use...
++ *
++ * If we're on affected hardware, able to do something about it (which
++ * implies that VERW now works), no explicit TSX choice and traditional
++ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
++ * plausibly value TSX higher than Hyperthreading...), disable TSX to
++ * mitigate TAA.
++ */
++ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
++ ((hw_smt_enabled && opt_smt) ||
++ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
++ {
++ setup_clear_cpu_cap(X86_FEATURE_HLE);
++ setup_clear_cpu_cap(X86_FEATURE_RTM);
++
++ opt_tsx = 0;
++ tsx_init();
++ }
++
+ print_details(thunk, caps);
+
+ /*
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+index a8ec2ccc69..2d202a0d4e 100644
+--- xen/arch/x86/tsx.c.orig
++++ xen/arch/x86/tsx.c
+@@ -5,7 +5,8 @@
+ * Valid values:
+ * 1 => Explicit tsx=1
+ * 0 => Explicit tsx=0
+- * -1 => Default, implicit tsx=1
++ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
++ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
+ *
+ * This is arranged such that the bottom bit encodes whether TSX is actually
+ * disabled, while identifying various explicit (>=0) and implicit (<0)
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 5ee7a37c12..1761a01f1f 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -56,6 +56,7 @@
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
+ #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
++#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)