summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbsiegert <bsiegert@pkgsrc.org>2019-11-16 22:10:06 +0000
committerbsiegert <bsiegert@pkgsrc.org>2019-11-16 22:10:06 +0000
commit387a32675a6bf89df35eacce8df408e602f05aa9 (patch)
tree4835057d968c59a27c6003ed5b5e6d63147920f6
parent2a831eaa34ca5c2eadedbc11252818c68f581d0c (diff)
downloadpkgsrc-387a32675a6bf89df35eacce8df408e602f05aa9.tar.gz
Pullup ticket #6086 - requested by bouyer
sysutils/xenkernel411: security fix Revisions pulled up: - sysutils/xenkernel411/Makefile 1.9-1.10 - sysutils/xenkernel411/distinfo 1.6-1.7 - sysutils/xenkernel411/patches/patch-XSA298 1.1-1.2 - sysutils/xenkernel411/patches/patch-XSA299 1.1 - sysutils/xenkernel411/patches/patch-XSA302 1.1-1.2 - sysutils/xenkernel411/patches/patch-XSA304 1.1-1.2 - sysutils/xenkernel411/patches/patch-XSA305 1.1-1.2 --- Module Name: pkgsrc Committed By: bouyer Date: Wed Nov 13 13:36:11 UTC 2019 Modified Files: pkgsrc/sysutils/xenkernel411: Makefile distinfo Added Files: pkgsrc/sysutils/xenkernel411/patches: patch-XSA298 patch-XSA302 patch-XSA304 patch-XSA305 Log Message: Add patches for relevant Xen security advisory up to XSA305 (everything up to XSA297 is already fixed upstream). Bump PKGREVISION --- Module Name: pkgsrc Committed By: bouyer Date: Wed Nov 13 15:00:06 UTC 2019 Modified Files: pkgsrc/sysutils/xenkernel411: Makefile distinfo pkgsrc/sysutils/xenkernel411/patches: patch-XSA298 patch-XSA302 patch-XSA304 patch-XSA305 Added Files: pkgsrc/sysutils/xenkernel411/patches: patch-XSA299 Log Message: Apply patch fixing XSA299. Bump PKGREVISION
-rw-r--r--sysutils/xenkernel411/Makefile4
-rw-r--r--sysutils/xenkernel411/distinfo7
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA29889
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA2992413
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA302537
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA304481
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA305482
7 files changed, 4010 insertions, 3 deletions
diff --git a/sysutils/xenkernel411/Makefile b/sysutils/xenkernel411/Makefile
index 5753bffb774..e1426bd25f6 100644
--- a/sysutils/xenkernel411/Makefile
+++ b/sysutils/xenkernel411/Makefile
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.8 2019/08/30 13:16:27 bouyer Exp $
+# $NetBSD: Makefile,v 1.8.2.1 2019/11/16 22:10:06 bsiegert Exp $
VERSION= 4.11.2
-#PKGREVISION= 0
+PKGREVISION= 2
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel411-${VERSION}
CATEGORIES= sysutils
diff --git a/sysutils/xenkernel411/distinfo b/sysutils/xenkernel411/distinfo
index ccf14678aaa..2ebc521bc0f 100644
--- a/sysutils/xenkernel411/distinfo
+++ b/sysutils/xenkernel411/distinfo
@@ -1,10 +1,15 @@
-$NetBSD: distinfo,v 1.5 2019/08/30 13:16:27 bouyer Exp $
+$NetBSD: distinfo,v 1.5.2.1 2019/11/16 22:10:06 bsiegert Exp $
SHA1 (xen411/xen-4.11.2.tar.gz) = 82766db0eca7ce65962732af8a31bb5cce1eb7ce
RMD160 (xen411/xen-4.11.2.tar.gz) = 6dcb1ac3e72381474912607b30b59fa55d87d38b
SHA512 (xen411/xen-4.11.2.tar.gz) = 48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e
Size (xen411/xen-4.11.2.tar.gz) = 25164925 bytes
SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA298) = 63e0f96ce3b945b16b98b51b423bafec14cf2be6
+SHA1 (patch-XSA299) = beb7ba1a8f9e0adda161c0da725ff053e674067e
+SHA1 (patch-XSA302) = 12fbb7dfea27f53c70c8115487a2e30595549c2b
+SHA1 (patch-XSA304) = f2c22732227e11a3e77c630f0264a689eed53399
+SHA1 (patch-XSA305) = eb5e0096cbf501fcbd7a5c5f9d1f932b557636b6
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
diff --git a/sysutils/xenkernel411/patches/patch-XSA298 b/sysutils/xenkernel411/patches/patch-XSA298
new file mode 100644
index 00000000000..cb8ca6b856d
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA298
@@ -0,0 +1,89 @@
+$NetBSD: patch-XSA298,v 1.2.2.2 2019/11/16 22:10:07 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PV: check GDT/LDT limits during emulation
+
+Accesses beyond the LDT limit originating from emulation would trigger
+the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
+accesses would cause an attempt to promote the touched page (offset from
+the present LDT base address) to a segment descriptor one. If this
+happens to succeed, guest user mode would be able to elevate its
+privileges to that of the guest kernel. This is particularly easy when
+there's no LDT at all, in which case the LDT base stored internally to
+Xen is simply zero.
+
+Also adjust the ASSERT() that was triggering: It was off by one to
+begin with, and for production builds we also better use
+ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
+
+This is XSA-298.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+--- xen/arch/x86/pv/emul-gate-op.c.orig
++++ xen/arch/x86/pv/emul-gate-op.c
+@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned
+ const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel);
+
+ if ( (gate_sel < 4) ||
+- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
++ /*
++ * We're interested in call gates only, which occupy a single
++ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
++ */
++ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
++ (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents
++ : v->arch.pv_vcpu.gdt_ents)) ||
+ __get_user(desc, pdesc) )
+ return 0;
+
+@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned
+ if ( !is_pv_32bit_vcpu(v) )
+ {
+ if ( (*ar & 0x1f00) != 0x0c00 ||
+- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
++ /* Limit check done above already. */
+ __get_user(desc, pdesc + 1) ||
+ (desc.b & 0x1f00) )
+ return 0;
+--- xen/arch/x86/pv/emulate.c.orig
++++ xen/arch/x86/pv/emulate.c
+@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
+ {
+ struct desc_struct desc;
+
+- if ( sel < 4)
++ if ( sel < 4 ||
++ /*
++ * Don't apply the GDT limit here, as the selector may be a Xen
++ * provided one. __get_user() will fail (without taking further
++ * action) for ones falling in the gap between guest populated
++ * and Xen ones.
++ */
++ ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) )
+ desc.b = desc.a = 0;
+ else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) )
+ return 0;
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int
+ BUG_ON(unlikely(in_irq()));
+
+ /*
+- * Hardware limit checking should guarantee this property. NB. This is
++ * Prior limit checking should guarantee this property. NB. This is
+ * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
+ * current vcpu, and vcpu_reset() will block until this vcpu has been
+ * descheduled before continuing.
+ */
+- ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents);
++ if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) )
++ {
++ ASSERT_UNREACHABLE();
++ return false;
++ }
+
+ if ( is_pv_32bit_domain(currd) )
+ linear = (uint32_t)linear;
diff --git a/sysutils/xenkernel411/patches/patch-XSA299 b/sysutils/xenkernel411/patches/patch-XSA299
new file mode 100644
index 00000000000..ea92bb68aad
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA299
@@ -0,0 +1,2413 @@
+$NetBSD: patch-XSA299,v 1.1.2.2 2019/11/16 22:10:07 bsiegert Exp $
+
+From 852df269d247e177d5f2e9b8f3a4301a6fdd76bd Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry
+
+On detection of a potential L1TF issue, most validation code returns
+-ERESTART to allow the switch to shadow mode to happen and cause the
+original operation to be restarted.
+
+However, in the validation code, the return value -ERESTART has been
+repurposed to indicate 1) the function has partially completed
+something which needs to be undone, and 2) calling put_page_type()
+should cleanly undo it. This causes problems in several places.
+
+For L1 tables, on receiving an -ERESTART return from alloc_l1_table(),
+alloc_page_type() will set PGT_partial on the page. If for some
+reason the original operation never restarts, then on domain
+destruction, relinquish_memory() will call free_page_type() on the
+page.
+
+Unfortunately, alloc_ and free_l1_table() aren't set up to deal with
+PGT_partial. When returning a failure, alloc_l1_table() always
+de-validates whatever it's validated so far, and free_l1_table()
+always devalidates the whole page. This means that if
+relinquish_memory() calls free_page_type() on an L1 that didn't
+complete due to an L1TF, it will call put_page_from_l1e() on "page
+entries" that have never been validated.
+
+For L2+ tables, setting rc to ERESTART causes the rest of the
+alloc_lN_table() function to *think* that the entry in question will
+have PGT_partial set. This will cause it to set partial_pte = 1. If
+relinqush_memory() then calls free_page_type() on one of those pages,
+then free_lN_table() will call put_page_from_lNe() on the entry when
+it shouldn't.
+
+Rather than indicating -ERESTART, indicate -EINTR. This is the code
+to indicate that nothing has changed from when you started the call
+(which is effectively how alloc_l1_table() handles errors).
+
+mod_lN_entry() shouldn't have any of these types of problems, so leave
+potential changes there for a clean-up patch later.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e6a4cb28f8..8ced185b49 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1110,7 +1110,7 @@ get_page_from_l2e(
+ int rc;
+
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+- return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
++ return pv_l1tf_check_l2e(d, l2e) ? -EINTR : 1;
+
+ if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
+ {
+@@ -1142,7 +1142,7 @@ get_page_from_l3e(
+ int rc;
+
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+- return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
++ return pv_l1tf_check_l3e(d, l3e) ? -EINTR : 1;
+
+ if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
+ {
+@@ -1175,7 +1175,7 @@ get_page_from_l4e(
+ int rc;
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+- return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
++ return pv_l1tf_check_l4e(d, l4e) ? -EINTR : 1;
+
+ if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
+ {
+@@ -1404,7 +1404,7 @@ static int alloc_l1_table(struct page_info *page)
+ {
+ if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
+ {
+- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
++ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0;
+ if ( ret )
+ goto out;
+ }
+--
+2.23.0
+
+From 6bdddd7980eac0cc883945d823986f24682ca47a Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially
+ de-validated page
+
+When unpinning pagetables, if an operation is interrupted,
+relinquish_memory() re-sets PGT_pinned so that the un-pin will
+pickedup again when the hypercall restarts.
+
+This is appropriate when put_page_and_type_preemptible() returns
+-EINTR, which indicates that the page is back in its initial state
+(i.e., completely validated). However, for -ERESTART, this leads to a
+state where a page has both PGT_pinned and PGT_partial set.
+
+This happens to work at the moment, although it's not really a
+"canonical" state; but in subsequent patches, where we need to make a
+distinction in handling between PGT_validated and PGT_partial pages,
+this causes issues.
+
+Move to a "canonical" state by:
+- Only re-setting PGT_pinned on -EINTR
+- Re-dropping the refcount held by PGT_pinned on -ERESTART
+
+In the latter case, the PGT_partial bit will be cleared further down
+with the rest of the other PGT_partial pages.
+
+While here, clean up some trainling whitespace.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++---
+ 1 file changed, 28 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 29f892c04c..8fbecbb169 100644
+--- xen/arch/x86/domain.c.orig
++++ xen/arch/x86/domain.c
+@@ -112,7 +112,7 @@ static void play_dead(void)
+ * this case, heap corruption or #PF can occur (when heap debugging is
+ * enabled). For example, even printk() can involve tasklet scheduling,
+ * which touches per-cpu vars.
+- *
++ *
+ * Consider very carefully when adding code to *dead_idle. Most hypervisor
+ * subsystems are unsafe to call.
+ */
+@@ -1838,9 +1838,34 @@ static int relinquish_memory(
+ break;
+ case -ERESTART:
+ case -EINTR:
++ /*
++ * -EINTR means PGT_validated has been re-set; re-set
++ * PGT_pinned again so that it gets picked up next time
++ * around.
++ *
++ * -ERESTART, OTOH, means PGT_partial is set instead. Put
++ * it back on the list, but don't set PGT_pinned; the
++ * section below will finish off de-validation. But we do
++ * need to drop the general ref associated with
++ * PGT_pinned, since put_page_and_type_preemptible()
++ * didn't do it.
++ *
++ * NB we can do an ASSERT for PGT_validated, since we
++ * "own" the type ref; but theoretically, the PGT_partial
++ * could be cleared by someone else.
++ */
++ if ( ret == -EINTR )
++ {
++ ASSERT(page->u.inuse.type_info & PGT_validated);
++ set_bit(_PGT_pinned, &page->u.inuse.type_info);
++ }
++ else
++ put_page(page);
++
+ ret = -ERESTART;
++
++ /* Put the page back on the list and drop the ref we grabbed above */
+ page_list_add(page, list);
+- set_bit(_PGT_pinned, &page->u.inuse.type_info);
+ put_page(page);
+ goto out;
+ default:
+@@ -2062,7 +2087,7 @@ void vcpu_kick(struct vcpu *v)
+ * pending flag. These values may fluctuate (after all, we hold no
+ * locks) but the key insight is that each change will cause
+ * evtchn_upcall_pending to be polled.
+- *
++ *
+ * NB2. We save the running flag across the unblock to avoid a needless
+ * IPI for domains that we IPI'd to unblock.
+ */
+--
+2.23.0
+
+From 7c0a37005f52d10903ce22851b52ae9b6f4f0ee2 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into
+ individual flags
+
+At the moment, partial_pte is a tri-state that contains two distinct bits
+of information:
+
+1. If zero, the pte at index [nr_validated_ptes] is un-validated. If
+ non-zero, the pte was last seen with PGT_partial set.
+
+2. If positive, the pte at index [nr_validated_ptes] does not hold a
+ general reference count. If negative, it does.
+
+To make future patches more clear, separate out this functionality
+into two distinct, named bits: PTF_partial_set (for #1) and
+PTF_partial_general_ref (for #2).
+
+Additionally, a number of functions which need this information also
+take other flags to control behavior (such as `preemptible` and
+`defer`). These are hard to read in the caller (since you only see
+'true' or 'false'), and ugly when many are added together. In
+preparation for adding yet another flag in a future patch, collapse
+all of these into a single `flag` variable.
+
+NB that this does mean checking for what was previously the '-1'
+condition a bit more ugly in the put_page_from_lNe functions (since
+you have to check for both partial_set and general ref); but this
+clause will go away in a future patch.
+
+Also note that the original comment had an off-by-one error:
+partial_flags (like partial_pte before it) concerns
+plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].
+
+No functional change intended.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 164 +++++++++++++++++++++++----------------
+ xen/include/asm-x86/mm.h | 41 ++++++----
+ 2 files changed, 127 insertions(+), 78 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 8ced185b49..1c4f54e328 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -610,20 +610,34 @@ static int alloc_segdesc_page(struct page_info *page)
+ static int _get_page_type(struct page_info *page, unsigned long type,
+ bool preemptible);
+
++/*
++ * The following flags are used to specify behavior of various get and
++ * put commands. The first two are also stored in page->partial_flags
++ * to indicate the state of the page pointed to by
++ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
++ * more information.
++ */
++#define PTF_partial_set (1 << 0)
++#define PTF_partial_general_ref (1 << 1)
++#define PTF_preemptible (1 << 2)
++#define PTF_defer (1 << 3)
++
+ static int get_page_and_type_from_mfn(
+ mfn_t mfn, unsigned long type, struct domain *d,
+- int partial, int preemptible)
++ unsigned int flags)
+ {
+ struct page_info *page = mfn_to_page(mfn);
+ int rc;
++ bool preemptible = flags & PTF_preemptible,
++ partial_ref = flags & PTF_partial_general_ref;
+
+- if ( likely(partial >= 0) &&
++ if ( likely(!partial_ref) &&
+ unlikely(!get_page_from_mfn(mfn, d)) )
+ return -EINVAL;
+
+ rc = _get_page_type(page, type, preemptible);
+
+- if ( unlikely(rc) && partial >= 0 &&
++ if ( unlikely(rc) && !partial_ref &&
+ (!preemptible || page != current->arch.old_guest_table) )
+ put_page(page);
+
+@@ -1104,7 +1118,7 @@ get_page_from_l1e(
+ define_get_linear_pagetable(l2);
+ static int
+ get_page_from_l2e(
+- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
++ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
+ {
+ unsigned long mfn = l2e_get_pfn(l2e);
+ int rc;
+@@ -1119,8 +1133,9 @@ get_page_from_l2e(
+ return -EINVAL;
+ }
+
+- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
+- partial, false);
++ ASSERT(!(flags & PTF_preemptible));
++
++ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;
+
+@@ -1137,7 +1152,7 @@ get_page_from_l2e(
+ define_get_linear_pagetable(l3);
+ static int
+ get_page_from_l3e(
+- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
++ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
+ {
+ int rc;
+
+@@ -1152,7 +1167,7 @@ get_page_from_l3e(
+ }
+
+ rc = get_page_and_type_from_mfn(
+- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
++ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
+ if ( unlikely(rc == -EINVAL) &&
+ !is_pv_32bit_domain(d) &&
+ get_l3_linear_pagetable(l3e, pfn, d) )
+@@ -1170,7 +1185,7 @@ get_page_from_l3e(
+ define_get_linear_pagetable(l4);
+ static int
+ get_page_from_l4e(
+- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
++ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
+ {
+ int rc;
+
+@@ -1185,7 +1200,7 @@ get_page_from_l4e(
+ }
+
+ rc = get_page_and_type_from_mfn(
+- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
++ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
+ if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+ rc = 0;
+
+@@ -1275,7 +1290,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
+ * Note also that this automatically deals correctly with linear p.t.'s.
+ */
+ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+- int partial, bool defer)
++ unsigned int flags)
+ {
+ int rc = 0;
+
+@@ -1295,12 +1310,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ struct page_info *pg = l2e_get_page(l2e);
+ struct page_info *ptpg = mfn_to_page(_mfn(pfn));
+
+- if ( unlikely(partial > 0) )
++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
++ PTF_partial_set )
+ {
+- ASSERT(!defer);
++ ASSERT(!(flags & PTF_defer));
+ rc = _put_page_type(pg, true, ptpg);
+ }
+- else if ( defer )
++ else if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = ptpg;
+ current->arch.old_guest_table = pg;
+@@ -1317,7 +1333,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ }
+
+ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+- int partial, bool defer)
++ unsigned int flags)
+ {
+ struct page_info *pg;
+ int rc;
+@@ -1340,13 +1356,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+
+ pg = l3e_get_page(l3e);
+
+- if ( unlikely(partial > 0) )
++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
++ PTF_partial_set )
+ {
+- ASSERT(!defer);
++ ASSERT(!(flags & PTF_defer));
+ return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
+ }
+
+- if ( defer )
++ if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
+@@ -1361,7 +1378,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ }
+
+ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+- int partial, bool defer)
++ unsigned int flags)
+ {
+ int rc = 1;
+
+@@ -1370,13 +1387,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ {
+ struct page_info *pg = l4e_get_page(l4e);
+
+- if ( unlikely(partial > 0) )
++ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
++ PTF_partial_set )
+ {
+- ASSERT(!defer);
++ ASSERT(!(flags & PTF_defer));
+ return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
+ }
+
+- if ( defer )
++ if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
+@@ -1483,12 +1501,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+ unsigned int i;
+- int rc = 0, partial = page->partial_pte;
++ int rc = 0;
++ unsigned int partial_flags = page->partial_flags;
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
+- i++, partial = 0 )
++ i++, partial_flags = 0 )
+ {
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+ {
+@@ -1498,18 +1517,19 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ }
+
+ if ( !is_guest_l2_slot(d, type, i) ||
+- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
++ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: 1;
++ /* Set 'set', retain 'general ref' */
++ page->partial_flags = partial_flags | PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ rc = -ERESTART;
+ }
+ else if ( rc < 0 && rc != -EINTR )
+@@ -1518,7 +1538,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+@@ -1542,7 +1562,8 @@ static int alloc_l3_table(struct page_info *page)
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l3_pgentry_t *pl3e;
+ unsigned int i;
+- int rc = 0, partial = page->partial_pte;
++ int rc = 0;
++ unsigned int partial_flags = page->partial_flags;
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+@@ -1557,7 +1578,7 @@ static int alloc_l3_table(struct page_info *page)
+ memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
+
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+- i++, partial = 0 )
++ i++, partial_flags = 0 )
+ {
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+ {
+@@ -1574,20 +1595,22 @@ static int alloc_l3_table(struct page_info *page)
+ else
+ rc = get_page_and_type_from_mfn(
+ l3e_get_mfn(pl3e[i]),
+- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
++ PGT_l2_page_table | PGT_pae_xen_l2, d,
++ partial_flags | PTF_preemptible);
+ }
+- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
++ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: 1;
++ /* Set 'set', leave 'general ref' set if this entry was set */
++ page->partial_flags = partial_flags | PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ rc = -ERESTART;
+ }
+ if ( rc < 0 )
+@@ -1604,7 +1627,7 @@ static int alloc_l3_table(struct page_info *page)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+@@ -1736,19 +1759,21 @@ static int alloc_l4_table(struct page_info *page)
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
+ unsigned int i;
+- int rc = 0, partial = page->partial_pte;
++ int rc = 0;
++ unsigned int partial_flags = page->partial_flags;
+
+ for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+- i++, partial = 0 )
++ i++, partial_flags = 0 )
+ {
+ if ( !is_guest_l4_slot(d, i) ||
+- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
++ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: 1;
++ /* Set 'set', leave 'general ref' set if this entry was set */
++ page->partial_flags = partial_flags | PTF_partial_set;
+ }
+ else if ( rc < 0 )
+ {
+@@ -1758,7 +1783,7 @@ static int alloc_l4_table(struct page_info *page)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ else
+@@ -1811,19 +1836,20 @@ static int free_l2_table(struct page_info *page)
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l2_pgentry_t *pl2e;
+- int rc = 0, partial = page->partial_pte;
+- unsigned int i = page->nr_validated_ptes - !partial;
++ int rc = 0;
++ unsigned int partial_flags = page->partial_flags,
++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
+
+ pl2e = map_domain_page(_mfn(pfn));
+
+ for ( ; ; )
+ {
+ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
+- rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
++ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
+ if ( rc < 0 )
+ break;
+
+- partial = 0;
++ partial_flags = 0;
+
+ if ( !i-- )
+ break;
+@@ -1845,12 +1871,14 @@ static int free_l2_table(struct page_info *page)
+ else if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: -1;
++ page->partial_flags = (partial_flags & PTF_partial_set) ?
++ partial_flags :
++ (PTF_partial_set | PTF_partial_general_ref);
+ }
+ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ rc = -ERESTART;
+ }
+
+@@ -1862,18 +1890,19 @@ static int free_l3_table(struct page_info *page)
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l3_pgentry_t *pl3e;
+- int rc = 0, partial = page->partial_pte;
+- unsigned int i = page->nr_validated_ptes - !partial;
++ int rc = 0;
++ unsigned int partial_flags = page->partial_flags,
++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+ for ( ; ; )
+ {
+- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
++ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
+ if ( rc < 0 )
+ break;
+
+- partial = 0;
++ partial_flags = 0;
+ if ( rc == 0 )
+ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+
+@@ -1892,12 +1921,14 @@ static int free_l3_table(struct page_info *page)
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: -1;
++ page->partial_flags = (partial_flags & PTF_partial_set) ?
++ partial_flags :
++ (PTF_partial_set | PTF_partial_general_ref);
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ rc = -ERESTART;
+ }
+ return rc > 0 ? 0 : rc;
+@@ -1908,26 +1939,29 @@ static int free_l4_table(struct page_info *page)
+ struct domain *d = page_get_owner(page);
+ unsigned long pfn = mfn_x(page_to_mfn(page));
+ l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
+- int rc = 0, partial = page->partial_pte;
+- unsigned int i = page->nr_validated_ptes - !partial;
++ int rc = 0;
++ unsigned partial_flags = page->partial_flags,
++ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
+
+ do {
+ if ( is_guest_l4_slot(d, i) )
+- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
++ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
+ if ( rc < 0 )
+ break;
+- partial = 0;
++ partial_flags = 0;
+ } while ( i-- );
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_pte = partial ?: -1;
++ page->partial_flags = (partial_flags & PTF_partial_set) ?
++ partial_flags :
++ (PTF_partial_set | PTF_partial_general_ref);
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+ page->nr_validated_ptes = i + 1;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ rc = -ERESTART;
+ }
+
+@@ -2203,7 +2237,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
+ return -EBUSY;
+ }
+
+- put_page_from_l2e(ol2e, pfn, 0, true);
++ put_page_from_l2e(ol2e, pfn, PTF_defer);
+
+ return rc;
+ }
+@@ -2271,7 +2305,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
+ if ( !create_pae_xen_mappings(d, pl3e) )
+ BUG();
+
+- put_page_from_l3e(ol3e, pfn, 0, 1);
++ put_page_from_l3e(ol3e, pfn, PTF_defer);
+ return rc;
+ }
+
+@@ -2334,7 +2368,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+ return -EFAULT;
+ }
+
+- put_page_from_l4e(ol4e, pfn, 0, 1);
++ put_page_from_l4e(ol4e, pfn, PTF_defer);
+ return rc;
+ }
+
+@@ -2598,7 +2632,7 @@ int free_page_type(struct page_info *page, unsigned long type,
+ if ( !(type & PGT_partial) )
+ {
+ page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ }
+
+ switch ( type & PGT_type_mask )
+@@ -2889,7 +2923,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ if ( !(x & PGT_partial) )
+ {
+ page->nr_validated_ptes = 0;
+- page->partial_pte = 0;
++ page->partial_flags = 0;
+ }
+ page->linear_pt_count = 0;
+ rc = alloc_page_type(page, type, preemptible);
+@@ -3064,7 +3098,7 @@ int new_guest_cr3(mfn_t mfn)
+ return 0;
+ }
+
+- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
++ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible);
+ switch ( rc )
+ {
+ case 0:
+@@ -3452,7 +3486,7 @@ long do_mmuext_op(
+ if ( op.arg1.mfn != 0 )
+ {
+ rc = get_page_and_type_from_mfn(
+- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
++ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible);
+
+ if ( unlikely(rc) )
+ {
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 1ea173c555..46cba52941 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -228,19 +228,34 @@ struct page_info
+ * setting the flag must not drop that reference, whereas the instance
+ * clearing it will have to.
+ *
+- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+- * been partially validated. This implies that the general reference
+- * to the page (acquired from get_page_from_lNe()) would be dropped
+- * (again due to the apparent failure) and hence must be re-acquired
+- * when resuming the validation, but must not be dropped when picking
+- * up the page for invalidation.
++ * If partial_flags & PTF_partial_set is set, then the page at
++ * at @nr_validated_ptes had PGT_partial set as a result of an
++ * operation on the current page. (That page may or may not
++ * still have PGT_partial set.)
+ *
+- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+- * been partially invalidated. This is basically the opposite case of
+- * above, i.e. the general reference to the page was not dropped in
+- * put_page_from_lNe() (due to the apparent failure), and hence it
+- * must be dropped when the put operation is resumed (and completes),
+- * but it must not be acquired if picking up the page for validation.
++ * If PTF_partial_general_ref is set, then the PTE at
++ * @nr_validated_ptef holds a general reference count for the
++ * page.
++ *
++ * This happens:
++ * - During de-validation, if de-validation of the page was
++ * interrupted
++ * - During validation, if an invalid entry is encountered and
++ * validation is preemptible
++ * - During validation, if PTF_partial_general_ref was set on
++ * this entry to begin with (perhaps because we're picking
++ * up from a partial de-validation).
++ *
++ * When resuming validation, if PTF_partial_general_ref is clear,
++ * then a general reference must be re-acquired; if it is set, no
++ * reference should be acquired.
++ *
++ * When resuming de-validation, if PTF_partial_general_ref is
++ * clear, no reference should be dropped; if it is set, a
++ * reference should be dropped.
++ *
++ * NB that PTF_partial_set and PTF_partial_general_ref are
++ * defined in mm.c, the only place where they are used.
+ *
+ * The 3rd field, @linear_pt_count, indicates
+ * - by a positive value, how many same-level page table entries a page
+@@ -251,7 +266,7 @@ struct page_info
+ struct {
+ u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+ u16 :16 - PAGETABLE_ORDER - 1 - 2;
+- s16 partial_pte:2;
++ u16 partial_flags:2;
+ s16 linear_pt_count;
+ };
+
+--
+2.23.0
+
+From 20b8a6702c6839bafd252789396b443d4b5c5474 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a
+ boolean
+
+This is in mainly in preparation for _put_page_type taking the
+partial_flags value in the future. It also makes it easier to read in
+the caller (since you see a flag name rather than `true` or `false`).
+
+No functional change intended.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 25 +++++++++++++------------
+ 1 file changed, 13 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 1c4f54e328..e2fba15d86 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1207,7 +1207,7 @@ get_page_from_l4e(
+ return rc;
+ }
+
+-static int _put_page_type(struct page_info *page, bool preemptible,
++static int _put_page_type(struct page_info *page, unsigned int flags,
+ struct page_info *ptpg);
+
+ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
+@@ -1314,7 +1314,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ PTF_partial_set )
+ {
+ ASSERT(!(flags & PTF_defer));
+- rc = _put_page_type(pg, true, ptpg);
++ rc = _put_page_type(pg, PTF_preemptible, ptpg);
+ }
+ else if ( flags & PTF_defer )
+ {
+@@ -1323,7 +1323,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ }
+ else
+ {
+- rc = _put_page_type(pg, true, ptpg);
++ rc = _put_page_type(pg, PTF_preemptible, ptpg);
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -1360,7 +1360,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ PTF_partial_set )
+ {
+ ASSERT(!(flags & PTF_defer));
+- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ }
+
+ if ( flags & PTF_defer )
+@@ -1370,7 +1370,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ return 0;
+ }
+
+- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+
+@@ -1391,7 +1391,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ PTF_partial_set )
+ {
+ ASSERT(!(flags & PTF_defer));
+- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
++ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ }
+
+ if ( flags & PTF_defer )
+@@ -1401,7 +1401,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ return 0;
+ }
+
+- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -2701,10 +2701,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type,
+ }
+
+
+-static int _put_page_type(struct page_info *page, bool preemptible,
++static int _put_page_type(struct page_info *page, unsigned int flags,
+ struct page_info *ptpg)
+ {
+ unsigned long nx, x, y = page->u.inuse.type_info;
++ bool preemptible = flags & PTF_preemptible;
+
+ ASSERT(current_locked_page_ne_check(page));
+
+@@ -2911,7 +2912,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ if ( unlikely(iommu_ret) )
+ {
+- _put_page_type(page, false, NULL);
++ _put_page_type(page, 0, NULL);
+ rc = iommu_ret;
+ goto out;
+ }
+@@ -2938,7 +2939,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ void put_page_type(struct page_info *page)
+ {
+- int rc = _put_page_type(page, false, NULL);
++ int rc = _put_page_type(page, 0, NULL);
+ ASSERT(rc == 0);
+ (void)rc;
+ }
+@@ -2955,7 +2956,7 @@ int get_page_type(struct page_info *page, unsigned long type)
+
+ int put_page_type_preemptible(struct page_info *page)
+ {
+- return _put_page_type(page, true, NULL);
++ return _put_page_type(page, PTF_preemptible, NULL);
+ }
+
+ int get_page_type_preemptible(struct page_info *page, unsigned long type)
+@@ -2972,7 +2973,7 @@ int put_old_guest_table(struct vcpu *v)
+ if ( !v->arch.old_guest_table )
+ return 0;
+
+- switch ( rc = _put_page_type(v->arch.old_guest_table, true,
++ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
+ v->arch.old_guest_ptpg) )
+ {
+ case -EINTR:
+--
+2.23.0
+
+From 7b3f9f9a797459902bebba962e31be5cbfe7b515 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional
+
+Make it easier to read by declaring the conditions in which we will
+retain the ref, rather than the conditions under which we release it.
+
+The only way (page == current->arch.old_guest_table) can be true is if
+preemptible is true; so remove this from the query itself, and add an
+ASSERT() to that effect on the opposite path.
+
+No functional change intended.
+
+NB that alloc_lN_table() mishandle the "linear pt failure" situation
+described in the comment; this will be addressed in a future patch.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 37 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e2fba15d86..eaf7b14245 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -637,8 +637,43 @@ static int get_page_and_type_from_mfn(
+
+ rc = _get_page_type(page, type, preemptible);
+
+- if ( unlikely(rc) && !partial_ref &&
+- (!preemptible || page != current->arch.old_guest_table) )
++ /*
++ * Retain the refcount if:
++ * - page is fully validated (rc == 0)
++ * - page is not validated (rc < 0) but:
++ * - We came in with a reference (partial_ref)
++ * - page is partially validated but there's been an error
++ * (page == current->arch.old_guest_table)
++ *
++ * The partial_ref-on-error clause is worth an explanation. There
++ * are two scenarios where partial_ref might be true coming in:
++ * - mfn has been partially demoted as type `type`; i.e. has
++ * PGT_partial set
++ * - mfn has been partially demoted as L(type+1) (i.e., a linear
++ * page; e.g. we're being called from get_page_from_l2e with
++ * type == PGT_l1_table, but the mfn is PGT_l2_table)
++ *
++ * If there's an error, in the first case, _get_page_type will
++ * either return -ERESTART, in which case we want to retain the
++ * ref (as the caller will consider it retained), or -EINVAL, in
++ * which case old_guest_table will be set; in both cases, we need
++ * to retain the ref.
++ *
++ * In the second case, if there's an error, _get_page_type() can
++ * *only* return -EINVAL, and *never* set old_guest_table. In
++ * that case we also want to retain the reference, to allow the
++ * page to continue to be torn down (i.e., PGT_partial cleared)
++ * safely.
++ *
++ * Also note that we shouldn't be able to leave with the reference
++ * count retained unless we succeeded, or the operation was
++ * preemptible.
++ */
++ if ( likely(!rc) || partial_ref )
++ /* nothing */;
++ else if ( page == current->arch.old_guest_table )
++ ASSERT(preemptible);
++ else
+ put_page(page);
+
+ return rc;
+--
+2.23.0
+
+From d28893777be56ef51562ed32502377974f738fd3 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when
+ preempting
+
+In order to allow recursive pagetable promotions and demotions to be
+interrupted, Xen must keep track of the state of the sub-pages
+promoted or demoted. This is stored in two elements in the page
+struct: nr_entries_validated and partial_flags.
+
+The rule is that entries [0, nr_entries_validated) should always be
+validated and hold a general reference count. If partial_flags is
+zero, then [nr_entries_validated] is not validated and no reference
+count is held. If PTF_partial_set is set, then [nr_entries_validated]
+is partially validated.
+
+At the moment, a distinction is made between promotion and demotion
+with regard to whether the entry itself "holds" a general reference
+count: when entry promotion is interrupted (i.e., returns -ERESTART),
+the entry is not considered to hold a reference; when entry demotion
+is interrupted, the entry is still considered to hold a general
+reference.
+
+PTF_partial_general_ref is used to distinguish between these cases.
+If clear, it's a partial promotion => no general reference count held
+by the entry; if set, it's partial demotion, so a general reference
+count held. Because promotions and demotions can be interleaved, this
+value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
+to be able to properly handle reference counts.
+
+Unfortunately, when alloc_l[23]_table check hypercall_preempt_check()
+and return -ERESTART, they set nr_entries_validated, but don't clear
+partial_flags.
+
+If we were picking up from a previously-interrupted promotion, that
+means that PTF_partial_set would be set even though
+[nr_entries_validated] was not partially validated. This means that
+if the page in this state were de-validated, put_page_type() would
+erroneously be called on that entry.
+
+Perhaps worse, if we were racing with a de-validation, then we might
+leave both PTF_partial_set and PTF_partial_general_ref; and when
+de-validation picked up again, both the type and the general ref would
+be erroneously dropped from [nr_entries_validated].
+
+In a sense, the real issue here is code duplication. Rather than
+duplicate the interruption code, set rc to -EINTR and fall through to
+the code which already handles that case correctly.
+
+Given the logic at this point, it should be impossible for
+partial_flags to be non-zero; add an ASSERT() to catch any changes.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 18 ++++--------------
+ 1 file changed, 4 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index eaf7b14245..053465cb7c 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1545,13 +1545,8 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ i++, partial_flags = 0 )
+ {
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+- {
+- page->nr_validated_ptes = i;
+- rc = -ERESTART;
+- break;
+- }
+-
+- if ( !is_guest_l2_slot(d, type, i) ||
++ rc = -EINTR;
++ else if ( !is_guest_l2_slot(d, type, i) ||
+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
+ continue;
+
+@@ -1616,13 +1611,8 @@ static int alloc_l3_table(struct page_info *page)
+ i++, partial_flags = 0 )
+ {
+ if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
+- {
+- page->nr_validated_ptes = i;
+- rc = -ERESTART;
+- break;
+- }
+-
+- if ( is_pv_32bit_domain(d) && (i == 3) )
++ rc = -EINTR;
++ else if ( is_pv_32bit_domain(d) && (i == 3) )
+ {
+ if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
+ (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+--
+2.23.0
+
+From f608a53c25806a7a4318cbe225bc5f5bbf154d69 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial
+
+In order to allow recursive pagetable promotions and demotions to be
+interrupted, Xen must keep track of the state of the sub-pages
+promoted or demoted. This is stored in two elements in the page struct:
+nr_entries_validated and partial_flags.
+
+The rule is that entries [0, nr_entries_validated) should always be
+validated and hold a general reference count. If partial_flags is
+zero, then [nr_entries_validated] is not validated and no reference
+count is held. If PTF_partial_set is set, then [nr_entries_validated]
+is partially validated.
+
+At the moment, a distinction is made between promotion and demotion
+with regard to whether the entry itself "holds" a general reference
+count: when entry promotion is interrupted (i.e., returns -ERESTART),
+the entry is not considered to hold a reference; when entry demotion
+is interrupted, the entry is still considered to hold a general
+reference.
+
+PTF_partial_general_ref is used to distinguish between these cases.
+If clear, it's a partial promotion => no general reference count held
+by the entry; if set, it's partial demotion, so a general reference
+count held. Because promotions and demotions can be interleaved, this
+value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
+to be able to properly handle reference counts.
+
+Unfortunately, because a refcount is not held, it is possible to
+engineer a situation where PFT_partial_set is set but the page in
+question has been assigned to another domain. A sketch is provided in
+the appendix.
+
+Fix this by having the parent page table entry hold a general
+reference count whenever PFT_partial_set is set. (For clarity of
+change, keep two separate flags. These will be collapsed in a
+subsequent changeset.)
+
+This has two basic implications. On the put_page_from_lNe() side,
+this mean that the (partial_set && !partial_ref) case can never happen,
+and no longer needs to be special-cased.
+
+Secondly, because both flags are set together, there's no need to carry over
+existing bits from partial_pte.
+
+(NB there is still another issue with calling _put_page_type() on a
+page which had PGT_partial set; that will be handled in a subsequent
+patch.)
+
+On the get_page_and_type_from_mfn() side, we need to distinguish
+between callers which hold a reference on partial (i.e.,
+alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and
+so on): pass a flag if the type should be retained on interruption.
+
+NB that since l1 promotion can't be preempted, that get_page_from_l2e
+can't return -ERESTART.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+-----
+* Appendix: Engineering PTF_partial_set while a page belongs to a
+ foreign domain
+
+Suppose A is a page which can be promoted to an l3, and B is a page
+which can be promoted to an l2, and A[x] points to B. B has
+PGC_allocated set but no other general references.
+
+V1: PIN_L3 A.
+ A is validated, B is validated.
+ A.type_count = 1 | PGT_validated | PGT_pinned
+ B.type_count = 1 | PGT_validated
+ B.count = 2 | PGC_allocated (A[x] holds a general ref)
+
+V1: UNPIN A.
+ A begins de-validation.
+ Arrange to be interrupted when i < x
+ V1->old_guest_table = A
+ V1->old_guest_table_ref_held = false
+ A.type_count = 1 | PGT_partial
+ A.nr_validated_entries = i < x
+ B.type_count = 0
+ B.count = 1 | PGC_allocated
+
+V2: MOD_L4_ENTRY to point some l4e to A.
+ Picks up re-validation of A.
+ Arrange to be interrupted halfway through B's validation
+ B.type_count = 1 | PGT_partial
+ B.count = 2 | PGC_allocated (PGT_partial holds a general ref)
+ A.type_count = 1 | PGT_partial
+ A.nr_validated_entries = x
+ A.partial_pte = PTF_partial_set
+
+V3: MOD_L3_ENTRY to point some other l3e (not in A) to B.
+ Validates B.
+ B.type_count = 1 | PGT_validated
+ B.count = 2 | PGC_allocated ("other l3e" holds a general ref)
+
+V3: MOD_L3_ENTRY to clear l3e pointing to B.
+ Devalidates B.
+ B.type_count = 0
+ B.count = 1 | PGC_allocated
+
+V3: decrease_reservation(B)
+ Clears PGC_allocated
+ B.count = 0 => B is freed
+
+B gets assigned to a different domain
+
+V1: Restarts UNPIN of A
+ put_old_guest_table(A)
+ ...
+ free_l3_table(A)
+
+Now since A.partial_flags has PTF_partial_set, free_l3_table() will
+call put_page_from_l3e() on A[x], which points to B, while B is owned
+by another domain.
+
+If A[x] held a general refcount for B on partial validation, as it does
+for partial de-validation, then B would still have a reference count of
+1 after PGC_allocated was freed; so B wouldn't be freed until after
+put_page_from_l3e() had happend on A[x].
+---
+ xen/arch/x86/mm.c | 84 +++++++++++++++++++++++-----------------
+ xen/include/asm-x86/mm.h | 15 ++++---
+ 2 files changed, 58 insertions(+), 41 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 053465cb7c..68a9e74002 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -617,10 +617,11 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
+ * more information.
+ */
+-#define PTF_partial_set (1 << 0)
+-#define PTF_partial_general_ref (1 << 1)
+-#define PTF_preemptible (1 << 2)
+-#define PTF_defer (1 << 3)
++#define PTF_partial_set (1 << 0)
++#define PTF_partial_general_ref (1 << 1)
++#define PTF_preemptible (1 << 2)
++#define PTF_defer (1 << 3)
++#define PTF_retain_ref_on_restart (1 << 4)
+
+ static int get_page_and_type_from_mfn(
+ mfn_t mfn, unsigned long type, struct domain *d,
+@@ -629,7 +630,11 @@ static int get_page_and_type_from_mfn(
+ struct page_info *page = mfn_to_page(mfn);
+ int rc;
+ bool preemptible = flags & PTF_preemptible,
+- partial_ref = flags & PTF_partial_general_ref;
++ partial_ref = flags & PTF_partial_general_ref,
++ partial_set = flags & PTF_partial_set,
++ retain_ref = flags & PTF_retain_ref_on_restart;
++
++ ASSERT(partial_ref == partial_set);
+
+ if ( likely(!partial_ref) &&
+ unlikely(!get_page_from_mfn(mfn, d)) )
+@@ -642,13 +647,15 @@ static int get_page_and_type_from_mfn(
+ * - page is fully validated (rc == 0)
+ * - page is not validated (rc < 0) but:
+ * - We came in with a reference (partial_ref)
++ * - page is partially validated (rc == -ERESTART), and the
++ * caller has asked the ref to be retained in that case
+ * - page is partially validated but there's been an error
+ * (page == current->arch.old_guest_table)
+ *
+ * The partial_ref-on-error clause is worth an explanation. There
+ * are two scenarios where partial_ref might be true coming in:
+- * - mfn has been partially demoted as type `type`; i.e. has
+- * PGT_partial set
++ * - mfn has been partially promoted / demoted as type `type`;
++ * i.e. has PGT_partial set
+ * - mfn has been partially demoted as L(type+1) (i.e., a linear
+ * page; e.g. we're being called from get_page_from_l2e with
+ * type == PGT_l1_table, but the mfn is PGT_l2_table)
+@@ -671,7 +678,8 @@ static int get_page_and_type_from_mfn(
+ */
+ if ( likely(!rc) || partial_ref )
+ /* nothing */;
+- else if ( page == current->arch.old_guest_table )
++ else if ( page == current->arch.old_guest_table ||
++ (retain_ref && rc == -ERESTART) )
+ ASSERT(preemptible);
+ else
+ put_page(page);
+@@ -1348,8 +1356,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
+ {
+- ASSERT(!(flags & PTF_defer));
+- rc = _put_page_type(pg, PTF_preemptible, ptpg);
++ /* partial_set should always imply partial_ref */
++ BUG();
+ }
+ else if ( flags & PTF_defer )
+ {
+@@ -1394,8 +1402,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
+ {
+- ASSERT(!(flags & PTF_defer));
+- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ /* partial_set should always imply partial_ref */
++ BUG();
+ }
+
+ if ( flags & PTF_defer )
+@@ -1425,8 +1433,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
+ {
+- ASSERT(!(flags & PTF_defer));
+- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ /* partial_set should always imply partial_ref */
++ BUG();
+ }
+
+ if ( flags & PTF_defer )
+@@ -1550,13 +1558,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
+ continue;
+
+- if ( rc == -ERESTART )
+- {
+- page->nr_validated_ptes = i;
+- /* Set 'set', retain 'general ref' */
+- page->partial_flags = partial_flags | PTF_partial_set;
+- }
+- else if ( rc == -EINTR && i )
++ /*
++ * It shouldn't be possible for get_page_from_l2e to return
++ * -ERESTART, since we never call this with PTF_preemptible.
++ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable
++ * entry.)
++ *
++ * NB that while on a "clean" promotion, we can never get
++ * PGT_partial. It is possible to arrange for an l2e to
++ * contain a partially-devalidated l2; but in that case, both
++ * of the following functions will fail anyway (the first
++ * because the page in question is not an l1; the second
++ * because the page is not fully validated).
++ */
++ ASSERT(rc != -ERESTART);
++
++ if ( rc == -EINTR && i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_flags = 0;
+@@ -1565,6 +1582,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ else if ( rc < 0 && rc != -EINTR )
+ {
+ gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
++ ASSERT(current->arch.old_guest_table == NULL);
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+@@ -1621,16 +1639,17 @@ static int alloc_l3_table(struct page_info *page)
+ rc = get_page_and_type_from_mfn(
+ l3e_get_mfn(pl3e[i]),
+ PGT_l2_page_table | PGT_pae_xen_l2, d,
+- partial_flags | PTF_preemptible);
++ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart);
+ }
+- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
++ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d,
++ partial_flags | PTF_retain_ref_on_restart)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+- page->partial_flags = partial_flags | PTF_partial_set;
++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
+ }
+ else if ( rc == -EINTR && i )
+ {
+@@ -1791,14 +1810,15 @@ static int alloc_l4_table(struct page_info *page)
+ i++, partial_flags = 0 )
+ {
+ if ( !is_guest_l4_slot(d, i) ||
+- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
++ (rc = get_page_from_l4e(pl4e[i], pfn, d,
++ partial_flags | PTF_retain_ref_on_restart)) > 0 )
+ continue;
+
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+- page->partial_flags = partial_flags | PTF_partial_set;
++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
+ }
+ else if ( rc < 0 )
+ {
+@@ -1896,9 +1916,7 @@ static int free_l2_table(struct page_info *page)
+ else if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = (partial_flags & PTF_partial_set) ?
+- partial_flags :
+- (PTF_partial_set | PTF_partial_general_ref);
++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
+ }
+ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
+ {
+@@ -1946,9 +1964,7 @@ static int free_l3_table(struct page_info *page)
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = (partial_flags & PTF_partial_set) ?
+- partial_flags :
+- (PTF_partial_set | PTF_partial_general_ref);
++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+@@ -1979,9 +1995,7 @@ static int free_l4_table(struct page_info *page)
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = (partial_flags & PTF_partial_set) ?
+- partial_flags :
+- (PTF_partial_set | PTF_partial_general_ref);
++ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 46cba52941..dc9cb869dd 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -238,22 +238,25 @@ struct page_info
+ * page.
+ *
+ * This happens:
+- * - During de-validation, if de-validation of the page was
++ * - During validation or de-validation, if the operation was
+ * interrupted
+ * - During validation, if an invalid entry is encountered and
+ * validation is preemptible
+ * - During validation, if PTF_partial_general_ref was set on
+- * this entry to begin with (perhaps because we're picking
+- * up from a partial de-validation).
++ * this entry to begin with (perhaps because it picked up a
++ * previous operation)
+ *
+- * When resuming validation, if PTF_partial_general_ref is clear,
+- * then a general reference must be re-acquired; if it is set, no
+- * reference should be acquired.
++ * When resuming validation, if PTF_partial_general_ref is
++ * clear, then a general reference must be re-acquired; if it
++ * is set, no reference should be acquired.
+ *
+ * When resuming de-validation, if PTF_partial_general_ref is
+ * clear, no reference should be dropped; if it is set, a
+ * reference should be dropped.
+ *
++ * NB at the moment, PTF_partial_set should be set if and only if
++ * PTF_partial_general_ref is set.
++ *
+ * NB that PTF_partial_set and PTF_partial_general_ref are
+ * defined in mm.c, the only place where they are used.
+ *
+--
+2.23.0
+
+From 6811df7fb7a1d4bb5a75fec9cf41519b5c86c605 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and
+ PTF_partial_general_ref into one
+
+...now that they are equivalent. No functional change intended.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 50 +++++++++++-----------------------------
+ xen/include/asm-x86/mm.h | 29 +++++++++++------------
+ 2 files changed, 26 insertions(+), 53 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 68a9e74002..4970b19aff 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -612,13 +612,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ /*
+ * The following flags are used to specify behavior of various get and
+- * put commands. The first two are also stored in page->partial_flags
+- * to indicate the state of the page pointed to by
++ * put commands. The first is also stored in page->partial_flags to
++ * indicate the state of the page pointed to by
+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
+ * more information.
+ */
+ #define PTF_partial_set (1 << 0)
+-#define PTF_partial_general_ref (1 << 1)
+ #define PTF_preemptible (1 << 2)
+ #define PTF_defer (1 << 3)
+ #define PTF_retain_ref_on_restart (1 << 4)
+@@ -630,13 +629,10 @@ static int get_page_and_type_from_mfn(
+ struct page_info *page = mfn_to_page(mfn);
+ int rc;
+ bool preemptible = flags & PTF_preemptible,
+- partial_ref = flags & PTF_partial_general_ref,
+ partial_set = flags & PTF_partial_set,
+ retain_ref = flags & PTF_retain_ref_on_restart;
+
+- ASSERT(partial_ref == partial_set);
+-
+- if ( likely(!partial_ref) &&
++ if ( likely(!partial_set) &&
+ unlikely(!get_page_from_mfn(mfn, d)) )
+ return -EINVAL;
+
+@@ -646,14 +642,14 @@ static int get_page_and_type_from_mfn(
+ * Retain the refcount if:
+ * - page is fully validated (rc == 0)
+ * - page is not validated (rc < 0) but:
+- * - We came in with a reference (partial_ref)
++ * - We came in with a reference (partial_set)
+ * - page is partially validated (rc == -ERESTART), and the
+ * caller has asked the ref to be retained in that case
+ * - page is partially validated but there's been an error
+ * (page == current->arch.old_guest_table)
+ *
+- * The partial_ref-on-error clause is worth an explanation. There
+- * are two scenarios where partial_ref might be true coming in:
++ * The partial_set-on-error clause is worth an explanation. There
++ * are two scenarios where partial_set might be true coming in:
+ * - mfn has been partially promoted / demoted as type `type`;
+ * i.e. has PGT_partial set
+ * - mfn has been partially demoted as L(type+1) (i.e., a linear
+@@ -676,7 +672,7 @@ static int get_page_and_type_from_mfn(
+ * count retained unless we succeeded, or the operation was
+ * preemptible.
+ */
+- if ( likely(!rc) || partial_ref )
++ if ( likely(!rc) || partial_set )
+ /* nothing */;
+ else if ( page == current->arch.old_guest_table ||
+ (retain_ref && rc == -ERESTART) )
+@@ -1353,13 +1349,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ struct page_info *pg = l2e_get_page(l2e);
+ struct page_info *ptpg = mfn_to_page(_mfn(pfn));
+
+- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+- PTF_partial_set )
+- {
+- /* partial_set should always imply partial_ref */
+- BUG();
+- }
+- else if ( flags & PTF_defer )
++ if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = ptpg;
+ current->arch.old_guest_table = pg;
+@@ -1399,13 +1389,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+
+ pg = l3e_get_page(l3e);
+
+- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+- PTF_partial_set )
+- {
+- /* partial_set should always imply partial_ref */
+- BUG();
+- }
+-
+ if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+@@ -1430,13 +1413,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ {
+ struct page_info *pg = l4e_get_page(l4e);
+
+- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+- PTF_partial_set )
+- {
+- /* partial_set should always imply partial_ref */
+- BUG();
+- }
+-
+ if ( flags & PTF_defer )
+ {
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+@@ -1649,7 +1625,7 @@ static int alloc_l3_table(struct page_info *page)
+ {
+ page->nr_validated_ptes = i;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
++ page->partial_flags = PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i )
+ {
+@@ -1818,7 +1794,7 @@ static int alloc_l4_table(struct page_info *page)
+ {
+ page->nr_validated_ptes = i;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
++ page->partial_flags = PTF_partial_set;
+ }
+ else if ( rc < 0 )
+ {
+@@ -1916,7 +1892,7 @@ static int free_l2_table(struct page_info *page)
+ else if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
++ page->partial_flags = PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
+ {
+@@ -1964,7 +1940,7 @@ static int free_l3_table(struct page_info *page)
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
++ page->partial_flags = PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+ {
+@@ -1995,7 +1971,7 @@ static int free_l4_table(struct page_info *page)
+ if ( rc == -ERESTART )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
++ page->partial_flags = PTF_partial_set;
+ }
+ else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+ {
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index dc9cb869dd..c6ba9e4d73 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -233,7 +233,7 @@ struct page_info
+ * operation on the current page. (That page may or may not
+ * still have PGT_partial set.)
+ *
+- * If PTF_partial_general_ref is set, then the PTE at
++ * Additionally, if PTF_partial_set is set, then the PTE at
+ * @nr_validated_ptef holds a general reference count for the
+ * page.
+ *
+@@ -242,23 +242,20 @@ struct page_info
+ * interrupted
+ * - During validation, if an invalid entry is encountered and
+ * validation is preemptible
+- * - During validation, if PTF_partial_general_ref was set on
+- * this entry to begin with (perhaps because it picked up a
++ * - During validation, if PTF_partial_set was set on this
++ * entry to begin with (perhaps because it picked up a
+ * previous operation)
+ *
+- * When resuming validation, if PTF_partial_general_ref is
+- * clear, then a general reference must be re-acquired; if it
+- * is set, no reference should be acquired.
++ * When resuming validation, if PTF_partial_set is clear, then
++ * a general reference must be re-acquired; if it is set, no
++ * reference should be acquired.
+ *
+- * When resuming de-validation, if PTF_partial_general_ref is
+- * clear, no reference should be dropped; if it is set, a
+- * reference should be dropped.
++ * When resuming de-validation, if PTF_partial_set is clear,
++ * no reference should be dropped; if it is set, a reference
++ * should be dropped.
+ *
+- * NB at the moment, PTF_partial_set should be set if and only if
+- * PTF_partial_general_ref is set.
+- *
+- * NB that PTF_partial_set and PTF_partial_general_ref are
+- * defined in mm.c, the only place where they are used.
++ * NB that PTF_partial_set is defined in mm.c, the only place
++ * where it is used.
+ *
+ * The 3rd field, @linear_pt_count, indicates
+ * - by a positive value, how many same-level page table entries a page
+@@ -268,8 +265,8 @@ struct page_info
+ */
+ struct {
+ u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+- u16 :16 - PAGETABLE_ORDER - 1 - 2;
+- u16 partial_flags:2;
++ u16 :16 - PAGETABLE_ORDER - 1 - 1;
++ u16 partial_flags:1;
+ s16 linear_pt_count;
+ };
+
+--
+2.23.0
+
+From a6098b8920b02149220641cb13358e9012b5fc4d Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion
+ failures
+
+In order to allow recursive pagetable promotions and demotions to be
+interrupted, Xen must keep track of the state of the sub-pages
+promoted or demoted. This is stored in two elements in the page
+struct: nr_entries_validated and partial_flags.
+
+The rule is that entries [0, nr_entries_validated) should always be
+validated and hold a general reference count. If partial_flags is
+zero, then [nr_entries_validated] is not validated and no reference
+count is held. If PTF_partial_set is set, then [nr_entries_validated]
+is partially validated, and a general reference count is held.
+
+Unfortunately, in cases where an entry began with PTF_partial_set set,
+and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is
+erroneously dropped. (This scenario can be engineered mainly by the
+use of interleaving of promoting and demoting a page which has "linear
+pagetable" entries; see the appendix for a sketch.) This means that
+we will "leak" a general reference count on the page in question,
+preventing the page from being freed.
+
+Fix this by setting page->partial_flags to the partial_flags local
+variable.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+-----
+Appendix
+
+Suppose A and B can both be promoted to L2 pages, and A[x] points to B.
+
+V1: PIN_L2 B.
+ B.type_count = 1 | PGT_validated
+ B.count = 2 | PGC_allocated
+
+V1: MOD_L3_ENTRY pointing something to A.
+ In the process of validating A[x], grab an extra type / ref on B:
+ B.type_count = 2 | PGT_validated
+ B.count = 3 | PGC_allocated
+ A.type_count = 1 | PGT_validated
+ A.count = 2 | PGC_allocated
+
+V1: UNPIN B.
+ B.type_count = 1 | PGT_validate
+ B.count = 2 | PGC_allocated
+
+V1: MOD_L3_ENTRY removing the reference to A.
+ De-validate A, down to A[x], which points to B.
+ Drop the final type on B. Arrange to be interrupted.
+ B.type_count = 1 | PGT_partial
+ B.count = 2 | PGC_allocated
+ A.type_count = 1 | PGT_partial
+ A.nr_validated_entries = x
+ A.partial_pte = -1
+
+V2: MOD_L3_ENTRY adds a reference to A.
+
+At this point, get_page_from_l2e(A[x]) tries
+get_page_and_type_from_mfn(), which fails because it's the wrong type;
+and get_l2_linear_pagetable() also fails, because B isn't validated as
+an l2 anymore.
+---
+ xen/arch/x86/mm.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 4970b19aff..cfb7538403 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1562,7 +1562,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = 0;
++ page->partial_flags = partial_flags;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+@@ -1647,7 +1647,7 @@ static int alloc_l3_table(struct page_info *page)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = 0;
++ page->partial_flags = partial_flags;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+@@ -1804,7 +1804,7 @@ static int alloc_l4_table(struct page_info *page)
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+- page->partial_flags = 0;
++ page->partial_flags = partial_flags;
+ if ( rc == -EINTR )
+ rc = -ERESTART;
+ else
+--
+2.23.0
+
+From eabd77b59f4006128501d6e15f9e620dfb349420 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:49 +0100
+Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error
+
+If an invalid entry is discovered when validating a page-table tree,
+the entire tree which has so far been validated must be de-validated.
+Since this may take a long time, alloc_l[2-4]_table() set current
+vcpu's old_guest_table immediately; put_old_guest_table() will make
+sure that put_page_type() will be called to finish off the
+de-validation before any other MMU operations can happen on the vcpu.
+
+The invariant for partial pages should be:
+
+* Entries [0, nr_validated_ptes) should be completely validated;
+ put_page_type() will de-validate these.
+
+* If [nr_validated_ptes] is partially validated, partial_flags should
+ set PTF_partiaL_set. put_page_type() will be called on this page to
+ finish off devalidation, and the appropriate refcount adjustments
+ will be done.
+
+alloc_l[2-3]_table() indicates partial validation to its callers by
+setting current->old_guest_table.
+
+Unfortunately, this is mishandled.
+
+Take the case where validating lNe[x] returns an error.
+
+First, alloc_l3_table() doesn't check old_guest_table at all; as a
+result, partial_flags is not set when it should be. nr_validated_ptes
+is set to x; and since PFT_partial_set clear, de-validation resumes at
+nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not
+have put_page_type() called on it when de-validating the rest of the
+l3: it will be stuck in the PGT_partial state until the domain is
+destroyed, or until it is re-used as an l2. (Any other page type will
+fail.)
+
+Worse, alloc_l4_table(), rather than setting PTF_partial_set as it
+should, sets nr_validated_ptes to x+1. When de-validating, since
+partial is 0, this will correctly resume calling put_page_type at [x];
+but, if the put_page_type() is never called, but instead
+get_page_type() is called, validation will pick up at [x+1],
+neglecting to validate [x]. If the rest of the validation succeeds,
+the l4 will be validated even though [x] is invalid.
+
+Fix this in both cases by setting PTF_partial_set if old_guest_table
+is set.
+
+While here, add some safety catches:
+- old_guest_table must point to the page contained in
+ [nr_validated_ptes].
+- alloc_l1_page shouldn't set old_guest_table
+
+If we experience one of these situations in production builds, it's
+safer to avoid calling put_page_type for the pages in question. If
+they have PGT_partial set, they will be cleaned up on domain
+destruction; if not, we have no idea whether a type count is safe to
+drop. Retaining an extra type ref that should have been dropped may
+trigger a BUG() on the free_domain_page() path, but dropping a type
+count that shouldn't be dropped may cause a privilege escalation.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 54 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index cfb7538403..aa03cb8b40 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1561,6 +1561,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ ASSERT(current->arch.old_guest_table == NULL);
+ if ( i )
+ {
++ /*
++ * alloc_l1_table() doesn't set old_guest_table; it does
++ * its own tear-down immediately on failure. If it
++ * did we'd need to check it and set partial_flags as we
++ * do in alloc_l[34]_table().
++ *
++ * Note on the use of ASSERT: if it's non-null and
++ * hasn't been cleaned up yet, it should have
++ * PGT_partial set; and so the type will be cleaned up
++ * on domain destruction. Unfortunately, we would
++ * leak the general ref held by old_guest_table; but
++ * leaking a page is less bad than a host crash.
++ */
++ ASSERT(current->arch.old_guest_table == NULL);
+ page->nr_validated_ptes = i;
+ page->partial_flags = partial_flags;
+ current->arch.old_guest_ptpg = NULL;
+@@ -1588,6 +1602,7 @@ static int alloc_l3_table(struct page_info *page)
+ unsigned int i;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags;
++ l3_pgentry_t l3e = l3e_empty();
+
+ pl3e = map_domain_page(_mfn(pfn));
+
+@@ -1634,7 +1649,11 @@ static int alloc_l3_table(struct page_info *page)
+ rc = -ERESTART;
+ }
+ if ( rc < 0 )
++ {
++ /* XSA-299 Backport: Copy l3e for checking */
++ l3e = pl3e[i];
+ break;
++ }
+
+ pl3e[i] = adjust_guest_l3e(pl3e[i], d);
+ }
+@@ -1648,6 +1667,24 @@ static int alloc_l3_table(struct page_info *page)
+ {
+ page->nr_validated_ptes = i;
+ page->partial_flags = partial_flags;
++ if ( current->arch.old_guest_table )
++ {
++ /*
++ * We've experienced a validation failure. If
++ * old_guest_table is set, "transfer" the general
++ * reference count to pl3e[nr_validated_ptes] by
++ * setting PTF_partial_set.
++ *
++ * As a precaution, check that old_guest_table is the
++ * page pointed to by pl3e[nr_validated_ptes]. If
++ * not, it's safer to leak a type ref on production
++ * builds.
++ */
++ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
++ page->partial_flags = PTF_partial_set;
++ else
++ ASSERT_UNREACHABLE();
++ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+@@ -1810,7 +1847,23 @@ static int alloc_l4_table(struct page_info *page)
+ else
+ {
+ if ( current->arch.old_guest_table )
+- page->nr_validated_ptes++;
++ {
++ /*
++ * We've experienced a validation failure. If
++ * old_guest_table is set, "transfer" the general
++ * reference count to pl3e[nr_validated_ptes] by
++ * setting PTF_partial_set.
++ *
++ * As a precaution, check that old_guest_table is the
++ * page pointed to by pl4e[nr_validated_ptes]. If
++ * not, it's safer to leak a type ref on production
++ * builds.
++ */
++ if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
++ page->partial_flags = PTF_partial_set;
++ else
++ ASSERT_UNREACHABLE();
++ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
+ }
+--
+2.23.0
+
+From f0086e3ac65c8bcabb84c1c29ab00b0c8a187555 Mon Sep 17 00:00:00 2001
+From: George Dunlap <george.dunlap@citrix.com>
+Date: Thu, 10 Oct 2019 17:57:50 +0100
+Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to
+ begin with
+
+Validation and de-validation of pagetable trees may take arbitrarily
+large amounts of time, and so must be preemptible. This is indicated
+by setting the PGT_partial bit in the type_info, and setting
+nr_validated_entries and partial_flags appropriately. Specifically,
+if the entry at [nr_validated_entries] is partially validated,
+partial_flags should have the PGT_partial_set bit set, and the entry
+should hold a general reference count. During de-validation,
+put_page_type() is called on partially validated entries.
+
+Unfortunately, there are a number of issues with the current algorithm.
+
+First, doing a "normal" put_page_type() is not safe when no type ref
+is held: there is nothing to stop another vcpu from coming along and
+picking up validation again: at which point the put_page_type may drop
+the only page ref on an in-use page. Some examples are listed in the
+appendix.
+
+The core issue is that put_page_type() is being called both to clean
+up PGT_partial, and to drop a type count; and has no way of knowing
+which is which; and so if in between, PGT_partial is cleared,
+put_page_type() will drop the type ref erroneously.
+
+What is needed is to distinguish between two states:
+- Dropping a type ref which is held
+- Cleaning up a page which has been partially de/validated
+
+Fix this by telling put_page_type() which of the two activities you
+intend.
+
+When cleaning up a partial de/validation, take no action unless you
+find a page partially validated.
+
+If put_page_type() is called without PTF_partial_set, and finds the
+page in a PGT_partial state anyway, then there's certainly been a
+misaccounting somewhere, and carrying on would almost certainly cause
+a security issue, so crash the host instead.
+
+In put_page_from_lNe, pass partial_flags on to _put_page_type().
+
+old_guest_table may be set either with a fully validated page (when
+using the "deferred put" pattern), or with a partially validated page
+(when a normal "de-validation" is interrupted, or when a validation
+fails part-way through due to invalid entries). Add a flag,
+old_guest_table_partial, to indicate which of these it is, and use
+that to pass the appropriate flag to _put_page_type().
+
+While here, delete stray trailing whitespace.
+
+This is part of XSA-299.
+
+Reported-by: George Dunlap <george.dunlap@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+-----
+Appendix:
+
+Suppose page A, when interpreted as an l3 pagetable, contains all
+valid entries; and suppose A[x] points to page B, which when
+interpreted as an l2 pagetable, contains all valid entries.
+
+P1: PIN_L3_TABLE
+ A -> PGT_l3_table | 1 | valid
+ B -> PGT_l2_table | 1 | valid
+
+P1: UNPIN_TABLE
+ > Arrange to interrupt after B has been de-validated
+ B:
+ type_info -> PGT_l2_table | 0
+ A:
+ type_info -> PGT_l3_table | 1 | partial
+ nr_validated_enties -> (less than x)
+
+P2: mod_l4_entry to point to A
+ > Arrange for this to be interrupted while B is being validated
+ B:
+ type_info -> PGT_l2_table | 1 | partial
+ (nr_validated_entires &c set as appropriate)
+ A:
+ type_info -> PGT_l3_table | 1 | partial
+ nr_validated_entries -> x
+ partial_pte = 1
+
+P3: mod_l3_entry some other unrelated l3 to point to B:
+ B:
+ type_info -> PGT_l2_table | 1
+
+P1: Restart UNPIN_TABLE
+
+At this point, since A.nr_validate_entries == x and A.partial_pte !=
+0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
+its type count to 0 while it's still being pointed to by some other l3
+
+A similar issue arises with old_guest_table. Consider the following
+scenario:
+
+Suppose A is a page which, when interpreted as an l2, has valid entries
+until entry x, which is invalid.
+
+V1: PIN_L2_TABLE(A)
+ <Validate until we try to validate [x], get -EINVAL>
+ A -> PGT_l2_table | 1 | PGT_partial
+ V1 -> old_guest_table = A
+ <delayed>
+
+V2: PIN_L2_TABLE(A)
+ <Pick up where V1 left off, try to re-validate [x], get -EINVAL>
+ A -> PGT_l2_table | 1 | PGT_partial
+ V2 -> old_guest_table = A
+ <restart>
+ put_old_guest_table()
+ _put_page_type(A)
+ A -> PGT_l2_table | 0
+
+V1: <restart>
+ put_old_guest_table()
+ _put_page_type(A) # UNDERFLOW
+
+Indeed, it is possible to engineer for old_guest_table for every vcpu
+a guest has to point to the same page.
+---
+ xen/arch/x86/domain.c | 6 +++
+ xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++-----
+ xen/include/asm-x86/domain.h | 4 +-
+ 3 files changed, 95 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 8fbecbb169..c880568dd4 100644
+--- xen/arch/x86/domain.c.orig
++++ xen/arch/x86/domain.c
+@@ -1074,9 +1074,15 @@ int arch_set_info_guest(
+ rc = -ERESTART;
+ /* Fallthrough */
+ case -ERESTART:
++ /*
++ * NB that we're putting the kernel-mode table
++ * here, which we've already successfully
++ * validated above; hence partial = false;
++ */
+ v->arch.old_guest_ptpg = NULL;
+ v->arch.old_guest_table =
+ pagetable_get_page(v->arch.guest_table);
++ v->arch.old_guest_table_partial = false;
+ v->arch.guest_table = pagetable_null();
+ break;
+ default:
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index aa03cb8b40..c701c7ef14 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1353,10 +1353,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
+ {
+ current->arch.old_guest_ptpg = ptpg;
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ }
+ else
+ {
+- rc = _put_page_type(pg, PTF_preemptible, ptpg);
++ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -1379,6 +1380,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ unsigned long mfn = l3e_get_pfn(l3e);
+ int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
++ ASSERT(!(flags & PTF_partial_set));
+ ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+ do {
+ put_data_page(mfn_to_page(_mfn(mfn)), writeable);
+@@ -1391,12 +1393,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+
+ if ( flags & PTF_defer )
+ {
++ ASSERT(!(flags & PTF_partial_set));
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ return 0;
+ }
+
+- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+
+@@ -1415,12 +1419,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+
+ if ( flags & PTF_defer )
+ {
++ ASSERT(!(flags & PTF_partial_set));
+ current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+ current->arch.old_guest_table = pg;
++ current->arch.old_guest_table_partial = false;
+ return 0;
+ }
+
+- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
++ rc = _put_page_type(pg, flags | PTF_preemptible,
++ mfn_to_page(_mfn(pfn)));
+ if ( likely(!rc) )
+ put_page(pg);
+ }
+@@ -1525,6 +1532,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+
+ pl2e = map_domain_page(_mfn(pfn));
+
++ /*
++ * NB that alloc_l2_table will never set partial_pte on an l2; but
++ * free_l2_table might if a linear_pagetable entry is interrupted
++ * partway through de-validation. In that circumstance,
++ * get_page_from_l2e() will always return -EINVAL; and we must
++ * retain the type ref by doing the normal partial_flags tracking.
++ */
++
+ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
+ i++, partial_flags = 0 )
+ {
+@@ -1579,6 +1594,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
+ page->partial_flags = partial_flags;
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ }
+ if ( rc < 0 )
+@@ -1681,12 +1697,16 @@ static int alloc_l3_table(struct page_info *page)
+ * builds.
+ */
+ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
++ {
++ ASSERT(current->arch.old_guest_table_partial);
+ page->partial_flags = PTF_partial_set;
++ }
+ else
+ ASSERT_UNREACHABLE();
+ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ while ( i-- > 0 )
+ pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+@@ -1860,12 +1880,16 @@ static int alloc_l4_table(struct page_info *page)
+ * builds.
+ */
+ if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
++ {
++ ASSERT(current->arch.old_guest_table_partial);
+ page->partial_flags = PTF_partial_set;
++ }
+ else
+ ASSERT_UNREACHABLE();
+ }
+ current->arch.old_guest_ptpg = NULL;
+ current->arch.old_guest_table = page;
++ current->arch.old_guest_table_partial = true;
+ }
+ }
+ }
+@@ -2782,6 +2806,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
+ x = y;
+ nx = x - 1;
+
++ /*
++ * Is this expected to do a full reference drop, or only
++ * cleanup partial validation / devalidation?
++ *
++ * If the former, the caller must hold a "full" type ref;
++ * which means the page must be validated. If the page is
++ * *not* fully validated, continuing would almost certainly
++ * open up a security hole. An exception to this is during
++ * domain destruction, where PGT_validated can be dropped
++ * without dropping a type ref.
++ *
++ * If the latter, do nothing unless type PGT_partial is set.
++ * If it is set, the type count must be 1.
++ */
++ if ( !(flags & PTF_partial_set) )
++ BUG_ON((x & PGT_partial) ||
++ !((x & PGT_validated) || page_get_owner(page)->is_dying));
++ else if ( !(x & PGT_partial) )
++ return 0;
++ else
++ BUG_ON((x & PGT_count_mask) != 1);
++
+ ASSERT((x & PGT_count_mask) != 0);
+
+ switch ( nx & (PGT_locked | PGT_count_mask) )
+@@ -3041,17 +3087,34 @@ int put_old_guest_table(struct vcpu *v)
+ if ( !v->arch.old_guest_table )
+ return 0;
+
+- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
+- v->arch.old_guest_ptpg) )
++ rc = _put_page_type(v->arch.old_guest_table,
++ PTF_preemptible |
++ ( v->arch.old_guest_table_partial ?
++ PTF_partial_set : 0 ),
++ v->arch.old_guest_ptpg);
++
++ if ( rc == -ERESTART || rc == -EINTR )
+ {
+- case -EINTR:
+- case -ERESTART:
++ v->arch.old_guest_table_partial = (rc == -ERESTART);
+ return -ERESTART;
+- case 0:
+- put_page(v->arch.old_guest_table);
+ }
+
++ /*
++ * It shouldn't be possible for _put_page_type() to return
++ * anything else at the moment; but if it does happen in
++ * production, leaking the type ref is probably the best thing to
++ * do. Either way, drop the general ref held by old_guest_table.
++ */
++ ASSERT(rc == 0);
++
++ put_page(v->arch.old_guest_table);
+ v->arch.old_guest_table = NULL;
++ v->arch.old_guest_ptpg = NULL;
++ /*
++ * Safest default if someone sets old_guest_table without
++ * explicitly setting old_guest_table_partial.
++ */
++ v->arch.old_guest_table_partial = true;
+
+ return rc;
+ }
+@@ -3201,11 +3264,11 @@ int new_guest_cr3(mfn_t mfn)
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+- rc = -ERESTART;
+- /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
++ rc = -ERESTART;
+ break;
+ default:
+ BUG_ON(rc);
+@@ -3479,6 +3542,7 @@ long do_mmuext_op(
+ {
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ curr->arch.old_guest_table_partial = false;
+ }
+ }
+ }
+@@ -3513,6 +3577,11 @@ long do_mmuext_op(
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ /*
++ * EINTR means we still hold the type ref; ERESTART
++ * means PGT_partial holds the type ref
++ */
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
+ rc = 0;
+ break;
+ default:
+@@ -3581,11 +3650,15 @@ long do_mmuext_op(
+ switch ( rc = put_page_and_type_preemptible(page) )
+ {
+ case -EINTR:
+- rc = -ERESTART;
+- /* fallthrough */
+ case -ERESTART:
+ curr->arch.old_guest_ptpg = NULL;
+ curr->arch.old_guest_table = page;
++ /*
++ * EINTR means we still hold the type ref;
++ * ERESTART means PGT_partial holds the ref
++ */
++ curr->arch.old_guest_table_partial = (rc == -ERESTART);
++ rc = -ERESTART;
+ break;
+ default:
+ BUG_ON(rc);
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 1ac5a96c08..360c38bd83 100644
+--- xen/include/asm-x86/domain.h.orig
++++ xen/include/asm-x86/domain.h
+@@ -309,7 +309,7 @@ struct arch_domain
+
+ struct paging_domain paging;
+ struct p2m_domain *p2m;
+- /* To enforce lock ordering in the pod code wrt the
++ /* To enforce lock ordering in the pod code wrt the
+ * page_alloc lock */
+ int page_alloc_unlock_level;
+
+@@ -542,6 +542,8 @@ struct arch_vcpu
+ struct page_info *old_guest_table; /* partially destructed pagetable */
+ struct page_info *old_guest_ptpg; /* containing page table of the */
+ /* former, if any */
++ bool old_guest_table_partial; /* Are we dropping a type ref, or just
++ * finishing up a partial de-validation? */
+ /* guest_table holds a ref to the page, and also a type-count unless
+ * shadow refcounts are in use */
+ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
+--
+2.23.0
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA302 b/sysutils/xenkernel411/patches/patch-XSA302
new file mode 100644
index 00000000000..36929eb5a1f
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA302
@@ -0,0 +1,537 @@
+$NetBSD: patch-XSA302,v 1.2.2.2 2019/11/16 22:10:07 bsiegert Exp $
+
+From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 2 Oct 2019 13:36:59 +0200
+Subject: [PATCH 1/2] IOMMU: add missing HVM check
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fix an unguarded d->arch.hvm access in assign_device().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
+Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
+---
+ xen/drivers/passthrough/pci.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index f51cae7f4e..037aba7c94 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ /* Prevent device assign if mem paging or mem sharing have been
+ * enabled for this domain */
+ if ( unlikely(!need_iommu(d) &&
+- (d->arch.hvm_domain.mem_sharing_enabled ||
++ ((is_hvm_domain(d) &&
++ d->arch.hvm_domain.mem_sharing_enabled) ||
+ vm_event_check_ring(d->vm_event_paging) ||
+ p2m_get_hostp2m(d)->global_logdirty)) )
+ return -EXDEV;
+--
+2.11.0
+
+From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001
+From: Paul Durrant <paul.durrant@citrix.com>
+Date: Mon, 14 Oct 2019 17:52:59 +0100
+Subject: [PATCH 2/2] passthrough: quarantine PCI devices
+
+When a PCI device is assigned to an untrusted domain, it is possible for
+that domain to program the device to DMA to an arbitrary address. The
+IOMMU is used to protect the host from malicious DMA by making sure that
+the device addresses can only target memory assigned to the guest. However,
+when the guest domain is torn down the device is assigned back to dom0,
+thus allowing any in-flight DMA to potentially target critical host data.
+
+This patch introduces a 'quarantine' for PCI devices using dom_io. When
+the toolstack makes a device assignable (by binding it to pciback), it
+will now also assign it to DOMID_IO and the device will only be assigned
+back to dom0 when the device is made unassignable again. Whilst device is
+assignable it will only ever transfer between dom_io and guest domains.
+dom_io is actually only used as a sentinel domain for quarantining purposes;
+it is not configured with any IOMMU mappings. Assignment to dom_io simply
+means that the device's initiator (requestor) identifier is not present in
+the IOMMU's device table and thus any DMA transactions issued will be
+terminated with a fault condition.
+
+In addition, a fix to assignment handling is made for VT-d. Failure
+during the assignment step should not lead to a device still being
+associated with its prior owner. Hand the device to DomIO temporarily,
+until the assignment step has completed successfully. Remove the PI
+hooks from the source domain then earlier as well.
+
+Failure of the recovery reassign_device_ownership() may not go silent:
+There e.g. may still be left over RMRR mappings in the domain assignment
+to which has failed, and hence we can't allow that domain to continue
+executing.
+
+NOTE: This patch also includes one printk() cleanup; the
+ "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
+ since similar printk()-s elsewhere also don't log such a tag.
+
+This is XSA-302.
+
+Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
+---
+ tools/libxl/libxl_pci.c | 25 +++++++++++-
+ xen/arch/x86/mm.c | 2 +
+ xen/common/domctl.c | 14 ++++++-
+ xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
+ xen/drivers/passthrough/iommu.c | 9 +++++
+ xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++-------
+ xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++---
+ xen/include/xen/pci.h | 3 ++
+ 8 files changed, 138 insertions(+), 24 deletions(-)
+
+diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
+index 4755a0c93c..81890a91ac 100644
+--- tools/libxl/libxl_pci.c.orig
++++ tools/libxl/libxl_pci.c
+@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ libxl_device_pci *pcidev,
+ int rebind)
+ {
++ libxl_ctx *ctx = libxl__gc_owner(gc);
+ unsigned dom, bus, dev, func;
+ char *spath, *driver_path = NULL;
+ int rc;
+@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ }
+ if ( rc ) {
+ LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
+- return 0;
++ goto quarantine;
+ }
+
+ /* Check to see if there's already a driver that we need to unbind from */
+@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
+ return ERROR_FAIL;
+ }
+
++quarantine:
++ /*
++ * DOMID_IO is just a sentinel domain, without any actual mappings,
++ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
++ * unnecessarily denied.
++ */
++ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
++ XEN_DOMCTL_DEV_RDM_RELAXED);
++ if ( rc < 0 ) {
++ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
++ return ERROR_FAIL;
++ }
++
+ return 0;
+ }
+
+@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
+ libxl_device_pci *pcidev,
+ int rebind)
+ {
++ libxl_ctx *ctx = libxl__gc_owner(gc);
+ int rc;
+ char *driver_path;
+
++ /* De-quarantine */
++ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
++ if ( rc < 0 ) {
++ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
++ pcidev->dev, pcidev->func);
++ return ERROR_FAIL;
++ }
++
+ /* Unbind from pciback */
+ if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
+ return ERROR_FAIL;
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e6a4cb28f8..c1ab57f9a5 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
+ * Initialise our DOMID_IO domain.
+ * This domain owns I/O pages that are within the range of the page_info
+ * array. Mappings occur at the priv of the caller.
++ * Quarantined PCI devices will be associated with this domain.
+ */
+ dom_io = domain_create(DOMID_IO, NULL);
+ BUG_ON(IS_ERR(dom_io));
++ INIT_LIST_HEAD(&dom_io->arch.pdev_list);
+
+ /*
+ * Initialise our COW domain.
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index 9b7bc083ee..741d774cd1 100644
+--- xen/common/domctl.c.orig
++++ xen/common/domctl.c
+@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+
+ switch ( op->cmd )
+ {
++ case XEN_DOMCTL_assign_device:
++ case XEN_DOMCTL_deassign_device:
++ if ( op->domain == DOMID_IO )
++ {
++ d = dom_io;
++ break;
++ }
++ else if ( op->domain == DOMID_INVALID )
++ return -ESRCH;
++ /* fall through */
+ case XEN_DOMCTL_test_assign_device:
+ if ( op->domain == DOMID_INVALID )
+ {
+@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+
+ if ( !domctl_lock_acquire() )
+ {
+- if ( d )
++ if ( d && d != dom_io )
+ rcu_unlock_domain(d);
+ return hypercall_create_continuation(
+ __HYPERVISOR_domctl, "h", u_domctl);
+@@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ domctl_lock_release();
+
+ domctl_out_unlock_domonly:
+- if ( d )
++ if ( d && d != dom_io )
+ rcu_unlock_domain(d);
+
+ if ( copyback && __copy_to_guest(u_domctl, op, 1) )
+diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
+index 12d2695b89..ec8baae717 100644
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device(
+ u8 bus = pdev->bus;
+ const struct domain_iommu *hd = dom_iommu(domain);
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return;
++
+ BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
+ !iommu->dev_table.buffer );
+
+@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
+ int req_id;
+ u8 bus = pdev->bus;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return;
++
+ BUG_ON ( iommu->dev_table.buffer == NULL );
+ req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
+ dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
+ ivrs_mappings[req_id].read_permission);
+ }
+
+- return reassign_device(hardware_domain, d, devfn, pdev);
++ return reassign_device(pdev->domain, d, devfn, pdev);
+ }
+
+ static void deallocate_next_page_table(struct page_info *pg, int level)
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index 04b0be37d3..8027d96f1c 100644
+--- xen/drivers/passthrough/iommu.c.orig
++++ xen/drivers/passthrough/iommu.c
+@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d)
+ {
+ const struct domain_iommu *hd = dom_iommu(d);
+
++ if ( d == dom_io )
++ return;
++
+ d->need_iommu = 0;
+ hd->platform_ops->teardown(d);
+ tasklet_schedule(&iommu_pt_cleanup_tasklet);
+@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d)
+ if ( need_iommu(d) > 0 )
+ return 0;
+
++ if ( d == dom_io )
++ return 0;
++
+ if ( !iommu_use_hap_pt(d) )
+ {
+ int rc;
+@@ -404,6 +410,9 @@ int __init iommu_setup(void)
+ printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
+ if ( iommu_enabled )
+ {
++ if ( iommu_domain_init(dom_io) )
++ panic("Could not set up quarantine\n");
++
+ printk(" - Dom0 mode: %s\n",
+ iommu_passthrough ? "Passthrough" :
+ iommu_dom0_strict ? "Strict" : "Relaxed");
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 037aba7c94..fb010a547b 100644
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
+ return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
+ }
+
+-/*
+- * If the device isn't owned by the hardware domain, it means it already
+- * has been assigned to other domain, or it doesn't exist.
+- */
+ static int device_assigned(u16 seg, u8 bus, u8 devfn)
+ {
+ struct pci_dev *pdev;
++ int rc = 0;
+
+ pcidevs_lock();
+- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++
++ pdev = pci_get_pdev(seg, bus, devfn);
++
++ if ( !pdev )
++ rc = -ENODEV;
++ /*
++ * If the device exists and it is not owned by either the hardware
++ * domain or dom_io then it must be assigned to a guest, or be
++ * hidden (owned by dom_xen).
++ */
++ else if ( pdev->domain != hardware_domain &&
++ pdev->domain != dom_io )
++ rc = -EBUSY;
++
+ pcidevs_unlock();
+
+- return pdev ? 0 : -EBUSY;
++ return rc;
+ }
+
+ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+@@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+
+ /* Prevent device assign if mem paging or mem sharing have been
+ * enabled for this domain */
+- if ( unlikely(!need_iommu(d) &&
++ if ( d != dom_io &&
++ unlikely(!need_iommu(d) &&
+ ((is_hvm_domain(d) &&
+ d->arch.hvm_domain.mem_sharing_enabled) ||
+ vm_event_check_ring(d->vm_event_paging) ||
+@@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ return rc;
+ }
+
+- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
++ pdev = pci_get_pdev(seg, bus, devfn);
++
++ rc = -ENODEV;
+ if ( !pdev )
+- {
+- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
+ goto done;
+- }
++
++ rc = 0;
++ if ( d == pdev->domain )
++ goto done;
++
++ rc = -EBUSY;
++ if ( pdev->domain != hardware_domain &&
++ pdev->domain != dom_io )
++ goto done;
+
+ if ( pdev->msix )
+ msixtbl_init(d);
+@@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ }
+
+ done:
++ /* The device is assigned to dom_io so mark it as quarantined */
++ if ( !rc && d == dom_io )
++ pdev->quarantine = true;
++
+ if ( !has_arch_pdevs(d) && need_iommu(d) )
+ iommu_teardown(d);
+ pcidevs_unlock();
+@@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ {
+ const struct domain_iommu *hd = dom_iommu(d);
+ struct pci_dev *pdev = NULL;
++ struct domain *target;
+ int ret = 0;
+
+ if ( !iommu_enabled || !hd->platform_ops )
+@@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ if ( !pdev )
+ return -ENODEV;
+
++ /* De-assignment from dom_io should de-quarantine the device */
++ target = (pdev->quarantine && pdev->domain != dom_io) ?
++ dom_io : hardware_domain;
++
+ while ( pdev->phantom_stride )
+ {
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ break;
+- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++ ret = hd->platform_ops->reassign_device(d, target, devfn,
+ pci_to_dev(pdev));
+ if ( !ret )
+ continue;
+@@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ }
+
+ devfn = pdev->devfn;
+- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
++ ret = hd->platform_ops->reassign_device(d, target, devfn,
+ pci_to_dev(pdev));
+ if ( ret )
+ {
+@@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
+ return ret;
+ }
+
++ if ( pdev->domain == hardware_domain )
++ pdev->quarantine = false;
++
+ pdev->fault.count = 0;
+
+ if ( !has_arch_pdevs(d) && need_iommu(d) )
+@@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl(
+ ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+ "h", u_domctl);
+ else if ( ret )
+- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
++ printk(XENLOG_G_ERR
+ "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ d->domain_id, ret);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 4c719d4ee7..19f7d13013 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1338,6 +1338,10 @@ int domain_context_mapping_one(
+ int agaw, rc, ret;
+ bool_t flush_dev_iotlb;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return 0;
++
+ ASSERT(pcidevs_locked());
+ spin_lock(&iommu->lock);
+ maddr = bus_to_context_maddr(iommu, bus);
+@@ -1573,6 +1577,10 @@ int domain_context_unmap_one(
+ int iommu_domid, rc, ret;
+ bool_t flush_dev_iotlb;
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ return 0;
++
+ ASSERT(pcidevs_locked());
+ spin_lock(&iommu->lock);
+
+@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
+ goto out;
+ }
+
++ /* dom_io is used as a sentinel for quarantined devices */
++ if ( domain == dom_io )
++ goto out;
++
+ /*
+ * if no other devices under the same iommu owned by this domain,
+ * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
+@@ -2389,6 +2401,15 @@ static int reassign_device_ownership(
+ if ( ret )
+ return ret;
+
++ if ( devfn == pdev->devfn )
++ {
++ list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
++ pdev->domain = dom_io;
++ }
++
++ if ( !has_arch_pdevs(source) )
++ vmx_pi_hooks_deassign(source);
++
+ if ( !has_arch_pdevs(target) )
+ vmx_pi_hooks_assign(target);
+
+@@ -2407,15 +2428,13 @@ static int reassign_device_ownership(
+ pdev->domain = target;
+ }
+
+- if ( !has_arch_pdevs(source) )
+- vmx_pi_hooks_deassign(source);
+-
+ return ret;
+ }
+
+ static int intel_iommu_assign_device(
+ struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
+ {
++ struct domain *s = pdev->domain;
+ struct acpi_rmrr_unit *rmrr;
+ int ret = 0, i;
+ u16 bdf, seg;
+@@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device(
+ }
+ }
+
+- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
+- if ( ret )
++ ret = reassign_device_ownership(s, d, devfn, pdev);
++ if ( ret || d == dom_io )
+ return ret;
+
+ /* Setup rmrr identity mapping */
+@@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device(
+ ret = rmrr_identity_mapping(d, 1, rmrr, flag);
+ if ( ret )
+ {
+- reassign_device_ownership(d, hardware_domain, devfn, pdev);
++ int rc;
++
++ rc = reassign_device_ownership(d, s, devfn, pdev);
+ printk(XENLOG_G_ERR VTDPREFIX
+ " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
+ rmrr->base_address, rmrr->end_address,
+ d->domain_id, ret);
++ if ( rc )
++ {
++ printk(XENLOG_ERR VTDPREFIX
++ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
++ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
++ domain_crash(d);
++ }
+ break;
+ }
+ }
+diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
+index 4cfa774615..066364bdef 100644
+--- xen/include/xen/pci.h.orig
++++ xen/include/xen/pci.h
+@@ -88,6 +88,9 @@ struct pci_dev {
+
+ nodeid_t node; /* NUMA node */
+
++ /* Device to be quarantined, don't automatically re-assign to dom0 */
++ bool quarantine;
++
+ enum pdev_type {
+ DEV_TYPE_PCI_UNKNOWN,
+ DEV_TYPE_PCIe_ENDPOINT,
+--
+2.11.0
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA304 b/sysutils/xenkernel411/patches/patch-XSA304
new file mode 100644
index 00000000000..83b52f7bd31
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA304
@@ -0,0 +1,481 @@
+$NetBSD: patch-XSA304,v 1.2.2.2 2019/11/16 22:10:07 bsiegert Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
+
+Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
+an EPT superpage gets shattered. The root cause is still under investigation,
+but the end result is unusable in combination with CVE-2018-12207 protections.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index fb7edfaef9..d698b1d50a 100644
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
+ int __must_check me_wifi_quirk(struct domain *domain,
+ u8 bus, u8 devfn, int map);
+ void pci_vtd_quirk(const struct pci_dev *);
++void quirk_iommu_caps(struct iommu *iommu);
++
+ bool_t platform_supports_intremap(void);
+ bool_t platform_supports_x2apic(void);
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index f242e30caf..8712d3b4dc 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
+ return -ENODEV;
+
++ quirk_iommu_caps(iommu);
++
+ if ( cap_fault_reg_offset(iommu->cap) +
+ cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
+ ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index d6db862678..b02688e316 100644
+--- xen/drivers/passthrough/vtd/quirks.c.orig
++++ xen/drivers/passthrough/vtd/quirks.c
+@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
+ break;
+ }
+ }
++
++void __init quirk_iommu_caps(struct iommu *iommu)
++{
++ /*
++ * IOMMU Quirks:
++ *
++ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
++ * implement superpages internally.
++ *
++ * There are issues changing the walk length under in-flight DMA, which
++ * has manifested as incompatibility between EPT/IOMMU sharing and the
++ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages
++ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
++ * and IOMMU pagetables.
++ *
++ * Detection of SandyBridge unfortunately has to be done by processor
++ * model because the client parts don't expose their IOMMUs as PCI devices
++ * we could match with a Device ID.
++ */
++ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
++ boot_cpu_data.x86 == 6 &&
++ (boot_cpu_data.x86_model == 0x2a ||
++ boot_cpu_data.x86_model == 0x2d) )
++ iommu->cap &= ~(0xful << 34);
++}
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtx: Disable executable EPT superpages to work around
+ CVE-2018-12207
+
+CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
+machine check exception can be generated in a corner case when an executable
+mapping changes size or cacheability without TLB invalidation. HVM guest
+kernels can trigger this to DoS the host.
+
+To mitigate, in affected hardware, all EPT superpages are marked NX. When an
+instruction fetch violation is observed against the superpage, the superpage
+is shattered to 4k and has execute permissions restored. This prevents the
+guest kernel from being able to create the necessary preconditions in the iTLB
+to exploit the vulnerability.
+
+This does come with a workload-dependent performance overhead, caused by
+increased TLB pressure. Performance can be restored, if guest kernels are
+trusted not to mount an attack, by specifying ept=exec-sp on the command line.
+
+This is part of XSA-304 / CVE-2018-12207
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index c63a07d29b..684671cb7b 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -828,7 +828,7 @@ effect the inverse meaning.
+ >> set as UC.
+
+ ### ept (Intel)
+-> `= List of ( {no-}pml | {no-}ad )`
++> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]`
+
+ Controls EPT related features.
+
+@@ -851,6 +851,16 @@ Controls EPT related features.
+
+ >> Have hardware keep accessed/dirty (A/D) bits updated.
+
++* The `exec-sp` boolean controls whether EPT superpages with execute
++ permissions are permitted. In general this is good for performance.
++
++ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can
++ use executable superpages to crash the host. By default, executable
++ superpages are disabled on affected hardware.
++
++ If HVM guest kernels are trusted not to mount a DoS against the system,
++ this option can enabled to regain performance.
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index f4a6a37149..1924434960 100644
+--- xen/arch/x86/hvm/hvm.c.orig
++++ xen/arch/x86/hvm/hvm.c
+@@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ struct p2m_domain *p2m, *hostp2m;
+ int rc, fall_through = 0, paged = 0;
+ int sharing_enomem = 0;
++ unsigned int page_order = 0;
+ vm_event_request_t *req_ptr = NULL;
+ bool_t ap2m_active, sync = 0;
+
+@@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ hostp2m = p2m_get_hostp2m(currd);
+ mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
+ P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
+- NULL);
++ &page_order);
+
+ if ( ap2m_active )
+ {
+@@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ goto out;
+ }
+
+- mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
++ mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order);
+ }
+ else
+ p2m = hostp2m;
+@@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
+ break;
+ }
+
++ /*
++ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution
++ * fault against a non-executable superpage, shatter it to regain
++ * execute permissions.
++ */
++ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
++ {
++ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
++ p2mt, p2ma);
++
++ if ( res )
++ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
++ gfn, res);
++
++ rc = !res;
++ goto out_put_gfn;
++ }
++
+ if ( violation )
+ {
+ /* Should #VE be emulated for this fault? */
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 493986e84a..8821a3b536 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
+
+ static bool_t __read_mostly opt_pml_enabled = 1;
+ static s8 __read_mostly opt_ept_ad = -1;
++int8_t __read_mostly opt_ept_exec_sp = -1;
+
+ /*
+ * The 'ept' parameter controls functionalities that depend on, or impact the
+@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s)
+ opt_pml_enabled = val;
+ else if ( !cmdline_strcmp(s, "ad") )
+ opt_ept_ad = val;
++ else if ( !cmdline_strcmp(s, "exec-sp") )
++ opt_ept_exec_sp = val;
+ else
+ rc = -EINVAL;
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 840dc2b44d..a568d62643 100644
+--- xen/arch/x86/hvm/vmx/vmx.c.orig
++++ xen/arch/x86/hvm/vmx/vmx.c
+@@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
+ static void __init lbr_tsx_fixup_check(void);
+ static void __init bdw_erratum_bdf14_fixup_check(void);
+
++/*
++ * Calculate whether the CPU is vulnerable to Instruction Fetch page
++ * size-change MCEs.
++ */
++static bool __init has_if_pschange_mc(void)
++{
++ uint64_t caps = 0;
++
++ /*
++ * If we are virtualised, there is nothing we can do. Our EPT tables are
++ * shadowed by our hypervisor, and not walked by hardware.
++ */
++ if ( cpu_has_hypervisor )
++ return false;
++
++ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
++ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
++ return false;
++
++ /*
++ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
++ * this time.
++ */
++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
++ boot_cpu_data.x86 != 6 )
++ return false;
++
++ switch ( boot_cpu_data.x86_model )
++ {
++ /*
++ * Core processors since at least Nehalem are vulnerable.
++ */
++ case 0x1f: /* Auburndale / Havendale */
++ case 0x1e: /* Nehalem */
++ case 0x1a: /* Nehalem EP */
++ case 0x2e: /* Nehalem EX */
++ case 0x25: /* Westmere */
++ case 0x2c: /* Westmere EP */
++ case 0x2f: /* Westmere EX */
++ case 0x2a: /* SandyBridge */
++ case 0x2d: /* SandyBridge EP/EX */
++ case 0x3a: /* IvyBridge */
++ case 0x3e: /* IvyBridge EP/EX */
++ case 0x3c: /* Haswell */
++ case 0x3f: /* Haswell EX/EP */
++ case 0x45: /* Haswell D */
++ case 0x46: /* Haswell H */
++ case 0x3d: /* Broadwell */
++ case 0x47: /* Broadwell H */
++ case 0x4f: /* Broadwell EP/EX */
++ case 0x56: /* Broadwell D */
++ case 0x4e: /* Skylake M */
++ case 0x5e: /* Skylake D */
++ case 0x55: /* Skylake-X / Cascade Lake */
++ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
++ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
++ return true;
++
++ /*
++ * Atom processors are not vulnerable.
++ */
++ case 0x1c: /* Pineview */
++ case 0x26: /* Lincroft */
++ case 0x27: /* Penwell */
++ case 0x35: /* Cloverview */
++ case 0x36: /* Cedarview */
++ case 0x37: /* Baytrail / Valleyview (Silvermont) */
++ case 0x4d: /* Avaton / Rangely (Silvermont) */
++ case 0x4c: /* Cherrytrail / Brasswell */
++ case 0x4a: /* Merrifield */
++ case 0x5a: /* Moorefield */
++ case 0x5c: /* Goldmont */
++ case 0x5d: /* SoFIA 3G Granite/ES2.1 */
++ case 0x65: /* SoFIA LTE AOSP */
++ case 0x5f: /* Denverton */
++ case 0x6e: /* Cougar Mountain */
++ case 0x75: /* Lightning Mountain */
++ case 0x7a: /* Gemini Lake */
++ case 0x86: /* Jacobsville */
++
++ /*
++ * Knights processors are not vulnerable.
++ */
++ case 0x57: /* Knights Landing */
++ case 0x85: /* Knights Mill */
++ return false;
++
++ default:
++ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
++ boot_cpu_data.x86_model);
++ return true;
++ }
++}
++
+ const struct hvm_function_table * __init start_vmx(void)
+ {
+ set_in_cr4(X86_CR4_VMXE);
+@@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void)
+ */
+ if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
+ {
++ bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
++
++ if ( opt_ept_exec_sp == -1 )
++ {
++ /* Default to non-executable superpages on vulnerable hardware. */
++ opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
++
++ if ( cpu_has_bug_pschange_mc )
++ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
++ }
++
+ vmx_function_table.hap_supported = 1;
+ vmx_function_table.altp2m_supported = 1;
+
+diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
+index ce46201d45..93e08f89a2 100644
+--- xen/arch/x86/mm/p2m-ept.c.orig
++++ xen/arch/x86/mm/p2m-ept.c
+@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
+ break;
+ }
+
++ /*
++ * Don't create executable superpages if we need to shatter them to
++ * protect against CVE-2018-12207.
++ */
++ if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
++ entry->x = 0;
+ }
+
+ #define GUEST_TABLE_MAP_FAILED 0
+diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
+index 89619e4afd..20eb7f6082 100644
+--- xen/include/asm-x86/hvm/vmx/vmx.h.orig
++++ xen/include/asm-x86/hvm/vmx/vmx.h
+@@ -28,6 +28,8 @@
+ #include <asm/hvm/trace.h>
+ #include <asm/hvm/vmx/vmcs.h>
+
++extern int8_t opt_ept_exec_sp;
++
+ typedef union {
+ struct {
+ u64 r : 1, /* bit 0 - Read permission */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index b8151d2d9f..89ae3e03f1 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -54,6 +54,7 @@
+ #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
+ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
++#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/vtx: Allow runtime modification of the exec-sp setting
+
+See patch for details.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..33ed1ffc40 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -861,6 +861,21 @@ Controls EPT related features.
+ If HVM guest kernels are trusted not to mount a DoS against the system,
+ this option can enabled to regain performance.
+
++ This boolean may be modified at runtime using `xl set-parameters
++ ept=[no-]exec-sp` to switch between fast and secure.
++
++ * When switching from secure to fast, preexisting HVM domains will run
++ at their current performance until they are rebooted; new domains will
++ run without any overhead.
++
++ * When switching from fast to secure, all HVM domains will immediately
++ suffer a performance penalty.
++
++ **Warning: No guarantee is made that this runtime option will be retained
++ indefinitely, or that it will retain this exact behaviour. It is
++ intended as an emergency option for people who first chose fast, then
++ change their minds to secure, and wish not to reboot.**
++
+ ### extra\_guest\_irqs
+ > `= [<domU number>][,<dom0 number>]`
+
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 8821a3b536..15376e25ba 100644
+--- xen/arch/x86/hvm/vmx/vmcs.c.orig
++++ xen/arch/x86/hvm/vmx/vmcs.c
+@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s)
+ }
+ custom_param("ept", parse_ept_param);
+
++static int parse_ept_param_runtime(const char *s)
++{
++ int val;
++
++ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
++ !(hvm_funcs.hap_capabilities &
++ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
++ {
++ printk("VMX: EPT not available, or not in use - ignoring\n");
++ return 0;
++ }
++
++ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
++ return -EINVAL;
++
++ if ( val != opt_ept_exec_sp )
++ {
++ struct domain *d;
++
++ opt_ept_exec_sp = val;
++
++ rcu_read_lock(&domlist_read_lock);
++ for_each_domain ( d )
++ if ( paging_mode_hap(d) )
++ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
++ rcu_read_unlock(&domlist_read_lock);
++ }
++
++ printk("VMX: EPT executable superpages %sabled\n",
++ val ? "en" : "dis");
++
++ return 0;
++}
++custom_runtime_only_param("ept", parse_ept_param_runtime);
++
+ /* Dynamic (run-time adjusted) execution control flags. */
+ u32 vmx_pin_based_exec_control __read_mostly;
+ u32 vmx_cpu_based_exec_control __read_mostly;
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index 2b62bc61dd..97c417fc3e 100644
+--- xen/arch/x86/mm/p2m.c.orig
++++ xen/arch/x86/mm/p2m.c
+@@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+ return 0;
+ }
+
++/*
++ * May be called with ot = nt = p2m_ram_rw for its side effect of
++ * recalculating all PTEs in the p2m.
++ */
+ void p2m_change_entry_type_global(struct domain *d,
+ p2m_type_t ot, p2m_type_t nt)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+- ASSERT(ot != nt);
+ ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
+
+ p2m_lock(p2m);
+ p2m->change_entry_type_global(p2m, ot, nt);
+- p2m->global_logdirty = (nt == p2m_ram_logdirty);
++ /* Don't allow 'recalculate' operations to change the logdirty state. */
++ if ( ot != nt )
++ p2m->global_logdirty = (nt == p2m_ram_logdirty);
+ p2m_unlock(p2m);
+ }
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA305 b/sysutils/xenkernel411/patches/patch-XSA305
new file mode 100644
index 00000000000..5827a342213
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA305
@@ -0,0 +1,482 @@
+$NetBSD: patch-XSA305,v 1.2.2.2 2019/11/16 22:10:07 bsiegert Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
+
+To protect against the TSX Async Abort speculative vulnerability, Intel have
+released new microcode for affected parts which introduce the MSR_TSX_CTRL
+control, which allows TSX to be turned off. This will be architectural on
+future parts.
+
+Introduce tsx= to provide a global on/off for TSX, including its enumeration
+via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to
+guests at the moment.
+
+VMs may have booted before microcode is loaded, or before hosts have rebooted,
+and they still want to migrate freely. A VM which booted seeing TSX can
+migrate safely to hosts with TSX disabled - TSX will start unconditionally
+aborting, but still behave in a manner compatible with the ABI.
+
+The guest-visible behaviour is equivalent to late loading the microcode and
+setting the RTM_DISABLE bit in the course of live patching.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 684671cb7b..b86d26399a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter.
+ ### tsc (x86)
+ > `= unstable | skewed | stable:socket`
+
++### tsx
++ = <bool>
++
++ Applicability: x86
++ Default: true
++
++Controls for the use of Transactional Synchronization eXtensions.
++
++On Intel parts released in Q3 2019 (with updated microcode), and future parts,
++a control has been introduced which allows TSX to be turned off.
++
++On systems with the ability to turn TSX off, this boolean offers system wide
++control of whether TSX is enabled or disabled.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+
+diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
+index da1e4827f4..4c82d9f710 100644
+--- xen/arch/x86/Makefile.orig
++++ xen/arch/x86/Makefile
+@@ -65,6 +65,7 @@ obj-y += sysctl.o
+ obj-y += time.o
+ obj-y += trace.o
+ obj-y += traps.o
++obj-y += tsx.o
+ obj-y += usercopy.o
+ obj-y += x86_emulate.o
+ obj-$(CONFIG_TBOOT) += tboot.o
+diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
+index 5e11970701..04aefa555d 100644
+--- xen/arch/x86/cpuid.c.orig
++++ xen/arch/x86/cpuid.c
+@@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d)
+ if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
+ __set_bit(X86_FEATURE_ITSC, max_fs);
+
++ /*
++ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
++ * TSX and hide the feature bits. Migrating-in VMs may have been booted
++ * pre-mitigation when the TSX features were visbile.
++ *
++ * This situation is compatible (albeit with a perf hit to any TSX code in
++ * the guest), so allow the feature bits to remain set.
++ */
++ if ( cpu_has_tsx_ctrl )
++ {
++ __set_bit(X86_FEATURE_HLE, max_fs);
++ __set_bit(X86_FEATURE_RTM, max_fs);
++ }
++
+ /* Clamp the toolstacks choices to reality. */
+ for ( i = 0; i < ARRAY_SIZE(fs); i++ )
+ fs[i] &= max_fs[i];
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index ebc0665615..35d99a98a1 100644
+--- xen/arch/x86/msr.c.orig
++++ xen/arch/x86/msr.c
+@@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ case MSR_FLUSH_CMD:
+ /* Write-only */
+ case MSR_TSX_FORCE_ABORT:
++ case MSR_TSX_CTRL:
+ /* Not offered to guests. */
+ goto gp_fault;
+
+@@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+ case MSR_ARCH_CAPABILITIES:
+ /* Read-only */
+ case MSR_TSX_FORCE_ABORT:
++ case MSR_TSX_CTRL:
+ /* Not offered to guests. */
+ goto gp_fault;
+
+diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
+index 657160549f..dc13ad6c36 100644
+--- xen/arch/x86/setup.c.orig
++++ xen/arch/x86/setup.c
+@@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
+
+ early_microcode_init();
+
++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
++
+ identify_cpu(&boot_cpu_data);
+
+ set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index fd52a10cf9..bdc118d88b 100644
+--- xen/arch/x86/smpboot.c.orig
++++ xen/arch/x86/smpboot.c
+@@ -376,6 +376,8 @@ void start_secondary(void *unused)
+ if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+ wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
+
++ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
++
+ if ( xen_guest )
+ hypervisor_ap_setup();
+
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+new file mode 100644
+index 0000000000..a8ec2ccc69
+--- /dev/null
++++ xen/arch/x86/tsx.c
+@@ -0,0 +1,74 @@
++#include <xen/init.h>
++#include <asm/msr.h>
++
++/*
++ * Valid values:
++ * 1 => Explicit tsx=1
++ * 0 => Explicit tsx=0
++ * -1 => Default, implicit tsx=1
++ *
++ * This is arranged such that the bottom bit encodes whether TSX is actually
++ * disabled, while identifying various explicit (>=0) and implicit (<0)
++ * conditions.
++ */
++int8_t __read_mostly opt_tsx = -1;
++int8_t __read_mostly cpu_has_tsx_ctrl = -1;
++
++static int __init parse_tsx(const char *s)
++{
++ int rc = 0, val = parse_bool(s, NULL);
++
++ if ( val >= 0 )
++ opt_tsx = val;
++ else
++ rc = -EINVAL;
++
++ return rc;
++}
++custom_param("tsx", parse_tsx);
++
++void tsx_init(void)
++{
++ /*
++ * This function is first called between microcode being loaded, and CPUID
++ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL
++ * is available.
++ */
++ if ( unlikely(cpu_has_tsx_ctrl < 0) )
++ {
++ uint64_t caps = 0;
++
++ if ( boot_cpu_data.cpuid_level >= 7 &&
++ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
++ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
++
++ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
++ }
++
++ if ( cpu_has_tsx_ctrl )
++ {
++ uint64_t val;
++
++ rdmsrl(MSR_TSX_CTRL, val);
++
++ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
++ /* Check bottom bit only. Higher bits are various sentinals. */
++ if ( !(opt_tsx & 1) )
++ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
++
++ wrmsrl(MSR_TSX_CTRL, val);
++ }
++ else if ( opt_tsx >= 0 )
++ printk_once(XENLOG_WARNING
++ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 89ae3e03f1..5ee7a37c12 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -55,6 +55,7 @@
+ #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
++#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+@@ -62,6 +63,10 @@
+ #define MSR_TSX_FORCE_ABORT 0x0000010f
+ #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0)
+
++#define MSR_TSX_CTRL 0x00000122
++#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0)
++#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1)
++
+ /* Intel MSRs. Some also available on other CPUs */
+ #define MSR_IA32_PERFCTR0 0x000000c1
+ #define MSR_IA32_A_PERFCTR0 0x000004c1
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 20d1ecb332..66224f23b9 100644
+--- xen/include/asm-x86/processor.h.orig
++++ xen/include/asm-x86/processor.h
+@@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx(
+ return ebx;
+ }
+
++static always_inline unsigned int cpuid_count_edx(
++ unsigned int leaf, unsigned int subleaf)
++{
++ unsigned int edx, tmp;
++
++ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
++
++ return edx;
++}
++
+ static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf,
+ struct cpuid_leaf *data)
+ {
+@@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
+ return fam;
+ }
+
++extern int8_t opt_tsx, cpu_has_tsx_ctrl;
++void tsx_init(void);
++
+ #endif /* !__ASSEMBLY__ */
+
+ #endif /* __ASM_X86_PROCESSOR_H */
+diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
+index 750f809968..be223a6950 100644
+--- xen/include/xen/lib.h.orig
++++ xen/include/xen/lib.h
+@@ -116,6 +116,16 @@ extern int printk_ratelimit(void);
+ #define gprintk(lvl, fmt, args...) \
+ printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
+
++#define printk_once(fmt, args...) \
++({ \
++ static bool __read_mostly once_; \
++ if ( unlikely(!once_) ) \
++ { \
++ once_ = true; \
++ printk(fmt, ## args); \
++ } \
++})
++
+ #ifdef NDEBUG
+
+ static inline void
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
+
+See patch documentation and comments.
+
+This is part of XSA-305 / CVE-2019-11135
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index b86d26399a..31635a473a 100644
+--- docs/misc/xen-command-line.markdown.orig
++++ docs/misc/xen-command-line.markdown
+@@ -1841,7 +1841,7 @@ extreme care.**
+ An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
+ mitigations, including pieces of infrastructure used to virtualise certain
+ mitigation features for guests. This also includes settings which `xpti`,
+-`smt`, `pv-l1tf` control, unless the respective option(s) have been
++`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
+ specified earlier on the command line.
+
+ Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
+@@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter.
+ = <bool>
+
+ Applicability: x86
+- Default: true
++ Default: false on parts vulnerable to TAA, true otherwise
+
+ Controls for the use of Transactional Synchronization eXtensions.
+
+@@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off.
+ On systems with the ability to turn TSX off, this boolean offers system wide
+ control of whether TSX is enabled or disabled.
+
++On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
++logic applies:
++
++ * An explicit `tsx=` choice is honoured, even if it is `true` and would
++ result in a vulnerable system.
++
++ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
++ mitigated by disabling TSX, as this is the lowest overhead option.
++
++ * If the use of TSX is important, the more expensive TAA mitigations can be
++ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
++ active by default.
++
+ ### ucode (x86)
+ > `= [<integer> | scan]`
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 2fe16b423d..ab196b156d 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s)
+ if ( opt_pv_l1tf_domu < 0 )
+ opt_pv_l1tf_domu = 0;
+
++ if ( opt_tsx == -1 )
++ opt_tsx = -3;
++
+ disable_common:
+ opt_rsb_pv = false;
+ opt_rsb_hvm = false;
+@@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ printk("Speculative mitigation facilities:\n");
+
+ /* Hardware features which pertain to speculative mitigations. */
+- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
+@@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
+ (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
+ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
+- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "");
++ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
++ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "");
+
+ /* Compiled-in support which pertains to mitigations. */
+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+@@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ "\n");
+
+ /* Settings for Xen's protection, irrespective of guests. */
+- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
++ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
+ thunk == THUNK_NONE ? "N/A" :
+ thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+ thunk == THUNK_LFENCE ? "LFENCE" :
+@@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
+ !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
+ (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
++ !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
++ (opt_tsx & 1) ? " TSX+" : " TSX-",
+ opt_ibpb ? " IBPB" : "",
+ opt_l1d_flush ? " L1D_FLUSH" : "",
+ opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "");
+@@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void)
+ {
+ enum ind_thunk thunk = THUNK_DEFAULT;
+ bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
++ bool cpu_has_bug_taa;
+ uint64_t caps = 0;
+
+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
+@@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void)
+ "enabled. Mitigations will not be fully effective. Please\n"
+ "choose an explicit smt=<bool> setting. See XSA-297.\n");
+
++ /*
++ * Vulnerability to TAA is a little complicated to quantify.
++ *
++ * In the pipeline, it is just another way to get speculative access to
++ * stale load port, store buffer or fill buffer data, and therefore can be
++ * considered a superset of MDS (on TSX-capable parts). On parts which
++ * predate MDS_NO, the existing VERW flushing will mitigate this
++ * sidechannel as well.
++ *
++ * On parts which contain MDS_NO, the lack of VERW flushing means that an
++ * attacker can still use TSX to target microarchitectural buffers to leak
++ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts
++ * which have MDS_NO but lack TAA_NO.
++ *
++ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
++ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so
++ * we check both to spot TSX in a microcode/cmdline independent way.
++ */
++ cpu_has_bug_taa =
++ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
++ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
++
++ /*
++ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
++ * the MDS mitigation of disabling HT and using VERW flushing.
++ *
++ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
++ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
++ * advertised, and there isn't a MD_CLEAR_2 flag to use...
++ *
++ * If we're on affected hardware, able to do something about it (which
++ * implies that VERW now works), no explicit TSX choice and traditional
++ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
++ * plausibly value TSX higher than Hyperthreading...), disable TSX to
++ * mitigate TAA.
++ */
++ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
++ ((hw_smt_enabled && opt_smt) ||
++ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
++ {
++ setup_clear_cpu_cap(X86_FEATURE_HLE);
++ setup_clear_cpu_cap(X86_FEATURE_RTM);
++
++ opt_tsx = 0;
++ tsx_init();
++ }
++
+ print_details(thunk, caps);
+
+ /*
+diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
+index a8ec2ccc69..2d202a0d4e 100644
+--- xen/arch/x86/tsx.c.orig
++++ xen/arch/x86/tsx.c
+@@ -5,7 +5,8 @@
+ * Valid values:
+ * 1 => Explicit tsx=1
+ * 0 => Explicit tsx=0
+- * -1 => Default, implicit tsx=1
++ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
++ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
+ *
+ * This is arranged such that the bottom bit encodes whether TSX is actually
+ * disabled, while identifying various explicit (>=0) and implicit (<0)
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 5ee7a37c12..1761a01f1f 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -56,6 +56,7 @@
+ #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
+ #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
++#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)