summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbsiegert <bsiegert@pkgsrc.org>2020-10-22 16:29:04 +0000
committerbsiegert <bsiegert@pkgsrc.org>2020-10-22 16:29:04 +0000
commitce1e23587abc3575c99b482e0bb07ab558654745 (patch)
tree1f923f2846b731c4a75f0a45e06ad4f6ff021106
parentcee66215fe3f245187c2d7628f537ae6d217f191 (diff)
downloadpkgsrc-ce1e23587abc3575c99b482e0bb07ab558654745.tar.gz
Pullup ticket #6355 - requested by bouyer
sysutils/xenkernel411: security fix sysutils/xenkernel413: security fix Revisions pulled up: - sysutils/xenkernel411/Makefile 1.17 - sysutils/xenkernel411/distinfo 1.15 - sysutils/xenkernel411/patches/patch-XSA286 1.1 - sysutils/xenkernel411/patches/patch-XSA345 1.1 - sysutils/xenkernel411/patches/patch-XSA346 1.1 - sysutils/xenkernel411/patches/patch-XSA347 1.1 - sysutils/xenkernel413/Makefile 1.6 - sysutils/xenkernel413/distinfo 1.4 - sysutils/xenkernel413/patches/patch-XSA286 1.1 - sysutils/xenkernel413/patches/patch-XSA345 1.1 - sysutils/xenkernel413/patches/patch-XSA346 1.1 - sysutils/xenkernel413/patches/patch-XSA347 1.1 --- Module Name: pkgsrc Committed By: bouyer Date: Wed Oct 21 09:03:05 UTC 2020 Modified Files: pkgsrc/sysutils/xenkernel411: Makefile distinfo Added Files: pkgsrc/sysutils/xenkernel411/patches: patch-XSA286 patch-XSA345 patch-XSA346 patch-XSA347 Log Message: Add upstream security patches for XSA286, XSA345, XSA346, XSA347. Bump PKGREVISION. --- Module Name: pkgsrc Committed By: bouyer Date: Wed Oct 21 09:04:10 UTC 2020 Modified Files: pkgsrc/sysutils/xenkernel413: Makefile distinfo Added Files: pkgsrc/sysutils/xenkernel413/patches: patch-XSA286 patch-XSA345 patch-XSA346 patch-XSA347 Log Message: Add upstream security patches for XSA286, XSA345, XSA346, XSA347. Bump PKGREVISION.
-rw-r--r--sysutils/xenkernel411/Makefile4
-rw-r--r--sysutils/xenkernel411/distinfo6
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA286778
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA345413
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA346261
-rw-r--r--sysutils/xenkernel411/patches/patch-XSA347134
-rw-r--r--sysutils/xenkernel413/Makefile4
-rw-r--r--sysutils/xenkernel413/distinfo6
-rw-r--r--sysutils/xenkernel413/patches/patch-XSA286716
-rw-r--r--sysutils/xenkernel413/patches/patch-XSA345413
-rw-r--r--sysutils/xenkernel413/patches/patch-XSA346256
-rw-r--r--sysutils/xenkernel413/patches/patch-XSA347282
12 files changed, 3267 insertions, 6 deletions
diff --git a/sysutils/xenkernel411/Makefile b/sysutils/xenkernel411/Makefile
index f0bd54bd6b6..7bab3a61f2f 100644
--- a/sysutils/xenkernel411/Makefile
+++ b/sysutils/xenkernel411/Makefile
@@ -1,8 +1,8 @@
-# $NetBSD: Makefile,v 1.15.2.1 2020/10/04 20:44:32 bsiegert Exp $
+# $NetBSD: Makefile,v 1.15.2.2 2020/10/22 16:29:04 bsiegert Exp $
VERSION= 4.11.4
#keep >= 1 if we have security patches
-PKGREVISION= 2
+PKGREVISION= 3
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel411-${VERSION}
CATEGORIES= sysutils
diff --git a/sysutils/xenkernel411/distinfo b/sysutils/xenkernel411/distinfo
index d10a8907e3c..ab6205cee4c 100644
--- a/sysutils/xenkernel411/distinfo
+++ b/sysutils/xenkernel411/distinfo
@@ -1,10 +1,11 @@
-$NetBSD: distinfo,v 1.13.2.1 2020/10/04 20:44:32 bsiegert Exp $
+$NetBSD: distinfo,v 1.13.2.2 2020/10/22 16:29:04 bsiegert Exp $
SHA1 (xen411/xen-4.11.4.tar.gz) = 6c8cdf441621c14dc5345196b48df6982c060c4f
RMD160 (xen411/xen-4.11.4.tar.gz) = 49819fcd1de3985d4dea370be962548c862f2933
SHA512 (xen411/xen-4.11.4.tar.gz) = 8383f0b369fa08c8ecfdd68f902a2aaad140146a183131c50c020fe04c2f1e829c219b9bd9923fa8f1c180e1e7c6e73d0d68b7015fc39fd3b7f59e55c680cedb
Size (xen411/xen-4.11.4.tar.gz) = 25184564 bytes
SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA286) = c7c5cc192be821721919cc035515ddf55d2c0658
SHA1 (patch-XSA317) = 3a3e7bf8f115bebaf56001afcf68c2bd501c00a5
SHA1 (patch-XSA319) = 4954bdc849666e1c735c3281256e4850c0594ee8
SHA1 (patch-XSA320) = 38d84a2ded4ccacee455ba64eb3b369e5661fbfd
@@ -19,6 +20,9 @@ SHA1 (patch-XSA340) = 23888acfe25fc82ff085fa9acfbb36c156a15bc3
SHA1 (patch-XSA342) = a61c4e28a8c8219b88e3bab534a109b2b29e2cc3
SHA1 (patch-XSA343) = 239822636b474ebb62aa455cfdbd9853c4fb342f
SHA1 (patch-XSA344) = cf7184ac9263b418305c6a7fbae7b163b233b4bc
+SHA1 (patch-XSA345) = 14ab754703af1045b2d049de1c6ba1c5baca5d81
+SHA1 (patch-XSA346) = c1962c037c5ab62c2f7e9a558c4565331c981be0
+SHA1 (patch-XSA347) = f3f98a794584d5d4321b95c2b1b9c88821fa567e
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
diff --git a/sysutils/xenkernel411/patches/patch-XSA286 b/sysutils/xenkernel411/patches/patch-XSA286
new file mode 100644
index 00000000000..cb314e17b8f
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA286
@@ -0,0 +1,778 @@
+$NetBSD: patch-XSA286,v 1.1.2.2 2020/10/22 16:29:04 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV
+
+The flag is really only meant for those, both HVM and 32-bit PV tell
+kernel from user mode based on CPL/RPL. Remove the all-question-marks
+comment and let's be on the safe side here and also suppress clearing
+for 32-bit PV (this isn't a fast path after all).
+
+Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
+sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
+assumes the new behavior.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: George Dunlap <george.dunlap@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 35857dbe86..1d0ac81c5b 100644
+--- xen/arch/x86/domain.c.orig
++++ xen/arch/x86/domain.c
+@@ -804,9 +804,15 @@ int arch_set_info_guest(
+
+ v->fpu_initialised = !!(flags & VGCF_I387_VALID);
+
+- v->arch.flags &= ~TF_kernel_mode;
+- if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
+- v->arch.flags |= TF_kernel_mode;
++ v->arch.flags |= TF_kernel_mode;
++ if ( unlikely(!(flags & VGCF_in_kernel)) &&
++ /*
++ * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
++ * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
++ * shadow_one_bit_disable() for why that is.
++ */
++ !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
++ v->arch.flags &= ~TF_kernel_mode;
+
+ v->arch.vgc_flags = flags;
+
+diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
+index 8ab343d16e..a2ebb4943f 100644
+--- xen/arch/x86/mm/shadow/multi.c.orig
++++ xen/arch/x86/mm/shadow/multi.c
+@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ INVALID_MFN, v->arch.paging.shadow.gl3e);
+ #else /* 32 or 64 */
+ const struct domain *d = v->domain;
+- mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
++ mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
+ ? pagetable_get_mfn(v->arch.guest_table)
+ : pagetable_get_mfn(v->arch.guest_table_user));
+ void *root_map = map_domain_page(root_mfn);
+@@ -4018,7 +4018,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
+ v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+ #if GUEST_PAGING_LEVELS == 4
+- if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
++ if ( !(v->arch.flags & TF_kernel_mode) )
+ gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+ else
+ #endif
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()
+
+The L3 one at least is going to be re-used by a subsequent patch, and
+splitting the L4 one then as well seems only natural.
+
+This is part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index 3bd157967a..e73daa55e4 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
+
+ l2_pgentry_t *compat_idle_pg_table_l2;
+
+-void *do_page_walk(struct vcpu *v, unsigned long addr)
++static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ {
+- unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
+- l4_pgentry_t l4e, *l4t;
+- l3_pgentry_t l3e, *l3t;
+- l2_pgentry_t l2e, *l2t;
+- l1_pgentry_t l1e, *l1t;
++ unsigned long mfn = pagetable_get_pfn(root);
++ l4_pgentry_t *l4t, l4e;
+
+- if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
+- return NULL;
++ if ( !is_canonical_address(addr) )
++ return l4e_empty();
+
+ l4t = map_domain_page(_mfn(mfn));
+ l4e = l4t[l4_table_offset(addr)];
+ unmap_domain_page(l4t);
++
++ return l4e;
++}
++
++static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
++{
++ l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
++ l3_pgentry_t *l3t, l3e;
++
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+- return NULL;
++ return l3e_empty();
+
+ l3t = map_l3t_from_l4e(l4e);
+ l3e = l3t[l3_table_offset(addr)];
+ unmap_domain_page(l3t);
++
++ return l3e;
++}
++
++void *do_page_walk(struct vcpu *v, unsigned long addr)
++{
++ l3_pgentry_t l3e;
++ l2_pgentry_t l2e, *l2t;
++ l1_pgentry_t l1e, *l1t;
++ unsigned long mfn;
++
++ if ( !is_pv_vcpu(v) )
++ return NULL;
++
++ l3e = page_walk_get_l3e(v->arch.guest_table, addr);
+ mfn = l3e_get_pfn(l3e);
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+ return NULL;
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: check page types in do_page_walk()
+
+For page table entries read to be guaranteed valid, transiently locking
+the pages and validating their types is necessary. Note that guest use
+of linear page tables is intentionally not taken into account here, as
+ordinary data (guest stacks) can't possibly live inside page tables.
+
+This is part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index e73daa55e4..1ca9547d68 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
+
+ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ {
+- unsigned long mfn = pagetable_get_pfn(root);
+- l4_pgentry_t *l4t, l4e;
++ mfn_t mfn = pagetable_get_mfn(root);
++ /* current's root page table can't disappear under our feet. */
++ bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
++ struct page_info *pg;
++ l4_pgentry_t l4e = l4e_empty();
+
+ if ( !is_canonical_address(addr) )
+ return l4e_empty();
+
+- l4t = map_domain_page(_mfn(mfn));
+- l4e = l4t[l4_table_offset(addr)];
+- unmap_domain_page(l4t);
++ pg = mfn_to_page(mfn);
++ if ( need_lock && !page_lock(pg) )
++ return l4e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
++ {
++ l4_pgentry_t *l4t = map_domain_page(mfn);
++
++ l4e = l4t[l4_table_offset(addr)];
++ unmap_domain_page(l4t);
++ }
++
++ if ( need_lock )
++ page_unlock(pg);
+
+ return l4e;
+ }
+@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ {
+ l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+- l3_pgentry_t *l3t, l3e;
++ mfn_t mfn = l4e_get_mfn(l4e);
++ struct page_info *pg;
++ l3_pgentry_t l3e = l3e_empty();
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+ return l3e_empty();
+
+- l3t = map_l3t_from_l4e(l4e);
+- l3e = l3t[l3_table_offset(addr)];
+- unmap_domain_page(l3t);
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l3e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
++ {
++ l3_pgentry_t *l3t = map_domain_page(mfn);
++
++ l3e = l3t[l3_table_offset(addr)];
++ unmap_domain_page(l3t);
++ }
++
++ page_unlock(pg);
+
+ return l3e;
+ }
+@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+- l2_pgentry_t l2e, *l2t;
+- l1_pgentry_t l1e, *l1t;
+- unsigned long mfn;
++ l2_pgentry_t l2e = l2e_empty();
++ l1_pgentry_t l1e = l1e_empty();
++ mfn_t mfn;
++ struct page_info *pg;
+
+ if ( !is_pv_vcpu(v) )
+ return NULL;
+
+ l3e = page_walk_get_l3e(v->arch.guest_table, addr);
+- mfn = l3e_get_pfn(l3e);
+- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ mfn = l3e_get_mfn(l3e);
++ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+ if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
+ {
+- mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
++ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
+ goto ret;
+ }
+
+- l2t = map_domain_page(_mfn(mfn));
+- l2e = l2t[l2_table_offset(addr)];
+- unmap_domain_page(l2t);
+- mfn = l2e_get_pfn(l2e);
+- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return NULL;
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
++ {
++ const l2_pgentry_t *l2t = map_domain_page(mfn);
++
++ l2e = l2t[l2_table_offset(addr)];
++ unmap_domain_page(l2t);
++ }
++
++ page_unlock(pg);
++
++ mfn = l2e_get_mfn(l2e);
++ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+ if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
+ {
+- mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
++ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
+ goto ret;
+ }
+
+- l1t = map_domain_page(_mfn(mfn));
+- l1e = l1t[l1_table_offset(addr)];
+- unmap_domain_page(l1t);
+- mfn = l1e_get_pfn(l1e);
+- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return NULL;
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
++ {
++ const l1_pgentry_t *l1t = map_domain_page(mfn);
++
++ l1e = l1t[l1_table_offset(addr)];
++ unmap_domain_page(l1t);
++ }
++
++ page_unlock(pg);
++
++ mfn = l1e_get_mfn(l1e);
++ if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+
+ ret:
+- return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
++ return map_domain_page(mfn) + (addr & ~PAGE_MASK);
+ }
+
+ /*
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Replace the linear L2 table access by an actual page walk.
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
+index 80bf280fb2..ee08c13881 100644
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
+ if ( unlikely(!__addr_ok(linear)) )
+ return NULL;
+
+- /* Find this l1e and its enclosing l1mfn in the linear map. */
+- if ( __copy_from_user(&l2e,
+- &__linear_l2_table[l2_linear_offset(linear)],
+- sizeof(l2_pgentry_t)) )
++ if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
++ {
++ ASSERT_UNREACHABLE();
+ return NULL;
++ }
++
++ /* Find this l1e and its enclosing l1mfn. */
++ l2e = page_walk_get_l2e(current->arch.guest_table, linear);
+
+ /* Check flags that it will be safe to read the l1e. */
+ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index 1ca9547d68..dfa33ba894 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ return l3e;
+ }
+
++l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
++{
++ l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
++ mfn_t mfn = l3e_get_mfn(l3e);
++ struct page_info *pg;
++ l2_pgentry_t l2e = l2e_empty();
++
++ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
++ (l3e_get_flags(l3e) & _PAGE_PSE) )
++ return l2e_empty();
++
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l2e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
++ {
++ l2_pgentry_t *l2t = map_domain_page(mfn);
++
++ l2e = l2t[l2_table_offset(addr)];
++ unmap_domain_page(l2t);
++ }
++
++ page_unlock(pg);
++
++ return l2e;
++}
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 7825691d06..afafe87fe7 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -585,7 +585,9 @@ void audit_domains(void);
+ void make_cr3(struct vcpu *v, mfn_t mfn);
+ void update_cr3(struct vcpu *v);
+ int vcpu_destroy_pagetables(struct vcpu *);
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr);
++l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+
+ int __sync_local_execstate(void);
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()
+
+First of all drop guest_get_eff_l1e() entirely - there's no actual user
+of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
+its only call site.
+
+Then replace the linear L1 table access by an actual page walk.
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
+index ee08c13881..c70785d0cf 100644
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
+ }
+
+ /*
+- * Read the guest's l1e that maps this address, from the kernel-mode
+- * page tables.
+- */
+-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
+-{
+- struct vcpu *curr = current;
+- const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
+- l1_pgentry_t l1e;
+-
+- if ( user_mode )
+- toggle_guest_pt(curr);
+-
+- l1e = guest_get_eff_l1e(linear);
+-
+- if ( user_mode )
+- toggle_guest_pt(curr);
+-
+- return l1e;
+-}
+-
+-/*
+ * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
+ * into Xen's virtual range. Returns true if the mapping changed, false
+ * otherwise.
+diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
+index 976209ba4c..cc4ee1affb 100644
+--- xen/arch/x86/pv/mm.h.orig
++++ xen/arch/x86/pv/mm.h
+@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
+
+ int new_guest_cr3(mfn_t mfn);
+
+-/* Read a PV guest's l1e that maps this linear address. */
+-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
++/*
++ * Read the guest's l1e that maps this address, from the kernel-mode
++ * page tables.
++ */
++static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
+ {
+- l1_pgentry_t l1e;
++ l1_pgentry_t l1e = l1e_empty();
+
+ ASSERT(!paging_mode_translate(current->domain));
+ ASSERT(!paging_mode_external(current->domain));
+
+- if ( unlikely(!__addr_ok(linear)) ||
+- __copy_from_user(&l1e,
+- &__linear_l1_table[l1_linear_offset(linear)],
+- sizeof(l1_pgentry_t)) )
+- l1e = l1e_empty();
++ if ( likely(__addr_ok(linear)) )
++ l1e = page_walk_get_l1e(current->arch.guest_table, linear);
+
+ return l1e;
+ }
+diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
+index a3c0c2dd19..c9ee5156f8 100644
+--- xen/arch/x86/pv/ro-page-fault.c.orig
++++ xen/arch/x86/pv/ro-page-fault.c
+@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+ bool mmio_ro;
+
+ /* Attempt to read the PTE that maps the VA being accessed. */
+- pte = guest_get_eff_l1e(addr);
++ pte = guest_get_eff_kern_l1e(addr);
+
+ /* We are only looking for read-only mappings */
+ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index dfa33ba894..cca7ea6e9d 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+ return l2e;
+ }
+
++/*
++ * For now no "set_accessed" parameter, as all callers want it set to true.
++ * For now also no "set_dirty" parameter, as all callers deal with r/o
++ * mappings, and we don't want to set the dirty bit there (conflicts with
++ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
++ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
++ */
++l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
++{
++ l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
++ mfn_t mfn = l2e_get_mfn(l2e);
++ struct page_info *pg;
++ l1_pgentry_t l1e = l1e_empty();
++
++ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
++ (l2e_get_flags(l2e) & _PAGE_PSE) )
++ return l1e_empty();
++
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l1e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
++ {
++ l1_pgentry_t *l1t = map_domain_page(mfn);
++
++ l1e = l1t[l1_table_offset(addr)];
++
++ if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
++ _PAGE_PRESENT )
++ {
++ l1_pgentry_t ol1e = l1e;
++
++ l1e_add_flags(l1e, _PAGE_ACCESSED);
++ /*
++ * Best effort only; with the lock held the page shouldn't
++ * change anyway, except for the dirty bit to perhaps become set.
++ */
++ while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
++ l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
++ l1e_get_intpte(ol1e) &&
++ !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
++ {
++ l1e_add_flags(ol1e, _PAGE_DIRTY);
++ l1e_add_flags(l1e, _PAGE_DIRTY);
++ }
++ }
++
++ unmap_domain_page(l1t);
++ }
++
++ page_unlock(pg);
++
++ return l1e;
++}
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index afafe87fe7..423313ae3a 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
+
+ void *do_page_walk(struct vcpu *v, unsigned long addr);
+ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
++l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
+
+ int __sync_local_execstate(void);
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using top level linear page tables in
+ {,un}map_domain_page()
+
+Move the page table recursion two levels down. This entails avoiding
+to free the recursive mapping prematurely in free_perdomain_mappings().
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
+index 0c24530ed9..d89fa27f8e 100644
+--- xen/arch/x86/domain_page.c.orig
++++ xen/arch/x86/domain_page.c
+@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
+ #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
+ #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
+ #define MAPCACHE_L1ENT(idx) \
+- __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
++ ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
++ ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
+
+ void *map_domain_page(mfn_t mfn)
+ {
+@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
+ {
+ struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
+ unsigned int bitmap_pages;
++ int rc;
+
+ ASSERT(is_pv_domain(d));
+
+@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
+ return 0;
+ #endif
+
++ BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
+ BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
+- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
++ 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
++ (1U << L2_PAGETABLE_SHIFT) >
+ MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
+ bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
+ dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
+@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
+
+ spin_lock_init(&dcache->lock);
+
+- return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+- 2 * bitmap_pages + 1,
+- NIL(l1_pgentry_t *), NULL);
++ rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
++ 2 * bitmap_pages + 1,
++ NIL(l1_pgentry_t *), NULL);
++ if ( !rc )
++ {
++ /*
++ * Install mapping of our L2 table into its own last slot, for easy
++ * access to the L1 entries via MAPCACHE_L1ENT().
++ */
++ l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
++ l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
++ l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
++
++ l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
++ unmap_domain_page(l2t);
++ unmap_domain_page(l3t);
++ }
++
++ return rc;
+ }
+
+ int mapcache_vcpu_init(struct vcpu *v)
+@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
+ else
+ {
+ ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
+- pl1e = &__linear_l1_table[l1_linear_offset(va)];
++ pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
+ }
+
+ return l1e_get_mfn(*pl1e);
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 626768a950..8f975a747d 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -6038,6 +6038,10 @@ void free_perdomain_mappings(struct domain *d)
+ {
+ struct page_info *l1pg = l2e_get_page(l2tab[j]);
+
++ /* mapcache_domain_init() installs a recursive entry. */
++ if ( l1pg == l2pg )
++ continue;
++
+ if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
+ {
+ l1_pgentry_t *l1tab = __map_domain_page(l1pg);
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: restrict use of linear page tables to shadow mode code
+
+Other code does not require them to be set up anymore, so restrict when
+to populate the respective L4 slot and reduce visibility of the
+accessors.
+
+While with the removal of all uses the vulnerability is actually fixed,
+removing the creation of the linear mapping adds an extra layer of
+protection. Similarly reducing visibility of the accessors mostly
+eliminates the risk of undue re-introduction of uses of the linear
+mappings.
+
+This is (not strictly) part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 8f975a747d..10175764e8 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1755,9 +1755,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
+ l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
+ idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
+
+- /* Slot 258: Self linear mappings. */
++ /* Slot 258: Self linear mappings (shadow pt only). */
+ ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
+ l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
++ !shadow_mode_external(d) ? l4e_empty() :
+ l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
+
+ /* Slot 259: Shadow linear mappings (if applicable) .*/
+diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
+index c7fa18925b..1933a6a2a2 100644
+--- xen/arch/x86/mm/shadow/private.h.orig
++++ xen/arch/x86/mm/shadow/private.h
+@@ -137,6 +137,15 @@ enum {
+ # define GUEST_PTE_SIZE 4
+ #endif
+
++/* Where to find each level of the linear mapping */
++#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
++#define __linear_l2_table \
++ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
++#define __linear_l3_table \
++ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
++#define __linear_l4_table \
++ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
++
+ /******************************************************************************
+ * Auditing routines
+ */
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index cca7ea6e9d..d7551e594a 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -833,9 +833,6 @@ void __init paging_init(void)
+
+ machine_to_phys_mapping_valid = 1;
+
+- /* Set up linear page table mapping. */
+- l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
+- l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
+ return;
+
+ nomem:
+diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
+index 9ef9d03ca7..4670ab99f6 100644
+--- xen/include/asm-x86/config.h.orig
++++ xen/include/asm-x86/config.h
+@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
+ */
+ #define PCI_MCFG_VIRT_START (PML4_ADDR(257))
+ #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
+-/* Slot 258: linear page table (guest table). */
++/* Slot 258: linear page table (monitor table, HVM only). */
+ #define LINEAR_PT_VIRT_START (PML4_ADDR(258))
+ #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+ /* Slot 259: linear page table (shadow table). */
+diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
+index c1e92937c0..e72c277b9f 100644
+--- xen/include/asm-x86/page.h.orig
++++ xen/include/asm-x86/page.h
+@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
+ #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
+ #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
+
+-#endif /* !defined(__ASSEMBLY__) */
+-
+-/* Where to find each level of the linear mapping */
+-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+-#define __linear_l2_table \
+- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+-#define __linear_l3_table \
+- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+-#define __linear_l4_table \
+- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+-
+-
+-#ifndef __ASSEMBLY__
+ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
+ extern l2_pgentry_t *compat_idle_pg_table_l2;
+ extern unsigned int m2p_compat_vstart;
diff --git a/sysutils/xenkernel411/patches/patch-XSA345 b/sysutils/xenkernel411/patches/patch-XSA345
new file mode 100644
index 00000000000..5ddb19b82f5
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA345
@@ -0,0 +1,413 @@
+$NetBSD: patch-XSA345,v 1.1.2.2 2020/10/22 16:29:04 bsiegert Exp $
+
+From edbe70427e17743351f1b739ea1536acd757ae6c Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Sat, 11 Jan 2020 21:57:41 +0000
+Subject: [PATCH 1/3] x86/mm: Refactor map_pages_to_xen to have only a single
+ exit path
+
+We will soon need to perform clean-ups before returning.
+
+No functional change.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 626768a950..79a3fac3cc 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -5194,6 +5194,7 @@ int map_pages_to_xen(
+ l2_pgentry_t *pl2e, ol2e;
+ l1_pgentry_t *pl1e, ol1e;
+ unsigned int i;
++ int rc = -ENOMEM;
+
+ #define flush_flags(oldf) do { \
+ unsigned int o_ = (oldf); \
+@@ -5214,7 +5215,8 @@ int map_pages_to_xen(
+ l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
+
+ if ( !pl3e )
+- return -ENOMEM;
++ goto out;
++
+ ol3e = *pl3e;
+
+ if ( cpu_has_page1gb &&
+@@ -5302,7 +5304,7 @@ int map_pages_to_xen(
+
+ pl2e = alloc_xen_pagetable();
+ if ( pl2e == NULL )
+- return -ENOMEM;
++ goto out;
+
+ for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ l2e_write(pl2e + i,
+@@ -5331,7 +5333,7 @@ int map_pages_to_xen(
+
+ pl2e = virt_to_xen_l2e(virt);
+ if ( !pl2e )
+- return -ENOMEM;
++ goto out;
+
+ if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) &
+ ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
+@@ -5374,7 +5376,7 @@ int map_pages_to_xen(
+ {
+ pl1e = virt_to_xen_l1e(virt);
+ if ( pl1e == NULL )
+- return -ENOMEM;
++ goto out;
+ }
+ else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
+ {
+@@ -5401,7 +5403,7 @@ int map_pages_to_xen(
+
+ pl1e = alloc_xen_pagetable();
+ if ( pl1e == NULL )
+- return -ENOMEM;
++ goto out;
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ l1e_write(&pl1e[i],
+@@ -5545,7 +5547,10 @@ int map_pages_to_xen(
+
+ #undef flush_flags
+
+- return 0;
++ rc = 0;
++
++ out:
++ return rc;
+ }
+
+ int populate_pt_range(unsigned long virt, unsigned long nr_mfns)
+--
+2.25.1
+
+From 7101786be91dce650b6e79f1374c580c731bb348 Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Sat, 11 Jan 2020 21:57:42 +0000
+Subject: [PATCH 2/3] x86/mm: Refactor modify_xen_mappings to have one exit
+ path
+
+We will soon need to perform clean-ups before returning.
+
+No functional change.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 79a3fac3cc..8ed3ecacbe 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -5577,6 +5577,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ l1_pgentry_t *pl1e;
+ unsigned int i;
+ unsigned long v = s;
++ int rc = -ENOMEM;
+
+ /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5618,7 +5619,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ /* PAGE1GB: shatter the superpage and fall through. */
+ pl2e = alloc_xen_pagetable();
+ if ( !pl2e )
+- return -ENOMEM;
++ goto out;
++
+ for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ l2e_write(pl2e + i,
+ l2e_from_pfn(l3e_get_pfn(*pl3e) +
+@@ -5673,7 +5675,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ /* PSE: shatter the superpage and try again. */
+ pl1e = alloc_xen_pagetable();
+ if ( !pl1e )
+- return -ENOMEM;
++ goto out;
++
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ l1e_write(&pl1e[i],
+ l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
+@@ -5802,7 +5805,10 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ flush_area(NULL, FLUSH_TLB_GLOBAL);
+
+ #undef FLAGS_MASK
+- return 0;
++ rc = 0;
++
++ out:
++ return rc;
+ }
+
+ #undef flush_area
+--
+2.25.1
+
+From e7bbc4a0b5af76a82f0dcf4afcbf1509b020eb73 Mon Sep 17 00:00:00 2001
+From: Hongyan Xia <hongyxia@amazon.com>
+Date: Sat, 11 Jan 2020 21:57:43 +0000
+Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates
+
+map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB
+superpages if possible, to maximize TLB efficiency. This means both
+replacing superpage entries with smaller entries, and replacing
+smaller entries with superpages.
+
+Unfortunately, while some potential races are handled correctly,
+others are not. These include:
+
+1. When one processor modifies a sub-superpage mapping while another
+processor replaces the entire range with a superpage.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and
+B.
+
+* A walks the pagetables, get a pointer to L2.
+* B replaces L3[N] with a 1GiB mapping.
+* B Frees L2
+* A writes L2[M] #
+
+This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't
+handle higher-level superpages properly: If you call virt_xen_to_l2e
+on a virtual address within an L3 superpage, you'll either hit a BUG()
+(most likely), or get a pointer into the middle of a data page; same
+with virt_xen_to_l1 on a virtual address within either an L3 or L2
+superpage.
+
+So take the following example:
+
+* A reads pl3e and discovers it to point to an L2.
+* B replaces L3[N] with a 1GiB mapping
+* A calls virt_to_xen_l2e() and hits the BUG_ON() #
+
+2. When two processors simultaneously try to replace a sub-superpage
+mapping with a superpage mapping.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and B,
+both trying to replace L3[N] with a superpage.
+
+* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2.
+* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2.
+* A writes the new value into L3[N]
+* B writes the new value into L3[N]
+* A recursively frees all the L1's under L2, then frees L2
+* B recursively double-frees all the L1's under L2, then double-frees L2 #
+
+Fix this by grabbing a lock for the entirety of the mapping update
+operation.
+
+Rather than grabbing map_pgdir_lock for the entire operation, however,
+repurpose the PGT_locked bit from L3's page->type_info as a lock.
+This means that rather than locking the entire address space, we
+"only" lock a single 512GiB chunk of hypervisor address space at a
+time.
+
+There was a proposal for a lock-and-reverify approach, where we walk
+the pagetables to the point where we decide what to do; then grab the
+map_pgdir_lock, re-verify the information we collected without the
+lock, and finally make the change (starting over again if anything had
+changed). Without being able to guarantee that the L2 table wasn't
+freed, however, that means every read would need to be considered
+potentially unsafe. Thinking carefully about that is probably
+something that wants to be done on public, not under time pressure.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 89 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 8ed3ecacbe..4ff24de73d 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -2153,6 +2153,50 @@ void page_unlock(struct page_info *page)
+ current_locked_page_set(NULL);
+ }
+
++/*
++ * L3 table locks:
++ *
++ * Used for serialization in map_pages_to_xen() and modify_xen_mappings().
++ *
++ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to
++ * reuse the PGT_locked flag. This lock is taken only when we move down to L3
++ * tables and below, since L4 (and above, for 5-level paging) is still globally
++ * protected by map_pgdir_lock.
++ *
++ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock().
++ * This has two implications:
++ * - We cannot reuse reuse current_locked_page_* for debugging
++ * - To avoid the chance of deadlock, even for different pages, we
++ * must never grab page_lock() after grabbing l3t_lock(). This
++ * includes any page_lock()-based locks, such as
++ * mem_sharing_page_lock().
++ *
++ * Also note that we grab the map_pgdir_lock while holding the
++ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
++ * reverse order.
++ */
++static void l3t_lock(struct page_info *page)
++{
++ unsigned long x, nx;
++
++ do {
++ while ( (x = page->u.inuse.type_info) & PGT_locked )
++ cpu_relax();
++ nx = x | PGT_locked;
++ } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
++}
++
++static void l3t_unlock(struct page_info *page)
++{
++ unsigned long x, nx, y = page->u.inuse.type_info;
++
++ do {
++ x = y;
++ BUG_ON(!(x & PGT_locked));
++ nx = x & ~PGT_locked;
++ } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
++}
++
+ /*
+ * PTE flags that a guest may change without re-validating the PTE.
+ * All other bits affect translation, caching, or Xen's safety.
+@@ -5184,6 +5228,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+ flush_area_local((const void *)v, f) : \
+ flush_area_all((const void *)v, f))
+
++#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
++
++#define L3T_LOCK(page) \
++ do { \
++ if ( locking ) \
++ l3t_lock(page); \
++ } while ( false )
++
++#define L3T_UNLOCK(page) \
++ do { \
++ if ( locking && (page) != ZERO_BLOCK_PTR ) \
++ { \
++ l3t_unlock(page); \
++ (page) = ZERO_BLOCK_PTR; \
++ } \
++ } while ( false )
++
+ int map_pages_to_xen(
+ unsigned long virt,
+ mfn_t mfn,
+@@ -5195,6 +5256,7 @@ int map_pages_to_xen(
+ l1_pgentry_t *pl1e, ol1e;
+ unsigned int i;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ #define flush_flags(oldf) do { \
+ unsigned int o_ = (oldf); \
+@@ -5210,13 +5272,20 @@ int map_pages_to_xen(
+ } \
+ } while (0)
+
++ L3T_INIT(current_l3page);
++
+ while ( nr_mfns != 0 )
+ {
+- l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
++ l3_pgentry_t *pl3e, ol3e;
+
++ L3T_UNLOCK(current_l3page);
++
++ pl3e = virt_to_xen_l3e(virt);
+ if ( !pl3e )
+ goto out;
+
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
+ ol3e = *pl3e;
+
+ if ( cpu_has_page1gb &&
+@@ -5550,6 +5619,7 @@ int map_pages_to_xen(
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
+@@ -5578,6 +5648,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ unsigned int i;
+ unsigned long v = s;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5586,11 +5657,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ ASSERT(IS_ALIGNED(s, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(e, PAGE_SIZE));
+
++ L3T_INIT(current_l3page);
++
+ while ( v < e )
+ {
+- l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
++ l3_pgentry_t *pl3e;
++
++ L3T_UNLOCK(current_l3page);
+
+- if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
++ pl3e = virt_to_xen_l3e(v);
++ if ( !pl3e )
++ goto out;
++
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
++
++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+ {
+ /* Confirm the caller isn't trying to create new mappings. */
+ ASSERT(!(nf & _PAGE_PRESENT));
+@@ -5808,9 +5890,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
++#undef L3T_LOCK
++#undef L3T_UNLOCK
++
+ #undef flush_area
+
+ int destroy_xen_mappings(unsigned long s, unsigned long e)
+--
+2.25.1
+
diff --git a/sysutils/xenkernel411/patches/patch-XSA346 b/sysutils/xenkernel411/patches/patch-XSA346
new file mode 100644
index 00000000000..c4f755d7c79
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA346
@@ -0,0 +1,261 @@
+$NetBSD: patch-XSA346,v 1.1.2.2 2020/10/22 16:29:04 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page
+
+Deferring flushes to a single, wide range one - as is done when
+handling XENMAPSPACE_gmfn_range - is okay only as long as
+pages don't get freed ahead of the eventual flush. While the only
+function setting the flag (xenmem_add_to_physmap()) suggests by its name
+that it's only mapping new entries, in reality the way
+xenmem_add_to_physmap_one() works means an unmap would happen not only
+for the page being moved (but not freed) but, if the destination GFN is
+populated, also for the page being displaced from that GFN. Collapsing
+the two flushes for this GFN into just one (end even more so deferring
+it to a batched invocation) is not correct.
+
+This is part of XSA-346.
+
+Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Acked-by: Julien Grall <jgrall@amazon.com>
+
+--- xen/common/memory.c.orig
++++ xen/common/memory.c
+@@ -298,7 +298,10 @@ int guest_remove_page(struct domain *d,
+ p2m_type_t p2mt;
+ #endif
+ mfn_t mfn;
++#ifdef CONFIG_HAS_PASSTHROUGH
++ bool *dont_flush_p, dont_flush;
+ int rc;
++#endif
+
+ #ifdef CONFIG_X86
+ mfn = get_gfn_query(d, gmfn, &p2mt);
+@@ -376,8 +379,22 @@ int guest_remove_page(struct domain *d,
+ return -ENXIO;
+ }
+
++#ifdef CONFIG_HAS_PASSTHROUGH
++ /*
++ * Since we're likely to free the page below, we need to suspend
++ * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes.
++ */
++ dont_flush_p = &this_cpu(iommu_dont_flush_iotlb);
++ dont_flush = *dont_flush_p;
++ *dont_flush_p = false;
++#endif
++
+ rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
+
++#ifdef CONFIG_HAS_PASSTHROUGH
++ *dont_flush_p = dont_flush;
++#endif
++
+ /*
+ * With the lack of an IOMMU on some platforms, domains with DMA-capable
+ * device must retrieve the same pfn when the hypercall populate_physmap
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU: hold page ref until after deferred TLB flush
+
+When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB
+flush for the "from" GFN range requires that the page remains allocated
+to the guest until the TLB flush has actually occurred. Otherwise a
+parallel hypercall to remove the page would only flush the TLB for the
+GFN it has been moved to, but not the one is was mapped at originally.
+
+This is part of XSA-346.
+
+Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
+Reported-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+
+--- xen/arch/arm/mm.c.orig
++++ xen/arch/arm/mm.c
+@@ -1222,7 +1222,7 @@ void share_xen_page_with_guest(struct pa
+ int xenmem_add_to_physmap_one(
+ struct domain *d,
+ unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx,
+ gfn_t gfn)
+ {
+@@ -1294,10 +1294,6 @@ int xenmem_add_to_physmap_one(
+ break;
+ }
+ case XENMAPSPACE_dev_mmio:
+- /* extra should be 0. Reserved for future use. */
+- if ( extra.res0 )
+- return -EOPNOTSUPP;
+-
+ rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx));
+ return rc;
+
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -4634,7 +4634,7 @@ static int handle_iomem_range(unsigned l
+ int xenmem_add_to_physmap_one(
+ struct domain *d,
+ unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx,
+ gfn_t gpfn)
+ {
+@@ -4721,9 +4721,20 @@ int xenmem_add_to_physmap_one(
+ rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
+
+ put_both:
+- /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */
++ /*
++ * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top.
++ * We also may need to transfer ownership of the page reference to our
++ * caller.
++ */
+ if ( space == XENMAPSPACE_gmfn )
++ {
+ put_gfn(d, gfn);
++ if ( !rc && extra.ppage )
++ {
++ *extra.ppage = page;
++ page = NULL;
++ }
++ }
+
+ if ( page )
+ put_page(page);
+--- xen/common/memory.c.orig
++++ xen/common/memory.c
+@@ -811,11 +811,10 @@ int xenmem_add_to_physmap(struct domain
+ {
+ unsigned int done = 0;
+ long rc = 0;
+- union xen_add_to_physmap_batch_extra extra;
++ union add_to_physmap_extra extra = {};
++ struct page_info *pages[16];
+
+- if ( xatp->space != XENMAPSPACE_gmfn_foreign )
+- extra.res0 = 0;
+- else
++ if ( xatp->space == XENMAPSPACE_gmfn_foreign )
+ extra.foreign_domid = DOMID_INVALID;
+
+ if ( xatp->space != XENMAPSPACE_gmfn_range )
+@@ -831,7 +830,10 @@ int xenmem_add_to_physmap(struct domain
+
+ #ifdef CONFIG_HAS_PASSTHROUGH
+ if ( need_iommu(d) )
++ {
+ this_cpu(iommu_dont_flush_iotlb) = 1;
++ extra.ppage = &pages[0];
++ }
+ #endif
+
+ while ( xatp->size > done )
+@@ -844,8 +846,12 @@ int xenmem_add_to_physmap(struct domain
+ xatp->idx++;
+ xatp->gpfn++;
+
++ if ( extra.ppage )
++ ++extra.ppage;
++
+ /* Check for continuation if it's not the last iteration. */
+- if ( xatp->size > ++done && hypercall_preempt_check() )
++ if ( (++done > ARRAY_SIZE(pages) && extra.ppage) ||
++ (xatp->size > done && hypercall_preempt_check()) )
+ {
+ rc = start + done;
+ break;
+@@ -856,6 +862,7 @@ int xenmem_add_to_physmap(struct domain
+ if ( need_iommu(d) )
+ {
+ int ret;
++ unsigned int i;
+
+ this_cpu(iommu_dont_flush_iotlb) = 0;
+
+@@ -863,6 +870,15 @@ int xenmem_add_to_physmap(struct domain
+ if ( unlikely(ret) && rc >= 0 )
+ rc = ret;
+
++ /*
++ * Now that the IOMMU TLB flush was done for the original GFN, drop
++ * the page references. The 2nd flush below is fine to make later, as
++ * whoever removes the page again from its new GFN will have to do
++ * another flush anyway.
++ */
++ for ( i = 0; i < done; ++i )
++ put_page(pages[i]);
++
+ ret = iommu_iotlb_flush(d, xatp->gpfn - done, done);
+ if ( unlikely(ret) && rc >= 0 )
+ rc = ret;
+@@ -876,6 +892,8 @@ static int xenmem_add_to_physmap_batch(s
+ struct xen_add_to_physmap_batch *xatpb,
+ unsigned int extent)
+ {
++ union add_to_physmap_extra extra = {};
++
+ if ( xatpb->size < extent )
+ return -EILSEQ;
+
+@@ -884,6 +902,19 @@ static int xenmem_add_to_physmap_batch(s
+ !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
+ return -EFAULT;
+
++ switch ( xatpb->space )
++ {
++ case XENMAPSPACE_dev_mmio:
++ /* res0 is reserved for future use. */
++ if ( xatpb->u.res0 )
++ return -EOPNOTSUPP;
++ break;
++
++ case XENMAPSPACE_gmfn_foreign:
++ extra.foreign_domid = xatpb->u.foreign_domid;
++ break;
++ }
++
+ while ( xatpb->size > extent )
+ {
+ xen_ulong_t idx;
+@@ -896,8 +927,7 @@ static int xenmem_add_to_physmap_batch(s
+ extent, 1)) )
+ return -EFAULT;
+
+- rc = xenmem_add_to_physmap_one(d, xatpb->space,
+- xatpb->u,
++ rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
+ idx, _gfn(gpfn));
+
+ if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
+--- xen/include/xen/mm.h.orig
++++ xen/include/xen/mm.h
+@@ -577,8 +577,22 @@ void scrub_one_page(struct page_info *);
+ &(d)->xenpage_list : &(d)->page_list)
+ #endif
+
++union add_to_physmap_extra {
++ /*
++ * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs
++ * to be kept until after the flush, so the page can't get removed from
++ * the domain (and re-used for another purpose) beforehand. By passing
++ * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants
++ * to have ownership of such a reference transferred in the success case.
++ */
++ struct page_info **ppage;
++
++ /* XENMAPSPACE_gmfn_foreign */
++ domid_t foreign_domid;
++};
++
+ int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx, gfn_t gfn);
+
+ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
diff --git a/sysutils/xenkernel411/patches/patch-XSA347 b/sysutils/xenkernel411/patches/patch-XSA347
new file mode 100644
index 00000000000..d1c1fe198f4
--- /dev/null
+++ b/sysutils/xenkernel411/patches/patch-XSA347
@@ -0,0 +1,134 @@
+$NetBSD: patch-XSA347,v 1.1.2.2 2020/10/22 16:29:04 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: update live PTEs atomically
+
+Updating a live PTE word by word allows the IOMMU to see a partially
+updated entry. Construct the new entry fully in a local variable and
+then write the new entry by a single insn.
+
+This is part of XSA-347.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -41,7 +41,7 @@ static void clear_iommu_pte_present(unsi
+
+ table = map_domain_page(_mfn(l1_mfn));
+ pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1);
+- *pte = 0;
++ write_atomic(pte, 0);
+ unmap_domain_page(table);
+ }
+
+@@ -49,7 +49,7 @@ static bool_t set_iommu_pde_present(u32
+ unsigned int next_level,
+ bool_t iw, bool_t ir)
+ {
+- uint64_t addr_lo, addr_hi, maddr_next;
++ uint64_t addr_lo, addr_hi, maddr_next, full;
+ u32 entry;
+ bool need_flush = false, old_present;
+
+@@ -106,7 +106,7 @@ static bool_t set_iommu_pde_present(u32
+ if ( next_level == IOMMU_PAGING_MODE_LEVEL_0 )
+ set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+ IOMMU_PTE_FC_MASK, IOMMU_PTE_FC_SHIFT, &entry);
+- pde[1] = entry;
++ full = (uint64_t)entry << 32;
+
+ /* mark next level as 'present' */
+ set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
+@@ -118,7 +118,9 @@ static bool_t set_iommu_pde_present(u32
+ set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+ IOMMU_PDE_PRESENT_MASK,
+ IOMMU_PDE_PRESENT_SHIFT, &entry);
+- pde[0] = entry;
++ full |= entry;
++
++ write_atomic((uint64_t *)pde, full);
+
+ return need_flush;
+ }
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: ensure suitable ordering of DTE modifications
+
+DMA and interrupt translation should be enabled only after other
+applicable DTE fields have been written. Similarly when disabling
+translation or when moving a device between domains, translation should
+first be disabled, before other entry fields get modified. Note however
+that the "moving" aspect doesn't apply to the interrupt remapping side,
+as domain specifics are maintained in the IRTEs here, not the DTE. We
+also never disable interrupt remapping once it got enabled for a device
+(the respective argument passed is always the immutable iommu_intremap).
+
+This is part of XSA-347.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -147,7 +147,22 @@ void amd_iommu_set_root_page_table(
+ u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid)
+ {
+ u64 addr_hi, addr_lo;
+- u32 entry;
++ u32 entry, dte0 = dte[0];
++
++ if ( valid ||
++ get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_VALID_MASK,
++ IOMMU_DEV_TABLE_VALID_SHIFT) )
++ {
++ set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, dte0,
++ IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK,
++ IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &dte0);
++ set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, dte0,
++ IOMMU_DEV_TABLE_VALID_MASK,
++ IOMMU_DEV_TABLE_VALID_SHIFT, &dte0);
++ dte[0] = dte0;
++ smp_wmb();
++ }
++
+ set_field_in_reg_u32(domain_id, 0,
+ IOMMU_DEV_TABLE_DOMAIN_ID_MASK,
+ IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry);
+@@ -166,8 +181,9 @@ void amd_iommu_set_root_page_table(
+ IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK,
+ IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry);
+ dte[1] = entry;
++ smp_wmb();
+
+- set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
++ set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, dte0,
+ IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+ IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry);
+ set_field_in_reg_u32(paging_mode, entry,
+@@ -180,7 +196,7 @@ void amd_iommu_set_root_page_table(
+ IOMMU_CONTROL_DISABLED, entry,
+ IOMMU_DEV_TABLE_VALID_MASK,
+ IOMMU_DEV_TABLE_VALID_SHIFT, &entry);
+- dte[0] = entry;
++ write_atomic(&dte[0], entry);
+ }
+
+ void iommu_dte_set_iotlb(u32 *dte, u8 i)
+@@ -212,6 +228,7 @@ void __init amd_iommu_set_intremap_table
+ IOMMU_DEV_TABLE_INT_CONTROL_MASK,
+ IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry);
+ dte[5] = entry;
++ smp_wmb();
+
+ set_field_in_reg_u32((u32)addr_lo >> 6, 0,
+ IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK,
+@@ -229,7 +246,7 @@ void __init amd_iommu_set_intremap_table
+ IOMMU_CONTROL_DISABLED, entry,
+ IOMMU_DEV_TABLE_INT_VALID_MASK,
+ IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry);
+- dte[4] = entry;
++ write_atomic(&dte[4], entry);
+ }
+
+ void __init iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev)
diff --git a/sysutils/xenkernel413/Makefile b/sysutils/xenkernel413/Makefile
index f4c2130ddc7..03fee1adcc9 100644
--- a/sysutils/xenkernel413/Makefile
+++ b/sysutils/xenkernel413/Makefile
@@ -1,7 +1,7 @@
-# $NetBSD: Makefile,v 1.4.2.1 2020/10/04 20:44:32 bsiegert Exp $
+# $NetBSD: Makefile,v 1.4.2.2 2020/10/22 16:29:05 bsiegert Exp $
VERSION= 4.13.1
-PKGREVISION= 2
+PKGREVISION= 3
DISTNAME= xen-${VERSION}
PKGNAME= xenkernel413-${VERSION}
CATEGORIES= sysutils
diff --git a/sysutils/xenkernel413/distinfo b/sysutils/xenkernel413/distinfo
index 2f181f4e4aa..c5dc7995153 100644
--- a/sysutils/xenkernel413/distinfo
+++ b/sysutils/xenkernel413/distinfo
@@ -1,10 +1,11 @@
-$NetBSD: distinfo,v 1.2.2.1 2020/10/04 20:44:32 bsiegert Exp $
+$NetBSD: distinfo,v 1.2.2.2 2020/10/22 16:29:05 bsiegert Exp $
SHA1 (xen413/xen-4.13.1.tar.gz) = 194a314171120dad0b3c5433104c92343ec884ba
RMD160 (xen413/xen-4.13.1.tar.gz) = 29cfb90b9da0ede99c1228b8e5964a99547c205d
SHA512 (xen413/xen-4.13.1.tar.gz) = b56d20704155d98d803496cba83eb928e0f986a750831cd5600fc88d0ae772fe1456571654375054043d2da8daca255cc98385ebf08b1b1a75ecf7f4b7a0ee90
Size (xen413/xen-4.13.1.tar.gz) = 39024612 bytes
SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA286) = 93bf307619151cd4389db95557d8af0e1b29ea0a
SHA1 (patch-XSA317) = 3a3e7bf8f115bebaf56001afcf68c2bd501c00a5
SHA1 (patch-XSA319) = 4954bdc849666e1c735c3281256e4850c0594ee8
SHA1 (patch-XSA320) = db978d49298660fb750dc6b50c2a1ddd099c8fa0
@@ -20,6 +21,9 @@ SHA1 (patch-XSA340) = 23888acfe25fc82ff085fa9acfbb36c156a15bc3
SHA1 (patch-XSA342) = a61c4e28a8c8219b88e3bab534a109b2b29e2cc3
SHA1 (patch-XSA343) = f4656c110229fdc63b57b8af76fc6e60386ef3cd
SHA1 (patch-XSA344) = 616fb56027ee289bb3b7b061e2f9f6f6d81e358b
+SHA1 (patch-XSA345) = 5fd7f8c04c6fd81c3ba49c01063075325f2b4779
+SHA1 (patch-XSA346) = 834b2c3b89aa2569a61ee990000592bc0a044999
+SHA1 (patch-XSA347) = ffb4a0fa152196e26c34f733eeff62c02b79cf49
SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
SHA1 (patch-xen_arch_x86_Rules.mk) = 0bedfc53a128a87b6a249ae04fbdf6a053bfb70b
diff --git a/sysutils/xenkernel413/patches/patch-XSA286 b/sysutils/xenkernel413/patches/patch-XSA286
new file mode 100644
index 00000000000..e9f867c49fc
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA286
@@ -0,0 +1,716 @@
+$NetBSD: patch-XSA286,v 1.1.2.2 2020/10/22 16:29:05 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()
+
+The L3 one at least is going to be re-used by a subsequent patch, and
+splitting the L4 one then as well seems only natural.
+
+This is part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index db4f035d8d..b1582b56fb 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
+
+ l2_pgentry_t *compat_idle_pg_table_l2;
+
+-void *do_page_walk(struct vcpu *v, unsigned long addr)
++static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ {
+- unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
+- l4_pgentry_t l4e, *l4t;
+- l3_pgentry_t l3e, *l3t;
+- l2_pgentry_t l2e, *l2t;
+- l1_pgentry_t l1e, *l1t;
++ unsigned long mfn = pagetable_get_pfn(root);
++ l4_pgentry_t *l4t, l4e;
+
+- if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
+- return NULL;
++ if ( !is_canonical_address(addr) )
++ return l4e_empty();
+
+ l4t = map_domain_page(_mfn(mfn));
+ l4e = l4t[l4_table_offset(addr)];
+ unmap_domain_page(l4t);
++
++ return l4e;
++}
++
++static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
++{
++ l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
++ l3_pgentry_t *l3t, l3e;
++
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+- return NULL;
++ return l3e_empty();
+
+ l3t = map_l3t_from_l4e(l4e);
+ l3e = l3t[l3_table_offset(addr)];
+ unmap_domain_page(l3t);
++
++ return l3e;
++}
++
++void *do_page_walk(struct vcpu *v, unsigned long addr)
++{
++ l3_pgentry_t l3e;
++ l2_pgentry_t l2e, *l2t;
++ l1_pgentry_t l1e, *l1t;
++ unsigned long mfn;
++
++ if ( !is_pv_vcpu(v) )
++ return NULL;
++
++ l3e = page_walk_get_l3e(v->arch.guest_table, addr);
+ mfn = l3e_get_pfn(l3e);
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
+ return NULL;
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: check page types in do_page_walk()
+
+For page table entries read to be guaranteed valid, transiently locking
+the pages and validating their types is necessary. Note that guest use
+of linear page tables is intentionally not taken into account here, as
+ordinary data (guest stacks) can't possibly live inside page tables.
+
+This is part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index b1582b56fb..7d439639b7 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
+
+ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ {
+- unsigned long mfn = pagetable_get_pfn(root);
+- l4_pgentry_t *l4t, l4e;
++ mfn_t mfn = pagetable_get_mfn(root);
++ /* current's root page table can't disappear under our feet. */
++ bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
++ struct page_info *pg;
++ l4_pgentry_t l4e = l4e_empty();
+
+ if ( !is_canonical_address(addr) )
+ return l4e_empty();
+
+- l4t = map_domain_page(_mfn(mfn));
+- l4e = l4t[l4_table_offset(addr)];
+- unmap_domain_page(l4t);
++ pg = mfn_to_page(mfn);
++ if ( need_lock && !page_lock(pg) )
++ return l4e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
++ {
++ l4_pgentry_t *l4t = map_domain_page(mfn);
++
++ l4e = l4t[l4_table_offset(addr)];
++ unmap_domain_page(l4t);
++ }
++
++ if ( need_lock )
++ page_unlock(pg);
+
+ return l4e;
+ }
+@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
+ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ {
+ l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
+- l3_pgentry_t *l3t, l3e;
++ mfn_t mfn = l4e_get_mfn(l4e);
++ struct page_info *pg;
++ l3_pgentry_t l3e = l3e_empty();
+
+ if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+ return l3e_empty();
+
+- l3t = map_l3t_from_l4e(l4e);
+- l3e = l3t[l3_table_offset(addr)];
+- unmap_domain_page(l3t);
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l3e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
++ {
++ l3_pgentry_t *l3t = map_domain_page(mfn);
++
++ l3e = l3t[l3_table_offset(addr)];
++ unmap_domain_page(l3t);
++ }
++
++ page_unlock(pg);
+
+ return l3e;
+ }
+@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+- l2_pgentry_t l2e, *l2t;
+- l1_pgentry_t l1e, *l1t;
+- unsigned long mfn;
++ l2_pgentry_t l2e = l2e_empty();
++ l1_pgentry_t l1e = l1e_empty();
++ mfn_t mfn;
++ struct page_info *pg;
+
+ if ( !is_pv_vcpu(v) )
+ return NULL;
+
+ l3e = page_walk_get_l3e(v->arch.guest_table, addr);
+- mfn = l3e_get_pfn(l3e);
+- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ mfn = l3e_get_mfn(l3e);
++ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+ if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
+ {
+- mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
++ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
+ goto ret;
+ }
+
+- l2t = map_domain_page(_mfn(mfn));
+- l2e = l2t[l2_table_offset(addr)];
+- unmap_domain_page(l2t);
+- mfn = l2e_get_pfn(l2e);
+- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return NULL;
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
++ {
++ const l2_pgentry_t *l2t = map_domain_page(mfn);
++
++ l2e = l2t[l2_table_offset(addr)];
++ unmap_domain_page(l2t);
++ }
++
++ page_unlock(pg);
++
++ mfn = l2e_get_mfn(l2e);
++ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+ if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
+ {
+- mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
++ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
+ goto ret;
+ }
+
+- l1t = map_domain_page(_mfn(mfn));
+- l1e = l1t[l1_table_offset(addr)];
+- unmap_domain_page(l1t);
+- mfn = l1e_get_pfn(l1e);
+- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return NULL;
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
++ {
++ const l1_pgentry_t *l1t = map_domain_page(mfn);
++
++ l1e = l1t[l1_table_offset(addr)];
++ unmap_domain_page(l1t);
++ }
++
++ page_unlock(pg);
++
++ mfn = l1e_get_mfn(l1e);
++ if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
+ return NULL;
+
+ ret:
+- return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
++ return map_domain_page(mfn) + (addr & ~PAGE_MASK);
+ }
+
+ /*
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Replace the linear L2 table access by an actual page walk.
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
+index 2b0dadc8da..acebf9e957 100644
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
+ if ( unlikely(!__addr_ok(linear)) )
+ return NULL;
+
+- /* Find this l1e and its enclosing l1mfn in the linear map. */
+- if ( __copy_from_user(&l2e,
+- &__linear_l2_table[l2_linear_offset(linear)],
+- sizeof(l2_pgentry_t)) )
++ if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
++ {
++ ASSERT_UNREACHABLE();
+ return NULL;
++ }
++
++ /* Find this l1e and its enclosing l1mfn. */
++ l2e = page_walk_get_l2e(current->arch.guest_table, linear);
+
+ /* Check flags that it will be safe to read the l1e. */
+ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index 7d439639b7..670aa3f892 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
+ return l3e;
+ }
+
++l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
++{
++ l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
++ mfn_t mfn = l3e_get_mfn(l3e);
++ struct page_info *pg;
++ l2_pgentry_t l2e = l2e_empty();
++
++ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
++ (l3e_get_flags(l3e) & _PAGE_PSE) )
++ return l2e_empty();
++
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l2e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
++ {
++ l2_pgentry_t *l2t = map_domain_page(mfn);
++
++ l2e = l2t[l2_table_offset(addr)];
++ unmap_domain_page(l2t);
++ }
++
++ page_unlock(pg);
++
++ return l2e;
++}
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 320c6cd196..cd3e7ec501 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -577,7 +577,9 @@ void audit_domains(void);
+ void make_cr3(struct vcpu *v, mfn_t mfn);
+ void update_cr3(struct vcpu *v);
+ int vcpu_destroy_pagetables(struct vcpu *);
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr);
++l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
+
+ int __sync_local_execstate(void);
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()
+
+First of all drop guest_get_eff_l1e() entirely - there's no actual user
+of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
+its only call site.
+
+Then replace the linear L1 table access by an actual page walk.
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
+index acebf9e957..7624447246 100644
+--- xen/arch/x86/pv/mm.c.orig
++++ xen/arch/x86/pv/mm.c
+@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
+ }
+
+ /*
+- * Read the guest's l1e that maps this address, from the kernel-mode
+- * page tables.
+- */
+-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
+-{
+- struct vcpu *curr = current;
+- const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
+- l1_pgentry_t l1e;
+-
+- if ( user_mode )
+- toggle_guest_pt(curr);
+-
+- l1e = guest_get_eff_l1e(linear);
+-
+- if ( user_mode )
+- toggle_guest_pt(curr);
+-
+- return l1e;
+-}
+-
+-/*
+ * Map a guest's LDT page (covering the byte at @offset from start of the LDT)
+ * into Xen's virtual range. Returns true if the mapping changed, false
+ * otherwise.
+diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
+index a1bd473b29..43d33a1fd1 100644
+--- xen/arch/x86/pv/mm.h.orig
++++ xen/arch/x86/pv/mm.h
+@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
+
+ int new_guest_cr3(mfn_t mfn);
+
+-/* Read a PV guest's l1e that maps this linear address. */
+-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
++/*
++ * Read the guest's l1e that maps this address, from the kernel-mode
++ * page tables.
++ */
++static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
+ {
+- l1_pgentry_t l1e;
++ l1_pgentry_t l1e = l1e_empty();
+
+ ASSERT(!paging_mode_translate(current->domain));
+ ASSERT(!paging_mode_external(current->domain));
+
+- if ( unlikely(!__addr_ok(linear)) ||
+- __copy_from_user(&l1e,
+- &__linear_l1_table[l1_linear_offset(linear)],
+- sizeof(l1_pgentry_t)) )
+- l1e = l1e_empty();
++ if ( likely(__addr_ok(linear)) )
++ l1e = page_walk_get_l1e(current->arch.guest_table, linear);
+
+ return l1e;
+ }
+diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
+index a920fb5e15..2bf4497a16 100644
+--- xen/arch/x86/pv/ro-page-fault.c.orig
++++ xen/arch/x86/pv/ro-page-fault.c
+@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+ bool mmio_ro;
+
+ /* Attempt to read the PTE that maps the VA being accessed. */
+- pte = guest_get_eff_l1e(addr);
++ pte = guest_get_eff_kern_l1e(addr);
+
+ /* We are only looking for read-only mappings */
+ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index 670aa3f892..c5686e0d25 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
+ return l2e;
+ }
+
++/*
++ * For now no "set_accessed" parameter, as all callers want it set to true.
++ * For now also no "set_dirty" parameter, as all callers deal with r/o
++ * mappings, and we don't want to set the dirty bit there (conflicts with
++ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
++ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
++ */
++l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
++{
++ l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
++ mfn_t mfn = l2e_get_mfn(l2e);
++ struct page_info *pg;
++ l1_pgentry_t l1e = l1e_empty();
++
++ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
++ (l2e_get_flags(l2e) & _PAGE_PSE) )
++ return l1e_empty();
++
++ pg = mfn_to_page(mfn);
++ if ( !page_lock(pg) )
++ return l1e_empty();
++
++ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
++ {
++ l1_pgentry_t *l1t = map_domain_page(mfn);
++
++ l1e = l1t[l1_table_offset(addr)];
++
++ if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
++ _PAGE_PRESENT )
++ {
++ l1_pgentry_t ol1e = l1e;
++
++ l1e_add_flags(l1e, _PAGE_ACCESSED);
++ /*
++ * Best effort only; with the lock held the page shouldn't
++ * change anyway, except for the dirty bit to perhaps become set.
++ */
++ while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
++ l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
++ l1e_get_intpte(ol1e) &&
++ !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
++ {
++ l1e_add_flags(ol1e, _PAGE_DIRTY);
++ l1e_add_flags(l1e, _PAGE_DIRTY);
++ }
++ }
++
++ unmap_domain_page(l1t);
++ }
++
++ page_unlock(pg);
++
++ return l1e;
++}
++
+ void *do_page_walk(struct vcpu *v, unsigned long addr)
+ {
+ l3_pgentry_t l3e;
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index cd3e7ec501..865db999c1 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -580,6 +580,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
+
+ void *do_page_walk(struct vcpu *v, unsigned long addr);
+ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
++l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
+
+ int __sync_local_execstate(void);
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: avoid using top level linear page tables in
+ {,un}map_domain_page()
+
+Move the page table recursion two levels down. This entails avoiding
+to free the recursive mapping prematurely in free_perdomain_mappings().
+
+This is part of XSA-286.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
+index 4a07cfb18e..660bd06aaf 100644
+--- xen/arch/x86/domain_page.c.orig
++++ xen/arch/x86/domain_page.c
+@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
+ #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
+ #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
+ #define MAPCACHE_L1ENT(idx) \
+- __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
++ ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
++ ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
+
+ void *map_domain_page(mfn_t mfn)
+ {
+@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
+ {
+ struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+ unsigned int bitmap_pages;
++ int rc;
+
+ ASSERT(is_pv_domain(d));
+
+@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
+ return 0;
+ #endif
+
++ BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
+ BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
+- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
++ 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
++ (1U << L2_PAGETABLE_SHIFT) >
+ MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
+ bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
+ dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
+@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
+
+ spin_lock_init(&dcache->lock);
+
+- return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
+- 2 * bitmap_pages + 1,
+- NIL(l1_pgentry_t *), NULL);
++ rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
++ 2 * bitmap_pages + 1,
++ NIL(l1_pgentry_t *), NULL);
++ if ( !rc )
++ {
++ /*
++ * Install mapping of our L2 table into its own last slot, for easy
++ * access to the L1 entries via MAPCACHE_L1ENT().
++ */
++ l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
++ l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
++ l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
++
++ l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
++ unmap_domain_page(l2t);
++ unmap_domain_page(l3t);
++ }
++
++ return rc;
+ }
+
+ int mapcache_vcpu_init(struct vcpu *v)
+@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
+ else
+ {
+ ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
+- pl1e = &__linear_l1_table[l1_linear_offset(va)];
++ pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
+ }
+
+ return l1e_get_mfn(*pl1e);
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 30dffb68e8..279664a83e 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -6031,6 +6031,10 @@ void free_perdomain_mappings(struct domain *d)
+ {
+ struct page_info *l1pg = l2e_get_page(l2tab[j]);
+
++ /* mapcache_domain_init() installs a recursive entry. */
++ if ( l1pg == l2pg )
++ continue;
++
+ if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
+ {
+ l1_pgentry_t *l1tab = __map_domain_page(l1pg);
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/mm: restrict use of linear page tables to shadow mode code
+
+Other code does not require them to be set up anymore, so restrict when
+to populate the respective L4 slot and reduce visibility of the
+accessors.
+
+While with the removal of all uses the vulnerability is actually fixed,
+removing the creation of the linear mapping adds an extra layer of
+protection. Similarly reducing visibility of the accessors mostly
+eliminates the risk of undue re-introduction of uses of the linear
+mappings.
+
+This is (not strictly) part of XSA-286.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 279664a83e..fa0f813d29 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1757,9 +1757,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
+ l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
+ idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
+
+- /* Slot 258: Self linear mappings. */
++ /* Slot 258: Self linear mappings (shadow pt only). */
+ ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
+ l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
++ !shadow_mode_external(d) ? l4e_empty() :
+ l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
+
+ /* Slot 259: Shadow linear mappings (if applicable) .*/
+diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
+index 3217777921..b214087194 100644
+--- xen/arch/x86/mm/shadow/private.h.orig
++++ xen/arch/x86/mm/shadow/private.h
+@@ -135,6 +135,15 @@ enum {
+ # define GUEST_PTE_SIZE 4
+ #endif
+
++/* Where to find each level of the linear mapping */
++#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
++#define __linear_l2_table \
++ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
++#define __linear_l3_table \
++ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
++#define __linear_l4_table \
++ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
++
+ /******************************************************************************
+ * Auditing routines
+ */
+diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
+index c5686e0d25..dcb20d1d9d 100644
+--- xen/arch/x86/x86_64/mm.c.orig
++++ xen/arch/x86/x86_64/mm.c
+@@ -833,9 +833,6 @@ void __init paging_init(void)
+
+ machine_to_phys_mapping_valid = 1;
+
+- /* Set up linear page table mapping. */
+- l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
+- l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
+ return;
+
+ nomem:
+diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
+index 8d79a71398..9d587a076a 100644
+--- xen/include/asm-x86/config.h.orig
++++ xen/include/asm-x86/config.h
+@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
+ */
+ #define PCI_MCFG_VIRT_START (PML4_ADDR(257))
+ #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
+-/* Slot 258: linear page table (guest table). */
++/* Slot 258: linear page table (monitor table, HVM only). */
+ #define LINEAR_PT_VIRT_START (PML4_ADDR(258))
+ #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+ /* Slot 259: linear page table (shadow table). */
+diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
+index c1e92937c0..e72c277b9f 100644
+--- xen/include/asm-x86/page.h.orig
++++ xen/include/asm-x86/page.h
+@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
+ #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
+ #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
+
+-#endif /* !defined(__ASSEMBLY__) */
+-
+-/* Where to find each level of the linear mapping */
+-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+-#define __linear_l2_table \
+- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+-#define __linear_l3_table \
+- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+-#define __linear_l4_table \
+- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+-
+-
+-#ifndef __ASSEMBLY__
+ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
+ extern l2_pgentry_t *compat_idle_pg_table_l2;
+ extern unsigned int m2p_compat_vstart;
diff --git a/sysutils/xenkernel413/patches/patch-XSA345 b/sysutils/xenkernel413/patches/patch-XSA345
new file mode 100644
index 00000000000..761602a93ad
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA345
@@ -0,0 +1,413 @@
+$NetBSD: patch-XSA345,v 1.1.2.2 2020/10/22 16:29:05 bsiegert Exp $
+
+From b3e0d4e37b7902533a463812374947d4d6d2e463 Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Sat, 11 Jan 2020 21:57:41 +0000
+Subject: [PATCH 1/3] x86/mm: Refactor map_pages_to_xen to have only a single
+ exit path
+
+We will soon need to perform clean-ups before returning.
+
+No functional change.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 30dffb68e8..133a393875 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -5187,6 +5187,7 @@ int map_pages_to_xen(
+ l2_pgentry_t *pl2e, ol2e;
+ l1_pgentry_t *pl1e, ol1e;
+ unsigned int i;
++ int rc = -ENOMEM;
+
+ #define flush_flags(oldf) do { \
+ unsigned int o_ = (oldf); \
+@@ -5207,7 +5208,8 @@ int map_pages_to_xen(
+ l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
+
+ if ( !pl3e )
+- return -ENOMEM;
++ goto out;
++
+ ol3e = *pl3e;
+
+ if ( cpu_has_page1gb &&
+@@ -5295,7 +5297,7 @@ int map_pages_to_xen(
+
+ pl2e = alloc_xen_pagetable();
+ if ( pl2e == NULL )
+- return -ENOMEM;
++ goto out;
+
+ for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ l2e_write(pl2e + i,
+@@ -5324,7 +5326,7 @@ int map_pages_to_xen(
+
+ pl2e = virt_to_xen_l2e(virt);
+ if ( !pl2e )
+- return -ENOMEM;
++ goto out;
+
+ if ( ((((virt >> PAGE_SHIFT) | mfn_x(mfn)) &
+ ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
+@@ -5367,7 +5369,7 @@ int map_pages_to_xen(
+ {
+ pl1e = virt_to_xen_l1e(virt);
+ if ( pl1e == NULL )
+- return -ENOMEM;
++ goto out;
+ }
+ else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
+ {
+@@ -5394,7 +5396,7 @@ int map_pages_to_xen(
+
+ pl1e = alloc_xen_pagetable();
+ if ( pl1e == NULL )
+- return -ENOMEM;
++ goto out;
+
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ l1e_write(&pl1e[i],
+@@ -5538,7 +5540,10 @@ int map_pages_to_xen(
+
+ #undef flush_flags
+
+- return 0;
++ rc = 0;
++
++ out:
++ return rc;
+ }
+
+ int populate_pt_range(unsigned long virt, unsigned long nr_mfns)
+--
+2.25.1
+
+From 9f6f35b833d295acaaa2d8ff8cf309bf688cfd50 Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Sat, 11 Jan 2020 21:57:42 +0000
+Subject: [PATCH 2/3] x86/mm: Refactor modify_xen_mappings to have one exit
+ path
+
+We will soon need to perform clean-ups before returning.
+
+No functional change.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 133a393875..af726d3274 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -5570,6 +5570,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ l1_pgentry_t *pl1e;
+ unsigned int i;
+ unsigned long v = s;
++ int rc = -ENOMEM;
+
+ /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5611,7 +5612,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ /* PAGE1GB: shatter the superpage and fall through. */
+ pl2e = alloc_xen_pagetable();
+ if ( !pl2e )
+- return -ENOMEM;
++ goto out;
++
+ for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ l2e_write(pl2e + i,
+ l2e_from_pfn(l3e_get_pfn(*pl3e) +
+@@ -5666,7 +5668,8 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ /* PSE: shatter the superpage and try again. */
+ pl1e = alloc_xen_pagetable();
+ if ( !pl1e )
+- return -ENOMEM;
++ goto out;
++
+ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+ l1e_write(&pl1e[i],
+ l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
+@@ -5795,7 +5798,10 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ flush_area(NULL, FLUSH_TLB_GLOBAL);
+
+ #undef FLAGS_MASK
+- return 0;
++ rc = 0;
++
++ out:
++ return rc;
+ }
+
+ #undef flush_area
+--
+2.25.1
+
+From 0ff9a8453dc47cd47eee9659d5916afb5094e871 Mon Sep 17 00:00:00 2001
+From: Hongyan Xia <hongyxia@amazon.com>
+Date: Sat, 11 Jan 2020 21:57:43 +0000
+Subject: [PATCH 3/3] x86/mm: Prevent some races in hypervisor mapping updates
+
+map_pages_to_xen will attempt to coalesce mappings into 2MiB and 1GiB
+superpages if possible, to maximize TLB efficiency. This means both
+replacing superpage entries with smaller entries, and replacing
+smaller entries with superpages.
+
+Unfortunately, while some potential races are handled correctly,
+others are not. These include:
+
+1. When one processor modifies a sub-superpage mapping while another
+processor replaces the entire range with a superpage.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and
+B.
+
+* A walks the pagetables, get a pointer to L2.
+* B replaces L3[N] with a 1GiB mapping.
+* B Frees L2
+* A writes L2[M] #
+
+This is race exacerbated by the fact that virt_to_xen_l[21]e doesn't
+handle higher-level superpages properly: If you call virt_xen_to_l2e
+on a virtual address within an L3 superpage, you'll either hit a BUG()
+(most likely), or get a pointer into the middle of a data page; same
+with virt_xen_to_l1 on a virtual address within either an L3 or L2
+superpage.
+
+So take the following example:
+
+* A reads pl3e and discovers it to point to an L2.
+* B replaces L3[N] with a 1GiB mapping
+* A calls virt_to_xen_l2e() and hits the BUG_ON() #
+
+2. When two processors simultaneously try to replace a sub-superpage
+mapping with a superpage mapping.
+
+Take the following example:
+
+Suppose L3[N] points to L2. And suppose we have two processors, A and B,
+both trying to replace L3[N] with a superpage.
+
+* A walks the pagetables, get a pointer to pl3e, and takes a copy ol3e pointing to L2.
+* B walks the pagetables, gets a pointre to pl3e, and takes a copy ol3e pointing to L2.
+* A writes the new value into L3[N]
+* B writes the new value into L3[N]
+* A recursively frees all the L1's under L2, then frees L2
+* B recursively double-frees all the L1's under L2, then double-frees L2 #
+
+Fix this by grabbing a lock for the entirety of the mapping update
+operation.
+
+Rather than grabbing map_pgdir_lock for the entire operation, however,
+repurpose the PGT_locked bit from L3's page->type_info as a lock.
+This means that rather than locking the entire address space, we
+"only" lock a single 512GiB chunk of hypervisor address space at a
+time.
+
+There was a proposal for a lock-and-reverify approach, where we walk
+the pagetables to the point where we decide what to do; then grab the
+map_pgdir_lock, re-verify the information we collected without the
+lock, and finally make the change (starting over again if anything had
+changed). Without being able to guarantee that the L2 table wasn't
+freed, however, that means every read would need to be considered
+potentially unsafe. Thinking carefully about that is probably
+something that wants to be done on public, not under time pressure.
+
+This is part of XSA-345.
+
+Reported-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
+Signed-off-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+---
+ xen/arch/x86/mm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 89 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index af726d3274..d6a0761f43 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -2167,6 +2167,50 @@ void page_unlock(struct page_info *page)
+ current_locked_page_set(NULL);
+ }
+
++/*
++ * L3 table locks:
++ *
++ * Used for serialization in map_pages_to_xen() and modify_xen_mappings().
++ *
++ * For Xen PT pages, the page->u.inuse.type_info is unused and it is safe to
++ * reuse the PGT_locked flag. This lock is taken only when we move down to L3
++ * tables and below, since L4 (and above, for 5-level paging) is still globally
++ * protected by map_pgdir_lock.
++ *
++ * PV MMU update hypercalls call map_pages_to_xen while holding a page's page_lock().
++ * This has two implications:
++ * - We cannot reuse reuse current_locked_page_* for debugging
++ * - To avoid the chance of deadlock, even for different pages, we
++ * must never grab page_lock() after grabbing l3t_lock(). This
++ * includes any page_lock()-based locks, such as
++ * mem_sharing_page_lock().
++ *
++ * Also note that we grab the map_pgdir_lock while holding the
++ * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
++ * reverse order.
++ */
++static void l3t_lock(struct page_info *page)
++{
++ unsigned long x, nx;
++
++ do {
++ while ( (x = page->u.inuse.type_info) & PGT_locked )
++ cpu_relax();
++ nx = x | PGT_locked;
++ } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
++}
++
++static void l3t_unlock(struct page_info *page)
++{
++ unsigned long x, nx, y = page->u.inuse.type_info;
++
++ do {
++ x = y;
++ BUG_ON(!(x & PGT_locked));
++ nx = x & ~PGT_locked;
++ } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
++}
++
+ #ifdef CONFIG_PV
+ /*
+ * PTE flags that a guest may change without re-validating the PTE.
+@@ -5177,6 +5221,23 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+ flush_area_local((const void *)v, f) : \
+ flush_area_all((const void *)v, f))
+
++#define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
++
++#define L3T_LOCK(page) \
++ do { \
++ if ( locking ) \
++ l3t_lock(page); \
++ } while ( false )
++
++#define L3T_UNLOCK(page) \
++ do { \
++ if ( locking && (page) != ZERO_BLOCK_PTR ) \
++ { \
++ l3t_unlock(page); \
++ (page) = ZERO_BLOCK_PTR; \
++ } \
++ } while ( false )
++
+ int map_pages_to_xen(
+ unsigned long virt,
+ mfn_t mfn,
+@@ -5188,6 +5249,7 @@ int map_pages_to_xen(
+ l1_pgentry_t *pl1e, ol1e;
+ unsigned int i;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ #define flush_flags(oldf) do { \
+ unsigned int o_ = (oldf); \
+@@ -5203,13 +5265,20 @@ int map_pages_to_xen(
+ } \
+ } while (0)
+
++ L3T_INIT(current_l3page);
++
+ while ( nr_mfns != 0 )
+ {
+- l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
++ l3_pgentry_t *pl3e, ol3e;
+
++ L3T_UNLOCK(current_l3page);
++
++ pl3e = virt_to_xen_l3e(virt);
+ if ( !pl3e )
+ goto out;
+
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
+ ol3e = *pl3e;
+
+ if ( cpu_has_page1gb &&
+@@ -5543,6 +5612,7 @@ int map_pages_to_xen(
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
+@@ -5571,6 +5641,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ unsigned int i;
+ unsigned long v = s;
+ int rc = -ENOMEM;
++ struct page_info *current_l3page;
+
+ /* Set of valid PTE bits which may be altered. */
+ #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
+@@ -5579,11 +5650,22 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ ASSERT(IS_ALIGNED(s, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(e, PAGE_SIZE));
+
++ L3T_INIT(current_l3page);
++
+ while ( v < e )
+ {
+- l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
++ l3_pgentry_t *pl3e;
++
++ L3T_UNLOCK(current_l3page);
+
+- if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
++ pl3e = virt_to_xen_l3e(v);
++ if ( !pl3e )
++ goto out;
++
++ current_l3page = virt_to_page(pl3e);
++ L3T_LOCK(current_l3page);
++
++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+ {
+ /* Confirm the caller isn't trying to create new mappings. */
+ ASSERT(!(nf & _PAGE_PRESENT));
+@@ -5801,9 +5883,13 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+ rc = 0;
+
+ out:
++ L3T_UNLOCK(current_l3page);
+ return rc;
+ }
+
++#undef L3T_LOCK
++#undef L3T_UNLOCK
++
+ #undef flush_area
+
+ int destroy_xen_mappings(unsigned long s, unsigned long e)
+--
+2.25.1
+
diff --git a/sysutils/xenkernel413/patches/patch-XSA346 b/sysutils/xenkernel413/patches/patch-XSA346
new file mode 100644
index 00000000000..25cf466d578
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA346
@@ -0,0 +1,256 @@
+$NetBSD: patch-XSA346,v 1.1.2.2 2020/10/22 16:29:05 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU: suppress "iommu_dont_flush_iotlb" when about to free a page
+
+Deferring flushes to a single, wide range one - as is done when
+handling XENMAPSPACE_gmfn_range - is okay only as long as
+pages don't get freed ahead of the eventual flush. While the only
+function setting the flag (xenmem_add_to_physmap()) suggests by its name
+that it's only mapping new entries, in reality the way
+xenmem_add_to_physmap_one() works means an unmap would happen not only
+for the page being moved (but not freed) but, if the destination GFN is
+populated, also for the page being displaced from that GFN. Collapsing
+the two flushes for this GFN into just one (end even more so deferring
+it to a batched invocation) is not correct.
+
+This is part of XSA-346.
+
+Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Acked-by: Julien Grall <jgrall@amazon.com>
+
+--- xen/common/memory.c.orig
++++ xen/common/memory.c
+@@ -292,6 +292,7 @@ int guest_remove_page(struct domain *d,
+ p2m_type_t p2mt;
+ #endif
+ mfn_t mfn;
++ bool *dont_flush_p, dont_flush;
+ int rc;
+
+ #ifdef CONFIG_X86
+@@ -378,8 +379,18 @@ int guest_remove_page(struct domain *d,
+ return -ENXIO;
+ }
+
++ /*
++ * Since we're likely to free the page below, we need to suspend
++ * xenmem_add_to_physmap()'s suppressing of IOMMU TLB flushes.
++ */
++ dont_flush_p = &this_cpu(iommu_dont_flush_iotlb);
++ dont_flush = *dont_flush_p;
++ *dont_flush_p = false;
++
+ rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0);
+
++ *dont_flush_p = dont_flush;
++
+ /*
+ * With the lack of an IOMMU on some platforms, domains with DMA-capable
+ * device must retrieve the same pfn when the hypercall populate_physmap
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU: hold page ref until after deferred TLB flush
+
+When moving around a page via XENMAPSPACE_gmfn_range, deferring the TLB
+flush for the "from" GFN range requires that the page remains allocated
+to the guest until the TLB flush has actually occurred. Otherwise a
+parallel hypercall to remove the page would only flush the TLB for the
+GFN it has been moved to, but not the one is was mapped at originally.
+
+This is part of XSA-346.
+
+Fixes: cf95b2a9fd5a ("iommu: Introduce per cpu flag (iommu_dont_flush_iotlb) to avoid unnecessary iotlb... ")
+Reported-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+
+--- xen/arch/arm/mm.c.orig
++++ xen/arch/arm/mm.c
+@@ -1407,7 +1407,7 @@ void share_xen_page_with_guest(struct pa
+ int xenmem_add_to_physmap_one(
+ struct domain *d,
+ unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx,
+ gfn_t gfn)
+ {
+@@ -1480,10 +1480,6 @@ int xenmem_add_to_physmap_one(
+ break;
+ }
+ case XENMAPSPACE_dev_mmio:
+- /* extra should be 0. Reserved for future use. */
+- if ( extra.res0 )
+- return -EOPNOTSUPP;
+-
+ rc = map_dev_mmio_region(d, gfn, 1, _mfn(idx));
+ return rc;
+
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -4617,7 +4617,7 @@ static int handle_iomem_range(unsigned l
+ int xenmem_add_to_physmap_one(
+ struct domain *d,
+ unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx,
+ gfn_t gpfn)
+ {
+@@ -4701,9 +4701,20 @@ int xenmem_add_to_physmap_one(
+ rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
+
+ put_both:
+- /* In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top. */
++ /*
++ * In the XENMAPSPACE_gmfn case, we took a ref of the gfn at the top.
++ * We also may need to transfer ownership of the page reference to our
++ * caller.
++ */
+ if ( space == XENMAPSPACE_gmfn )
++ {
+ put_gfn(d, gfn);
++ if ( !rc && extra.ppage )
++ {
++ *extra.ppage = page;
++ page = NULL;
++ }
++ }
+
+ if ( page )
+ put_page(page);
+--- xen/common/memory.c.orig
++++ xen/common/memory.c
+@@ -814,13 +814,12 @@ int xenmem_add_to_physmap(struct domain
+ {
+ unsigned int done = 0;
+ long rc = 0;
+- union xen_add_to_physmap_batch_extra extra;
++ union add_to_physmap_extra extra = {};
++ struct page_info *pages[16];
+
+ ASSERT(paging_mode_translate(d));
+
+- if ( xatp->space != XENMAPSPACE_gmfn_foreign )
+- extra.res0 = 0;
+- else
++ if ( xatp->space == XENMAPSPACE_gmfn_foreign )
+ extra.foreign_domid = DOMID_INVALID;
+
+ if ( xatp->space != XENMAPSPACE_gmfn_range )
+@@ -835,7 +834,10 @@ int xenmem_add_to_physmap(struct domain
+ xatp->size -= start;
+
+ if ( is_iommu_enabled(d) )
++ {
+ this_cpu(iommu_dont_flush_iotlb) = 1;
++ extra.ppage = &pages[0];
++ }
+
+ while ( xatp->size > done )
+ {
+@@ -847,8 +849,12 @@ int xenmem_add_to_physmap(struct domain
+ xatp->idx++;
+ xatp->gpfn++;
+
++ if ( extra.ppage )
++ ++extra.ppage;
++
+ /* Check for continuation if it's not the last iteration. */
+- if ( xatp->size > ++done && hypercall_preempt_check() )
++ if ( (++done > ARRAY_SIZE(pages) && extra.ppage) ||
++ (xatp->size > done && hypercall_preempt_check()) )
+ {
+ rc = start + done;
+ break;
+@@ -858,6 +864,7 @@ int xenmem_add_to_physmap(struct domain
+ if ( is_iommu_enabled(d) )
+ {
+ int ret;
++ unsigned int i;
+
+ this_cpu(iommu_dont_flush_iotlb) = 0;
+
+@@ -866,6 +873,15 @@ int xenmem_add_to_physmap(struct domain
+ if ( unlikely(ret) && rc >= 0 )
+ rc = ret;
+
++ /*
++ * Now that the IOMMU TLB flush was done for the original GFN, drop
++ * the page references. The 2nd flush below is fine to make later, as
++ * whoever removes the page again from its new GFN will have to do
++ * another flush anyway.
++ */
++ for ( i = 0; i < done; ++i )
++ put_page(pages[i]);
++
+ ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
+ IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
+ if ( unlikely(ret) && rc >= 0 )
+@@ -879,6 +895,8 @@ static int xenmem_add_to_physmap_batch(s
+ struct xen_add_to_physmap_batch *xatpb,
+ unsigned int extent)
+ {
++ union add_to_physmap_extra extra = {};
++
+ if ( unlikely(xatpb->size < extent) )
+ return -EILSEQ;
+
+@@ -890,6 +908,19 @@ static int xenmem_add_to_physmap_batch(s
+ !guest_handle_subrange_okay(xatpb->errs, extent, xatpb->size - 1) )
+ return -EFAULT;
+
++ switch ( xatpb->space )
++ {
++ case XENMAPSPACE_dev_mmio:
++ /* res0 is reserved for future use. */
++ if ( xatpb->u.res0 )
++ return -EOPNOTSUPP;
++ break;
++
++ case XENMAPSPACE_gmfn_foreign:
++ extra.foreign_domid = xatpb->u.foreign_domid;
++ break;
++ }
++
+ while ( xatpb->size > extent )
+ {
+ xen_ulong_t idx;
+@@ -902,8 +933,7 @@ static int xenmem_add_to_physmap_batch(s
+ extent, 1)) )
+ return -EFAULT;
+
+- rc = xenmem_add_to_physmap_one(d, xatpb->space,
+- xatpb->u,
++ rc = xenmem_add_to_physmap_one(d, xatpb->space, extra,
+ idx, _gfn(gpfn));
+
+ if ( unlikely(__copy_to_guest_offset(xatpb->errs, extent, &rc, 1)) )
+--- xen/include/xen/mm.h.orig
++++ xen/include/xen/mm.h
+@@ -588,8 +588,22 @@ void scrub_one_page(struct page_info *);
+ &(d)->xenpage_list : &(d)->page_list)
+ #endif
+
++union add_to_physmap_extra {
++ /*
++ * XENMAPSPACE_gmfn: When deferring TLB flushes, a page reference needs
++ * to be kept until after the flush, so the page can't get removed from
++ * the domain (and re-used for another purpose) beforehand. By passing
++ * non-NULL, the caller of xenmem_add_to_physmap_one() indicates it wants
++ * to have ownership of such a reference transferred in the success case.
++ */
++ struct page_info **ppage;
++
++ /* XENMAPSPACE_gmfn_foreign */
++ domid_t foreign_domid;
++};
++
+ int xenmem_add_to_physmap_one(struct domain *d, unsigned int space,
+- union xen_add_to_physmap_batch_extra extra,
++ union add_to_physmap_extra extra,
+ unsigned long idx, gfn_t gfn);
+
+ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
diff --git a/sysutils/xenkernel413/patches/patch-XSA347 b/sysutils/xenkernel413/patches/patch-XSA347
new file mode 100644
index 00000000000..dba51f23dea
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA347
@@ -0,0 +1,282 @@
+$NetBSD: patch-XSA347,v 1.1.2.2 2020/10/22 16:29:05 bsiegert Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: convert amd_iommu_pte from struct to union
+
+This is to add a "raw" counterpart to the bitfield equivalent. Take the
+opportunity and
+ - convert fields to bool / unsigned int,
+ - drop the naming of the reserved field,
+ - shorten the names of the ignored ones.
+
+This is part of XSA-347.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -38,7 +38,7 @@ static unsigned int pfn_to_pde_idx(unsig
+ static unsigned int clear_iommu_pte_present(unsigned long l1_mfn,
+ unsigned long dfn)
+ {
+- struct amd_iommu_pte *table, *pte;
++ union amd_iommu_pte *table, *pte;
+ unsigned int flush_flags;
+
+ table = map_domain_page(_mfn(l1_mfn));
+@@ -52,7 +52,7 @@ static unsigned int clear_iommu_pte_pres
+ return flush_flags;
+ }
+
+-static unsigned int set_iommu_pde_present(struct amd_iommu_pte *pte,
++static unsigned int set_iommu_pde_present(union amd_iommu_pte *pte,
+ unsigned long next_mfn,
+ unsigned int next_level, bool iw,
+ bool ir)
+@@ -87,7 +87,7 @@ static unsigned int set_iommu_pte_presen
+ int pde_level,
+ bool iw, bool ir)
+ {
+- struct amd_iommu_pte *table, *pde;
++ union amd_iommu_pte *table, *pde;
+ unsigned int flush_flags;
+
+ table = map_domain_page(_mfn(pt_mfn));
+@@ -178,7 +178,7 @@ void iommu_dte_set_guest_cr3(struct amd_
+ static int iommu_pde_from_dfn(struct domain *d, unsigned long dfn,
+ unsigned long pt_mfn[], bool map)
+ {
+- struct amd_iommu_pte *pde, *next_table_vaddr;
++ union amd_iommu_pte *pde, *next_table_vaddr;
+ unsigned long next_table_mfn;
+ unsigned int level;
+ struct page_info *table;
+@@ -458,7 +458,7 @@ int __init amd_iommu_quarantine_init(str
+ unsigned long end_gfn =
+ 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
+ unsigned int level = amd_iommu_get_paging_mode(end_gfn);
+- struct amd_iommu_pte *table;
++ union amd_iommu_pte *table;
+
+ if ( hd->arch.root_table )
+ {
+@@ -489,7 +489,7 @@ int __init amd_iommu_quarantine_init(str
+
+ for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
+ {
+- struct amd_iommu_pte *pde = &table[i];
++ union amd_iommu_pte *pde = &table[i];
+
+ /*
+ * PDEs are essentially a subset of PTEs, so this function
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -390,7 +390,7 @@ static void deallocate_next_page_table(s
+
+ static void deallocate_page_table(struct page_info *pg)
+ {
+- struct amd_iommu_pte *table_vaddr;
++ union amd_iommu_pte *table_vaddr;
+ unsigned int index, level = PFN_ORDER(pg);
+
+ PFN_ORDER(pg) = 0;
+@@ -405,7 +405,7 @@ static void deallocate_page_table(struct
+
+ for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
+ {
+- struct amd_iommu_pte *pde = &table_vaddr[index];
++ union amd_iommu_pte *pde = &table_vaddr[index];
+
+ if ( pde->mfn && pde->next_level && pde->pr )
+ {
+@@ -557,7 +557,7 @@ static void amd_dump_p2m_table_level(str
+ paddr_t gpa, int indent)
+ {
+ paddr_t address;
+- struct amd_iommu_pte *table_vaddr;
++ const union amd_iommu_pte *table_vaddr;
+ int index;
+
+ if ( level < 1 )
+@@ -573,7 +573,7 @@ static void amd_dump_p2m_table_level(str
+
+ for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
+ {
+- struct amd_iommu_pte *pde = &table_vaddr[index];
++ const union amd_iommu_pte *pde = &table_vaddr[index];
+
+ if ( !(index % 2) )
+ process_pending_softirqs();
+--- xen/include/asm-x86/hvm/svm/amd-iommu-defs.h.orig
++++ xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+@@ -465,20 +465,23 @@ union amd_iommu_x2apic_control {
+ #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
+ #define IOMMU_PAGE_TABLE_ALIGNMENT 4096
+
+-struct amd_iommu_pte {
+- uint64_t pr:1;
+- uint64_t ignored0:4;
+- uint64_t a:1;
+- uint64_t d:1;
+- uint64_t ignored1:2;
+- uint64_t next_level:3;
+- uint64_t mfn:40;
+- uint64_t reserved:7;
+- uint64_t u:1;
+- uint64_t fc:1;
+- uint64_t ir:1;
+- uint64_t iw:1;
+- uint64_t ignored2:1;
++union amd_iommu_pte {
++ uint64_t raw;
++ struct {
++ bool pr:1;
++ unsigned int ign0:4;
++ bool a:1;
++ bool d:1;
++ unsigned int ign1:2;
++ unsigned int next_level:3;
++ uint64_t mfn:40;
++ unsigned int :7;
++ bool u:1;
++ bool fc:1;
++ bool ir:1;
++ bool iw:1;
++ unsigned int ign2:1;
++ };
+ };
+
+ /* Paging modes */
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: update live PTEs atomically
+
+Updating a live PTE bitfield by bitfield risks the compiler re-ordering
+the individual updates as well as splitting individual updates into
+multiple memory writes. Construct the new entry fully in a local
+variable, do the check to determine the flushing needs on the thus
+established new entry, and then write the new entry by a single insn.
+
+Similarly using memset() to clear a PTE is unsafe, as the order of
+writes the function does is, at least in principle, undefined.
+
+This is part of XSA-347.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -45,7 +45,7 @@ static unsigned int clear_iommu_pte_pres
+ pte = &table[pfn_to_pde_idx(dfn, 1)];
+
+ flush_flags = pte->pr ? IOMMU_FLUSHF_modified : 0;
+- memset(pte, 0, sizeof(*pte));
++ write_atomic(&pte->raw, 0);
+
+ unmap_domain_page(table);
+
+@@ -57,26 +57,30 @@ static unsigned int set_iommu_pde_presen
+ unsigned int next_level, bool iw,
+ bool ir)
+ {
++ union amd_iommu_pte new = {}, old;
+ unsigned int flush_flags = IOMMU_FLUSHF_added;
+
+- if ( pte->pr &&
+- (pte->mfn != next_mfn ||
+- pte->iw != iw ||
+- pte->ir != ir ||
+- pte->next_level != next_level) )
+- flush_flags |= IOMMU_FLUSHF_modified;
+-
+ /*
+ * FC bit should be enabled in PTE, this helps to solve potential
+ * issues with ATS devices
+ */
+- pte->fc = !next_level;
++ new.fc = !next_level;
++
++ new.mfn = next_mfn;
++ new.iw = iw;
++ new.ir = ir;
++ new.next_level = next_level;
++ new.pr = true;
++
++ old.raw = read_atomic(&pte->raw);
++ old.ign0 = 0;
++ old.ign1 = 0;
++ old.ign2 = 0;
++
++ if ( old.pr && old.raw != new.raw )
++ flush_flags |= IOMMU_FLUSHF_modified;
+
+- pte->mfn = next_mfn;
+- pte->iw = iw;
+- pte->ir = ir;
+- pte->next_level = next_level;
+- pte->pr = 1;
++ write_atomic(&pte->raw, new.raw);
+
+ return flush_flags;
+ }
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: ensure suitable ordering of DTE modifications
+
+DMA and interrupt translation should be enabled only after other
+applicable DTE fields have been written. Similarly when disabling
+translation or when moving a device between domains, translation should
+first be disabled, before other entry fields get modified. Note however
+that the "moving" aspect doesn't apply to the interrupt remapping side,
+as domain specifics are maintained in the IRTEs here, not the DTE. We
+also never disable interrupt remapping once it got enabled for a device
+(the respective argument passed is always the immutable iommu_intremap).
+
+This is part of XSA-347.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -107,11 +107,18 @@ void amd_iommu_set_root_page_table(struc
+ uint64_t root_ptr, uint16_t domain_id,
+ uint8_t paging_mode, bool valid)
+ {
++ if ( valid || dte->v )
++ {
++ dte->tv = false;
++ dte->v = true;
++ smp_wmb();
++ }
+ dte->domain_id = domain_id;
+ dte->pt_root = paddr_to_pfn(root_ptr);
+ dte->iw = true;
+ dte->ir = true;
+ dte->paging_mode = paging_mode;
++ smp_wmb();
+ dte->tv = true;
+ dte->v = valid;
+ }
+@@ -134,6 +141,7 @@ void amd_iommu_set_intremap_table(
+ }
+
+ dte->ig = false; /* unmapped interrupts result in i/o page faults */
++ smp_wmb();
+ dte->iv = valid;
+ }
+
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -120,7 +120,10 @@ static void amd_iommu_setup_domain_devic
+ /* Undo what amd_iommu_disable_domain_device() may have done. */
+ ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+ if ( dte->it_root )
++ {
+ dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
++ smp_wmb();
++ }
+ dte->iv = iommu_intremap;
+ dte->ex = ivrs_dev->dte_allow_exclusion;
+ dte->sys_mgt = MASK_EXTR(ivrs_dev->device_flags, ACPI_IVHD_SYSTEM_MGMT);