Apply relevant Xen Security Advisory 385 up to 402, and 404 (403 still

not released). Bump PKGREVISION
author: bouyer <bouyer@pkgsrc.org> 2022-06-24 13:47:37 +0000
committer: bouyer <bouyer@pkgsrc.org> 2022-06-24 13:47:37 +0000
commit: d6fc92c6e901bd545b6781a01bc993ffc8f7de00 (patch)
tree: 37844950d1050816f4f88c0c700fffc5b2b22d92 /sysutils
parent: fd67f46ab80259714e4918f216a10835fdd61b8e (diff)
download: pkgsrc-d6fc92c6e901bd545b6781a01bc993ffc8f7de00.tar.gz
12 files changed, 5412 insertions, 2 deletions
diff --git a/sysutils/xenkernel413/Makefile b/sysutils/xenkernel413/Makefile
index 33a2fe9626f..36bbd3f7538 100644
--- a/sysutils/xenkernel413/Makefile
+++ b/sysutils/xenkernel413/Makefile
@@ -1,8 +1,9 @@
-# $NetBSD: Makefile,v 1.15 2021/09/21 13:20:47 bouyer Exp $
+# $NetBSD: Makefile,v 1.16 2022/06/24 13:47:37 bouyer Exp $
 
 VERSION=	4.13.4
 DISTNAME=	xen-${VERSION}
 PKGNAME=	xenkernel413-${VERSION}
+PKGREVISION=	1
 CATEGORIES=	sysutils
 MASTER_SITES=	https://downloads.xenproject.org/release/xen/${VERSION}/
 DIST_SUBDIR=	xen413
diff --git a/sysutils/xenkernel413/distinfo b/sysutils/xenkernel413/distinfo
index 9f86c8bf79c..5e576571045 100644
--- a/sysutils/xenkernel413/distinfo
+++ b/sysutils/xenkernel413/distinfo
@@ -1,9 +1,19 @@
-$NetBSD: distinfo,v 1.13 2021/10/26 11:20:25 nia Exp $
+$NetBSD: distinfo,v 1.14 2022/06/24 13:47:37 bouyer Exp $
 
 BLAKE2s (xen413/xen-4.13.4.tar.gz) = b88ad78f8716c98253a8d3aae7622c1e3214efbc80c008518ae0104ef0eed661
 SHA512 (xen413/xen-4.13.4.tar.gz) = 1f6d67e0270b10be45b6444322ced791b44df09a3a51e0fe690f5ad76cd80d35115efc93056e99f73b4e550178e0e780c9ee827ced04b09caf12fdf34d9a9b71
 Size (xen413/xen-4.13.4.tar.gz) = 39055744 bytes
 SHA1 (patch-Config.mk) = 9372a09efd05c9fbdbc06f8121e411fcb7c7ba65
+SHA1 (patch-XSA385) = 5245aeb10dcfb9c97792f024942718b03c451cf5
+SHA1 (patch-XSA388) = c9d33d0770ee634aefa33805c17ccebea2879643
+SHA1 (patch-XSA389) = 04f6ec483f5fe1d8a47ce689a0a883871bda5214
+SHA1 (patch-XSA397) = 4aebc96ec37dc74e67d86d90abdf86b2516d0120
+SHA1 (patch-XSA398) = 9185899eef317ebbff8a0f1aa611c49a5e1c87e1
+SHA1 (patch-XSA399) = c9ab4473654810ca2701dfc38c26e91a0d7f2eb5
+SHA1 (patch-XSA400) = 90c8fcc1dd06e1a5c7667bc1a69145602ac692e9
+SHA1 (patch-XSA401) = 404d6899a161407618e2ab37e18d8f9e7ec61b1d
+SHA1 (patch-XSA402) = 92e585f077e15a3c67ba68044086ce8e0fc5379a
+SHA1 (patch-XSA404) = d562d5379673d0d23d0496438e00e0f131e7ac73
 SHA1 (patch-fixpvh) = eec14f19d0adc6d96035d6c711270bb375304660
 SHA1 (patch-xen_Makefile) = 465388d80de414ca3bb84faefa0f52d817e423a6
 SHA1 (patch-xen_Rules.mk) = c743dc63f51fc280d529a7d9e08650292c171dac
diff --git a/sysutils/xenkernel413/patches/patch-XSA385 b/sysutils/xenkernel413/patches/patch-XSA385
new file mode 100644
index 00000000000..15dee78ca54
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA385
@@ -0,0 +1,80 @@
+$NetBSD: patch-XSA385,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Julien Grall <jgrall@amazon.com>
+Subject: xen/page_alloc: Harden assign_pages()
+
+domain_tot_pages() and d->max_pages are 32-bit values. While the order
+should always be quite small, it would still be possible to overflow
+if domain_tot_pages() is near to (2^32 - 1).
+
+As this code may be called by a guest via XENMEM_increase_reservation
+and XENMEM_populate_physmap, we want to make sure the guest is not going
+to be able to allocate more than it is allowed.
+
+Rework the allocation check to avoid any possible overflow. While the
+check domain_tot_pages() < d->max_pages should technically not be
+necessary, it is probably best to have it to catch any possible
+inconsistencies in the future.
+
+This is CVE-2021-28706 / XSA-385.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/common/grant_table.c.orig
++++ xen/common/grant_table.c
+@@ -2286,7 +2286,8 @@ gnttab_transfer(
+          * pages when it is dying.
+          */
+         if ( unlikely(e->is_dying) ||
+-             unlikely(e->tot_pages >= e->max_pages) )
++             unlikely(e->tot_pages >= e->max_pages) ||
++             unlikely(!(e->tot_pages + 1)) )
+         {
+             spin_unlock(&e->page_alloc_lock);
+ 
+@@ -2295,8 +2296,8 @@ gnttab_transfer(
+                          e->domain_id);
+             else
+                 gdprintk(XENLOG_INFO,
+-                         "Transferee d%d has no headroom (tot %u, max %u)\n",
+-                         e->domain_id, e->tot_pages, e->max_pages);
++                         "Transferee %pd has no headroom (tot %u, max %u)\n",
++                         e, e->tot_pages, e->max_pages);
+ 
+             gop.status = GNTST_general_error;
+             goto unlock_and_copyback;
+--- xen/common/page_alloc.c.orig
++++ xen/common/page_alloc.c
+@@ -2276,16 +2276,25 @@ int assign_pages(
+ 
+     if ( !(memflags & MEMF_no_refcount) )
+     {
+-        if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
++        unsigned int nr = 1u << order;
++
++        if ( unlikely(d->tot_pages > d->max_pages) )
++        {
++            gprintk(XENLOG_INFO, "Inconsistent allocation for %pd: %u > %u\n",
++                    d, d->tot_pages, d->max_pages);
++            rc = -EPERM;
++            goto out;
++        }
++
++        if ( unlikely(nr > d->max_pages - d->tot_pages) )
+         {
+-            gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
+-                    "%u > %u\n", d->domain_id,
+-                    d->tot_pages + (1 << order), d->max_pages);
++            gprintk(XENLOG_INFO, "Over-allocation for %pd: %Lu > %u\n",
++                    d, d->tot_pages + 0ull + nr, d->max_pages);
+             rc = -E2BIG;
+             goto out;
+         }
+ 
+-        if ( unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) )
++        if ( unlikely(domain_adjust_tot_pages(d, nr) == nr) )
+             get_knownalive_domain(d);
+     }
+ 
diff --git a/sysutils/xenkernel413/patches/patch-XSA388 b/sysutils/xenkernel413/patches/patch-XSA388
new file mode 100644
index 00000000000..591e6fcf24f
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA388
@@ -0,0 +1,212 @@
+$NetBSD: patch-XSA388,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PoD: deal with misaligned GFNs
+
+Users of XENMEM_decrease_reservation and XENMEM_populate_physmap aren't
+required to pass in order-aligned GFN values. (While I consider this
+bogus, I don't think we can fix this there, as that might break existing
+code, e.g Linux'es swiotlb, which - while affecting PV only - until
+recently had been enforcing only page alignment on the original
+allocation.) Only non-PoD code paths (guest_physmap_{add,remove}_page(),
+p2m_set_entry()) look to be dealing with this properly (in part by being
+implemented inefficiently, handling every 4k page separately).
+
+Introduce wrappers taking care of splitting the incoming request into
+aligned chunks, without putting much effort in trying to determine the
+largest possible chunk at every iteration.
+
+Also "handle" p2m_set_entry() failure for non-order-0 requests by
+crashing the domain in one more place. Alongside putting a log message
+there, also add one to the other similar path.
+
+Note regarding locking: This is left in the actual worker functions on
+the assumption that callers aren't guaranteed atomicity wrt acting on
+multiple pages at a time. For mis-aligned GFNs gfn_lock() wouldn't have
+locked the correct GFN range anyway, if it didn't simply resolve to
+p2m_lock(), and for well-behaved callers there continues to be only a
+single iteration, i.e. behavior is unchanged for them. (FTAOD pulling
+out just pod_lock() into p2m_pod_decrease_reservation() would result in
+a lock order violation.)
+
+This is CVE-2021-28704 and CVE-2021-28707 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/arch/x86/mm/p2m-pod.c.orig
++++ xen/arch/x86/mm/p2m-pod.c
+@@ -495,7 +495,7 @@ p2m_pod_zero_check_superpage(struct p2m_
+ 
+ 
+ /*
+- * This function is needed for two reasons:
++ * This pair of functions is needed for two reasons:
+  * + To properly handle clearing of PoD entries
+  * + To "steal back" memory being freed for the PoD cache, rather than
+  *   releasing it.
+@@ -503,8 +503,8 @@ p2m_pod_zero_check_superpage(struct p2m_
+  * Once both of these functions have been completed, we can return and
+  * allow decrease_reservation() to handle everything else.
+  */
+-unsigned long
+-p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++static unsigned long
++decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
+ {
+     unsigned long ret = 0, i, n;
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -551,8 +551,10 @@ p2m_pod_decrease_reservation(struct doma
+          * All PoD: Mark the whole region invalid and tell caller
+          * we're done.
+          */
+-        if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
+-                           p2m->default_access) )
++        int rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
++                               p2m->default_access);
++
++        if ( rc )
+         {
+             /*
+              * If this fails, we can't tell how much of the range was changed.
+@@ -560,7 +562,12 @@ p2m_pod_decrease_reservation(struct doma
+              * impossible.
+              */
+             if ( order != 0 )
++            {
++                printk(XENLOG_G_ERR
++                       "%pd: marking GFN %#lx (order %u) as non-PoD failed: %d\n",
++                       d, gfn_x(gfn), order, rc);
+                 domain_crash(d);
++            }
+             goto out_unlock;
+         }
+         ret = 1UL << order;
+@@ -667,6 +674,22 @@ out_unlock:
+     return ret;
+ }
+ 
++unsigned long
++p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
++{
++    unsigned long left = 1UL << order, ret = 0;
++    unsigned int chunk_order = find_first_set_bit(gfn_x(gfn) | left);
++
++    do {
++        ret += decrease_reservation(d, gfn, chunk_order);
++
++        left -= 1UL << chunk_order;
++        gfn = gfn_add(gfn, 1UL << chunk_order);
++    } while ( left );
++
++    return ret;
++}
++
+ void p2m_pod_dump_data(struct domain *d)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -1266,19 +1289,15 @@ remap_and_retry:
+     return true;
+ }
+ 
+-
+-int
+-guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
+-                                      unsigned int order)
++static int
++mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
++                        unsigned int order)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+     gfn_t gfn = _gfn(gfn_l);
+     unsigned long i, n, pod_count = 0;
+     int rc = 0;
+ 
+-    if ( !paging_mode_translate(d) )
+-        return -EINVAL;
+-
+     gfn_lock(p2m, gfn, order);
+ 
+     P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
+@@ -1316,12 +1335,44 @@ guest_physmap_mark_populate_on_demand(st
+         BUG_ON(p2m->pod.entry_count < 0);
+         pod_unlock(p2m);
+     }
++    else if ( order )
++    {
++        /*
++         * If this failed, we can't tell how much of the range was changed.
++         * Best to crash the domain.
++         */
++        printk(XENLOG_G_ERR
++               "%pd: marking GFN %#lx (order %u) as PoD failed: %d\n",
++               d, gfn_l, order, rc);
++        domain_crash(d);
++    }
+ 
+ out:
+     gfn_unlock(p2m, gfn, order);
+ 
+     return rc;
+ }
++
++int
++guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
++                                      unsigned int order)
++{
++    unsigned long left = 1UL << order;
++    unsigned int chunk_order = find_first_set_bit(gfn | left);
++    int rc;
++
++    if ( !paging_mode_translate(d) )
++        return -EINVAL;
++
++    do {
++        rc = mark_populate_on_demand(d, gfn, chunk_order);
++
++        left -= 1UL << chunk_order;
++        gfn += 1UL << chunk_order;
++    } while ( !rc && left );
++
++    return rc;
++}
+ 
+ void p2m_pod_init(struct p2m_domain *p2m)
+ {
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/PoD: handle intermediate page orders in p2m_pod_cache_add()
+
+p2m_pod_decrease_reservation() may pass pages to the function which
+aren't 4k, 2M, or 1G. Handle all intermediate orders as well, to avoid
+hitting the BUG() at the switch() statement's "default" case.
+
+This is CVE-2021-28708 / part of XSA-388.
+
+Fixes: 3c352011c0d3 ("x86/PoD: shorten certain operations on higher order ranges")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/arch/x86/mm/p2m-pod.c.orig
++++ xen/arch/x86/mm/p2m-pod.c
+@@ -111,15 +111,13 @@ p2m_pod_cache_add(struct p2m_domain *p2m
+     /* Then add to the appropriate populate-on-demand list. */
+     switch ( order )
+     {
+-    case PAGE_ORDER_1G:
+-        for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
++    case PAGE_ORDER_2M ... PAGE_ORDER_1G:
++        for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_2M )
+             page_list_add_tail(page + i, &p2m->pod.super);
+         break;
+-    case PAGE_ORDER_2M:
+-        page_list_add_tail(page, &p2m->pod.super);
+-        break;
+-    case PAGE_ORDER_4K:
+-        page_list_add_tail(page, &p2m->pod.single);
++    case PAGE_ORDER_4K ... PAGE_ORDER_2M - 1:
++        for ( i = 0; i < (1UL << order); i += 1UL << PAGE_ORDER_4K )
++            page_list_add_tail(page + i, &p2m->pod.single);
+         break;
+     default:
+         BUG();
diff --git a/sysutils/xenkernel413/patches/patch-XSA389 b/sysutils/xenkernel413/patches/patch-XSA389
new file mode 100644
index 00000000000..26882e36f71
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA389
@@ -0,0 +1,182 @@
+$NetBSD: patch-XSA389,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/P2M: deal with partial success of p2m_set_entry()
+
+M2P and PoD stats need to remain in sync with P2M; if an update succeeds
+only partially, respective adjustments need to be made. If updates get
+made before the call, they may also need undoing upon complete failure
+(i.e. including the single-page case).
+
+Log-dirty state would better also be kept in sync.
+
+Note that the change to set_typed_p2m_entry() may not be strictly
+necessary (due to the order restriction enforced near the top of the
+function), but is being kept here to be on the safe side.
+
+This is CVE-2021-28705 and CVE-2021-28709 / XSA-389.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/arch/x86/mm/p2m.c.orig
++++ xen/arch/x86/mm/p2m.c
+@@ -781,6 +781,7 @@ p2m_remove_page(struct p2m_domain *p2m,
+     gfn_t gfn = _gfn(gfn_l);
+     p2m_type_t t;
+     p2m_access_t a;
++    int rc;
+ 
+     /* IOMMU for PV guests is handled in get_page_type() and put_page(). */
+     if ( !paging_mode_translate(p2m->domain) )
+@@ -812,8 +813,27 @@ p2m_remove_page(struct p2m_domain *p2m,
+                 set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
+         }
+     }
+-    return p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
+-                         p2m->default_access);
++    rc = p2m_set_entry(p2m, gfn, INVALID_MFN, page_order, p2m_invalid,
++                       p2m->default_access);
++    if ( likely(!rc) || !mfn_valid(_mfn(mfn)) )
++        return rc;
++
++    /*
++     * The operation may have partially succeeded. For the failed part we need
++     * to undo the M2P update and, out of precaution, mark the pages dirty
++     * again.
++     */
++    for ( i = 0; i < (1UL << page_order); ++i )
++    {
++        p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, NULL, NULL);
++        if ( !p2m_is_hole(t) && !p2m_is_special(t) && !p2m_is_shared(t) )
++        {
++            set_gpfn_from_mfn(mfn + i, gfn_l + i);
++            paging_mark_pfn_dirty(p2m->domain, _pfn(gfn_l + i));
++        }
++    }
++
++    return rc;
+ }
+ 
+ int
+@@ -1002,13 +1022,8 @@ guest_physmap_add_entry(struct domain *d
+ 
+     /* Now, actually do the two-way mapping */
+     rc = p2m_set_entry(p2m, gfn, mfn, page_order, t, p2m->default_access);
+-    if ( rc == 0 )
++    if ( likely(!rc) )
+     {
+-        pod_lock(p2m);
+-        p2m->pod.entry_count -= pod_count;
+-        BUG_ON(p2m->pod.entry_count < 0);
+-        pod_unlock(p2m);
+-
+         if ( !p2m_is_grant(t) )
+         {
+             for ( i = 0; i < (1UL << page_order); i++ )
+@@ -1016,6 +1031,42 @@ guest_physmap_add_entry(struct domain *d
+                                   gfn_x(gfn_add(gfn, i)));
+         }
+     }
++    else
++    {
++        /*
++         * The operation may have partially succeeded. For the successful part
++         * we need to update M2P and dirty state, while for the failed part we
++         * may need to adjust PoD stats as well as undo the earlier M2P update.
++         */
++        for ( i = 0; i < (1UL << page_order); ++i )
++        {
++            omfn = p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, NULL, NULL);
++            if ( p2m_is_pod(ot) )
++            {
++                BUG_ON(!pod_count);
++                --pod_count;
++            }
++            else if ( mfn_eq(omfn, mfn_add(mfn, i)) && ot == t &&
++                      a == p2m->default_access && !p2m_is_grant(t) )
++            {
++                set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++                paging_mark_pfn_dirty(d, _pfn(gfn_x(gfn) + i));
++            }
++            else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) )
++            {
++                ASSERT(mfn_valid(omfn));
++                set_gpfn_from_mfn(mfn_x(omfn), gfn_x(gfn) + i);
++            }
++        }
++    }
++
++    if ( pod_count )
++    {
++        pod_lock(p2m);
++        p2m->pod.entry_count -= pod_count;
++        BUG_ON(p2m->pod.entry_count < 0);
++        pod_unlock(p2m);
++    }
+ 
+  out:
+     p2m_unlock(p2m);
+@@ -1307,6 +1358,49 @@ static int set_typed_p2m_entry(struct do
+             return 0;
+         }
+     }
++
++    P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
++    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
++    if ( unlikely(rc) )
++    {
++        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
++                 gfn_l, order, rc, mfn_x(mfn));
++
++        /*
++         * The operation may have partially succeeded. For the successful part
++         * we need to update PoD stats, M2P, and dirty state.
++         */
++        if ( order != PAGE_ORDER_4K )
++        {
++            unsigned long i;
++
++            for ( i = 0; i < (1UL << order); ++i )
++            {
++                p2m_type_t t;
++                mfn_t cmfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0,
++                                            NULL, NULL);
++
++                if ( !mfn_eq(cmfn, mfn_add(mfn, i)) || t != gfn_p2mt ||
++                     a != access )
++                    continue;
++
++                if ( p2m_is_ram(ot) )
++                {
++                    ASSERT(mfn_valid(mfn_add(omfn, i)));
++                    set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
++                }
++#ifdef CONFIG_HVM
++                else if ( p2m_is_pod(ot) )
++                {
++                    pod_lock(p2m);
++                    BUG_ON(!p2m->pod.entry_count);
++                    --p2m->pod.entry_count;
++                    pod_unlock(p2m);
++                }
++#endif
++            }
++        }
++    }
+     else if ( p2m_is_ram(ot) )
+     {
+         unsigned long i;
+@@ -1317,12 +1411,6 @@ static int set_typed_p2m_entry(struct do
+             set_gpfn_from_mfn(mfn_x(omfn) + i, INVALID_M2P_ENTRY);
+         }
+     }
+-
+-    P2M_DEBUG("set %d %lx %lx\n", gfn_p2mt, gfn_l, mfn_x(mfn));
+-    rc = p2m_set_entry(p2m, gfn, mfn, order, gfn_p2mt, access);
+-    if ( rc )
+-        gdprintk(XENLOG_ERR, "p2m_set_entry: %#lx:%u -> %d (0x%"PRI_mfn")\n",
+-                 gfn_l, order, rc, mfn_x(mfn));
+ #ifdef CONFIG_HVM
+     else if ( p2m_is_pod(ot) )
+     {
diff --git a/sysutils/xenkernel413/patches/patch-XSA397 b/sysutils/xenkernel413/patches/patch-XSA397
new file mode 100644
index 00000000000..52f8e01e650
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA397
@@ -0,0 +1,100 @@
+$NetBSD: patch-XSA397,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Roger Pau Monne <roger.pau@citrix.com>
+Subject: x86/hap: do not switch on log dirty for VRAM tracking
+
+XEN_DMOP_track_dirty_vram possibly calls into paging_log_dirty_enable
+when using HAP mode, and it can interact badly with other ongoing
+paging domctls, as XEN_DMOP_track_dirty_vram is not holding the domctl
+lock.
+
+This was detected as a result of the following assert triggering when
+doing repeated migrations of a HAP HVM domain with a stubdom:
+
+Assertion 'd->arch.paging.log_dirty.allocs == 0' failed at paging.c:198
+----[ Xen-4.17-unstable  x86_64  debug=y  Not tainted ]----
+CPU:    34
+RIP:    e008:[<ffff82d040314b3b>] arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x6
+RFLAGS: 0000000000010206   CONTEXT: hypervisor (d0v23)
+[...]
+Xen call trace:
+   [<ffff82d040314b3b>] R arch/x86/mm/paging.c#paging_free_log_dirty_bitmap+0x606/0x63a
+   [<ffff82d040279f96>] S xsm/flask/hooks.c#domain_has_perm+0x5a/0x67
+   [<ffff82d04031577f>] F paging_domctl+0x251/0xd41
+   [<ffff82d04031640c>] F paging_domctl_continuation+0x19d/0x202
+   [<ffff82d0403202fa>] F pv_hypercall+0x150/0x2a7
+   [<ffff82d0403a729d>] F lstar_enter+0x12d/0x140
+
+Such assert triggered because the stubdom used
+XEN_DMOP_track_dirty_vram while dom0 was in the middle of executing
+XEN_DOMCTL_SHADOW_OP_OFF, and so log dirty become enabled while
+retiring the old structures, thus leading to new entries being
+populated in already clear slots.
+
+Fix this by not enabling log dirty for VRAM tracking, similar to what
+is done when using shadow instead of HAP. Call
+p2m_enable_hardware_log_dirty when enabling VRAM tracking in order to
+get some hardware assistance if available. As a side effect the memory
+pressure on the p2m pool should go down if only VRAM tracking is
+enabled, as the dirty bitmap is no longer allocated.
+
+Note that paging_log_dirty_range (used to get the dirty bitmap for
+VRAM tracking) doesn't use the log dirty bitmap, and instead relies on
+checking whether each gfn on the range has been switched from
+p2m_ram_logdirty to p2m_ram_rw in order to account for dirty pages.
+
+This is CVE-2022-26356 / XSA-397.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+--- xen/include/asm-x86/paging.h.orig
++++ xen/include/asm-x86/paging.h
+@@ -160,9 +160,6 @@ void paging_log_dirty_range(struct domai
+                             unsigned long nr,
+                             uint8_t *dirty_bitmap);
+ 
+-/* enable log dirty */
+-int paging_log_dirty_enable(struct domain *d, bool log_global);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d, const struct log_dirty_ops *ops);
+ 
+--- xen/arch/x86/mm/hap/hap.c.orig
++++ xen/arch/x86/mm/hap/hap.c
+@@ -69,13 +69,6 @@ int hap_track_dirty_vram(struct domain *
+     {
+         int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+ 
+-        if ( !paging_mode_log_dirty(d) )
+-        {
+-            rc = paging_log_dirty_enable(d, false);
+-            if ( rc )
+-                goto out;
+-        }
+-
+         rc = -ENOMEM;
+         dirty_bitmap = vzalloc(size);
+         if ( !dirty_bitmap )
+@@ -107,6 +100,10 @@ int hap_track_dirty_vram(struct domain *
+ 
+             paging_unlock(d);
+ 
++            domain_pause(d);
++            p2m_enable_hardware_log_dirty(d);
++            domain_unpause(d);
++
+             if ( oend > ostart )
+                 p2m_change_type_range(d, ostart, oend,
+                                       p2m_ram_logdirty, p2m_ram_rw);
+--- xen/arch/x86/mm/paging.c.orig
++++ xen/arch/x86/mm/paging.c
+@@ -209,7 +209,7 @@ static int paging_free_log_dirty_bitmap(
+     return rc;
+ }
+ 
+-int paging_log_dirty_enable(struct domain *d, bool log_global)
++static int paging_log_dirty_enable(struct domain *d, bool log_global)
+ {
+     int ret;
+ 
diff --git a/sysutils/xenkernel413/patches/patch-XSA398 b/sysutils/xenkernel413/patches/patch-XSA398
new file mode 100644
index 00000000000..92aa098920f
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA398
@@ -0,0 +1,58 @@
+$NetBSD: patch-XSA398,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From 7b9814b250a5a28277bd0866d341a5cfc0f4c1ac Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 7 Mar 2022 16:35:52 +0000
+Subject: x86/spec-ctrl: Cease using thunk=lfence on AMD
+
+AMD have updated their Spectre v2 guidance, and lfence/jmp is no longer
+considered safe.  AMD are recommending using retpoline everywhere.
+
+Update the default heuristics to never select THUNK_LFENCE.
+
+This is part of XSA-398 / CVE-2021-26401.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 8d03080d2a339840d3a59e0932a94f804e45110d)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index cf9dea62dbfd..eead69ada2c2 100644
+--- docs/misc/xen-command-line.pandoc.orig
++++ docs/misc/xen-command-line.pandoc
+@@ -2077,9 +2077,9 @@ to use.
+ 
+ If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to
+ select which of the thunks gets patched into the `__x86_indirect_thunk_%reg`
+-locations.  The default thunk is `retpoline` (generally preferred for Intel
+-hardware), with the alternatives being `jmp` (a `jmp *%reg` gadget, minimal
+-overhead), and `lfence` (an `lfence; jmp *%reg` gadget, preferred for AMD).
++locations.  The default thunk is `retpoline` (generally preferred), with the
++alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and
++`lfence` (an `lfence; jmp *%reg` gadget).
+ 
+ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the
+ `ibrs=` option can be used to force or prevent Xen using the feature itself.
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 1cfd02d7d7cf..7447d4a8e5b5 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -908,16 +908,10 @@ void __init init_speculation_mitigations(void)
+         if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) )
+         {
+             /*
+-             * AMD's recommended mitigation is to set lfence as being dispatch
+-             * serialising, and to use IND_THUNK_LFENCE.
+-             */
+-            if ( cpu_has_lfence_dispatch )
+-                thunk = THUNK_LFENCE;
+-            /*
+-             * On Intel hardware, we'd like to use retpoline in preference to
++             * On all hardware, we'd like to use retpoline in preference to
+              * IBRS, but only if it is safe on this hardware.
+              */
+-            else if ( retpoline_safe(caps) )
++            if ( retpoline_safe(caps) )
+                 thunk = THUNK_RETPOLINE;
+             else if ( boot_cpu_has(X86_FEATURE_IBRSB) )
+                 ibrs = true;
diff --git a/sysutils/xenkernel413/patches/patch-XSA399 b/sysutils/xenkernel413/patches/patch-XSA399
new file mode 100644
index 00000000000..b42b1fa219c
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA399
@@ -0,0 +1,47 @@
+$NetBSD: patch-XSA399,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: correct ordering of operations in cleanup_domid_map()
+
+The function may be called without any locks held (leaving aside the
+domctl one, which we surely don't want to depend on here), so needs to
+play safe wrt other accesses to domid_map[] and domid_bitmap[]. This is
+to avoid context_set_domain_id()'s writing of domid_map[] to be reset to
+zero right away in the case of it racing the freeing of a DID.
+
+For the interaction with context_set_domain_id() and ->domid_map[] reads
+see the code comment.
+
+{check_,}cleanup_domid_map() are called with pcidevs_lock held or during
+domain cleanup only (and pcidevs_lock is also held around
+context_set_domain_id()), i.e. racing calls with the same (dom, iommu)
+tuple cannot occur.
+
+domain_iommu_domid(), besides its use by cleanup_domid_map(), has its
+result used only to control flushing, and hence a stale result would
+only lead to a stray extra flush.
+
+This is CVE-2022-26357 / XSA-399.
+
+Fixes: b9c20c78789f ("VT-d: per-iommu domain-id")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -152,8 +152,14 @@ static void cleanup_domid_map(struct dom
+ 
+     if ( iommu_domid >= 0 )
+     {
++        /*
++         * Update domid_map[] /before/ domid_bitmap[] to avoid a race with
++         * context_set_domain_id(), setting the slot to DOMID_INVALID for
++         * ->domid_map[] reads to produce a suitable value while the bit is
++         * still set.
++         */
++        iommu->domid_map[iommu_domid] = DOMID_INVALID;
+         clear_bit(iommu_domid, iommu->domid_bitmap);
+-        iommu->domid_map[iommu_domid] = 0;
+     }
+ }
+ 
diff --git a/sysutils/xenkernel413/patches/patch-XSA400 b/sysutils/xenkernel413/patches/patch-XSA400
new file mode 100644
index 00000000000..3c2e80252b5
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA400
@@ -0,0 +1,3149 @@
+$NetBSD: patch-XSA400,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: split domid map cleanup check into a function
+
+This logic will want invoking from elsewhere.
+
+No functional change intended.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -153,6 +153,51 @@ static void cleanup_domid_map(struct dom
+     }
+ }
+ 
++static bool any_pdev_behind_iommu(const struct domain *d,
++                                  const struct pci_dev *exclude,
++                                  const struct vtd_iommu *iommu)
++{
++    const struct pci_dev *pdev;
++
++    for_each_pdev ( d, pdev )
++    {
++        const struct acpi_drhd_unit *drhd;
++
++        if ( pdev == exclude )
++            continue;
++
++        drhd = acpi_find_matched_drhd_unit(pdev);
++        if ( drhd && drhd->iommu == iommu )
++            return true;
++    }
++
++    return false;
++}
++
++/*
++ * If no other devices under the same iommu owned by this domain,
++ * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
++ */
++static void check_cleanup_domid_map(struct domain *d,
++                                    const struct pci_dev *exclude,
++                                    struct vtd_iommu *iommu)
++{
++    bool found = any_pdev_behind_iommu(d, exclude, iommu);
++
++    /*
++     * Hidden devices are associated with DomXEN but usable by the hardware
++     * domain. Hence they need considering here as well.
++     */
++    if ( !found && is_hardware_domain(d) )
++        found = any_pdev_behind_iommu(dom_xen, exclude, iommu);
++
++    if ( !found )
++    {
++        clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap);
++        cleanup_domid_map(d, iommu);
++    }
++}
++
+ static int iommus_incoherent;
+ 
+ static void sync_cache(const void *addr, unsigned int size)
+@@ -1685,7 +1730,6 @@ static int domain_context_unmap(struct d
+     struct vtd_iommu *iommu;
+     int ret = 0;
+     u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
+-    int found = 0;
+ 
+     drhd = acpi_find_matched_drhd_unit(pdev);
+     if ( !drhd )
+@@ -1769,28 +1813,8 @@ static int domain_context_unmap(struct d
+     if ( ret )
+         goto out;
+ 
+-    /*
+-     * if no other devices under the same iommu owned by this domain,
+-     * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
+-     */
+-    for_each_pdev ( domain, pdev )
+-    {
+-        if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
+-            continue;
+-
+-        drhd = acpi_find_matched_drhd_unit(pdev);
+-        if ( drhd && drhd->iommu == iommu )
+-        {
+-            found = 1;
+-            break;
+-        }
+-    }
+-
+-    if ( found == 0 )
+-    {
+-        clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
+-        cleanup_domid_map(domain, iommu);
+-    }
++    if ( !ret )
++        check_cleanup_domid_map(domain, pdev, iommu);
+ 
+ out:
+     return ret;
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: fix (de)assign ordering when RMRRs are in use
+
+In the event that the RMRR mappings are essential for device operation,
+they should be established before updating the device's context entry,
+while they should be torn down only after the device's context entry was
+successfully updated.
+
+Also adjust a related log message.
+
+This is CVE-2022-26358 / part of XSA-400.
+
+Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -2392,6 +2392,10 @@ static int reassign_device_ownership(
+ {
+     int ret;
+ 
++    ret = domain_context_unmap(source, devfn, pdev);
++    if ( ret )
++        return ret;
++
+     /*
+      * Devices assigned to untrusted domains (here assumed to be any domU)
+      * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
+@@ -2428,10 +2432,6 @@ static int reassign_device_ownership(
+             }
+     }
+ 
+-    ret = domain_context_unmap(source, devfn, pdev);
+-    if ( ret )
+-        return ret;
+-
+     if ( devfn == pdev->devfn && pdev->domain != dom_io )
+     {
+         list_move(&pdev->domain_list, &dom_io->pdev_list);
+@@ -2508,9 +2508,8 @@ static int intel_iommu_assign_device(
+         }
+     }
+ 
+-    ret = reassign_device_ownership(s, d, devfn, pdev);
+-    if ( ret || d == dom_io )
+-        return ret;
++    if ( d == dom_io )
++        return reassign_device_ownership(s, d, devfn, pdev);
+ 
+     /* Setup rmrr identity mapping */
+     for_each_rmrr_device( rmrr, bdf, i )
+@@ -2523,20 +2522,37 @@ static int intel_iommu_assign_device(
+                                          rmrr->end_address, flag);
+             if ( ret )
+             {
+-                int rc;
+-
+-                rc = reassign_device_ownership(d, s, devfn, pdev);
+                 printk(XENLOG_G_ERR VTDPREFIX
+-                       " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
+-                       rmrr->base_address, rmrr->end_address,
+-                       d->domain_id, ret);
+-                if ( rc )
+-                {
+-                    printk(XENLOG_ERR VTDPREFIX
+-                           " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
+-                           seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
+-                    domain_crash(d);
+-                }
++                       "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n",
++                       d, rmrr->base_address, rmrr->end_address, ret);
++                break;
++            }
++        }
++    }
++
++    if ( !ret )
++        ret = reassign_device_ownership(s, d, devfn, pdev);
++
++    /* See reassign_device_ownership() for the hwdom aspect. */
++    if ( !ret || is_hardware_domain(d) )
++        return ret;
++
++    for_each_rmrr_device( rmrr, bdf, i )
++    {
++        if ( rmrr->segment == seg &&
++             PCI_BUS(bdf) == bus &&
++             PCI_DEVFN2(bdf) == devfn )
++        {
++            int rc = iommu_identity_mapping(d, p2m_access_x,
++                                            rmrr->base_address,
++                                            rmrr->end_address, 0);
++
++            if ( rc && rc != -ENOENT )
++            {
++                printk(XENLOG_ERR VTDPREFIX
++                       "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n",
++                       d, rmrr->base_address, rmrr->end_address, rc);
++                domain_crash(d);
+                 break;
+             }
+         }
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: fix add/remove ordering when RMRRs are in use
+
+In the event that the RMRR mappings are essential for device operation,
+they should be established before updating the device's context entry,
+while they should be torn down only after the device's context entry was
+successfully cleared.
+
+Also switch to %pd in related log messages.
+
+Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR")
+Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1993,14 +1993,6 @@ static int intel_iommu_add_device(u8 dev
+     if ( !pdev->domain )
+         return -EINVAL;
+ 
+-    ret = domain_context_mapping(pdev->domain, devfn, pdev);
+-    if ( ret )
+-    {
+-        dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
+-                pdev->domain->domain_id);
+-        return ret;
+-    }
+-
+     for_each_rmrr_device ( rmrr, bdf, i )
+     {
+         if ( rmrr->segment == pdev->seg &&
+@@ -2017,12 +2009,17 @@ static int intel_iommu_add_device(u8 dev
+                                          rmrr->base_address, rmrr->end_address,
+                                          0);
+             if ( ret )
+-                dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
+-                        pdev->domain->domain_id);
++                dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n",
++                        pdev->domain);
+         }
+     }
+ 
+-    return 0;
++    ret = domain_context_mapping(pdev->domain, devfn, pdev);
++    if ( ret )
++        dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n",
++                pdev->domain);
++
++    return ret;
+ }
+ 
+ static int intel_iommu_enable_device(struct pci_dev *pdev)
+@@ -2044,11 +2041,15 @@ static int intel_iommu_remove_device(u8
+ {
+     struct acpi_rmrr_unit *rmrr;
+     u16 bdf;
+-    int i;
++    int ret, i;
+ 
+     if ( !pdev->domain )
+         return -EINVAL;
+ 
++    ret = domain_context_unmap(pdev->domain, devfn, pdev);
++    if ( ret )
++        return ret;
++
+     for_each_rmrr_device ( rmrr, bdf, i )
+     {
+         if ( rmrr->segment != pdev->seg ||
+@@ -2064,7 +2065,7 @@ static int intel_iommu_remove_device(u8
+                                rmrr->end_address, 0);
+     }
+ 
+-    return domain_context_unmap(pdev->domain, devfn, pdev);
++    return 0;
+ }
+ 
+ static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: drop ownership checking from domain_context_mapping_one()
+
+Despite putting in quite a bit of effort it was not possible to
+establish why exactly this code exists (beyond possibly sanity
+checking). Instead of a subsequent change further complicating this
+logic, simply get rid of it.
+
+Take the opportunity and move the respective unmap_vtd_domain_page() out
+of the locked region.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -114,28 +114,6 @@ static int context_set_domain_id(struct
+     return 0;
+ }
+ 
+-static int context_get_domain_id(struct context_entry *context,
+-                                 struct vtd_iommu *iommu)
+-{
+-    unsigned long dom_index, nr_dom;
+-    int domid = -1;
+-
+-    if (iommu && context)
+-    {
+-        nr_dom = cap_ndoms(iommu->cap);
+-
+-        dom_index = context_domain_id(*context);
+-
+-        if ( dom_index < nr_dom && iommu->domid_map )
+-            domid = iommu->domid_map[dom_index];
+-        else
+-            dprintk(XENLOG_DEBUG VTDPREFIX,
+-                    "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
+-                    dom_index, nr_dom);
+-    }
+-    return domid;
+-}
+-
+ static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
+ {
+     int iommu_domid = domain_iommu_domid(domain, iommu);
+@@ -1392,49 +1370,9 @@ int domain_context_mapping_one(
+ 
+     if ( context_present(*context) )
+     {
+-        int res = 0;
+-
+-        /* Try to get domain ownership from device structure.  If that's
+-         * not available, try to read it from the context itself. */
+-        if ( pdev )
+-        {
+-            if ( pdev->domain != domain )
+-            {
+-                printk(XENLOG_G_INFO VTDPREFIX
+-                       "d%d: %04x:%02x:%02x.%u owned by d%d!",
+-                       domain->domain_id,
+-                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+-                       pdev->domain ? pdev->domain->domain_id : -1);
+-                res = -EINVAL;
+-            }
+-        }
+-        else
+-        {
+-            int cdomain;
+-            cdomain = context_get_domain_id(context, iommu);
+-            
+-            if ( cdomain < 0 )
+-            {
+-                printk(XENLOG_G_WARNING VTDPREFIX
+-                       "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n",
+-                       domain->domain_id,
+-                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+-                res = -EINVAL;
+-            }
+-            else if ( cdomain != domain->domain_id )
+-            {
+-                printk(XENLOG_G_INFO VTDPREFIX
+-                       "d%d: %04x:%02x:%02x.%u already mapped to d%d!",
+-                       domain->domain_id,
+-                       seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+-                       cdomain);
+-                res = -EINVAL;
+-            }
+-        }
+-
+-        unmap_vtd_domain_page(context_entries);
+         spin_unlock(&iommu->lock);
+-        return res;
++        unmap_vtd_domain_page(context_entries);
++        return 0;
+     }
+ 
+     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: re-assign devices directly
+
+Devices with RMRRs, due to it being unspecified how/when the specified
+memory regions may get accessed, may not be left disconnected from their
+respective mappings (as long as it's not certain that the device has
+been fully quiesced). Hence rather than unmapping the old context and
+then mapping the new one, re-assignment needs to be done in a single
+step.
+
+This is CVE-2022-26359 / part of XSA-400.
+
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+
+Similarly quarantining scratch-page mode relies on page tables to be
+continuously wired up.
+
+To avoid complicating things more than necessary, treat all devices
+mostly equally, i.e. regardless of their association with any RMRRs. The
+main difference is when it comes to updating context entries, which need
+to be atomic when there are RMRRs. Yet atomicity can only be achieved
+with CMPXCHG16B, availability of which we can't take for given.
+
+The seemingly complicated choice of non-negative return values for
+domain_context_mapping_one() is to limit code churn: This way callers
+passing NULL for pdev don't need fiddling with.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr);
+ void *map_vtd_domain_page(u64 maddr);
+ void unmap_vtd_domain_page(void *va);
+ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+-                               u8 bus, u8 devfn, const struct pci_dev *);
++                               uint8_t bus, uint8_t devfn,
++                               const struct pci_dev *pdev, unsigned int mode);
+ int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+                              u8 bus, u8 devfn);
+ int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
+@@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void);
+ void platform_quirks_init(void);
+ void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
+ void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
+-int __must_check me_wifi_quirk(struct domain *domain,
+-                               u8 bus, u8 devfn, int map);
++int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
++                               uint8_t devfn, unsigned int mode);
+ void pci_vtd_quirk(const struct pci_dev *);
+ void quirk_iommu_caps(struct vtd_iommu *iommu);
+ 
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -110,6 +110,7 @@ static int context_set_domain_id(struct
+     }
+ 
+     set_bit(i, iommu->domid_bitmap);
++    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+     context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+     return 0;
+ }
+@@ -1350,15 +1351,27 @@ static void __hwdom_init intel_iommu_hwd
+     }
+ }
+ 
++/*
++ * This function returns
++ * - a negative errno value upon error,
++ * - zero upon success when previously the entry was non-present, or this isn't
++ *   the "main" request for a device (pdev == NULL), or for no-op quarantining
++ *   assignments,
++ * - positive (one) upon success when previously the entry was present and this
++ *   is the "main" request for a device (pdev != NULL).
++ */
+ int domain_context_mapping_one(
+     struct domain *domain,
+     struct vtd_iommu *iommu,
+-    u8 bus, u8 devfn, const struct pci_dev *pdev)
++    uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
++    unsigned int mode)
+ {
+     struct domain_iommu *hd = dom_iommu(domain);
+-    struct context_entry *context, *context_entries;
++    struct context_entry *context, *context_entries, lctxt;
++    __uint128_t old;
+     u64 maddr, pgd_maddr;
+-    u16 seg = iommu->drhd->segment;
++    uint16_t seg = iommu->drhd->segment, prev_did = 0;
++    struct domain *prev_dom = NULL;
+     int agaw, rc, ret;
+     bool_t flush_dev_iotlb;
+ 
+@@ -1367,17 +1380,32 @@ int domain_context_mapping_one(
+     maddr = bus_to_context_maddr(iommu, bus);
+     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
+     context = &context_entries[devfn];
++    old = (lctxt = *context).full;
+ 
+-    if ( context_present(*context) )
++    if ( context_present(lctxt) )
+     {
+-        spin_unlock(&iommu->lock);
+-        unmap_vtd_domain_page(context_entries);
+-        return 0;
++        domid_t domid;
++
++        prev_did = context_domain_id(lctxt);
++        domid = iommu->domid_map[prev_did];
++        if ( domid < DOMID_FIRST_RESERVED )
++            prev_dom = rcu_lock_domain_by_id(domid);
++        else if ( domid == DOMID_IO )
++            prev_dom = rcu_lock_domain(dom_io);
++        if ( !prev_dom )
++        {
++            spin_unlock(&iommu->lock);
++            unmap_vtd_domain_page(context_entries);
++            dprintk(XENLOG_DEBUG VTDPREFIX,
++                    "no domain for did %u (nr_dom %u)\n",
++                    prev_did, cap_ndoms(iommu->cap));
++            return -ESRCH;
++        }
+     }
+ 
+     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+     {
+-        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
++        context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU);
+         agaw = level_to_agaw(iommu->nr_pt_levels);
+     }
+     else
+@@ -1394,6 +1422,8 @@ int domain_context_mapping_one(
+                 spin_unlock(&hd->arch.mapping_lock);
+                 spin_unlock(&iommu->lock);
+                 unmap_vtd_domain_page(context_entries);
++                if ( prev_dom )
++                    rcu_unlock_domain(prev_dom);
+                 return -ENOMEM;
+             }
+         }
+@@ -1411,33 +1441,102 @@ int domain_context_mapping_one(
+                 goto nomem;
+         }
+ 
+-        context_set_address_root(*context, pgd_maddr);
++        context_set_address_root(lctxt, pgd_maddr);
+         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
+-            context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
++            context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
+         else
+-            context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
++            context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL);
+ 
+         spin_unlock(&hd->arch.mapping_lock);
+     }
+ 
+-    if ( context_set_domain_id(context, domain, iommu) )
++    rc = context_set_domain_id(&lctxt, domain, iommu);
++    if ( rc )
+     {
++    unlock:
+         spin_unlock(&iommu->lock);
+         unmap_vtd_domain_page(context_entries);
+-        return -EFAULT;
++        if ( prev_dom )
++            rcu_unlock_domain(prev_dom);
++        return rc;
++    }
++
++    if ( !prev_dom )
++    {
++        context_set_address_width(lctxt, agaw);
++        context_set_fault_enable(lctxt);
++        context_set_present(lctxt);
++    }
++    else if ( prev_dom == domain )
++    {
++        ASSERT(lctxt.full == context->full);
++        rc = !!pdev;
++        goto unlock;
++    }
++    else
++    {
++        ASSERT(context_address_width(lctxt) == agaw);
++        ASSERT(!context_fault_disable(lctxt));
++    }
++
++    if ( cpu_has_cx16 )
++    {
++        __uint128_t res = cmpxchg16b(context, &old, &lctxt.full);
++
++        /*
++         * Hardware does not update the context entry behind our backs,
++         * so the return value should match "old".
++         */
++        if ( res != old )
++        {
++            if ( pdev )
++                check_cleanup_domid_map(domain, pdev, iommu);
++            printk(XENLOG_ERR
++                   "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
++                   pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
++                   (uint64_t)(res >> 64), (uint64_t)res,
++                   (uint64_t)(old >> 64), (uint64_t)old);
++            rc = -EILSEQ;
++            goto unlock;
++        }
++    }
++    else if ( !prev_dom || !(mode & MAP_WITH_RMRR) )
++    {
++        context_clear_present(*context);
++        iommu_sync_cache(context, sizeof(*context));
++
++        write_atomic(&context->hi, lctxt.hi);
++        /* No barrier should be needed between these two. */
++        write_atomic(&context->lo, lctxt.lo);
++    }
++    else /* Best effort, updating DID last. */
++    {
++         /*
++          * By non-atomically updating the context entry's DID field last,
++          * during a short window in time TLB entries with the old domain ID
++          * but the new page tables may be inserted.  This could affect I/O
++          * of other devices using this same (old) domain ID.  Such updating
++          * therefore is not a problem if this was the only device associated
++          * with the old domain ID.  Diverting I/O of any of a dying domain's
++          * devices to the quarantine page tables is intended anyway.
++          */
++        if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) )
++            printk(XENLOG_WARNING VTDPREFIX
++                   " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
++                   seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom);
++
++        write_atomic(&context->lo, lctxt.lo);
++        /* No barrier should be needed between these two. */
++        write_atomic(&context->hi, lctxt.hi);
+     }
+ 
+-    context_set_address_width(*context, agaw);
+-    context_set_fault_enable(*context);
+-    context_set_present(*context);
+     iommu_sync_cache(context, sizeof(struct context_entry));
+     spin_unlock(&iommu->lock);
+ 
+-    /* Context entry was previously non-present (with domid 0). */
+-    rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
+-                                    DMA_CCMD_MASK_NOBIT, 1);
++    rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn),
++                                    DMA_CCMD_MASK_NOBIT, !prev_dom);
+     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+-    ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
++    ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb);
+ 
+     /*
+      * The current logic for returns:
+@@ -1458,12 +1557,21 @@ int domain_context_mapping_one(
+     unmap_vtd_domain_page(context_entries);
+ 
+     if ( !seg && !rc )
+-        rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
++        rc = me_wifi_quirk(domain, bus, devfn, mode);
+ 
+     if ( rc )
+-        domain_context_unmap_one(domain, iommu, bus, devfn);
++    {
++        if ( !prev_dom )
++            domain_context_unmap_one(domain, iommu, bus, devfn);
++        else if ( prev_dom != domain ) /* Avoid infinite recursion. */
++            domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
++                                       mode & MAP_WITH_RMRR);
++    }
+ 
+-    return rc;
++    if ( prev_dom )
++        rcu_unlock_domain(prev_dom);
++
++    return rc ?: pdev && prev_dom;
+ }
+ 
+ static int domain_context_unmap(struct domain *d, uint8_t devfn,
+@@ -1473,8 +1581,11 @@ static int domain_context_mapping(struct
+                                   struct pci_dev *pdev)
+ {
+     struct acpi_drhd_unit *drhd;
++    const struct acpi_rmrr_unit *rmrr;
+     int ret = 0;
+-    u8 seg = pdev->seg, bus = pdev->bus, secbus;
++    unsigned int i, mode = 0;
++    uint16_t seg = pdev->seg, bdf;
++    uint8_t bus = pdev->bus, secbus;
+ 
+     drhd = acpi_find_matched_drhd_unit(pdev);
+     if ( !drhd )
+@@ -1493,8 +1604,29 @@ static int domain_context_mapping(struct
+ 
+     ASSERT(pcidevs_locked());
+ 
++    for_each_rmrr_device( rmrr, bdf, i )
++    {
++        if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf )
++            continue;
++
++        mode |= MAP_WITH_RMRR;
++        break;
++    }
++
++    if ( domain != pdev->domain )
++    {
++        if ( pdev->domain->is_dying )
++            mode |= MAP_OWNER_DYING;
++        else if ( drhd &&
++                  !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) &&
++                  !pdev->phantom_stride )
++            mode |= MAP_SINGLE_DEVICE;
++    }
++
+     switch ( pdev->type )
+     {
++        bool prev_present;
++
+     case DEV_TYPE_PCI_HOST_BRIDGE:
+         if ( iommu_debug )
+             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
+@@ -1515,7 +1647,9 @@ static int domain_context_mapping(struct
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev);
++                                         pdev, mode);
++        if ( ret > 0 )
++            ret = 0;
+         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+             enable_ats_device(pdev, &drhd->iommu->ats_devices);
+ 
+@@ -1528,9 +1662,10 @@ static int domain_context_mapping(struct
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+ 
+         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev);
+-        if ( ret )
++                                         pdev, mode);
++        if ( ret < 0 )
+             break;
++        prev_present = ret;
+ 
+         if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 )
+         {
+@@ -1538,6 +1673,15 @@ static int domain_context_mapping(struct
+                 break;
+             ret = -ENXIO;
+         }
++        /*
++         * Strictly speaking if the device is the only one behind this bridge
++         * and the only one with this (secbus,0,0) tuple, it could be allowed
++         * to be re-assigned regardless of RMRR presence.  But let's deal with
++         * that case only if it is actually found in the wild.
++         */
++        else if ( prev_present && (mode & MAP_WITH_RMRR) &&
++                  domain != pdev->domain )
++            ret = -EOPNOTSUPP;
+ 
+         /*
+          * Mapping a bridge should, if anything, pass the struct pci_dev of
+@@ -1546,7 +1690,7 @@ static int domain_context_mapping(struct
+          */
+         if ( ret >= 0 )
+             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                             NULL);
++                                             NULL, mode);
+ 
+         /*
+          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1561,10 +1705,15 @@ static int domain_context_mapping(struct
+         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+              (secbus != pdev->bus || pdev->devfn != 0) )
+             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+-                                             NULL);
++                                             NULL, mode);
+ 
+         if ( ret )
+-            domain_context_unmap(domain, devfn, pdev);
++        {
++            if ( !prev_present )
++                domain_context_unmap(domain, devfn, pdev);
++            else if ( pdev->domain != domain ) /* Avoid infinite recursion. */
++                domain_context_mapping(pdev->domain, devfn, pdev);
++        }
+ 
+         break;
+ 
+@@ -2331,9 +2480,8 @@ static int reassign_device_ownership(
+ {
+     int ret;
+ 
+-    ret = domain_context_unmap(source, devfn, pdev);
+-    if ( ret )
+-        return ret;
++    if ( !has_arch_pdevs(target) )
++        vmx_pi_hooks_assign(target);
+ 
+     /*
+      * Devices assigned to untrusted domains (here assumed to be any domU)
+@@ -2343,6 +2491,31 @@ static int reassign_device_ownership(
+     if ( (target != hardware_domain) && !iommu_intremap )
+         untrusted_msi = true;
+ 
++    ret = domain_context_mapping(target, devfn, pdev);
++    if ( ret )
++    {
++        if ( !has_arch_pdevs(target) )
++            vmx_pi_hooks_deassign(target);
++        return ret;
++    }
++
++    if ( pdev->devfn == devfn )
++    {
++        const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++        if ( drhd )
++            check_cleanup_domid_map(source, pdev, drhd->iommu);
++    }
++
++    if ( devfn == pdev->devfn && pdev->domain != target )
++    {
++        list_move(&pdev->domain_list, &target->pdev_list);
++        pdev->domain = target;
++    }
++
++    if ( !has_arch_pdevs(source) )
++        vmx_pi_hooks_deassign(source);
++
+     /*
+      * If the device belongs to the hardware domain, and it has RMRR, don't
+      * remove it from the hardware domain, because BIOS may use RMRR at
+@@ -2371,34 +2544,7 @@ static int reassign_device_ownership(
+             }
+     }
+ 
+-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
+-    {
+-        list_move(&pdev->domain_list, &dom_io->pdev_list);
+-        pdev->domain = dom_io;
+-    }
+-
+-    if ( !has_arch_pdevs(source) )
+-        vmx_pi_hooks_deassign(source);
+-
+-    if ( !has_arch_pdevs(target) )
+-        vmx_pi_hooks_assign(target);
+-
+-    ret = domain_context_mapping(target, devfn, pdev);
+-    if ( ret )
+-    {
+-        if ( !has_arch_pdevs(target) )
+-            vmx_pi_hooks_deassign(target);
+-
+-        return ret;
+-    }
+-
+-    if ( devfn == pdev->devfn && pdev->domain != target )
+-    {
+-        list_move(&pdev->domain_list, &target->pdev_list);
+-        pdev->domain = target;
+-    }
+-
+-    return ret;
++    return 0;
+ }
+ 
+ static int intel_iommu_assign_device(
+--- xen/drivers/passthrough/vtd/iommu.h.orig
++++ xen/drivers/passthrough/vtd/iommu.h
+@@ -202,8 +202,12 @@ struct root_entry {
+     do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
+ 
+ struct context_entry {
+-    u64 lo;
+-    u64 hi;
++    union {
++        struct {
++            uint64_t lo, hi;
++        };
++        __uint128_t full;
++    };
+ };
+ #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+ #define context_present(c) ((c).lo & 1)
+--- xen/drivers/passthrough/vtd/quirks.c.orig
++++ xen/drivers/passthrough/vtd/quirks.c
+@@ -343,7 +343,8 @@ void __init platform_quirks_init(void)
+  */
+ 
+ static int __must_check map_me_phantom_function(struct domain *domain,
+-                                                u32 dev, int map)
++                                                unsigned int dev,
++                                                unsigned int mode)
+ {
+     struct acpi_drhd_unit *drhd;
+     struct pci_dev *pdev;
+@@ -354,9 +355,9 @@ static int __must_check map_me_phantom_f
+     drhd = acpi_find_matched_drhd_unit(pdev);
+ 
+     /* map or unmap ME phantom function */
+-    if ( map )
++    if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
+         rc = domain_context_mapping_one(domain, drhd->iommu, 0,
+-                                        PCI_DEVFN(dev, 7), NULL);
++                                        PCI_DEVFN(dev, 7), NULL, mode);
+     else
+         rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+                                       PCI_DEVFN(dev, 7));
+@@ -364,7 +365,8 @@ static int __must_check map_me_phantom_f
+     return rc;
+ }
+ 
+-int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
++int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
++                  unsigned int mode)
+ {
+     u32 id;
+     int rc = 0;
+@@ -388,7 +390,7 @@ int me_wifi_quirk(struct domain *domain,
+             case 0x423b8086:
+             case 0x423c8086:
+             case 0x423d8086:
+-                rc = map_me_phantom_function(domain, 3, map);
++                rc = map_me_phantom_function(domain, 3, mode);
+                 break;
+             default:
+                 break;
+@@ -414,7 +416,7 @@ int me_wifi_quirk(struct domain *domain,
+             case 0x42388086:        /* Puma Peak */
+             case 0x422b8086:
+             case 0x422c8086:
+-                rc = map_me_phantom_function(domain, 22, map);
++                rc = map_me_phantom_function(domain, 22, mode);
+                 break;
+             default:
+                 break;
+--- xen/drivers/passthrough/vtd/vtd.h.orig
++++ xen/drivers/passthrough/vtd/vtd.h
+@@ -22,8 +22,14 @@
+ 
+ #include <xen/iommu.h>
+ 
+-#define MAP_ME_PHANTOM_FUNC      1
+-#define UNMAP_ME_PHANTOM_FUNC    0
++/*
++ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode"
++ * parameters.
++ */
++#define MAP_WITH_RMRR         (1u << 0)
++#define MAP_OWNER_DYING       (1u << 1)
++#define MAP_SINGLE_DEVICE     (1u << 2)
++#define UNMAP_ME_PHANTOM_FUNC (1u << 3)
+ 
+ /* Allow for both IOAPIC and IOSAPIC. */
+ #define IO_xAPIC_route_entry IO_APIC_route_entry
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: re-assign devices directly
+
+Devices with unity map ranges, due to it being unspecified how/when
+these memory ranges may get accessed, may not be left disconnected from
+their unity mappings (as long as it's not certain that the device has
+been fully quiesced). Hence rather than tearing down the old root page
+table pointer and then establishing the new one, re-assignment needs to
+be done in a single step.
+
+This is CVE-2022-26360 / part of XSA-400.
+
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+
+Similarly quarantining scratch-page mode relies on page tables to be
+continuously wired up.
+
+To avoid complicating things more than necessary, treat all devices
+mostly equally, i.e. regardless of their association with any unity map
+ranges.  The main difference is when it comes to updating DTEs, which need
+to be atomic when there are unity mappings. Yet atomicity can only be
+achieved with CMPXCHG16B, availability of which we can't take for given.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig
++++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -79,9 +79,13 @@ void amd_iommu_set_intremap_table(struct
+                                   const void *ptr,
+                                   const struct amd_iommu *iommu,
+                                   bool valid);
+-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+-				   uint64_t root_ptr, uint16_t domain_id,
+-				   uint8_t paging_mode, bool valid);
++#define SET_ROOT_VALID          (1u << 0)
++#define SET_ROOT_WITH_UNITY_MAP (1u << 1)
++int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
++                                               uint64_t root_ptr,
++                                               uint16_t domain_id,
++                                               uint8_t paging_mode,
++                                               unsigned int flags);
+ void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
+                                 const struct ivrs_mappings *ivrs_dev);
+ void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id,
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -103,10 +103,69 @@ static unsigned int set_iommu_pte_presen
+     return flush_flags;
+ }
+ 
+-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+-                                   uint64_t root_ptr, uint16_t domain_id,
+-                                   uint8_t paging_mode, bool valid)
++/*
++ * This function returns
++ * - -errno for errors,
++ * - 0 for a successful update, atomic when necessary
++ * - 1 for a successful but non-atomic update, which may need to be warned
++ *   about by the caller.
++ */
++int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
++                                  uint64_t root_ptr, uint16_t domain_id,
++                                  uint8_t paging_mode, unsigned int flags)
+ {
++    bool valid = flags & SET_ROOT_VALID;
++
++    if ( dte->v && dte->tv &&
++         (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) )
++    {
++        union {
++            struct amd_iommu_dte dte;
++            uint64_t raw64[4];
++            __uint128_t raw128[2];
++        } ldte = { .dte = *dte };
++        __uint128_t old = ldte.raw128[0];
++        int ret = 0;
++
++        ldte.dte.domain_id = domain_id;
++        ldte.dte.pt_root = paddr_to_pfn(root_ptr);
++        ldte.dte.iw = true;
++        ldte.dte.ir = true;
++        ldte.dte.paging_mode = paging_mode;
++        ldte.dte.v = valid;
++
++        if ( cpu_has_cx16 )
++        {
++            __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]);
++
++            /*
++             * Hardware does not update the DTE behind our backs, so the
++             * return value should match "old".
++             */
++            if ( res != old )
++            {
++                printk(XENLOG_ERR
++                       "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n",
++                       domain_id,
++                       (uint64_t)(res >> 64), (uint64_t)res,
++                       (uint64_t)(old >> 64), (uint64_t)old);
++                ret = -EILSEQ;
++            }
++        }
++        else /* Best effort, updating domain_id last. */
++        {
++            uint64_t *ptr = (void *)dte;
++
++            write_atomic(ptr + 0, ldte.raw64[0]);
++            /* No barrier should be needed between these two. */
++            write_atomic(ptr + 1, ldte.raw64[1]);
++
++            ret = 1;
++        }
++
++        return ret;
++    }
++
+     if ( valid || dte->v )
+     {
+         dte->tv = false;
+@@ -121,6 +180,8 @@ void amd_iommu_set_root_page_table(struc
+     smp_wmb();
+     dte->tv = true;
+     dte->v = valid;
++
++    return 0;
+ }
+ 
+ void amd_iommu_set_intremap_table(
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -85,40 +85,81 @@ int get_dma_requestor_id(uint16_t seg, u
+     return req_id;
+ }
+ 
+-static void amd_iommu_setup_domain_device(
++static int __must_check allocate_domain_resources(struct domain_iommu *hd)
++{
++    int rc;
++
++    spin_lock(&hd->arch.mapping_lock);
++    rc = amd_iommu_alloc_root(hd);
++    spin_unlock(&hd->arch.mapping_lock);
++
++    return rc;
++}
++
++static bool any_pdev_behind_iommu(const struct domain *d,
++                                  const struct pci_dev *exclude,
++                                  const struct amd_iommu *iommu)
++{
++    const struct pci_dev *pdev;
++
++    for_each_pdev ( d, pdev )
++    {
++        if ( pdev == exclude )
++            continue;
++
++        if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
++            return true;
++    }
++
++    return false;
++}
++
++static int __must_check amd_iommu_setup_domain_device(
+     struct domain *domain, struct amd_iommu *iommu,
+     uint8_t devfn, struct pci_dev *pdev)
+ {
+     struct amd_iommu_dte *table, *dte;
+     unsigned long flags;
+-    int req_id, valid = 1;
++    unsigned int req_id, sr_flags;
++    int rc;
+     u8 bus = pdev->bus;
+-    const struct domain_iommu *hd = dom_iommu(domain);
++    struct domain_iommu *hd = dom_iommu(domain);
++    const struct ivrs_mappings *ivrs_dev;
+ 
+-    BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
+-            !iommu->dev_table.buffer );
++    BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
+ 
+-    if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
+-        valid = 0;
++    rc = allocate_domain_resources(hd);
++    if ( rc )
++        return rc;
++
++    req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf);
++    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
++    sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain)
++                ? 0 : SET_ROOT_VALID)
++               | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0);
+ 
+     /* get device-table entry */
+     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
+     table = iommu->dev_table.buffer;
+     dte = &table[req_id];
++    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+ 
+     spin_lock_irqsave(&iommu->lock, flags);
+ 
+     if ( !dte->v || !dte->tv )
+     {
+-        const struct ivrs_mappings *ivrs_dev;
+-
+         /* bind DTE to domain page-tables */
+-        amd_iommu_set_root_page_table(
+-            dte, page_to_maddr(hd->arch.root_table), domain->domain_id,
+-            hd->arch.paging_mode, valid);
++        rc = amd_iommu_set_root_page_table(
++                 dte, page_to_maddr(hd->arch.root_table),
++                 domain->domain_id, hd->arch.paging_mode, sr_flags);
++        if ( rc )
++        {
++            ASSERT(rc < 0);
++            spin_unlock_irqrestore(&iommu->lock, flags);
++            return rc;
++        }
+ 
+         /* Undo what amd_iommu_disable_domain_device() may have done. */
+-        ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+         if ( dte->it_root )
+         {
+             dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
+@@ -133,17 +174,74 @@ static void amd_iommu_setup_domain_devic
+             dte->i = ats_enabled;
+ 
+         amd_iommu_flush_device(iommu, req_id);
++    }
++    else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) )
++    {
++        /*
++         * Strictly speaking if the device is the only one with this requestor
++         * ID, it could be allowed to be re-assigned regardless of unity map
++         * presence.  But let's deal with that case only if it is actually
++         * found in the wild.
++         */
++        if ( req_id != PCI_BDF2(bus, devfn) &&
++             (sr_flags & SET_ROOT_WITH_UNITY_MAP) )
++            rc = -EOPNOTSUPP;
++        else
++            rc = amd_iommu_set_root_page_table(
++                     dte, page_to_maddr(hd->arch.root_table),
++                     domain->domain_id, hd->arch.paging_mode, sr_flags);
++        if ( rc < 0 )
++        {
++            spin_unlock_irqrestore(&iommu->lock, flags);
++            return rc;
++        }
++        if ( rc &&
++             domain != pdev->domain &&
++             /*
++              * By non-atomically updating the DTE's domain ID field last,
++              * during a short window in time TLB entries with the old domain
++              * ID but the new page tables may have been inserted.  This could
++              * affect I/O of other devices using this same (old) domain ID.
++              * Such updating therefore is not a problem if this was the only
++              * device associated with the old domain ID.  Diverting I/O of any
++              * of a dying domain's devices to the quarantine page tables is
++              * intended anyway.
++              */
++             !pdev->domain->is_dying &&
++             (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
++              pdev->phantom_stride) )
++            printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
++                   pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
++                   pdev->domain);
++
++        /*
++         * Check remaining settings are still in place from an earlier call
++         * here. They're all independent of the domain, so should not have
++         * changed.
++         */
++        if ( dte->it_root )
++            ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED);
++        ASSERT(dte->iv == iommu_intremap);
++        ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion);
++        ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags,
++                                         ACPI_IVHD_SYSTEM_MGMT));
+ 
+-        AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+-                        "root table = %#"PRIx64", "
+-                        "domain = %d, paging mode = %d\n",
+-                        req_id, pdev->type,
+-                        page_to_maddr(hd->arch.root_table),
+-                        domain->domain_id, hd->arch.paging_mode);
++        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
++             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
++            ASSERT(dte->i == ats_enabled);
++
++        amd_iommu_flush_device(iommu, req_id);
+     }
+ 
+     spin_unlock_irqrestore(&iommu->lock, flags);
+ 
++    AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
++                    "root table = %#"PRIx64", "
++                    "domain = %d, paging mode = %d\n",
++                    req_id, pdev->type,
++                    page_to_maddr(hd->arch.root_table),
++                    domain->domain_id, hd->arch.paging_mode);
++
+     ASSERT(pcidevs_locked());
+ 
+     if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
+@@ -154,6 +252,8 @@ static void amd_iommu_setup_domain_devic
+ 
+         amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+     }
++
++    return 0;
+ }
+ 
+ int __init acpi_ivrs_init(void)
+@@ -223,17 +323,6 @@ int amd_iommu_alloc_root(struct domain_i
+     return 0;
+ }
+ 
+-static int __must_check allocate_domain_resources(struct domain_iommu *hd)
+-{
+-    int rc;
+-
+-    spin_lock(&hd->arch.mapping_lock);
+-    rc = amd_iommu_alloc_root(hd);
+-    spin_unlock(&hd->arch.mapping_lock);
+-
+-    return rc;
+-}
+-
+ int __read_mostly amd_iommu_min_paging_mode = 1;
+ 
+ static int amd_iommu_domain_init(struct domain *d)
+@@ -333,7 +422,6 @@ static int reassign_device(struct domain
+ {
+     struct amd_iommu *iommu;
+     int bdf, rc;
+-    struct domain_iommu *t = dom_iommu(target);
+     const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+ 
+     bdf = PCI_BDF2(pdev->bus, pdev->devfn);
+@@ -347,7 +435,15 @@ static int reassign_device(struct domain
+         return -ENODEV;
+     }
+ 
+-    amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
++    rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
++    if ( rc )
++        return rc;
++
++    if ( devfn == pdev->devfn && pdev->domain != target )
++    {
++        list_move(&pdev->domain_list, &target->pdev_list);
++        pdev->domain = target;
++    }
+ 
+     /*
+      * If the device belongs to the hardware domain, and it has a unity mapping,
+@@ -363,27 +459,10 @@ static int reassign_device(struct domain
+             return rc;
+     }
+ 
+-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
+-    {
+-        list_move(&pdev->domain_list, &dom_io->pdev_list);
+-        pdev->domain = dom_io;
+-    }
+-
+-    rc = allocate_domain_resources(t);
+-    if ( rc )
+-        return rc;
+-
+-    amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
+     AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n",
+                     pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                     source->domain_id, target->domain_id);
+ 
+-    if ( devfn == pdev->devfn && pdev->domain != target )
+-    {
+-        list_move(&pdev->domain_list, &target->pdev_list);
+-        pdev->domain = target;
+-    }
+-
+     return 0;
+ }
+ 
+@@ -547,8 +626,7 @@ static int amd_iommu_add_device(u8 devfn
+         spin_unlock_irqrestore(&iommu->lock, flags);
+     }
+ 
+-    amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+-    return 0;
++    return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+ }
+ 
+ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: prepare for per-device quarantine page tables (part I)
+
+Arrange for domain ID and page table root to be passed around, the latter in
+particular to domain_pgd_maddr() such that taking it from the per-domain
+fields can be overridden.
+
+No functional change intended.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -86,9 +86,10 @@ void *map_vtd_domain_page(u64 maddr);
+ void unmap_vtd_domain_page(void *va);
+ int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+                                uint8_t bus, uint8_t devfn,
+-                               const struct pci_dev *pdev, unsigned int mode);
++                               const struct pci_dev *pdev, domid_t domid,
++                               paddr_t pgd_maddr, unsigned int mode);
+ int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+-                             u8 bus, u8 devfn);
++                             uint8_t bus, uint8_t devfn, domid_t domid);
+ int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
+ 
+ unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
+@@ -107,7 +108,8 @@ void platform_quirks_init(void);
+ void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
+ void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
+ int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
+-                               uint8_t devfn, unsigned int mode);
++                               uint8_t devfn, domid_t domid, paddr_t pgd_maddr,
++                               unsigned int mode);
+ void pci_vtd_quirk(const struct pci_dev *);
+ void quirk_iommu_caps(struct vtd_iommu *iommu);
+ 
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -1364,12 +1364,12 @@ int domain_context_mapping_one(
+     struct domain *domain,
+     struct vtd_iommu *iommu,
+     uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
+-    unsigned int mode)
++    domid_t domid, paddr_t pgd_maddr, unsigned int mode)
+ {
+     struct domain_iommu *hd = dom_iommu(domain);
+     struct context_entry *context, *context_entries, lctxt;
+     __uint128_t old;
+-    u64 maddr, pgd_maddr;
++    uint64_t maddr;
+     uint16_t seg = iommu->drhd->segment, prev_did = 0;
+     struct domain *prev_dom = NULL;
+     int agaw, rc, ret;
+@@ -1410,10 +1410,12 @@ int domain_context_mapping_one(
+     }
+     else
+     {
++        paddr_t root = pgd_maddr;
++
+         spin_lock(&hd->arch.mapping_lock);
+ 
+         /* Ensure we have pagetables allocated down to leaf PTE. */
+-        if ( hd->arch.pgd_maddr == 0 )
++        if ( !root )
+         {
+             addr_to_dma_page_maddr(domain, 0, 1);
+             if ( hd->arch.pgd_maddr == 0 )
+@@ -1426,22 +1428,24 @@ int domain_context_mapping_one(
+                     rcu_unlock_domain(prev_dom);
+                 return -ENOMEM;
+             }
++
++            root = hd->arch.pgd_maddr;
+         }
+ 
+         /* Skip top levels of page tables for 2- and 3-level DRHDs. */
+-        pgd_maddr = hd->arch.pgd_maddr;
+         for ( agaw = level_to_agaw(4);
+               agaw != level_to_agaw(iommu->nr_pt_levels);
+               agaw-- )
+         {
+-            struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
+-            pgd_maddr = dma_pte_addr(*p);
++            struct dma_pte *p = map_vtd_domain_page(root);
++
++            root = dma_pte_addr(*p);
+             unmap_vtd_domain_page(p);
+-            if ( pgd_maddr == 0 )
++            if ( !root )
+                 goto nomem;
+         }
+ 
+-        context_set_address_root(lctxt, pgd_maddr);
++        context_set_address_root(lctxt, root);
+         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
+             context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
+         else
+@@ -1557,15 +1561,21 @@ int domain_context_mapping_one(
+     unmap_vtd_domain_page(context_entries);
+ 
+     if ( !seg && !rc )
+-        rc = me_wifi_quirk(domain, bus, devfn, mode);
++        rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode);
+ 
+     if ( rc )
+     {
+         if ( !prev_dom )
+-            domain_context_unmap_one(domain, iommu, bus, devfn);
++            domain_context_unmap_one(domain, iommu, bus, devfn,
++                                     domain->domain_id);
+         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
++        {
++            hd = dom_iommu(prev_dom);
+             domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
++                                       domain->domain_id,
++                                       hd->arch.pgd_maddr,
+                                        mode & MAP_WITH_RMRR);
++        }
+     }
+ 
+     if ( prev_dom )
+@@ -1582,6 +1592,7 @@ static int domain_context_mapping(struct
+ {
+     struct acpi_drhd_unit *drhd;
+     const struct acpi_rmrr_unit *rmrr;
++    paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
+     int ret = 0;
+     unsigned int i, mode = 0;
+     uint16_t seg = pdev->seg, bdf;
+@@ -1647,7 +1658,8 @@ static int domain_context_mapping(struct
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev, mode);
++                                         pdev, domain->domain_id, pgd_maddr,
++                                         mode);
+         if ( ret > 0 )
+             ret = 0;
+         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+@@ -1662,7 +1674,8 @@ static int domain_context_mapping(struct
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+ 
+         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev, mode);
++                                         pdev, domain->domain_id, pgd_maddr,
++                                         mode);
+         if ( ret < 0 )
+             break;
+         prev_present = ret;
+@@ -1690,7 +1703,8 @@ static int domain_context_mapping(struct
+          */
+         if ( ret >= 0 )
+             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                             NULL, mode);
++                                             NULL, domain->domain_id, pgd_maddr,
++                                             mode);
+ 
+         /*
+          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1705,7 +1719,8 @@ static int domain_context_mapping(struct
+         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+              (secbus != pdev->bus || pdev->devfn != 0) )
+             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+-                                             NULL, mode);
++                                             NULL, domain->domain_id, pgd_maddr,
++                                             mode);
+ 
+         if ( ret )
+         {
+@@ -1734,7 +1749,7 @@ static int domain_context_mapping(struct
+ int domain_context_unmap_one(
+     struct domain *domain,
+     struct vtd_iommu *iommu,
+-    u8 bus, u8 devfn)
++    uint8_t bus, uint8_t devfn, domid_t domid)
+ {
+     struct context_entry *context, *context_entries;
+     u64 maddr;
+@@ -1792,7 +1807,7 @@ int domain_context_unmap_one(
+     unmap_vtd_domain_page(context_entries);
+ 
+     if ( !iommu->drhd->segment && !rc )
+-        rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
++        rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC);
+ 
+     if ( rc && !is_hardware_domain(domain) && domain != dom_io )
+     {
+@@ -1844,7 +1859,8 @@ static int domain_context_unmap(struct d
+             printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
++        ret = domain_context_unmap_one(domain, iommu, bus, devfn,
++                                       domain->domain_id);
+         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+             disable_ats_device(pdev);
+ 
+@@ -1854,7 +1870,8 @@ static int domain_context_unmap(struct d
+         if ( iommu_debug )
+             printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
++        ret = domain_context_unmap_one(domain, iommu, bus, devfn,
++                                       domain->domain_id);
+         if ( ret )
+             break;
+ 
+@@ -1880,12 +1897,15 @@ static int domain_context_unmap(struct d
+         /* PCIe to PCI/PCIx bridge */
+         if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+         {
+-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
++            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++                                           domain->domain_id);
+             if ( !ret )
+-                ret = domain_context_unmap_one(domain, iommu, secbus, 0);
++                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
++                                               domain->domain_id);
+         }
+         else /* Legacy PCI bridge */
+-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
++            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++                                           domain->domain_id);
+ 
+         break;
+ 
+--- xen/drivers/passthrough/vtd/quirks.c.orig
++++ xen/drivers/passthrough/vtd/quirks.c
+@@ -344,6 +344,8 @@ void __init platform_quirks_init(void)
+ 
+ static int __must_check map_me_phantom_function(struct domain *domain,
+                                                 unsigned int dev,
++                                                domid_t domid,
++                                                paddr_t pgd_maddr,
+                                                 unsigned int mode)
+ {
+     struct acpi_drhd_unit *drhd;
+@@ -357,16 +359,17 @@ static int __must_check map_me_phantom_f
+     /* map or unmap ME phantom function */
+     if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
+         rc = domain_context_mapping_one(domain, drhd->iommu, 0,
+-                                        PCI_DEVFN(dev, 7), NULL, mode);
++                                        PCI_DEVFN(dev, 7), NULL,
++                                        domid, pgd_maddr, mode);
+     else
+         rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+-                                      PCI_DEVFN(dev, 7));
++                                      PCI_DEVFN(dev, 7), domid);
+ 
+     return rc;
+ }
+ 
+ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+-                  unsigned int mode)
++                  domid_t domid, paddr_t pgd_maddr, unsigned int mode)
+ {
+     u32 id;
+     int rc = 0;
+@@ -390,7 +393,7 @@ int me_wifi_quirk(struct domain *domain,
+             case 0x423b8086:
+             case 0x423c8086:
+             case 0x423d8086:
+-                rc = map_me_phantom_function(domain, 3, mode);
++                rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode);
+                 break;
+             default:
+                 break;
+@@ -416,7 +419,7 @@ int me_wifi_quirk(struct domain *domain,
+             case 0x42388086:        /* Puma Peak */
+             case 0x422b8086:
+             case 0x422c8086:
+-                rc = map_me_phantom_function(domain, 22, mode);
++                rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode);
+                 break;
+             default:
+                 break;
+From: Jan Beulich <jbeulich@suse.com>
+Subject: VT-d: prepare for per-device quarantine page tables (part II)
+
+Replace the passing of struct domain * by domid_t in preparation of
+per-device quarantine page tables also requiring per-device pseudo
+domain IDs, which aren't going to be associated with any struct domain
+instances.
+
+No functional change intended (except for slightly adjusted log message
+text).
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -52,8 +52,8 @@ static struct tasklet vtd_fault_tasklet;
+ static int setup_hwdom_device(u8 devfn, struct pci_dev *);
+ static void setup_hwdom_rmrr(struct domain *d);
+ 
+-static int domain_iommu_domid(struct domain *d,
+-                              struct vtd_iommu *iommu)
++static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu,
++                         bool warn)
+ {
+     unsigned long nr_dom, i;
+ 
+@@ -61,16 +61,16 @@ static int domain_iommu_domid(struct dom
+     i = find_first_bit(iommu->domid_bitmap, nr_dom);
+     while ( i < nr_dom )
+     {
+-        if ( iommu->domid_map[i] == d->domain_id )
++        if ( iommu->domid_map[i] == domid )
+             return i;
+ 
+         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
+     }
+ 
+-    if ( !d->is_dying )
++    if ( warn )
+         dprintk(XENLOG_ERR VTDPREFIX,
+-                "Cannot get valid iommu %u domid: %pd\n",
+-                iommu->index, d);
++                "No valid iommu %u domid for Dom%d\n",
++                iommu->index, domid);
+ 
+     return -1;
+ }
+@@ -78,8 +78,7 @@ static int domain_iommu_domid(struct dom
+ #define DID_FIELD_WIDTH 16
+ #define DID_HIGH_OFFSET 8
+ static int context_set_domain_id(struct context_entry *context,
+-                                 struct domain *d,
+-                                 struct vtd_iommu *iommu)
++                                 domid_t domid, struct vtd_iommu *iommu)
+ {
+     unsigned long nr_dom, i;
+     int found = 0;
+@@ -90,7 +89,7 @@ static int context_set_domain_id(struct
+     i = find_first_bit(iommu->domid_bitmap, nr_dom);
+     while ( i < nr_dom )
+     {
+-        if ( iommu->domid_map[i] == d->domain_id )
++        if ( iommu->domid_map[i] == domid )
+         {
+             found = 1;
+             break;
+@@ -106,7 +105,7 @@ static int context_set_domain_id(struct
+             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
+             return -EFAULT;
+         }
+-        iommu->domid_map[i] = d->domain_id;
++        iommu->domid_map[i] = domid;
+     }
+ 
+     set_bit(i, iommu->domid_bitmap);
+@@ -115,9 +114,9 @@ static int context_set_domain_id(struct
+     return 0;
+ }
+ 
+-static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
++static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu)
+ {
+-    int iommu_domid = domain_iommu_domid(domain, iommu);
++    int iommu_domid = get_iommu_did(domid, iommu, false);
+ 
+     if ( iommu_domid >= 0 )
+     {
+@@ -173,7 +172,7 @@ static void check_cleanup_domid_map(stru
+     if ( !found )
+     {
+         clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap);
+-        cleanup_domid_map(d, iommu);
++        cleanup_domid_map(d->domain_id, iommu);
+     }
+ }
+ 
+@@ -630,7 +629,7 @@ static int __must_check iommu_flush_iotl
+             continue;
+ 
+         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+-        iommu_domid= domain_iommu_domid(d, iommu);
++        iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying);
+         if ( iommu_domid == -1 )
+             continue;
+ 
+@@ -1454,7 +1453,7 @@ int domain_context_mapping_one(
+         spin_unlock(&hd->arch.mapping_lock);
+     }
+ 
+-    rc = context_set_domain_id(&lctxt, domain, iommu);
++    rc = context_set_domain_id(&lctxt, domid, iommu);
+     if ( rc )
+     {
+     unlock:
+@@ -1774,7 +1773,7 @@ int domain_context_unmap_one(
+     context_clear_entry(*context);
+     iommu_sync_cache(context, sizeof(struct context_entry));
+ 
+-    iommu_domid= domain_iommu_domid(domain, iommu);
++    iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying);
+     if ( iommu_domid == -1 )
+     {
+         spin_unlock(&iommu->lock);
+@@ -1948,7 +1947,7 @@ static void iommu_domain_teardown(struct
+     spin_unlock(&hd->arch.mapping_lock);
+ 
+     for_each_drhd_unit ( drhd )
+-        cleanup_domid_map(d, drhd->iommu);
++        cleanup_domid_map(d->domain_id, drhd->iommu);
+ }
+ 
+ static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU/x86: maintain a per-device pseudo domain ID
+
+In order to subsequently enable per-device quarantine page tables, we'll
+need domain-ID-like identifiers to be inserted in the respective device
+(AMD) or context (Intel) table entries alongside the per-device page
+table root addresses.
+
+Make use of "real" domain IDs occupying only half of the value range
+coverable by domid_t.
+
+Note that in VT-d's iommu_alloc() I didn't want to introduce new memory
+leaks in case of error, but existing ones don't get plugged - that'll be
+the subject of a later change.
+
+The VT-d changes are slightly asymmetric, but this way we can avoid
+assigning pseudo domain IDs to devices which would never be mapped while
+still avoiding to add a new parameter to domain_context_unmap().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/include/asm-x86/iommu.h.orig
++++ xen/include/asm-x86/iommu.h
+@@ -130,6 +130,10 @@ int pi_update_irte(const struct pi_desc
+         iommu_vcall(ops, sync_cache, addr, size);       \
+ })
+ 
++unsigned long *iommu_init_domid(void);
++domid_t iommu_alloc_domid(unsigned long *map);
++void iommu_free_domid(domid_t domid, unsigned long *map);
++
+ #endif /* !__ARCH_X86_IOMMU_H__ */
+ /*
+  * Local variables:
+--- xen/include/asm-x86/pci.h.orig
++++ xen/include/asm-x86/pci.h
+@@ -15,6 +15,12 @@
+ 
+ struct arch_pci_dev {
+     vmask_t used_vectors;
++    /*
++     * These fields are (de)initialized under pcidevs-lock. Other uses of
++     * them don't race (de)initialization and hence don't strictly need any
++     * locking.
++     */
++    domid_t pseudo_domid;
+ };
+ 
+ int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+--- xen/include/asm-x86/amd-iommu.h.orig
++++ xen/include/asm-x86/amd-iommu.h
+@@ -94,6 +94,7 @@ struct amd_iommu {
+     struct ring_buffer cmd_buffer;
+     struct ring_buffer event_log;
+     struct ring_buffer ppr_log;
++    unsigned long *domid_map;
+ 
+     int exclusion_enable;
+     int exclusion_allow_all;
+--- xen/drivers/passthrough/amd/iommu_detect.c.orig
++++ xen/drivers/passthrough/amd/iommu_detect.c
+@@ -183,6 +183,11 @@ int __init amd_iommu_detect_one_acpi(
+     if ( rt )
+         goto out;
+ 
++    iommu->domid_map = iommu_init_domid();
++    rt = -ENOMEM;
++    if ( !iommu->domid_map )
++        goto out;
++
+     rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
+     if ( rt )
+         printk(XENLOG_ERR
+@@ -194,7 +199,10 @@ int __init amd_iommu_detect_one_acpi(
+ 
+  out:
+     if ( rt )
++    {
++        xfree(iommu->domid_map);
+         xfree(iommu);
++    }
+ 
+     return rt;
+ }
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -563,6 +563,8 @@ static int amd_iommu_add_device(u8 devfn
+     struct amd_iommu *iommu;
+     u16 bdf;
+     struct ivrs_mappings *ivrs_mappings;
++    bool fresh_domid = false;
++    int ret;
+ 
+     if ( !pdev->domain )
+         return -EINVAL;
+@@ -626,7 +628,22 @@ static int amd_iommu_add_device(u8 devfn
+         spin_unlock_irqrestore(&iommu->lock, flags);
+     }
+ 
+-    return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
++    if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID )
++    {
++        pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map);
++        if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++            return -ENOSPC;
++        fresh_domid = true;
++    }
++
++    ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
++    if ( ret && fresh_domid )
++    {
++        iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
++        pdev->arch.pseudo_domid = DOMID_INVALID;
++    }
++
++    return ret;
+ }
+ 
+ static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+@@ -651,6 +668,9 @@ static int amd_iommu_remove_device(u8 de
+ 
+     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+ 
++    iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
++    pdev->arch.pseudo_domid = DOMID_INVALID;
++
+     ivrs_mappings = get_ivrs_mappings(pdev->seg);
+     bdf = PCI_BDF2(pdev->bus, devfn);
+     if ( amd_iommu_perdev_intremap &&
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -338,6 +338,7 @@ static struct pci_dev *alloc_pdev(struct
+     *((u8*) &pdev->bus) = bus;
+     *((u8*) &pdev->devfn) = devfn;
+     pdev->domain = NULL;
++    pdev->arch.pseudo_domid = DOMID_INVALID;
+     INIT_LIST_HEAD(&pdev->msi_list);
+ 
+     pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+@@ -1353,9 +1354,13 @@ static int _dump_pci_devices(struct pci_
+ 
+     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
+     {
+-        printk("%04x:%02x:%02x.%u - %pd - node %-3d - MSIs < ",
+-               pseg->nr, pdev->bus,
+-               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain,
++        printk("%04x:%02x:%02x.%u - ", pseg->nr, pdev->bus,
++               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
++        if ( pdev->domain == dom_io )
++            printk("DomIO:%x", pdev->arch.pseudo_domid);
++        else
++            printk("%pd", pdev->domain);
++        printk(" - node %-3d - MSIs < ",
+                (pdev->node != NUMA_NO_NODE) ? pdev->node : -1);
+         list_for_each_entry ( msi, &pdev->msi_list, list )
+                printk("%d ", msi->irq);
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -22,6 +22,7 @@
+ #include <xen/sched.h>
+ #include <xen/xmalloc.h>
+ #include <xen/domain_page.h>
++#include <xen/err.h>
+ #include <xen/iocap.h>
+ #include <xen/iommu.h>
+ #include <xen/numa.h>
+@@ -1192,7 +1193,7 @@ int __init iommu_alloc(struct acpi_drhd_
+ {
+     struct vtd_iommu *iommu;
+     unsigned long sagaw, nr_dom;
+-    int agaw;
++    int agaw, rc;
+ 
+     if ( nr_iommus >= MAX_IOMMUS )
+     {
+@@ -1285,7 +1286,16 @@ int __init iommu_alloc(struct acpi_drhd_
+     if ( !iommu->domid_map )
+         return -ENOMEM;
+ 
++    iommu->pseudo_domid_map = iommu_init_domid();
++    rc = -ENOMEM;
++    if ( !iommu->pseudo_domid_map )
++        goto free;
++
+     return 0;
++
++ free:
++    iommu_free(drhd);
++    return rc;
+ }
+ 
+ void __init iommu_free(struct acpi_drhd_unit *drhd)
+@@ -1308,6 +1318,7 @@ void __init iommu_free(struct acpi_drhd_
+ 
+     xfree(iommu->domid_bitmap);
+     xfree(iommu->domid_map);
++    xfree(iommu->pseudo_domid_map);
+ 
+     if ( iommu->msi.irq >= 0 )
+         destroy_irq(iommu->msi.irq);
+@@ -1583,8 +1594,8 @@ int domain_context_mapping_one(
+     return rc ?: pdev && prev_dom;
+ }
+ 
+-static int domain_context_unmap(struct domain *d, uint8_t devfn,
+-                                struct pci_dev *pdev);
++static const struct acpi_drhd_unit *domain_context_unmap(
++    struct domain *d, uint8_t devfn, struct pci_dev *pdev);
+ 
+ static int domain_context_mapping(struct domain *domain, u8 devfn,
+                                   struct pci_dev *pdev)
+@@ -1592,6 +1603,7 @@ static int domain_context_mapping(struct
+     struct acpi_drhd_unit *drhd;
+     const struct acpi_rmrr_unit *rmrr;
+     paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
++    domid_t orig_domid = pdev->arch.pseudo_domid;
+     int ret = 0;
+     unsigned int i, mode = 0;
+     uint16_t seg = pdev->seg, bdf;
+@@ -1652,6 +1664,14 @@ static int domain_context_mapping(struct
+         break;
+ 
+     case DEV_TYPE_PCIe_ENDPOINT:
++        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
++        {
++            pdev->arch.pseudo_domid =
++                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
++            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++                return -ENOSPC;
++        }
++
+         if ( iommu_debug )
+             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus,
+@@ -1667,6 +1687,14 @@ static int domain_context_mapping(struct
+         break;
+ 
+     case DEV_TYPE_PCI:
++        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
++        {
++            pdev->arch.pseudo_domid =
++                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
++            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
++                return -ENOSPC;
++        }
++
+         if ( iommu_debug )
+             printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus,
+@@ -1742,6 +1770,13 @@ static int domain_context_mapping(struct
+     if ( !ret && devfn == pdev->devfn )
+         pci_vtd_quirk(pdev);
+ 
++    if ( ret && drhd && orig_domid == DOMID_INVALID )
++    {
++        iommu_free_domid(pdev->arch.pseudo_domid,
++                         drhd->iommu->pseudo_domid_map);
++        pdev->arch.pseudo_domid = DOMID_INVALID;
++    }
++
+     return ret;
+ }
+ 
+@@ -1824,8 +1859,10 @@ int domain_context_unmap_one(
+     return rc;
+ }
+ 
+-static int domain_context_unmap(struct domain *domain, u8 devfn,
+-                                struct pci_dev *pdev)
++static const struct acpi_drhd_unit *domain_context_unmap(
++    struct domain *domain,
++    uint8_t devfn,
++    struct pci_dev *pdev)
+ {
+     struct acpi_drhd_unit *drhd;
+     struct vtd_iommu *iommu;
+@@ -1834,7 +1871,7 @@ static int domain_context_unmap(struct d
+ 
+     drhd = acpi_find_matched_drhd_unit(pdev);
+     if ( !drhd )
+-        return -ENODEV;
++        return ERR_PTR(-ENODEV);
+     iommu = drhd->iommu;
+ 
+     switch ( pdev->type )
+@@ -1845,7 +1882,7 @@ static int domain_context_unmap(struct d
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+         if ( !is_hardware_domain(domain) )
+-            return -EPERM;
++            return ERR_PTR(-EPERM);
+         goto out;
+ 
+     case DEV_TYPE_PCIe_BRIDGE:
+@@ -1923,7 +1960,7 @@ static int domain_context_unmap(struct d
+         check_cleanup_domid_map(domain, pdev, iommu);
+ 
+ out:
+-    return ret;
++    return ret ? ERR_PTR(ret) : drhd;
+ }
+ 
+ static void iommu_domain_teardown(struct domain *d)
+@@ -2145,16 +2182,17 @@ static int intel_iommu_enable_device(str
+ 
+ static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+ {
++    const struct acpi_drhd_unit *drhd;
+     struct acpi_rmrr_unit *rmrr;
+     u16 bdf;
+-    int ret, i;
++    unsigned int i;
+ 
+     if ( !pdev->domain )
+         return -EINVAL;
+ 
+-    ret = domain_context_unmap(pdev->domain, devfn, pdev);
+-    if ( ret )
+-        return ret;
++    drhd = domain_context_unmap(pdev->domain, devfn, pdev);
++    if ( IS_ERR(drhd) )
++        return PTR_ERR(drhd);
+ 
+     for_each_rmrr_device ( rmrr, bdf, i )
+     {
+@@ -2171,6 +2209,13 @@ static int intel_iommu_remove_device(u8
+                                rmrr->end_address, 0);
+     }
+ 
++    if ( drhd )
++    {
++        iommu_free_domid(pdev->arch.pseudo_domid,
++                         drhd->iommu->pseudo_domid_map);
++        pdev->arch.pseudo_domid = DOMID_INVALID;
++    }
++
+     return 0;
+ }
+ 
+--- xen/drivers/passthrough/vtd/iommu.h.orig
++++ xen/drivers/passthrough/vtd/iommu.h
+@@ -535,6 +535,7 @@ struct vtd_iommu {
+     } flush;
+ 
+     struct list_head ats_devices;
++    unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */
+     unsigned long *domid_bitmap;  /* domain id bitmap */
+     u16 *domid_map;               /* domain id mapping array */
+     uint32_t version;
+--- xen/drivers/passthrough/x86/iommu.c.orig
++++ xen/drivers/passthrough/x86/iommu.c
+@@ -346,6 +346,53 @@ void __hwdom_init arch_iommu_hwdom_init(
+         return;
+ }
+ 
++unsigned long *__init iommu_init_domid(void)
++{
++    if ( !iommu_quarantine )
++        return ZERO_BLOCK_PTR;
++
++    BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX);
++
++    return xzalloc_array(unsigned long,
++                         BITS_TO_LONGS(UINT16_MAX - DOMID_MASK));
++}
++
++domid_t iommu_alloc_domid(unsigned long *map)
++{
++    /*
++     * This is used uniformly across all IOMMUs, such that on typical
++     * systems we wouldn't re-use the same ID very quickly (perhaps never).
++     */
++    static unsigned int start;
++    unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start);
++
++    ASSERT(pcidevs_locked());
++
++    if ( idx >= UINT16_MAX - DOMID_MASK )
++        idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK);
++    if ( idx >= UINT16_MAX - DOMID_MASK )
++        return DOMID_INVALID;
++
++    __set_bit(idx, map);
++
++    start = idx + 1;
++
++    return idx | (DOMID_MASK + 1);
++}
++
++void iommu_free_domid(domid_t domid, unsigned long *map)
++{
++    ASSERT(pcidevs_locked());
++
++    if ( domid == DOMID_INVALID )
++        return;
++
++    ASSERT(domid > DOMID_MASK);
++
++    if ( !__test_and_clear_bit(domid & DOMID_MASK, map) )
++        BUG();
++}
++
+ /*
+  * Local variables:
+  * mode: C
+--- xen/include/public/xen.h.orig
++++ xen/include/public/xen.h
+@@ -614,6 +614,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
+ /* Idle domain. */
+ #define DOMID_IDLE           xen_mk_uint(0x7FFF)
+ 
++/* Mask for valid domain id values */
++#define DOMID_MASK           xen_mk_uint(0x7FFF)
++
+ #ifndef __ASSEMBLY__
+ 
+ typedef uint16_t domid_t;
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU/x86: drop TLB flushes from quarantine_init() hooks
+
+The page tables just created aren't hooked up yet anywhere, so there's
+nothing that could be present in any TLB, and hence nothing to flush.
+Dropping this flush is, at least on the VT-d side, a prereq to per-
+device domain ID use when quarantining devices, as dom_io isn't going
+to be assigned a DID anymore: The warning in get_iommu_did() would
+trigger.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -595,8 +595,6 @@ int __init amd_iommu_quarantine_init(str
+  out:
+     spin_unlock(&hd->arch.mapping_lock);
+ 
+-    amd_iommu_flush_all_pages(d);
+-
+     /* Pages leaked in failure case */
+     return level ? -ENOMEM : 0;
+ }
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -2894,7 +2894,6 @@ static int __init intel_iommu_quarantine
+     struct dma_pte *parent;
+     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+     unsigned int level = agaw_to_level(agaw);
+-    int rc;
+ 
+     if ( hd->arch.pgd_maddr )
+     {
+@@ -2941,10 +2940,8 @@ static int __init intel_iommu_quarantine
+  out:
+     spin_unlock(&hd->arch.mapping_lock);
+ 
+-    rc = iommu_flush_iotlb_all(d);
+-
+     /* Pages leaked in failure case */
+-    return level ? -ENOMEM : rc;
++    return level ? -ENOMEM : 0;
+ }
+ 
+ const struct iommu_ops __initconstrel intel_iommu_ops = {
+From: Jan Beulich <jbeulich@suse.com>
+Subject: AMD/IOMMU: abstract maximum number of page table levels
+
+We will want to use the constant elsewhere.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+
+--- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig
++++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -193,7 +193,7 @@ static inline int amd_iommu_get_paging_m
+     while ( max_frames > PTE_PER_TABLE_SIZE )
+     {
+         max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT;
+-        if ( ++level > 6 )
++        if ( ++level > IOMMU_MAX_PT_LEVELS )
+             return -ENOMEM;
+     }
+ 
+--- xen/include/asm-x86/hvm/svm/amd-iommu-defs.h.orig
++++ xen/include/asm-x86/hvm/svm/amd-iommu-defs.h
+@@ -110,6 +110,7 @@ struct amd_iommu_dte {
+     bool tv:1;
+     unsigned int :5;
+     unsigned int had:2;
++#define IOMMU_MAX_PT_LEVELS 6
+     unsigned int paging_mode:3;
+     uint64_t pt_root:40;
+     bool ppr:1;
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -260,7 +260,7 @@ static int iommu_pde_from_dfn(struct dom
+     table = hd->arch.root_table;
+     level = hd->arch.paging_mode;
+ 
+-    BUG_ON( table == NULL || level < 1 || level > 6 );
++    BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS );
+ 
+     /*
+      * A frame number past what the current page tables can represent can't
+From: Jan Beulich <jbeulich@suse.com>
+Subject: IOMMU/x86: use per-device page tables for quarantining
+
+Devices with RMRRs / unity mapped regions, due to it being unspecified
+how/when these memory regions may be accessed, may not be left
+disconnected from the mappings of these regions (as long as it's not
+certain that the device has been fully quiesced). Hence even the page
+tables used when quarantining such devices need to have mappings of
+those regions. This implies installing page tables in the first place
+even when not in scratch-page quarantining mode.
+
+This is CVE-2022-26361 / part of XSA-400.
+
+While for the purpose here it would be sufficient to have devices with
+RMRRs / unity mapped regions use per-device page tables, extend this to
+all devices (in scratch-page quarantining mode). This allows the leaf
+pages to be mapped r/w, thus covering also memory writes (rather than
+just reads) issued by non-quiescent devices.
+
+Set up quarantine page tables as late as possible, yet early enough to
+not encounter failure during de-assign. This means setup generally
+happens in assign_device(), while (for now) the one in deassign_device()
+is there mainly to be on the safe side.
+
+In VT-d's DID allocation function don't require the IOMMU lock to be
+held anymore: All involved code paths hold pcidevs_lock, so this way we
+avoid the need to acquire the IOMMU lock around the new call to
+context_set_domain_id().
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- xen/arch/x86/mm/p2m.c.orig
++++ xen/arch/x86/mm/p2m.c
+@@ -1453,7 +1453,7 @@ int set_identity_p2m_entry(struct domain
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+     int ret;
+ 
+-    if ( !paging_mode_translate(p2m->domain) )
++    if ( !paging_mode_translate(d) )
+     {
+         if ( !is_iommu_enabled(d) )
+             return 0;
+--- xen/include/asm-x86/pci.h.orig
++++ xen/include/asm-x86/pci.h
+@@ -1,6 +1,8 @@
+ #ifndef __X86_PCI_H__
+ #define __X86_PCI_H__
+ 
++#include <xen/mm.h>
++
+ #define CF8_BDF(cf8)     (  ((cf8) & 0x00ffff00) >> 8)
+ #define CF8_ADDR_LO(cf8) (   (cf8) & 0x000000fc)
+ #define CF8_ADDR_HI(cf8) (  ((cf8) & 0x0f000000) >> 16)
+@@ -20,7 +22,18 @@ struct arch_pci_dev {
+      * them don't race (de)initialization and hence don't strictly need any
+      * locking.
+      */
++    union {
++        /* Subset of struct arch_iommu's fields, to be used in dom_io. */
++        struct {
++            uint64_t pgd_maddr;
++        } vtd;
++        struct {
++            struct page_info *root_table;
++        } amd;
++    };
+     domid_t pseudo_domid;
++    mfn_t leaf_mfn;
++    struct page_list_head pgtables_list;
+ };
+ 
+ int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+--- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig
++++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+@@ -54,7 +54,8 @@ int amd_iommu_init_late(void);
+ int amd_iommu_update_ivrs_mapping_acpi(void);
+ int iov_adjust_irq_affinities(void);
+ 
+-int amd_iommu_quarantine_init(struct domain *d);
++int amd_iommu_quarantine_init(struct pci_dev *pdev);
++void amd_iommu_quarantine_teardown(struct pci_dev *pdev);
+ 
+ /* mapping functions */
+ int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
+--- xen/drivers/passthrough/amd/iommu_map.c.orig
++++ xen/drivers/passthrough/amd/iommu_map.c
+@@ -539,64 +539,137 @@ int amd_iommu_reserve_domain_unity_unmap
+     return rc;
+ }
+ 
+-int __init amd_iommu_quarantine_init(struct domain *d)
++static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
++                    struct page_info *pgs[IOMMU_MAX_PT_LEVELS],
++                    struct pci_dev *pdev)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
++    unsigned int i;
++    int rc = 0;
++
++    for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i )
++    {
++        union amd_iommu_pte *pte = &this[i], *next;
++
++        if ( !pte->pr )
++        {
++            if ( !pgs[level] )
++            {
++                /*
++                 * The pgtable allocator is fine for the leaf page, as well as
++                 * page table pages, and the resulting allocations are always
++                 * zeroed.
++                 */
++                pgs[level] = alloc_amd_iommu_pgtable();
++                if ( !pgs[level] )
++                {
++                    rc = -ENOMEM;
++                    break;
++                }
++
++                page_list_add(pgs[level], &pdev->arch.pgtables_list);
++
++                if ( level )
++                {
++                    next = __map_domain_page(pgs[level]);
++                    rc = fill_qpt(next, level - 1, pgs, pdev);
++                    unmap_domain_page(next);
++                }
++            }
++
++            /*
++             * PDEs are essentially a subset of PTEs, so this function
++             * is fine to use even at the leaf.
++             */
++            set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level,
++                                  true, true);
++        }
++        else if ( level && pte->next_level )
++        {
++            page_list_add(mfn_to_page(_mfn(pte->mfn)),
++                          &pdev->arch.pgtables_list);
++            next = map_domain_page(_mfn(pte->mfn));
++            rc = fill_qpt(next, level - 1, pgs, pdev);
++            unmap_domain_page(next);
++        }
++    }
++
++    return rc;
++}
++
++int amd_iommu_quarantine_init(struct pci_dev *pdev)
++{
++    struct domain_iommu *hd = dom_iommu(dom_io);
+     unsigned long end_gfn =
+         1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
+     unsigned int level = amd_iommu_get_paging_mode(end_gfn);
+-    union amd_iommu_pte *table;
++    unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
++    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
++    int rc;
+ 
+-    if ( hd->arch.root_table )
++    ASSERT(pcidevs_locked());
++    ASSERT(!hd->arch.root_table);
++
++    ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID);
++
++    if ( pdev->arch.amd.root_table )
+     {
+-        ASSERT_UNREACHABLE();
++        clear_domain_page(pdev->arch.leaf_mfn);
+         return 0;
+     }
+ 
+-    spin_lock(&hd->arch.mapping_lock);
+-
+-    hd->arch.root_table = alloc_amd_iommu_pgtable();
+-    if ( !hd->arch.root_table )
+-        goto out;
+-
+-    table = __map_domain_page(hd->arch.root_table);
+-    while ( level )
++    pdev->arch.amd.root_table = alloc_amd_iommu_pgtable();
++    if ( !pdev->arch.amd.root_table )
++        return -ENOMEM;
++
++    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
++    hd->arch.root_table = pdev->arch.amd.root_table;
++
++    rc = amd_iommu_reserve_domain_unity_map(dom_io,
++                                            ivrs_mappings[req_id].unity_map,
++                                            0);
++
++    iommu_identity_map_teardown(dom_io);
++    hd->arch.root_table = NULL;
++
++    if ( rc )
++        printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n",
++               pdev->seg, pdev->bus,
++               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
++    else
+     {
+-        struct page_info *pg;
+-        unsigned int i;
++        union amd_iommu_pte *root;
++        struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {};
+ 
+-        /*
+-         * The pgtable allocator is fine for the leaf page, as well as
+-         * page table pages, and the resulting allocations are always
+-         * zeroed.
+-         */
+-        pg = alloc_amd_iommu_pgtable();
+-        if ( !pg )
+-            break;
++        spin_lock(&hd->arch.mapping_lock);
+ 
+-        for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
+-        {
+-            union amd_iommu_pte *pde = &table[i];
++        root = __map_domain_page(pdev->arch.amd.root_table);
++        rc = fill_qpt(root, level - 1, pgs, pdev);
++        unmap_domain_page(root);
+ 
+-            /*
+-             * PDEs are essentially a subset of PTEs, so this function
+-             * is fine to use even at the leaf.
+-             */
+-            set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1,
+-                                  false, true);
+-        }
++        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
+ 
+-        unmap_domain_page(table);
+-        table = __map_domain_page(pg);
+-        level--;
++        spin_unlock(&hd->arch.mapping_lock);
+     }
+-    unmap_domain_page(table);
+ 
+- out:
+-    spin_unlock(&hd->arch.mapping_lock);
++    if ( rc )
++        amd_iommu_quarantine_teardown(pdev);
++
++    return rc;
++}
++
++void amd_iommu_quarantine_teardown(struct pci_dev *pdev)
++{
++    struct page_info *pg;
++
++    ASSERT(pcidevs_locked());
++
++    if ( !pdev->arch.amd.root_table )
++        return;
++
++    while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
++        free_amd_iommu_pgtable(pg);
+ 
+-    /* Pages leaked in failure case */
+-    return level ? -ENOMEM : 0;
++    pdev->arch.amd.root_table = NULL;
+ }
+ 
+ /*
+--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
++++ xen/drivers/passthrough/amd/pci_amd_iommu.c
+@@ -125,6 +125,8 @@ static int __must_check amd_iommu_setup_
+     u8 bus = pdev->bus;
+     struct domain_iommu *hd = dom_iommu(domain);
+     const struct ivrs_mappings *ivrs_dev;
++    const struct page_info *root_pg;
++    domid_t domid;
+ 
+     BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
+ 
+@@ -144,14 +146,25 @@ static int __must_check amd_iommu_setup_
+     dte = &table[req_id];
+     ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+ 
++    if ( domain != dom_io )
++    {
++        root_pg = hd->arch.root_table;
++        domid = domain->domain_id;
++    }
++    else
++    {
++        root_pg = pdev->arch.amd.root_table;
++        domid = pdev->arch.pseudo_domid;
++    }
++
+     spin_lock_irqsave(&iommu->lock, flags);
+ 
+     if ( !dte->v || !dte->tv )
+     {
+         /* bind DTE to domain page-tables */
+         rc = amd_iommu_set_root_page_table(
+-                 dte, page_to_maddr(hd->arch.root_table),
+-                 domain->domain_id, hd->arch.paging_mode, sr_flags);
++                 dte, page_to_maddr(root_pg), domid,
++                 hd->arch.paging_mode, sr_flags);
+         if ( rc )
+         {
+             ASSERT(rc < 0);
+@@ -175,7 +188,7 @@ static int __must_check amd_iommu_setup_
+ 
+         amd_iommu_flush_device(iommu, req_id);
+     }
+-    else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) )
++    else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
+     {
+         /*
+          * Strictly speaking if the device is the only one with this requestor
+@@ -188,8 +201,8 @@ static int __must_check amd_iommu_setup_
+             rc = -EOPNOTSUPP;
+         else
+             rc = amd_iommu_set_root_page_table(
+-                     dte, page_to_maddr(hd->arch.root_table),
+-                     domain->domain_id, hd->arch.paging_mode, sr_flags);
++                     dte, page_to_maddr(root_pg), domid,
++                     hd->arch.paging_mode, sr_flags);
+         if ( rc < 0 )
+         {
+             spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -208,6 +221,7 @@ static int __must_check amd_iommu_setup_
+               * intended anyway.
+               */
+              !pdev->domain->is_dying &&
++             pdev->domain != dom_io &&
+              (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
+               pdev->phantom_stride) )
+             printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n",
+@@ -238,9 +252,8 @@ static int __must_check amd_iommu_setup_
+     AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+                     "root table = %#"PRIx64", "
+                     "domain = %d, paging mode = %d\n",
+-                    req_id, pdev->type,
+-                    page_to_maddr(hd->arch.root_table),
+-                    domain->domain_id, hd->arch.paging_mode);
++                    req_id, pdev->type, page_to_maddr(root_pg),
++                    domid, hd->arch.paging_mode);
+ 
+     ASSERT(pcidevs_locked());
+ 
+@@ -313,7 +326,7 @@ static int iov_enable_xt(void)
+ 
+ int amd_iommu_alloc_root(struct domain_iommu *hd)
+ {
+-    if ( unlikely(!hd->arch.root_table) )
++    if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) )
+     {
+         hd->arch.root_table = alloc_amd_iommu_pgtable();
+         if ( !hd->arch.root_table )
+@@ -404,7 +417,7 @@ static void amd_iommu_disable_domain_dev
+ 
+         AMD_IOMMU_DEBUG("Disable: device id = %#x, "
+                         "domain = %d, paging mode = %d\n",
+-                        req_id,  domain->domain_id,
++                        req_id, dte->domain_id,
+                         dom_iommu(domain)->arch.paging_mode);
+     }
+     spin_unlock_irqrestore(&iommu->lock, flags);
+@@ -668,6 +681,8 @@ static int amd_iommu_remove_device(u8 de
+ 
+     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
+ 
++    amd_iommu_quarantine_teardown(pdev);
++
+     iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
+     pdev->arch.pseudo_domid = DOMID_INVALID;
+ 
+--- xen/drivers/passthrough/iommu.c.orig
++++ xen/drivers/passthrough/iommu.c
+@@ -450,21 +450,21 @@ int iommu_iotlb_flush_all(struct domain
+     return rc;
+ }
+ 
+-static int __init iommu_quarantine_init(void)
++int iommu_quarantine_dev_init(device_t *dev)
+ {
+     const struct domain_iommu *hd = dom_iommu(dom_io);
+-    int rc;
+ 
+-    dom_io->options |= XEN_DOMCTL_CDF_iommu;
++    if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
++        return 0;
+ 
+-    rc = iommu_domain_init(dom_io, 0);
+-    if ( rc )
+-        return rc;
++    return iommu_call(hd->platform_ops, quarantine_init, dev);
++}
+ 
+-    if ( !hd->platform_ops->quarantine_init )
+-        return 0;
++static int __init iommu_quarantine_init(void)
++{
++    dom_io->options |= XEN_DOMCTL_CDF_iommu;
+ 
+-    return hd->platform_ops->quarantine_init(dom_io);
++    return iommu_domain_init(dom_io, 0);
+ }
+ 
+ int __init iommu_setup(void)
+--- xen/drivers/passthrough/pci.c.orig
++++ xen/drivers/passthrough/pci.c
+@@ -929,9 +929,16 @@ static int deassign_device(struct domain
+         return -ENODEV;
+ 
+     /* De-assignment from dom_io should de-quarantine the device */
+-    target = ((pdev->quarantine || iommu_quarantine) &&
+-              pdev->domain != dom_io) ?
+-        dom_io : hardware_domain;
++    if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io )
++    {
++        ret = iommu_quarantine_dev_init(pci_to_dev(pdev));
++        if ( ret )
++           return ret;
++
++        target = dom_io;
++    }
++    else
++        target = hardware_domain;
+ 
+     while ( pdev->phantom_stride )
+     {
+@@ -1547,6 +1554,13 @@ static int assign_device(struct domain *
+         msixtbl_init(d);
+     }
+ 
++    if ( pdev->domain != dom_io )
++    {
++        rc = iommu_quarantine_dev_init(pci_to_dev(pdev));
++        if ( rc )
++            goto done;
++    }
++
+     pdev->fault.count = 0;
+ 
+     if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) )
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -43,6 +43,12 @@
+ #include "vtd.h"
+ #include "../ats.h"
+ 
++#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
++                                             : (pdev)->arch.pseudo_domid)
++#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
++                                 ? dom_iommu(d)->arch.pgd_maddr \
++                                 : (pdev)->arch.vtd.pgd_maddr)
++
+ /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
+ bool __read_mostly untrusted_msi;
+ 
+@@ -78,13 +84,18 @@ static int get_iommu_did(domid_t domid,
+ 
+ #define DID_FIELD_WIDTH 16
+ #define DID_HIGH_OFFSET 8
++
++/*
++ * This function may have "context" passed as NULL, to merely obtain a DID
++ * for "domid".
++ */
+ static int context_set_domain_id(struct context_entry *context,
+                                  domid_t domid, struct vtd_iommu *iommu)
+ {
+     unsigned long nr_dom, i;
+     int found = 0;
+ 
+-    ASSERT(spin_is_locked(&iommu->lock));
++    ASSERT(pcidevs_locked());
+ 
+     nr_dom = cap_ndoms(iommu->cap);
+     i = find_first_bit(iommu->domid_bitmap, nr_dom);
+@@ -110,8 +121,13 @@ static int context_set_domain_id(struct
+     }
+ 
+     set_bit(i, iommu->domid_bitmap);
+-    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+-    context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
++
++    if ( context )
++    {
++        context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
++        context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
++    }
++
+     return 0;
+ }
+ 
+@@ -161,8 +177,12 @@ static void check_cleanup_domid_map(stru
+                                     const struct pci_dev *exclude,
+                                     struct vtd_iommu *iommu)
+ {
+-    bool found = any_pdev_behind_iommu(d, exclude, iommu);
++    bool found;
++
++    if ( d == dom_io )
++        return;
+ 
++    found = any_pdev_behind_iommu(d, exclude, iommu);
+     /*
+      * Hidden devices are associated with DomXEN but usable by the hardware
+      * domain. Hence they need considering here as well.
+@@ -1400,7 +1420,7 @@ int domain_context_mapping_one(
+         domid = iommu->domid_map[prev_did];
+         if ( domid < DOMID_FIRST_RESERVED )
+             prev_dom = rcu_lock_domain_by_id(domid);
+-        else if ( domid == DOMID_IO )
++        else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK )
+             prev_dom = rcu_lock_domain(dom_io);
+         if ( !prev_dom )
+         {
+@@ -1577,15 +1597,12 @@ int domain_context_mapping_one(
+     {
+         if ( !prev_dom )
+             domain_context_unmap_one(domain, iommu, bus, devfn,
+-                                     domain->domain_id);
++                                     DEVICE_DOMID(domain, pdev));
+         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+-        {
+-            hd = dom_iommu(prev_dom);
+             domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+-                                       domain->domain_id,
+-                                       hd->arch.pgd_maddr,
++                                       DEVICE_DOMID(prev_dom, pdev),
++                                       DEVICE_PGTABLE(prev_dom, pdev),
+                                        mode & MAP_WITH_RMRR);
+-        }
+     }
+ 
+     if ( prev_dom )
+@@ -1602,7 +1619,7 @@ static int domain_context_mapping(struct
+ {
+     struct acpi_drhd_unit *drhd;
+     const struct acpi_rmrr_unit *rmrr;
+-    paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
++    paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
+     domid_t orig_domid = pdev->arch.pseudo_domid;
+     int ret = 0;
+     unsigned int i, mode = 0;
+@@ -1635,7 +1652,7 @@ static int domain_context_mapping(struct
+         break;
+     }
+ 
+-    if ( domain != pdev->domain )
++    if ( domain != pdev->domain && pdev->domain != dom_io )
+     {
+         if ( pdev->domain->is_dying )
+             mode |= MAP_OWNER_DYING;
+@@ -1676,8 +1693,8 @@ static int domain_context_mapping(struct
+             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev, domain->domain_id, pgd_maddr,
++        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
++                                         DEVICE_DOMID(domain, pdev), pgd_maddr,
+                                          mode);
+         if ( ret > 0 )
+             ret = 0;
+@@ -1701,8 +1718,8 @@ static int domain_context_mapping(struct
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+ 
+         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev, domain->domain_id, pgd_maddr,
+-                                         mode);
++                                         pdev, DEVICE_DOMID(domain, pdev),
++                                         pgd_maddr, mode);
+         if ( ret < 0 )
+             break;
+         prev_present = ret;
+@@ -1730,8 +1747,8 @@ static int domain_context_mapping(struct
+          */
+         if ( ret >= 0 )
+             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                             NULL, domain->domain_id, pgd_maddr,
+-                                             mode);
++                                             NULL, DEVICE_DOMID(domain, pdev),
++                                             pgd_maddr, mode);
+ 
+         /*
+          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+@@ -1746,8 +1763,8 @@ static int domain_context_mapping(struct
+         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+              (secbus != pdev->bus || pdev->devfn != 0) )
+             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+-                                             NULL, domain->domain_id, pgd_maddr,
+-                                             mode);
++                                             NULL, DEVICE_DOMID(domain, pdev),
++                                             pgd_maddr, mode);
+ 
+         if ( ret )
+         {
+@@ -1896,7 +1913,7 @@ static const struct acpi_drhd_unit *doma
+                    domain->domain_id, seg, bus,
+                    PCI_SLOT(devfn), PCI_FUNC(devfn));
+         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+-                                       domain->domain_id);
++                                       DEVICE_DOMID(domain, pdev));
+         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+             disable_ats_device(pdev);
+ 
+@@ -1907,7 +1924,7 @@ static const struct acpi_drhd_unit *doma
+             printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
+                    domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+-                                       domain->domain_id);
++                                       DEVICE_DOMID(domain, pdev));
+         if ( ret )
+             break;
+ 
+@@ -1930,18 +1947,12 @@ static const struct acpi_drhd_unit *doma
+             break;
+         }
+ 
++        ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
++                                       DEVICE_DOMID(domain, pdev));
+         /* PCIe to PCI/PCIx bridge */
+-        if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+-        {
+-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+-                                           domain->domain_id);
+-            if ( !ret )
+-                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+-                                               domain->domain_id);
+-        }
+-        else /* Legacy PCI bridge */
+-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+-                                           domain->domain_id);
++        if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
++            ret = domain_context_unmap_one(domain, iommu, secbus, 0,
++                                           DEVICE_DOMID(domain, pdev));
+ 
+         break;
+ 
+@@ -1987,6 +1998,25 @@ static void iommu_domain_teardown(struct
+         cleanup_domid_map(d->domain_id, drhd->iommu);
+ }
+ 
++static void quarantine_teardown(struct pci_dev *pdev,
++                                const struct acpi_drhd_unit *drhd)
++{
++    struct page_info *pg;
++
++    ASSERT(pcidevs_locked());
++
++    if ( !pdev->arch.vtd.pgd_maddr )
++        return;
++
++    while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
++        free_domheap_page(pg);
++
++    pdev->arch.vtd.pgd_maddr = 0;
++
++    if ( drhd )
++        cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
++}
++
+ static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
+                                              mfn_t mfn, unsigned int flags,
+                                              unsigned int *flush_flags)
+@@ -2209,6 +2239,8 @@ static int intel_iommu_remove_device(u8
+                                rmrr->end_address, 0);
+     }
+ 
++    quarantine_teardown(pdev, drhd);
++
+     if ( drhd )
+     {
+         iommu_free_domid(pdev->arch.pseudo_domid,
+@@ -2888,60 +2920,139 @@ static void vtd_dump_p2m_table(struct do
+     vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
+ }
+ 
+-static int __init intel_iommu_quarantine_init(struct domain *d)
++static int fill_qpt(struct dma_pte *this, unsigned int level,
++                    paddr_t maddrs[6], struct pci_dev *pdev)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+-    struct dma_pte *parent;
++    struct domain_iommu *hd = dom_iommu(dom_io);
++    unsigned int i;
++    int rc = 0;
++
++    for ( i = 0; !rc && i < PTE_NUM; ++i )
++    {
++        struct dma_pte *pte = &this[i], *next;
++
++        if ( !dma_pte_present(*pte) )
++        {
++            if ( !maddrs[level] )
++            {
++                /*
++                 * The pgtable allocator is fine for the leaf page, as well as
++                 * page table pages, and the resulting allocations are always
++                 * zeroed.
++                 */
++                maddrs[level] = alloc_pgtable_maddr(1, hd->node);
++                if ( !maddrs[level] )
++                {
++                    rc = -ENOMEM;
++                    break;
++                }
++
++                page_list_add(maddr_to_page(maddrs[level]),
++                              &pdev->arch.pgtables_list);
++
++                if ( level )
++                {
++                    next = map_vtd_domain_page(maddrs[level]);
++                    rc = fill_qpt(next, level - 1, maddrs, pdev);
++                    unmap_vtd_domain_page(next);
++                }
++            }
++
++            dma_set_pte_addr(*pte, maddrs[level]);
++            dma_set_pte_readable(*pte);
++            dma_set_pte_writable(*pte);
++        }
++        else if ( level && !dma_pte_superpage(*pte) )
++        {
++            page_list_add(maddr_to_page(dma_pte_addr(*pte)),
++                          &pdev->arch.pgtables_list);
++            next = map_vtd_domain_page(dma_pte_addr(*pte));
++            rc = fill_qpt(next, level - 1, maddrs, pdev);
++            unmap_vtd_domain_page(next);
++        }
++    }
++
++    return rc;
++}
++
++static int intel_iommu_quarantine_init(struct pci_dev *pdev)
++{
++    struct domain_iommu *hd = dom_iommu(dom_io);
++    paddr_t maddr;
+     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+     unsigned int level = agaw_to_level(agaw);
++    const struct acpi_drhd_unit *drhd;
++    const struct acpi_rmrr_unit *rmrr;
++    unsigned int i, bdf;
++    bool rmrr_found = false;
++    int rc;
+ 
+-    if ( hd->arch.pgd_maddr )
++    ASSERT(pcidevs_locked());
++    ASSERT(!hd->arch.pgd_maddr);
++
++    if ( pdev->arch.vtd.pgd_maddr )
+     {
+-        ASSERT_UNREACHABLE();
++        clear_domain_page(pdev->arch.leaf_mfn);
+         return 0;
+     }
+ 
+-    spin_lock(&hd->arch.mapping_lock);
++    drhd = acpi_find_matched_drhd_unit(pdev);
++    if ( !drhd )
++        return -ENODEV;
+ 
+-    hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node);
+-    if ( !hd->arch.pgd_maddr )
+-        goto out;
++    maddr = alloc_pgtable_maddr(1, hd->node);
++    if ( !maddr )
++        return -ENOMEM;
+ 
+-    parent = map_vtd_domain_page(hd->arch.pgd_maddr);
+-    while ( level )
+-    {
+-        uint64_t maddr;
+-        unsigned int offset;
++    rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
+ 
+-        /*
+-         * The pgtable allocator is fine for the leaf page, as well as
+-         * page table pages, and the resulting allocations are always
+-         * zeroed.
+-         */
+-        maddr = alloc_pgtable_maddr(1, hd->node);
+-        if ( !maddr )
++    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
++    hd->arch.pgd_maddr = maddr;
++
++    for_each_rmrr_device ( rmrr, bdf, i )
++    {
++        if ( rc )
+             break;
+ 
+-        for ( offset = 0; offset < PTE_NUM; offset++ )
++        if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf )
+         {
+-            struct dma_pte *pte = &parent[offset];
++            rmrr_found = true;
+ 
+-            dma_set_pte_addr(*pte, maddr);
+-            dma_set_pte_readable(*pte);
++            rc = iommu_identity_mapping(dom_io, p2m_access_rw,
++                                        rmrr->base_address, rmrr->end_address,
++                                        0);
++            if ( rc )
++                printk(XENLOG_ERR VTDPREFIX
++                       "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n",
++                       pdev->seg, pdev->bus,
++                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+         }
+-        iommu_sync_cache(parent, PAGE_SIZE);
++    }
+ 
+-        unmap_vtd_domain_page(parent);
+-        parent = map_vtd_domain_page(maddr);
+-        level--;
++    iommu_identity_map_teardown(dom_io);
++    hd->arch.pgd_maddr = 0;
++    pdev->arch.vtd.pgd_maddr = maddr;
++
++    if ( !rc )
++    {
++        struct dma_pte *root;
++        paddr_t maddrs[6] = {};
++
++        spin_lock(&hd->arch.mapping_lock);
++
++        root = map_vtd_domain_page(maddr);
++        rc = fill_qpt(root, level - 1, maddrs, pdev);
++        unmap_vtd_domain_page(root);
++
++        pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]);
++
++        spin_unlock(&hd->arch.mapping_lock);
+     }
+-    unmap_vtd_domain_page(parent);
+ 
+- out:
+-    spin_unlock(&hd->arch.mapping_lock);
++    if ( rc )
++        quarantine_teardown(pdev, drhd);
+ 
+-    /* Pages leaked in failure case */
+-    return level ? -ENOMEM : 0;
++    return rc;
+ }
+ 
+ const struct iommu_ops __initconstrel intel_iommu_ops = {
+--- xen/drivers/passthrough/vtd/iommu.h.orig
++++ xen/drivers/passthrough/vtd/iommu.h
+@@ -509,7 +509,7 @@ struct vtd_iommu {
+     u32 nr_pt_levels;
+     u64	cap;
+     u64	ecap;
+-    spinlock_t lock; /* protect context, domain ids */
++    spinlock_t lock; /* protect context */
+     spinlock_t register_lock; /* protect iommu register handling */
+     u64 root_maddr; /* root entry machine address */
+     nodeid_t node;
+--- xen/include/xen/iommu.h.orig
++++ xen/include/xen/iommu.h
+@@ -211,7 +211,7 @@ typedef int iommu_grdm_t(xen_pfn_t start
+ struct iommu_ops {
+     int (*init)(struct domain *d);
+     void (*hwdom_init)(struct domain *d);
+-    int (*quarantine_init)(struct domain *d);
++    int (*quarantine_init)(device_t *dev);
+     int (*add_device)(u8 devfn, device_t *dev);
+     int (*enable_device)(device_t *dev);
+     int (*remove_device)(u8 devfn, device_t *dev);
+@@ -331,6 +331,7 @@ int __must_check iommu_suspend(void);
+ void iommu_resume(void);
+ void iommu_crash_shutdown(void);
+ int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
++int iommu_quarantine_dev_init(device_t *dev);
+ 
+ void iommu_share_p2m_table(struct domain *d);
+ 
diff --git a/sysutils/xenkernel413/patches/patch-XSA401 b/sysutils/xenkernel413/patches/patch-XSA401
new file mode 100644
index 00000000000..d75f6ae17f2
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA401
@@ -0,0 +1,343 @@
+$NetBSD: patch-XSA401,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Clean up _get_page_type()
+
+Various fixes for clarity, ahead of making complicated changes.
+
+ * Split the overflow check out of the if/else chain for type handling, as
+   it's somewhat unrelated.
+ * Comment the main if/else chain to explain what is going on.  Adjust one
+   ASSERT() and state the bit layout for validate-locked and partial states.
+ * Correct the comment about TLB flushing, as it's backwards.  The problem
+   case is when writeable mappings are retained to a page becoming read-only,
+   as it allows the guest to bypass Xen's safety checks for updates.
+ * Reduce the scope of 'y'.  It is an artefact of the cmpxchg loop and not
+   valid for use by subsequent logic.  Switch to using ACCESS_ONCE() to treat
+   all reads as explicitly volatile.  The only thing preventing the validated
+   wait-loop being infinite is the compiler barrier hidden in cpu_relax().
+ * Replace one page_get_owner(page) with the already-calculated 'd' already in
+   scope.
+
+No functional change.
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ad89bfb45fff..96738b027827 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -2978,16 +2978,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
+ static int _get_page_type(struct page_info *page, unsigned long type,
+                           bool preemptible)
+ {
+-    unsigned long nx, x, y = page->u.inuse.type_info;
++    unsigned long nx, x;
+     int rc = 0;
+ 
+     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
+     ASSERT(!in_irq());
+ 
+-    for ( ; ; )
++    for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
+     {
+         x  = y;
+         nx = x + 1;
++
+         if ( unlikely((nx & PGT_count_mask) == 0) )
+         {
+             gdprintk(XENLOG_WARNING,
+@@ -2995,8 +2996,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+                      mfn_x(page_to_mfn(page)));
+             return -EINVAL;
+         }
+-        else if ( unlikely((x & PGT_count_mask) == 0) )
++
++        if ( unlikely((x & PGT_count_mask) == 0) )
+         {
++            /*
++             * Typeref 0 -> 1.
++             *
++             * Type changes are permitted when the typeref is 0.  If the type
++             * actually changes, the page needs re-validating.
++             */
+             struct domain *d = page_get_owner(page);
+ 
+             if ( d && shadow_mode_enabled(d) )
+@@ -3007,8 +3015,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+             {
+                 /*
+                  * On type change we check to flush stale TLB entries. It is
+-                 * vital that no other CPUs are left with mappings of a frame
+-                 * which is about to become writeable to the guest.
++                 * vital that no other CPUs are left with writeable mappings
++                 * to a frame which is intending to become pgtable/segdesc.
+                  */
+                 cpumask_t *mask = this_cpu(scratch_cpumask);
+ 
+@@ -3020,7 +3028,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ 
+                 if ( unlikely(!cpumask_empty(mask)) &&
+                      /* Shadow mode: track only writable pages. */
+-                     (!shadow_mode_enabled(page_get_owner(page)) ||
++                     (!shadow_mode_enabled(d) ||
+                       ((nx & PGT_type_mask) == PGT_writable_page)) )
+                 {
+                     perfc_incr(need_flush_tlb_flush);
+@@ -3041,7 +3049,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+         }
+         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+         {
+-            /* Don't log failure if it could be a recursive-mapping attempt. */
++            /*
++             * else, we're trying to take a new reference, of the wrong type.
++             *
++             * This (being able to prohibit use of the wrong type) is what the
++             * typeref system exists for, but skip printing the failure if it
++             * looks like a recursive mapping, as subsequent logic might
++             * ultimately permit the attempt.
++             */
+             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
+                  (type == PGT_l1_page_table) )
+                 return -EINVAL;
+@@ -3060,18 +3075,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+         }
+         else if ( unlikely(!(x & PGT_validated)) )
+         {
++            /*
++             * else, the count is non-zero, and we're grabbing the right type;
++             * but the page hasn't been validated yet.
++             *
++             * The page is in one of two states (depending on PGT_partial),
++             * and should have exactly one reference.
++             */
++            ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
++
+             if ( !(x & PGT_partial) )
+             {
+-                /* Someone else is updating validation of this page. Wait... */
++                /*
++                 * The page has been left in the "validate locked" state
++                 * (i.e. PGT_[type] | 1) which means that a concurrent caller
++                 * of _get_page_type() is in the middle of validation.
++                 *
++                 * Spin waiting for the concurrent user to complete (partial
++                 * or fully validated), then restart our attempt to acquire a
++                 * type reference.
++                 */
+                 do {
+                     if ( preemptible && hypercall_preempt_check() )
+                         return -EINTR;
+                     cpu_relax();
+-                } while ( (y = page->u.inuse.type_info) == x );
++                } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
+                 continue;
+             }
+-            /* Type ref count was left at 1 when PGT_partial got set. */
+-            ASSERT((x & PGT_count_mask) == 1);
++
++            /*
++             * The page has been left in the "partial" state
++             * (i.e., PGT_[type] | PGT_partial | 1).
++             *
++             * Rather than bumping the type count, we need to try to grab the
++             * validation lock; if we succeed, we need to validate the page,
++             * then drop the general ref associated with the PGT_partial bit.
++             *
++             * We grab the validation lock by setting nx to (PGT_[type] | 1)
++             * (i.e., non-zero type count, neither PGT_validated nor
++             * PGT_partial set).
++             */
+             nx = x & ~PGT_partial;
+         }
+ 
+@@ -3116,6 +3159,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+     }
+ 
+  out:
++    /*
++     * Did we drop the PGT_partial bit when acquiring the typeref?  If so,
++     * drop the general reference that went along with it.
++     *
++     * N.B. validate_page() may have have re-set PGT_partial, not reflected in
++     * nx, but will have taken an extra ref when doing so.
++     */
+     if ( (x & PGT_partial) && !(nx & PGT_partial) )
+         put_page(page);
+ 
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
+
+_get_page_type() suffers from a race condition where it incorrectly assumes
+that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
+cannot have changed in-between.  Consider:
+
+CPU A:
+  1. Creates an L2e referencing pg
+     `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
+  2.     Issues flush_tlb_mask()
+CPU B:
+  3. Creates a writeable mapping of pg
+     `-> _get_page_type(pg, PGT_writable_page), count increases to 1
+  4. Writes into new mapping, creating a TLB entry for pg
+  5. Removes the writeable mapping of pg
+     `-> _put_page_type(pg), count goes back down to 0
+CPU A:
+  7.     Issues cmpxchg(), setting count 1, type PGT_l1_page_table
+
+CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
+suitably protected (i.e. read-only).  The TLB flush in step 2 must be deferred
+until after the guest is prohibited from creating new writeable mappings,
+which is after step 7.
+
+Defer all safety actions until after the cmpxchg() has successfully taken the
+intended typeref, because that is what prevents concurrent users from using
+the old type.
+
+Also remove the early validation for writeable and shared pages.  This removes
+race conditions where one half of a parallel mapping attempt can return
+successfully before:
+ * The IOMMU pagetables are in sync with the new page type
+ * Writeable mappings to shared pages have been torn down
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 96738b027827..ee91c7fe5f69 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -3005,46 +3005,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+              * Type changes are permitted when the typeref is 0.  If the type
+              * actually changes, the page needs re-validating.
+              */
+-            struct domain *d = page_get_owner(page);
+-
+-            if ( d && shadow_mode_enabled(d) )
+-               shadow_prepare_page_type_change(d, page, type);
+ 
+             ASSERT(!(x & PGT_pae_xen_l2));
+             if ( (x & PGT_type_mask) != type )
+             {
+-                /*
+-                 * On type change we check to flush stale TLB entries. It is
+-                 * vital that no other CPUs are left with writeable mappings
+-                 * to a frame which is intending to become pgtable/segdesc.
+-                 */
+-                cpumask_t *mask = this_cpu(scratch_cpumask);
+-
+-                BUG_ON(in_irq());
+-                cpumask_copy(mask, d->dirty_cpumask);
+-
+-                /* Don't flush if the timestamp is old enough */
+-                tlbflush_filter(mask, page->tlbflush_timestamp);
+-
+-                if ( unlikely(!cpumask_empty(mask)) &&
+-                     /* Shadow mode: track only writable pages. */
+-                     (!shadow_mode_enabled(d) ||
+-                      ((nx & PGT_type_mask) == PGT_writable_page)) )
+-                {
+-                    perfc_incr(need_flush_tlb_flush);
+-                    flush_tlb_mask(mask);
+-                }
+-
+-                /* We lose existing type and validity. */
+                 nx &= ~(PGT_type_mask | PGT_validated);
+                 nx |= type;
+-
+-                /*
+-                 * No special validation needed for writable pages.
+-                 * Page tables and GDT/LDT need to be scanned for validity.
+-                 */
+-                if ( type == PGT_writable_page || type == PGT_shared_page )
+-                    nx |= PGT_validated;
+             }
+         }
+         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+@@ -3125,6 +3091,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+             return -EINTR;
+     }
+ 
++    /*
++     * One typeref has been taken and is now globally visible.
++     *
++     * The page is either in the "validate locked" state (PGT_[type] | 1) or
++     * fully validated (PGT_[type] | PGT_validated | >0).
++     */
++
++    if ( unlikely((x & PGT_count_mask) == 0) )
++    {
++        struct domain *d = page_get_owner(page);
++
++        if ( d && shadow_mode_enabled(d) )
++            shadow_prepare_page_type_change(d, page, type);
++
++        if ( (x & PGT_type_mask) != type )
++        {
++            /*
++             * On type change we check to flush stale TLB entries. It is
++             * vital that no other CPUs are left with writeable mappings
++             * to a frame which is intending to become pgtable/segdesc.
++             */
++            cpumask_t *mask = this_cpu(scratch_cpumask);
++
++            BUG_ON(in_irq());
++            cpumask_copy(mask, d->dirty_cpumask);
++
++            /* Don't flush if the timestamp is old enough */
++            tlbflush_filter(mask, page->tlbflush_timestamp);
++
++            if ( unlikely(!cpumask_empty(mask)) &&
++                 /* Shadow mode: track only writable pages. */
++                 (!shadow_mode_enabled(d) ||
++                  ((nx & PGT_type_mask) == PGT_writable_page)) )
++            {
++                perfc_incr(need_flush_tlb_flush);
++                flush_tlb_mask(mask);
++            }
++        }
++    }
++
+     if ( unlikely((x & PGT_type_mask) != type) )
+     {
+         /* Special pages should not be accessible from devices. */
+@@ -3149,13 +3155,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ 
+     if ( unlikely(!(nx & PGT_validated)) )
+     {
+-        if ( !(x & PGT_partial) )
++        /*
++         * No special validation needed for writable or shared pages.  Page
++         * tables and GDT/LDT need to have their contents audited.
++         *
++         * per validate_page(), non-atomic updates are fine here.
++         */
++        if ( type == PGT_writable_page || type == PGT_shared_page )
++            page->u.inuse.type_info |= PGT_validated;
++        else
+         {
+-            page->nr_validated_ptes = 0;
+-            page->partial_flags = 0;
+-            page->linear_pt_count = 0;
++            if ( !(x & PGT_partial) )
++            {
++                page->nr_validated_ptes = 0;
++                page->partial_flags = 0;
++                page->linear_pt_count = 0;
++            }
++
++            rc = alloc_page_type(page, type, preemptible);
+         }
+-        rc = alloc_page_type(page, type, preemptible);
+     }
+ 
+  out:
diff --git a/sysutils/xenkernel413/patches/patch-XSA402 b/sysutils/xenkernel413/patches/patch-XSA402
new file mode 100644
index 00000000000..2bbc66a89bc
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA402
@@ -0,0 +1,743 @@
+$NetBSD: patch-XSA402,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/page: Introduce _PAGE_* constants for memory types
+
+... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
+constants.  These are going to be needed by forthcoming logic.
+
+No functional change.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
+index c1e92937c073..7269ae89b880 100644
+--- xen/include/asm-x86/page.h.orig
++++ xen/include/asm-x86/page.h
+@@ -320,6 +320,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+ 
+ #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+ 
++/* Memory types, encoded under Xen's choice of MSR_PAT. */
++#define _PAGE_WB         (                                0)
++#define _PAGE_WT         (                        _PAGE_PWT)
++#define _PAGE_UCM        (            _PAGE_PCD            )
++#define _PAGE_UC         (            _PAGE_PCD | _PAGE_PWT)
++#define _PAGE_WC         (_PAGE_PAT                        )
++#define _PAGE_WP         (_PAGE_PAT |             _PAGE_PWT)
++
+ /*
+  * Debug option: Ensure that granted mappings are not implicitly unmapped.
+  * WARNING: This will need to be disabled to run OSes that use the spare PTE
+@@ -338,8 +346,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+ #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
+ #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
+                                    _PAGE_DIRTY | _PAGE_RW)
+-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
+-#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
++#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_UC)
+ 
+ #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
+ 
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86: Don't change the cacheability of the directmap
+
+Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
+in response to guest mapping requests") attempted to keep the cacheability
+consistent between different mappings of the same page.
+
+The reason wasn't described in the changelog, but it is understood to be in
+regards to a concern over machine check exceptions, owing to errata when using
+mixed cacheabilities.  It did this primarily by updating Xen's mapping of the
+page in the direct map when the guest mapped a page with reduced cacheability.
+
+Unfortunately, the logic didn't actually prevent mixed cacheability from
+occurring:
+ * A guest could map a page normally, and then map the same page with
+   different cacheability; nothing prevented this.
+ * The cacheability of the directmap was always latest-takes-precedence in
+   terms of guest requests.
+ * Grant-mapped frames with lesser cacheability didn't adjust the page's
+   cacheattr settings.
+ * The map_domain_page() function still unconditionally created WB mappings,
+   irrespective of the page's cacheattr settings.
+
+Additionally, update_xen_mappings() had a bug where the alias calculation was
+wrong for mfn's which were .init content, which should have been treated as
+fully guest pages, not Xen pages.
+
+Worse yet, the logic introduced a vulnerability whereby necessary
+pagetable/segdesc adjustments made by Xen in the validation logic could become
+non-coherent between the cache and main memory.  The CPU could subsequently
+operate on the stale value in the cache, rather than the safe value in main
+memory.
+
+The directmap contains primarily mappings of RAM.  PAT/MTRR conflict
+resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
+cacheability resolves to being coherent.  The special case is WC mappings,
+which are non-coherent against MTRR=WB regions (except for fully-coherent
+CPUs).
+
+Xen must not have any WC cacheability in the directmap, to prevent Xen's
+actions from creating non-coherency.  (Guest actions creating non-coherency is
+dealt with in subsequent patches.)  As all memory types for MTRR=WB ranges
+inter-operate coherently, so leave Xen's directmap mappings as WB.
+
+Only PV guests with access to devices can use reduced-cacheability mappings to
+begin with, and they're trusted not to mount DoSs against the system anyway.
+
+Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
+Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
+count.  Retain the check in get_page_from_l1e() for special_pages() because a
+guest has no business using reduced cacheability on these.
+
+This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
+
+This is CVE-2022-26363, part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ee91c7fe5f69..859646b670a8 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -786,24 +786,6 @@ bool is_iomem_page(mfn_t mfn)
+     return (page_get_owner(page) == dom_io);
+ }
+ 
+-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
+-{
+-    int err = 0;
+-    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
+-         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
+-    unsigned long xen_va =
+-        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
+-
+-    if ( unlikely(alias) && cacheattr )
+-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
+-    if ( !err )
+-        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
+-                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
+-    if ( unlikely(alias) && !cacheattr && !err )
+-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
+-    return err;
+-}
+-
+ #ifndef NDEBUG
+ struct mmio_emul_range_ctxt {
+     const struct domain *d;
+@@ -1008,47 +990,14 @@ get_page_from_l1e(
+         goto could_not_pin;
+     }
+ 
+-    if ( pte_flags_to_cacheattr(l1f) !=
+-         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
++    if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_xen_heap_page(page) )
+     {
+-        unsigned long x, nx, y = page->count_info;
+-        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
+-        int err;
+-
+-        if ( is_xen_heap_page(page) )
+-        {
+-            if ( write )
+-                put_page_type(page);
+-            put_page(page);
+-            gdprintk(XENLOG_WARNING,
+-                     "Attempt to change cache attributes of Xen heap page\n");
+-            return -EACCES;
+-        }
+-
+-        do {
+-            x  = y;
+-            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
+-        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+-        err = update_xen_mappings(mfn, cacheattr);
+-        if ( unlikely(err) )
+-        {
+-            cacheattr = y & PGC_cacheattr_mask;
+-            do {
+-                x  = y;
+-                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
+-            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+-            if ( write )
+-                put_page_type(page);
+-            put_page(page);
+-
+-            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
+-                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
+-                     mfn, get_gpfn_from_mfn(mfn),
+-                     l1e_get_intpte(l1e), l1e_owner->domain_id);
+-            return err;
+-        }
++        if ( write )
++            put_page_type(page);
++        put_page(page);
++        gdprintk(XENLOG_WARNING,
++                 "Attempt to change cache attributes of Xen heap page\n");
++        return -EACCES;
+     }
+ 
+     return 0;
+@@ -2541,25 +2490,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+  */
+ static int cleanup_page_mappings(struct page_info *page)
+ {
+-    unsigned int cacheattr =
+-        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
+     int rc = 0;
+     unsigned long mfn = mfn_x(page_to_mfn(page));
+ 
+     /*
+-     * If we've modified xen mappings as a result of guest cache
+-     * attributes, restore them to the "normal" state.
+-     */
+-    if ( unlikely(cacheattr) )
+-    {
+-        page->count_info &= ~PGC_cacheattr_mask;
+-
+-        BUG_ON(is_xen_heap_page(page));
+-
+-        rc = update_xen_mappings(mfn, 0);
+-    }
+-
+-    /*
+      * If this may be in a PV domain's IOMMU, remove it.
+      *
+      * NB that writable xenheap pages have their type set and cleared by
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 320c6cd19669..db09849f73f8 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -64,22 +64,19 @@
+  /* Set when is using a page as a page table */
+ #define _PGC_page_table   PG_shift(3)
+ #define PGC_page_table    PG_mask(1, 3)
+- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
+-#define PGC_cacheattr_base PG_shift(6)
+-#define PGC_cacheattr_mask PG_mask(7, 6)
+  /* Page is broken? */
+-#define _PGC_broken       PG_shift(7)
+-#define PGC_broken        PG_mask(1, 7)
++#define _PGC_broken       PG_shift(4)
++#define PGC_broken        PG_mask(1, 4)
+  /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
+-#define PGC_state         PG_mask(3, 9)
+-#define PGC_state_inuse   PG_mask(0, 9)
+-#define PGC_state_offlining PG_mask(1, 9)
+-#define PGC_state_offlined PG_mask(2, 9)
+-#define PGC_state_free    PG_mask(3, 9)
++#define PGC_state           PG_mask(3, 6)
++#define PGC_state_inuse     PG_mask(0, 6)
++#define PGC_state_offlining PG_mask(1, 6)
++#define PGC_state_offlined  PG_mask(2, 6)
++#define PGC_state_free      PG_mask(3, 6)
+ #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+ 
+  /* Count of references to this frame. */
+-#define PGC_count_width   PG_shift(9)
++#define PGC_count_width   PG_shift(6)
+ #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
+ 
+ /*
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86: Split cache_flush() out of cache_writeback()
+
+Subsequent changes will want a fully flushing version.
+
+Use the new helper rather than opencoding it in flush_area_local().  This
+resolves an outstanding issue where the conditional sfence is on the wrong
+side of the clflushopt loop.  clflushopt is ordered with respect to older
+stores, not to younger stores.
+
+Rename gnttab_cache_flush()'s helper to avoid colliding in name.
+grant_table.c can see the prototype from cache.h so the build fails
+otherwise.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+Xen 4.16 and earlier:
+ * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
+   syncing" to split cache_writeback() out of the IOMMU logic, but without the
+   associated hooks changes.
+
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 03f92c23dcaf..8568491c7ea9 100644
+--- xen/arch/x86/flushtlb.c.orig
++++ xen/arch/x86/flushtlb.c
+@@ -224,7 +224,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+     if ( flags & FLUSH_CACHE )
+     {
+         const struct cpuinfo_x86 *c = &current_cpu_data;
+-        unsigned long i, sz = 0;
++        unsigned long sz = 0;
+ 
+         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
+             sz = 1UL << (order + PAGE_SHIFT);
+@@ -234,13 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+              c->x86_clflush_size && c->x86_cache_size && sz &&
+              ((sz >> 10) < c->x86_cache_size) )
+         {
+-            alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+-            for ( i = 0; i < sz; i += c->x86_clflush_size )
+-                alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
+-                                  " clflush %0",
+-                                  "data16 clflush %0",      /* clflushopt */
+-                                  X86_FEATURE_CLFLUSHOPT,
+-                                  "m" (((const char *)va)[i]));
++            cache_flush(va, sz);
+             flags &= ~FLUSH_CACHE;
+         }
+         else
+@@ -254,3 +248,77 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ 
+     return flags;
+ }
++
++void cache_flush(const void *addr, unsigned int size)
++{
++    /*
++     * This function may be called before current_cpu_data is established.
++     * Hence a fallback is needed to prevent the loop below becoming infinite.
++     */
++    unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++    const void *end = addr + size;
++
++    addr -= (unsigned long)addr & (clflush_size - 1);
++    for ( ; addr < end; addr += clflush_size )
++    {
++        /*
++         * Note regarding the "ds" prefix use: it's faster to do a clflush
++         * + prefix than a clflush + nop, and hence the prefix is added instead
++         * of letting the alternative framework fill the gap by appending nops.
++         */
++        alternative_io("ds; clflush %[p]",
++                       "data16 clflush %[p]", /* clflushopt */
++                       X86_FEATURE_CLFLUSHOPT,
++                       /* no outputs */,
++                       [p] "m" (*(const char *)(addr)));
++    }
++
++    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++}
++
++void cache_writeback(const void *addr, unsigned int size)
++{
++    unsigned int clflush_size;
++    const void *end = addr + size;
++
++    /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
++    if ( !boot_cpu_has(X86_FEATURE_CLWB) )
++        return cache_flush(addr, size);
++
++    /*
++     * This function may be called before current_cpu_data is established.
++     * Hence a fallback is needed to prevent the loop below becoming infinite.
++     */
++    clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++    addr -= (unsigned long)addr & (clflush_size - 1);
++    for ( ; addr < end; addr += clflush_size )
++    {
++/*
++ * The arguments to a macro must not include preprocessor directives. Doing so
++ * results in undefined behavior, so we have to create some defines here in
++ * order to avoid it.
++ */
++#if defined(HAVE_AS_CLWB)
++# define CLWB_ENCODING "clwb %[p]"
++#elif defined(HAVE_AS_XSAVEOPT)
++# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
++#else
++# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
++#endif
++
++#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
++#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
++# define INPUT BASE_INPUT
++#else
++# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
++#endif
++
++        asm volatile (CLWB_ENCODING :: INPUT(addr));
++
++#undef INPUT
++#undef BASE_INPUT
++#undef CLWB_ENCODING
++    }
++
++    asm volatile ("sfence" ::: "memory");
++}
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index cbb2ce17c001..709509e0fc9e 100644
+--- xen/common/grant_table.c.orig
++++ xen/common/grant_table.c
+@@ -3407,7 +3407,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
+     return 0;
+ }
+ 
+-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
++static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+ {
+     struct domain *d, *owner;
+     struct page_info *page;
+@@ -3501,7 +3501,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
+             return -EFAULT;
+         for ( ; ; )
+         {
+-            int ret = cache_flush(&op, cur_ref);
++            int ret = _cache_flush(&op, cur_ref);
+ 
+             if ( ret < 0 )
+                 return ret;
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index fbe951b2fad0..3defe9677f06 100644
+--- xen/drivers/passthrough/vtd/extern.h.orig
++++ xen/drivers/passthrough/vtd/extern.h
+@@ -77,7 +77,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
+                                           struct pci_dev *pdev,
+                                           u16 did, u16 size, u64 addr);
+ 
+-unsigned int get_cache_line_size(void);
+ void flush_all_cache(void);
+ 
+ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index f051a55764b9..2bf5f02c08de 100644
+--- xen/drivers/passthrough/vtd/iommu.c.orig
++++ xen/drivers/passthrough/vtd/iommu.c
+@@ -31,6 +31,7 @@
+ #include <xen/pci.h>
+ #include <xen/pci_regs.h>
+ #include <xen/keyhandler.h>
++#include <asm/cache.h>
+ #include <asm/msi.h>
+ #include <asm/nops.h>
+ #include <asm/irq.h>
+@@ -201,53 +202,10 @@ static int iommus_incoherent;
+ 
+ static void sync_cache(const void *addr, unsigned int size)
+ {
+-    static unsigned long clflush_size = 0;
+-    const void *end = addr + size;
+-
+     if ( !iommus_incoherent )
+         return;
+ 
+-    if ( clflush_size == 0 )
+-        clflush_size = get_cache_line_size();
+-
+-    addr -= (unsigned long)addr & (clflush_size - 1);
+-    for ( ; addr < end; addr += clflush_size )
+-/*
+- * The arguments to a macro must not include preprocessor directives. Doing so
+- * results in undefined behavior, so we have to create some defines here in
+- * order to avoid it.
+- */
+-#if defined(HAVE_AS_CLWB)
+-# define CLWB_ENCODING "clwb %[p]"
+-#elif defined(HAVE_AS_XSAVEOPT)
+-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+-#else
+-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+-#endif
+-
+-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+-# define INPUT BASE_INPUT
+-#else
+-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+-#endif
+-        /*
+-         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
+-         * + prefix than a clflush + nop, and hence the prefix is added instead
+-         * of letting the alternative framework fill the gap by appending nops.
+-         */
+-        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
+-                         "data16 clflush %[p]", /* clflushopt */
+-                         X86_FEATURE_CLFLUSHOPT,
+-                         CLWB_ENCODING,
+-                         X86_FEATURE_CLWB, /* no outputs */,
+-                         INPUT(addr));
+-#undef INPUT
+-#undef BASE_INPUT
+-#undef CLWB_ENCODING
+-
+-    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
+-                      "sfence", X86_FEATURE_CLWB);
++    cache_writeback(addr, size);
+ }
+ 
+ /* Allocate page table, return its machine address */
+diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
+index 229938f3a812..2a18b76e800d 100644
+--- xen/drivers/passthrough/vtd/x86/vtd.c.orig
++++ xen/drivers/passthrough/vtd/x86/vtd.c
+@@ -46,11 +46,6 @@ void unmap_vtd_domain_page(void *va)
+     unmap_domain_page(va);
+ }
+ 
+-unsigned int get_cache_line_size(void)
+-{
+-    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+-}
+-
+ void flush_all_cache()
+ {
+     wbinvd();
+diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
+index 1f7173d8c72c..e4770efb22b9 100644
+--- xen/include/asm-x86/cache.h.orig
++++ xen/include/asm-x86/cache.h
+@@ -11,4 +11,11 @@
+ 
+ #define __read_mostly __section(".data.read_mostly")
+ 
++#ifndef __ASSEMBLY__
++
++void cache_flush(const void *addr, unsigned int size);
++void cache_writeback(const void *addr, unsigned int size);
++
++#endif
++
+ #endif
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/amd: Work around CLFLUSH ordering on older parts
+
+On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
+including reads and writes to the address, and LFENCE/SFENCE instructions.
+
+This creates a multitude of problematic corner cases, laid out in the manual.
+Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index b77fa1929733..aa1b9d0dda6b 100644
+--- xen/arch/x86/cpu/amd.c.orig
++++ xen/arch/x86/cpu/amd.c
+@@ -639,6 +639,14 @@ static void init_amd(struct cpuinfo_x86 *c)
+ 	if (!cpu_has_lfence_dispatch)
+ 		__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+ 
++	/*
++	 * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
++	 * everything, including reads and writes to address, and
++	 * LFENCE/SFENCE instructions.
++	 */
++	if (!cpu_has_clflushopt)
++		setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
++
+ 	switch(c->x86)
+ 	{
+ 	case 0xf ... 0x11:
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 8568491c7ea9..6f3f5ab1a3c4 100644
+--- xen/arch/x86/flushtlb.c.orig
++++ xen/arch/x86/flushtlb.c
+@@ -249,6 +249,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+     return flags;
+ }
+ 
++/*
++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
++ * including reads and writes to address, and LFENCE/SFENCE instructions.
++ *
++ * This function only works safely after alternatives have run.  Luckily, at
++ * the time of writing, we don't flush the caches that early.
++ */
+ void cache_flush(const void *addr, unsigned int size)
+ {
+     /*
+@@ -258,6 +265,8 @@ void cache_flush(const void *addr, unsigned int size)
+     unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+     const void *end = addr + size;
+ 
++    alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
++
+     addr -= (unsigned long)addr & (clflush_size - 1);
+     for ( ; addr < end; addr += clflush_size )
+     {
+@@ -273,7 +282,9 @@ void cache_flush(const void *addr, unsigned int size)
+                        [p] "m" (*(const char *)(addr)));
+     }
+ 
+-    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++    alternative_2("",
++                  "sfence", X86_FEATURE_CLFLUSHOPT,
++                  "mfence", X86_BUG_CLFLUSH_MFENCE);
+ }
+ 
+ void cache_writeback(const void *addr, unsigned int size)
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index b9d3cac97538..a8222e978cd9 100644
+--- xen/include/asm-x86/cpufeatures.h.orig
++++ xen/include/asm-x86/cpufeatures.h
+@@ -44,6 +44,7 @@ XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
+ #define X86_BUG(x) ((FSCAPINTS + X86_NR_SYNTH) * 32 + (x))
+ 
+ #define X86_BUG_FPU_PTRS          X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
++#define X86_BUG_CLFLUSH_MFENCE    X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
+ 
+ /* Total number of capability words, inc synth and bug words. */
+ #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Track and flush non-coherent mappings of RAM
+
+There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
+devices that make non-coherent writes.  The Linux sound subsystem makes
+extensive use of this technique.
+
+For such usecases, the guest's DMA buffer is mapped and consistently used as
+WC, and Xen doesn't interact with the buffer.
+
+However, a mischevious guest can use WC mappings to deliberately create
+non-coherency between the cache and RAM, and use this to trick Xen into
+validating a pagetable which isn't actually safe.
+
+Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
+it whenever a non-coherent writeable mapping is created.  If the page is used
+as anything other than PGT_writable_page, force a cache flush before
+validation.  Also force a cache flush before the page is returned to the heap.
+
+This is CVE-2022-26364, part of XSA-402.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 859646b670a8..f5eeddce5867 100644
+--- xen/arch/x86/mm.c.orig
++++ xen/arch/x86/mm.c
+@@ -1000,6 +1000,15 @@ get_page_from_l1e(
+         return -EACCES;
+     }
+ 
++    /*
++     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
++     * flush later if the target is used as anything but a PGT_writeable page.
++     * We care about all writeable mappings, including foreign mappings.
++     */
++    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
++         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
++        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
++
+     return 0;
+ 
+  could_not_pin:
+@@ -2532,6 +2541,19 @@ static int cleanup_page_mappings(struct page_info *page)
+         }
+     }
+ 
++    /*
++     * Flush the cache if there were previously non-coherent writeable
++     * mappings of this page.  This forces the page to be coherent before it
++     * is freed back to the heap.
++     */
++    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
++    {
++        void *addr = __map_domain_page(page);
++
++        cache_flush(addr, PAGE_SIZE);
++        unmap_domain_page(addr);
++    }
++
+     return rc;
+ }
+ 
+@@ -3090,6 +3112,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+     if ( unlikely(!(nx & PGT_validated)) )
+     {
+         /*
++         * Flush the cache if there were previously non-coherent mappings of
++         * this page, and we're trying to use it as anything other than a
++         * writeable page.  This forces the page to be coherent before we
++         * validate its contents for safety.
++         */
++        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
++        {
++            void *addr = __map_domain_page(page);
++
++            cache_flush(addr, PAGE_SIZE);
++            unmap_domain_page(addr);
++
++            page->u.inuse.type_info &= ~PGT_non_coherent;
++        }
++
++        /*
+          * No special validation needed for writable or shared pages.  Page
+          * tables and GDT/LDT need to have their contents audited.
+          *
+diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
+index 0325618c9883..81c72e61ed55 100644
+--- xen/arch/x86/pv/grant_table.c.orig
++++ xen/arch/x86/pv/grant_table.c
+@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
+ 
+     ol1e = *pl1e;
+     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++    {
++        /*
++         * We always create mappings in this path.  However, our caller,
++         * map_grant_ref(), only passes potentially non-zero cache_flags for
++         * MMIO frames, so this path doesn't create non-coherent mappings of
++         * RAM frames and there's no need to calculate PGT_non_coherent.
++         */
++        ASSERT(!cache_flags || is_iomem_page(frame));
++
+         rc = GNTST_okay;
++    }
+ 
+  out_unlock:
+     page_unlock(page);
+@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
+                  l1e_get_flags(ol1e), addr, grant_pte_flags);
+ 
+     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++    {
++        /*
++         * Generally, replace_grant_pv_mapping() is used to destroy mappings
++         * (n1le = l1e_empty()), but it can be a present mapping on the
++         * GNTABOP_unmap_and_replace path.
++         *
++         * In such cases, the PTE is fully transplanted from its old location
++         * via steal_linear_addr(), so we need not perform PGT_non_coherent
++         * checking here.
++         */
+         rc = GNTST_okay;
++    }
+ 
+  out_unlock:
+     page_unlock(page);
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index db09849f73f8..82d0fd6104a2 100644
+--- xen/include/asm-x86/mm.h.orig
++++ xen/include/asm-x86/mm.h
+@@ -48,8 +48,12 @@
+ #define _PGT_partial      PG_shift(8)
+ #define PGT_partial       PG_mask(1, 8)
+ 
++/* Has this page been mapped writeable with a non-coherent memory type? */
++#define _PGT_non_coherent PG_shift(9)
++#define PGT_non_coherent  PG_mask(1, 9)
++
+  /* Count of uses of this frame as its current type. */
+-#define PGT_count_width   PG_shift(8)
++#define PGT_count_width   PG_shift(9)
+ #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
+ 
+ /* Are the 'type mask' bits identical? */
diff --git a/sysutils/xenkernel413/patches/patch-XSA404 b/sysutils/xenkernel413/patches/patch-XSA404
new file mode 100644
index 00000000000..d48743b1ab3
--- /dev/null
+++ b/sysutils/xenkernel413/patches/patch-XSA404
@@ -0,0 +1,485 @@
+$NetBSD: patch-XSA404,v 1.1 2022/06/24 13:47:37 bouyer Exp $
+
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/spec-ctrl: Make VERW flushing runtime conditional
+
+Currently, VERW flushing to mitigate MDS is boot time conditional per domain
+type.  However, to provide mitigations for DRPW (CVE-2022-21166), we need to
+conditionally use VERW based on the trustworthiness of the guest, and the
+devices passed through.
+
+Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest
+path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags.
+
+Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW
+disposition at domain creation time, and context switch the SCF_verw bit.
+
+For now, VERW flushing is used and controlled exactly as before, but later
+patches will add per-domain cases too.
+
+No change in behaviour.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index eead69ada2c2..e8bdf30fa46c 100644
+--- docs/misc/xen-command-line.pandoc.orig
++++ docs/misc/xen-command-line.pandoc
+@@ -2058,9 +2058,8 @@ in place for guests to use.
+ Use of a positive boolean value for either of these options is invalid.
+ 
+ The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine
+-grained control over the alternative blocks used by Xen.  These impact Xen's
+-ability to protect itself, and Xen's ability to virtualise support for guests
+-to use.
++grained control over the primitives by Xen.  These impact Xen's ability to
++protect itself, and Xen's ability to virtualise support for guests to use.
+ 
+ * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
+   respectively.
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 820cb0f90558..fe95b25a034e 100644
+--- xen/arch/x86/domain.c.orig
++++ xen/arch/x86/domain.c
+@@ -651,6 +651,8 @@ int arch_domain_create(struct domain *d,
+ 
+     domain_cpu_policy_changed(d);
+ 
++    spec_ctrl_init_domain(d);
++
+     return 0;
+ 
+  fail:
+@@ -1746,14 +1748,15 @@ static void __context_switch(void)
+ void context_switch(struct vcpu *prev, struct vcpu *next)
+ {
+     unsigned int cpu = smp_processor_id();
++    struct cpu_info *info = get_cpu_info();
+     const struct domain *prevd = prev->domain, *nextd = next->domain;
+     unsigned int dirty_cpu = next->dirty_cpu;
+ 
+     ASSERT(prev != next);
+     ASSERT(local_irq_is_enabled());
+ 
+-    get_cpu_info()->use_pv_cr3 = false;
+-    get_cpu_info()->xen_cr3 = 0;
++    info->use_pv_cr3 = false;
++    info->xen_cr3 = 0;
+ 
+     if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN )
+     {
+@@ -1816,6 +1819,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
+                 *last_id = next_id;
+             }
+         }
++
++        /* Update the top-of-stack block with the VERW disposition. */
++        info->spec_ctrl_flags &= ~SCF_verw;
++        if ( nextd->arch.verw )
++            info->spec_ctrl_flags |= SCF_verw;
+     }
+ 
+     sched_context_switched(prev, next);
+diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
+index 27c8c5ca4943..62ed0d854df1 100644
+--- xen/arch/x86/hvm/vmx/entry.S.orig
++++ xen/arch/x86/hvm/vmx/entry.S
+@@ -81,6 +81,7 @@ UNLIKELY_END(realmode)
+ 
+         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+         SPEC_CTRL_EXIT_TO_HVM   /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
++        DO_SPEC_CTRL_COND_VERW
+ 
+         mov  VCPU_hvm_guest_cr2(%rbx),%rax
+ 
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 7447d4a8e5b5..38e1f1098210 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -35,8 +35,8 @@ static bool __initdata opt_msr_sc_pv = true;
+ static bool __initdata opt_msr_sc_hvm = true;
+ static bool __initdata opt_rsb_pv = true;
+ static bool __initdata opt_rsb_hvm = true;
+-static int8_t __initdata opt_md_clear_pv = -1;
+-static int8_t __initdata opt_md_clear_hvm = -1;
++static int8_t __read_mostly opt_md_clear_pv = -1;
++static int8_t __read_mostly opt_md_clear_hvm = -1;
+ 
+ /* Cmdline controls for Xen's speculative settings. */
+ static enum ind_thunk {
+@@ -878,6 +878,13 @@ static __init void mds_calculations(uint64_t caps)
+     }
+ }
+ 
++void spec_ctrl_init_domain(struct domain *d)
++{
++    bool pv = is_pv_domain(d);
++
++    d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
++}
++
+ void __init init_speculation_mitigations(void)
+ {
+     enum ind_thunk thunk = THUNK_DEFAULT;
+@@ -1078,21 +1085,20 @@ void __init init_speculation_mitigations(void)
+                             boot_cpu_has(X86_FEATURE_MD_CLEAR));
+ 
+     /*
+-     * Enable MDS defences as applicable.  The PV blocks need using all the
+-     * time, and the Idle blocks need using if either PV or HVM defences are
+-     * used.
++     * Enable MDS defences as applicable.  The Idle blocks need using if
++     * either PV or HVM defences are used.
+      *
+      * HVM is more complicated.  The MD_CLEAR microcode extends L1D_FLUSH with
+-     * equivelent semantics to avoid needing to perform both flushes on the
+-     * HVM path.  The HVM blocks don't need activating if our hypervisor told
+-     * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves.
++     * equivalent semantics to avoid needing to perform both flushes on the
++     * HVM path.  Therefore, we don't need VERW in addition to L1D_FLUSH.
++     *
++     * After calculating the appropriate idle setting, simplify
++     * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
++     * guests", so spec_ctrl_init_domain() can calculate suitable settings.
+      */
+-    if ( opt_md_clear_pv )
+-        setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV);
+     if ( opt_md_clear_pv || opt_md_clear_hvm )
+         setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+-    if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush )
+-        setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM);
++    opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
+ 
+     /*
+      * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index a8222e978cd9..bcba926bda41 100644
+--- xen/include/asm-x86/cpufeatures.h.orig
++++ xen/include/asm-x86/cpufeatures.h
+@@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM,        X86_SYNTH(19)) /* RSB overwrite needed for HVM
+ XEN_CPUFEATURE(XEN_SELFSNOOP,     X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
+ XEN_CPUFEATURE(SC_MSR_IDLE,       X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
+ XEN_CPUFEATURE(XEN_LBR,           X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
+-XEN_CPUFEATURE(SC_VERW_PV,        X86_SYNTH(23)) /* VERW used by Xen for PV */
+-XEN_CPUFEATURE(SC_VERW_HVM,       X86_SYNTH(24)) /* VERW used by Xen for HVM */
++/* Bits 23,24 unused. */
+ XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
+ 
+ /* Bug words follow the synthetic words. */
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 309b56e2d6b7..71d1ca243b32 100644
+--- xen/include/asm-x86/domain.h.orig
++++ xen/include/asm-x86/domain.h
+@@ -295,6 +295,9 @@ struct arch_domain
+     uint32_t pci_cf8;
+     uint8_t cmos_idx;
+ 
++    /* Use VERW on return-to-guest for its flushing side effect. */
++    bool verw;
++
+     union {
+         struct pv_domain pv;
+         struct hvm_domain hvm;
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index b252bb863111..157a2c67d89c 100644
+--- xen/include/asm-x86/spec_ctrl.h.orig
++++ xen/include/asm-x86/spec_ctrl.h
+@@ -24,6 +24,7 @@
+ #define SCF_use_shadow (1 << 0)
+ #define SCF_ist_wrmsr  (1 << 1)
+ #define SCF_ist_rsb    (1 << 2)
++#define SCF_verw       (1 << 3)
+ 
+ #ifndef __ASSEMBLY__
+ 
+@@ -32,6 +33,7 @@
+ #include <asm/msr-index.h>
+ 
+ void init_speculation_mitigations(void);
++void spec_ctrl_init_domain(struct domain *d);
+ 
+ extern bool opt_ibpb;
+ extern bool opt_ssbd;
+diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
+index c60093b090b5..4a3777cc5227 100644
+--- xen/include/asm-x86/spec_ctrl_asm.h.orig
++++ xen/include/asm-x86/spec_ctrl_asm.h
+@@ -141,6 +141,19 @@
+     wrmsr
+ .endm
+ 
++.macro DO_SPEC_CTRL_COND_VERW
++/*
++ * Requires %rsp=cpuinfo
++ *
++ * Issue a VERW for its flushing side effect, if indicated.  This is a Spectre
++ * v1 gadget, but the IRET/VMEntry is serialising.
++ */
++    testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp)
++    jz .L\@_verw_skip
++    verw CPUINFO_verw_sel(%rsp)
++.L\@_verw_skip:
++.endm
++
+ .macro DO_SPEC_CTRL_ENTRY maybexen:req
+ /*
+  * Requires %rsp=regs (also cpuinfo if !maybexen)
+@@ -242,15 +255,12 @@
+ #define SPEC_CTRL_EXIT_TO_PV                                            \
+     ALTERNATIVE "",                                                     \
+         DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV;              \
+-    ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)),           \
+-        X86_FEATURE_SC_VERW_PV
++    DO_SPEC_CTRL_COND_VERW
+ 
+ /* Use when exiting to HVM guest context. */
+ #define SPEC_CTRL_EXIT_TO_HVM                                           \
+     ALTERNATIVE "",                                                     \
+         DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_HVM;             \
+-    ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)),           \
+-        X86_FEATURE_SC_VERW_HVM
+ 
+ /*
+  * Use in IST interrupt/exception context.  May interrupt Xen or PV context.
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/spec-ctrl: Enumeration for MMIO Stale Data controls
+
+The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP
+data movement primitives.
+
+FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer
+flushing side effect.  This is only enumerated on parts where VERW had
+previously lost it's flushing side effect due to the MDS/TAA vulnerabilities
+being fixed in hardware.
+
+FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer
+clearing side effect of VERW can be turned off for performance reasons.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 38e1f1098210..fd36927ba1cb 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -318,7 +318,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+     printk("Speculative mitigation facilities:\n");
+ 
+     /* Hardware features which pertain to speculative mitigations. */
+-    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
++    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
+            (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP"     : "",
+            (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
+@@ -333,7 +333,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+            (caps & ARCH_CAPS_SSB_NO)                ? " SSB_NO"    : "",
+            (caps & ARCH_CAPS_MDS_NO)                ? " MDS_NO"    : "",
+            (caps & ARCH_CAPS_TSX_CTRL)              ? " TSX_CTRL"  : "",
+-           (caps & ARCH_CAPS_TAA_NO)                ? " TAA_NO"    : "");
++           (caps & ARCH_CAPS_TAA_NO)                ? " TAA_NO"    : "",
++           (caps & ARCH_CAPS_SBDR_SSDP_NO)          ? " SBDR_SSDP_NO" : "",
++           (caps & ARCH_CAPS_FBSDP_NO)              ? " FBSDP_NO"  : "",
++           (caps & ARCH_CAPS_PSDP_NO)               ? " PSDP_NO"   : "",
++           (caps & ARCH_CAPS_FB_CLEAR)              ? " FB_CLEAR"  : "",
++           (caps & ARCH_CAPS_FB_CLEAR_CTRL)         ? " FB_CLEAR_CTRL" : "");
+ 
+     /* Compiled-in support which pertains to mitigations. */
+     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index ba9e90af210b..2a80660d849d 100644
+--- xen/include/asm-x86/msr-index.h.orig
++++ xen/include/asm-x86/msr-index.h
+@@ -55,6 +55,11 @@
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO	(_AC(1, ULL) << 6)
+ #define ARCH_CAPS_TSX_CTRL		(_AC(1, ULL) << 7)
+ #define ARCH_CAPS_TAA_NO		(_AC(1, ULL) << 8)
++#define ARCH_CAPS_SBDR_SSDP_NO		(_AC(1, ULL) << 13)
++#define ARCH_CAPS_FBSDP_NO		(_AC(1, ULL) << 14)
++#define ARCH_CAPS_PSDP_NO		(_AC(1, ULL) << 15)
++#define ARCH_CAPS_FB_CLEAR		(_AC(1, ULL) << 17)
++#define ARCH_CAPS_FB_CLEAR_CTRL		(_AC(1, ULL) << 18)
+ 
+ #define MSR_FLUSH_CMD			0x0000010b
+ #define FLUSH_CMD_L1D			(_AC(1, ULL) << 0)
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/spec-ctrl: Add spec-ctrl=unpriv-mmio
+
+Per Xen's support statement, PCI passthrough should be to trusted domains
+because the overall system security depends on factors outside of Xen's
+control.
+
+As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR.
+
+However, users who have risk assessed their configuration may be happy with
+the risk of DoS, but unhappy with the risk of cross-domain data leakage.  Such
+users should enable this option.
+
+On CPUs vulnerable to MDS, the existing mitigations are the best we can do to
+mitigate MMIO cross-domain data leakage.
+
+On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option:
+
+ * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage
+   using FB_CLEAR.
+ * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the
+   srb-lock, previously used to mitigate SRBDS.
+
+Both mitigations require microcode from IPU 2022.1, May 2022.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+---
+Backporting note: For Xen 4.7 and earlier with bool_t not aliasing bool, the
+ARCH_CAPS_FB_CLEAR hunk needs !!
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index e8bdf30fa46c..022cb01da762 100644
+--- docs/misc/xen-command-line.pandoc.orig
++++ docs/misc/xen-command-line.pandoc
+@@ -2035,7 +2035,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ ### spec-ctrl (x86)
+ > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
+ >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
+->              l1d-flush,branch-harden,srb-lock}=<bool> ]`
++>              l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]`
+ 
+ Controls for speculative execution sidechannel mitigations.  By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -2114,8 +2114,16 @@ Xen will enable this mitigation.
+ On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force
+ or prevent Xen from protect the Special Register Buffer from leaking stale
+ data. By default, Xen will enable this mitigation, except on parts where MDS
+-is fixed and TAA is fixed/mitigated (in which case, there is believed to be no
+-way for an attacker to obtain the stale data).
++is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO
++mappings (in which case, there is believed to be no way for an attacker to
++obtain stale data).
++
++The `unpriv-mmio=` boolean indicates whether the system has (or will have)
++less than fully privileged domains granted access to MMIO devices.  By
++default, this option is disabled.  If enabled, Xen will use the `FB_CLEAR`
++and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode
++release to mitigate cross-domain leakage of data via the MMIO Stale Data
++vulnerabilities.
+ 
+ ### sync_console
+ > `= <boolean>`
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index fd36927ba1cb..d4ba9412067b 100644
+--- xen/arch/x86/spec_ctrl.c.orig
++++ xen/arch/x86/spec_ctrl.c
+@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination.
+ 
+ static int8_t __initdata opt_srb_lock = -1;
+ uint64_t __read_mostly default_xen_mcu_opt_ctrl;
++static bool __initdata opt_unpriv_mmio;
++static bool __read_mostly opt_fb_clear_mmio;
+ 
+ static int __init parse_spec_ctrl(const char *s)
+ {
+@@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s)
+             opt_branch_harden = val;
+         else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
+             opt_srb_lock = val;
++        else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
++            opt_unpriv_mmio = val;
+         else
+             rc = -EINVAL;
+ 
+@@ -367,7 +371,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+            opt_srb_lock                              ? " SRB_LOCK+" : " SRB_LOCK-",
+            opt_ibpb                                  ? " IBPB"  : "",
+            opt_l1d_flush                             ? " L1D_FLUSH" : "",
+-           opt_md_clear_pv || opt_md_clear_hvm       ? " VERW"  : "",
++           opt_md_clear_pv || opt_md_clear_hvm ||
++           opt_fb_clear_mmio                         ? " VERW"  : "",
+            opt_branch_harden                         ? " BRANCH_HARDEN" : "");
+ 
+     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+@@ -887,7 +892,9 @@ void spec_ctrl_init_domain(struct domain *d)
+ {
+     bool pv = is_pv_domain(d);
+ 
+-    d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
++    d->arch.verw =
++        (pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
++        (opt_fb_clear_mmio && is_iommu_enabled(d));
+ }
+ 
+ void __init init_speculation_mitigations(void)
+@@ -1078,6 +1085,18 @@ void __init init_speculation_mitigations(void)
+     mds_calculations(caps);
+ 
+     /*
++     * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have
++     * reintroduced the VERW fill buffer flushing side effect because of a
++     * susceptibility to FBSDP.
++     *
++     * If unprivileged guests have (or will have) MMIO mappings, we can
++     * mitigate cross-domain leakage of fill buffer data by issuing VERW on
++     * the return-to-guest path.
++     */
++    if ( opt_unpriv_mmio )
++        opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR;
++
++    /*
+      * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
+      * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
+      * but it is somewhat better than nothing.
+@@ -1090,18 +1109,20 @@ void __init init_speculation_mitigations(void)
+                             boot_cpu_has(X86_FEATURE_MD_CLEAR));
+ 
+     /*
+-     * Enable MDS defences as applicable.  The Idle blocks need using if
+-     * either PV or HVM defences are used.
++     * Enable MDS/MMIO defences as applicable.  The Idle blocks need using if
++     * either the PV or HVM MDS defences are used, or if we may give MMIO
++     * access to untrusted guests.
+      *
+      * HVM is more complicated.  The MD_CLEAR microcode extends L1D_FLUSH with
+      * equivalent semantics to avoid needing to perform both flushes on the
+-     * HVM path.  Therefore, we don't need VERW in addition to L1D_FLUSH.
++     * HVM path.  Therefore, we don't need VERW in addition to L1D_FLUSH (for
++     * MDS mitigations.  L1D_FLUSH is not safe for MMIO mitigations.)
+      *
+      * After calculating the appropriate idle setting, simplify
+      * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
+      * guests", so spec_ctrl_init_domain() can calculate suitable settings.
+      */
+-    if ( opt_md_clear_pv || opt_md_clear_hvm )
++    if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio )
+         setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+     opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
+ 
+@@ -1170,12 +1191,18 @@ void __init init_speculation_mitigations(void)
+          * On some SRBDS-affected hardware, it may be safe to relax srb-lock
+          * by default.
+          *
+-         * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only way
+-         * to access the Fill Buffer.  If TSX isn't available (inc. SKU
+-         * reasons on some models), or TSX is explicitly disabled, then there
+-         * is no need for the extra overhead to protect RDRAND/RDSEED.
++         * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale
++         * RNG data becomes available to other contexts.  To recover the data,
++         * an attacker needs to use:
++         *  - SBDS (MDS or TAA to sample the cores fill buffer)
++         *  - SBDR (Architecturally retrieve stale transaction buffer contents)
++         *  - DRPW (Architecturally latch stale fill buffer data)
++         *
++         * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and
++         * there is no unprivileged MMIO access, the RNG data doesn't need
++         * protecting.
+          */
+-        if ( opt_srb_lock == -1 &&
++        if ( opt_srb_lock == -1 && !opt_unpriv_mmio &&
+              (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO &&
+              (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && opt_tsx == 0)) )
+             opt_srb_lock = 0;
author	bouyer <bouyer@pkgsrc.org>	2022-06-24 13:47:37 +0000
committer	bouyer <bouyer@pkgsrc.org>	2022-06-24 13:47:37 +0000
commit	d6fc92c6e901bd545b6781a01bc993ffc8f7de00 (patch)
tree	37844950d1050816f4f88c0c700fffc5b2b22d92 /sysutils
parent	fd67f46ab80259714e4918f216a10835fdd61b8e (diff)
download	pkgsrc-d6fc92c6e901bd545b6781a01bc993ffc8f7de00.tar.gz