$NetBSD: patch-XSA402,v 1.1 2022/06/24 13:47:37 bouyer Exp $

From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/page: Introduce _PAGE_* constants for memory types

... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
constants.  These are going to be needed by forthcoming logic.

No functional change.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c073..7269ae89b880 100644
--- xen/include/asm-x86/page.h.orig
+++ xen/include/asm-x86/page.h
@@ -320,6 +320,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
 
 #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 
+/* Memory types, encoded under Xen's choice of MSR_PAT. */
+#define _PAGE_WB         (                                0)
+#define _PAGE_WT         (                        _PAGE_PWT)
+#define _PAGE_UCM        (            _PAGE_PCD            )
+#define _PAGE_UC         (            _PAGE_PCD | _PAGE_PWT)
+#define _PAGE_WC         (_PAGE_PAT                        )
+#define _PAGE_WP         (_PAGE_PAT |             _PAGE_PWT)
+
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
  * WARNING: This will need to be disabled to run OSes that use the spare PTE
@@ -338,8 +346,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
-#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
+#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_UC)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Don't change the cacheability of the directmap

Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
in response to guest mapping requests") attempted to keep the cacheability
consistent between different mappings of the same page.

The reason wasn't described in the changelog, but it is understood to be in
regards to a concern over machine check exceptions, owing to errata when using
mixed cacheabilities.  It did this primarily by updating Xen's mapping of the
page in the direct map when the guest mapped a page with reduced cacheability.

Unfortunately, the logic didn't actually prevent mixed cacheability from
occurring:
 * A guest could map a page normally, and then map the same page with
   different cacheability; nothing prevented this.
 * The cacheability of the directmap was always latest-takes-precedence in
   terms of guest requests.
 * Grant-mapped frames with lesser cacheability didn't adjust the page's
   cacheattr settings.
 * The map_domain_page() function still unconditionally created WB mappings,
   irrespective of the page's cacheattr settings.

Additionally, update_xen_mappings() had a bug where the alias calculation was
wrong for mfn's which were .init content, which should have been treated as
fully guest pages, not Xen pages.

Worse yet, the logic introduced a vulnerability whereby necessary
pagetable/segdesc adjustments made by Xen in the validation logic could become
non-coherent between the cache and main memory.  The CPU could subsequently
operate on the stale value in the cache, rather than the safe value in main
memory.

The directmap contains primarily mappings of RAM.  PAT/MTRR conflict
resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
cacheability resolves to being coherent.  The special case is WC mappings,
which are non-coherent against MTRR=WB regions (except for fully-coherent
CPUs).

Xen must not have any WC cacheability in the directmap, to prevent Xen's
actions from creating non-coherency.  (Guest actions creating non-coherency is
dealt with in subsequent patches.)  As all memory types for MTRR=WB ranges
inter-operate coherently, so leave Xen's directmap mappings as WB.

Only PV guests with access to devices can use reduced-cacheability mappings to
begin with, and they're trusted not to mount DoSs against the system anyway.

Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
count.  Retain the check in get_page_from_l1e() for special_pages() because a
guest has no business using reduced cacheability on these.

This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0

This is CVE-2022-26363, part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index ee91c7fe5f69..859646b670a8 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -786,24 +786,6 @@ bool is_iomem_page(mfn_t mfn)
     return (page_get_owner(page) == dom_io);
 }
 
-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
-{
-    int err = 0;
-    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
-         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
-    unsigned long xen_va =
-        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
-
-    if ( unlikely(alias) && cacheattr )
-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
-    if ( !err )
-        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
-                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
-    if ( unlikely(alias) && !cacheattr && !err )
-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
-    return err;
-}
-
 #ifndef NDEBUG
 struct mmio_emul_range_ctxt {
     const struct domain *d;
@@ -1008,47 +990,14 @@ get_page_from_l1e(
         goto could_not_pin;
     }
 
-    if ( pte_flags_to_cacheattr(l1f) !=
-         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
+    if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_xen_heap_page(page) )
     {
-        unsigned long x, nx, y = page->count_info;
-        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
-        int err;
-
-        if ( is_xen_heap_page(page) )
-        {
-            if ( write )
-                put_page_type(page);
-            put_page(page);
-            gdprintk(XENLOG_WARNING,
-                     "Attempt to change cache attributes of Xen heap page\n");
-            return -EACCES;
-        }
-
-        do {
-            x  = y;
-            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
-        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
-        err = update_xen_mappings(mfn, cacheattr);
-        if ( unlikely(err) )
-        {
-            cacheattr = y & PGC_cacheattr_mask;
-            do {
-                x  = y;
-                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
-            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
-            if ( write )
-                put_page_type(page);
-            put_page(page);
-
-            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
-                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
-                     mfn, get_gpfn_from_mfn(mfn),
-                     l1e_get_intpte(l1e), l1e_owner->domain_id);
-            return err;
-        }
+        if ( write )
+            put_page_type(page);
+        put_page(page);
+        gdprintk(XENLOG_WARNING,
+                 "Attempt to change cache attributes of Xen heap page\n");
+        return -EACCES;
     }
 
     return 0;
@@ -2541,25 +2490,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
  */
 static int cleanup_page_mappings(struct page_info *page)
 {
-    unsigned int cacheattr =
-        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
     int rc = 0;
     unsigned long mfn = mfn_x(page_to_mfn(page));
 
     /*
-     * If we've modified xen mappings as a result of guest cache
-     * attributes, restore them to the "normal" state.
-     */
-    if ( unlikely(cacheattr) )
-    {
-        page->count_info &= ~PGC_cacheattr_mask;
-
-        BUG_ON(is_xen_heap_page(page));
-
-        rc = update_xen_mappings(mfn, 0);
-    }
-
-    /*
      * If this may be in a PV domain's IOMMU, remove it.
      *
      * NB that writable xenheap pages have their type set and cleared by
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 320c6cd19669..db09849f73f8 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -64,22 +64,19 @@
  /* Set when is using a page as a page table */
 #define _PGC_page_table   PG_shift(3)
 #define PGC_page_table    PG_mask(1, 3)
- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
-#define PGC_cacheattr_base PG_shift(6)
-#define PGC_cacheattr_mask PG_mask(7, 6)
  /* Page is broken? */
-#define _PGC_broken       PG_shift(7)
-#define PGC_broken        PG_mask(1, 7)
+#define _PGC_broken       PG_shift(4)
+#define PGC_broken        PG_mask(1, 4)
  /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
-#define PGC_state         PG_mask(3, 9)
-#define PGC_state_inuse   PG_mask(0, 9)
-#define PGC_state_offlining PG_mask(1, 9)
-#define PGC_state_offlined PG_mask(2, 9)
-#define PGC_state_free    PG_mask(3, 9)
+#define PGC_state           PG_mask(3, 6)
+#define PGC_state_inuse     PG_mask(0, 6)
+#define PGC_state_offlining PG_mask(1, 6)
+#define PGC_state_offlined  PG_mask(2, 6)
+#define PGC_state_free      PG_mask(3, 6)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
 
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(6)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 /*
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Split cache_flush() out of cache_writeback()

Subsequent changes will want a fully flushing version.

Use the new helper rather than opencoding it in flush_area_local().  This
resolves an outstanding issue where the conditional sfence is on the wrong
side of the clflushopt loop.  clflushopt is ordered with respect to older
stores, not to younger stores.

Rename gnttab_cache_flush()'s helper to avoid colliding in name.
grant_table.c can see the prototype from cache.h so the build fails
otherwise.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

Xen 4.16 and earlier:
 * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
   syncing" to split cache_writeback() out of the IOMMU logic, but without the
   associated hooks changes.

diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 03f92c23dcaf..8568491c7ea9 100644
--- xen/arch/x86/flushtlb.c.orig
+++ xen/arch/x86/flushtlb.c
@@ -224,7 +224,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
     if ( flags & FLUSH_CACHE )
     {
         const struct cpuinfo_x86 *c = &current_cpu_data;
-        unsigned long i, sz = 0;
+        unsigned long sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
             sz = 1UL << (order + PAGE_SHIFT);
@@ -234,13 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
              c->x86_clflush_size && c->x86_cache_size && sz &&
              ((sz >> 10) < c->x86_cache_size) )
         {
-            alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-            for ( i = 0; i < sz; i += c->x86_clflush_size )
-                alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
-                                  " clflush %0",
-                                  "data16 clflush %0",      /* clflushopt */
-                                  X86_FEATURE_CLFLUSHOPT,
-                                  "m" (((const char *)va)[i]));
+            cache_flush(va, sz);
             flags &= ~FLUSH_CACHE;
         }
         else
@@ -254,3 +248,77 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
 
     return flags;
 }
+
+void cache_flush(const void *addr, unsigned int size)
+{
+    /*
+     * This function may be called before current_cpu_data is established.
+     * Hence a fallback is needed to prevent the loop below becoming infinite.
+     */
+    unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+    const void *end = addr + size;
+
+    addr -= (unsigned long)addr & (clflush_size - 1);
+    for ( ; addr < end; addr += clflush_size )
+    {
+        /*
+         * Note regarding the "ds" prefix use: it's faster to do a clflush
+         * + prefix than a clflush + nop, and hence the prefix is added instead
+         * of letting the alternative framework fill the gap by appending nops.
+         */
+        alternative_io("ds; clflush %[p]",
+                       "data16 clflush %[p]", /* clflushopt */
+                       X86_FEATURE_CLFLUSHOPT,
+                       /* no outputs */,
+                       [p] "m" (*(const char *)(addr)));
+    }
+
+    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+}
+
+void cache_writeback(const void *addr, unsigned int size)
+{
+    unsigned int clflush_size;
+    const void *end = addr + size;
+
+    /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
+    if ( !boot_cpu_has(X86_FEATURE_CLWB) )
+        return cache_flush(addr, size);
+
+    /*
+     * This function may be called before current_cpu_data is established.
+     * Hence a fallback is needed to prevent the loop below becoming infinite.
+     */
+    clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+    addr -= (unsigned long)addr & (clflush_size - 1);
+    for ( ; addr < end; addr += clflush_size )
+    {
+/*
+ * The arguments to a macro must not include preprocessor directives. Doing so
+ * results in undefined behavior, so we have to create some defines here in
+ * order to avoid it.
+ */
+#if defined(HAVE_AS_CLWB)
+# define CLWB_ENCODING "clwb %[p]"
+#elif defined(HAVE_AS_XSAVEOPT)
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+#else
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+#endif
+
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+# define INPUT BASE_INPUT
+#else
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+#endif
+
+        asm volatile (CLWB_ENCODING :: INPUT(addr));
+
+#undef INPUT
+#undef BASE_INPUT
+#undef CLWB_ENCODING
+    }
+
+    asm volatile ("sfence" ::: "memory");
+}
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index cbb2ce17c001..709509e0fc9e 100644
--- xen/common/grant_table.c.orig
+++ xen/common/grant_table.c
@@ -3407,7 +3407,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
     return 0;
 }
 
-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
 {
     struct domain *d, *owner;
     struct page_info *page;
@@ -3501,7 +3501,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
             return -EFAULT;
         for ( ; ; )
         {
-            int ret = cache_flush(&op, cur_ref);
+            int ret = _cache_flush(&op, cur_ref);
 
             if ( ret < 0 )
                 return ret;
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index fbe951b2fad0..3defe9677f06 100644
--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -77,7 +77,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
                                           struct pci_dev *pdev,
                                           u16 did, u16 size, u64 addr);
 
-unsigned int get_cache_line_size(void);
 void flush_all_cache(void);
 
 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index f051a55764b9..2bf5f02c08de 100644
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -31,6 +31,7 @@
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
+#include <asm/cache.h>
 #include <asm/msi.h>
 #include <asm/nops.h>
 #include <asm/irq.h>
@@ -201,53 +202,10 @@ static int iommus_incoherent;
 
 static void sync_cache(const void *addr, unsigned int size)
 {
-    static unsigned long clflush_size = 0;
-    const void *end = addr + size;
-
     if ( !iommus_incoherent )
         return;
 
-    if ( clflush_size == 0 )
-        clflush_size = get_cache_line_size();
-
-    addr -= (unsigned long)addr & (clflush_size - 1);
-    for ( ; addr < end; addr += clflush_size )
-/*
- * The arguments to a macro must not include preprocessor directives. Doing so
- * results in undefined behavior, so we have to create some defines here in
- * order to avoid it.
- */
-#if defined(HAVE_AS_CLWB)
-# define CLWB_ENCODING "clwb %[p]"
-#elif defined(HAVE_AS_XSAVEOPT)
-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
-#else
-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
-#endif
-
-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
-# define INPUT BASE_INPUT
-#else
-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
-#endif
-        /*
-         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
-         * + prefix than a clflush + nop, and hence the prefix is added instead
-         * of letting the alternative framework fill the gap by appending nops.
-         */
-        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
-                         "data16 clflush %[p]", /* clflushopt */
-                         X86_FEATURE_CLFLUSHOPT,
-                         CLWB_ENCODING,
-                         X86_FEATURE_CLWB, /* no outputs */,
-                         INPUT(addr));
-#undef INPUT
-#undef BASE_INPUT
-#undef CLWB_ENCODING
-
-    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
-                      "sfence", X86_FEATURE_CLWB);
+    cache_writeback(addr, size);
 }
 
 /* Allocate page table, return its machine address */
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 229938f3a812..2a18b76e800d 100644
--- xen/drivers/passthrough/vtd/x86/vtd.c.orig
+++ xen/drivers/passthrough/vtd/x86/vtd.c
@@ -46,11 +46,6 @@ void unmap_vtd_domain_page(void *va)
     unmap_domain_page(va);
 }
 
-unsigned int get_cache_line_size(void)
-{
-    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
-}
-
 void flush_all_cache()
 {
     wbinvd();
diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
index 1f7173d8c72c..e4770efb22b9 100644
--- xen/include/asm-x86/cache.h.orig
+++ xen/include/asm-x86/cache.h
@@ -11,4 +11,11 @@
 
 #define __read_mostly __section(".data.read_mostly")
 
+#ifndef __ASSEMBLY__
+
+void cache_flush(const void *addr, unsigned int size);
+void cache_writeback(const void *addr, unsigned int size);
+
+#endif
+
 #endif
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/amd: Work around CLFLUSH ordering on older parts

On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
including reads and writes to the address, and LFENCE/SFENCE instructions.

This creates a multitude of problematic corner cases, laid out in the manual.
Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index b77fa1929733..aa1b9d0dda6b 100644
--- xen/arch/x86/cpu/amd.c.orig
+++ xen/arch/x86/cpu/amd.c
@@ -639,6 +639,14 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (!cpu_has_lfence_dispatch)
 		__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
 
+	/*
+	 * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
+	 * everything, including reads and writes to address, and
+	 * LFENCE/SFENCE instructions.
+	 */
+	if (!cpu_has_clflushopt)
+		setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
+
 	switch(c->x86)
 	{
 	case 0xf ... 0x11:
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 8568491c7ea9..6f3f5ab1a3c4 100644
--- xen/arch/x86/flushtlb.c.orig
+++ xen/arch/x86/flushtlb.c
@@ -249,6 +249,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
     return flags;
 }
 
+/*
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
+ * including reads and writes to address, and LFENCE/SFENCE instructions.
+ *
+ * This function only works safely after alternatives have run.  Luckily, at
+ * the time of writing, we don't flush the caches that early.
+ */
 void cache_flush(const void *addr, unsigned int size)
 {
     /*
@@ -258,6 +265,8 @@ void cache_flush(const void *addr, unsigned int size)
     unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
     const void *end = addr + size;
 
+    alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
+
     addr -= (unsigned long)addr & (clflush_size - 1);
     for ( ; addr < end; addr += clflush_size )
     {
@@ -273,7 +282,9 @@ void cache_flush(const void *addr, unsigned int size)
                        [p] "m" (*(const char *)(addr)));
     }
 
-    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+    alternative_2("",
+                  "sfence", X86_FEATURE_CLFLUSHOPT,
+                  "mfence", X86_BUG_CLFLUSH_MFENCE);
 }
 
 void cache_writeback(const void *addr, unsigned int size)
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
index b9d3cac97538..a8222e978cd9 100644
--- xen/include/asm-x86/cpufeatures.h.orig
+++ xen/include/asm-x86/cpufeatures.h
@@ -44,6 +44,7 @@ XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
 #define X86_BUG(x) ((FSCAPINTS + X86_NR_SYNTH) * 32 + (x))
 
 #define X86_BUG_FPU_PTRS          X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
+#define X86_BUG_CLFLUSH_MFENCE    X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
 
 /* Total number of capability words, inc synth and bug words. */
 #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Track and flush non-coherent mappings of RAM

There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
devices that make non-coherent writes.  The Linux sound subsystem makes
extensive use of this technique.

For such usecases, the guest's DMA buffer is mapped and consistently used as
WC, and Xen doesn't interact with the buffer.

However, a mischevious guest can use WC mappings to deliberately create
non-coherency between the cache and RAM, and use this to trick Xen into
validating a pagetable which isn't actually safe.

Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
it whenever a non-coherent writeable mapping is created.  If the page is used
as anything other than PGT_writable_page, force a cache flush before
validation.  Also force a cache flush before the page is returned to the heap.

This is CVE-2022-26364, part of XSA-402.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 859646b670a8..f5eeddce5867 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -1000,6 +1000,15 @@ get_page_from_l1e(
         return -EACCES;
     }
 
+    /*
+     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
+     * flush later if the target is used as anything but a PGT_writeable page.
+     * We care about all writeable mappings, including foreign mappings.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
+         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
+        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
+
     return 0;
 
  could_not_pin:
@@ -2532,6 +2541,19 @@ static int cleanup_page_mappings(struct page_info *page)
         }
     }
 
+    /*
+     * Flush the cache if there were previously non-coherent writeable
+     * mappings of this page.  This forces the page to be coherent before it
+     * is freed back to the heap.
+     */
+    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
+    {
+        void *addr = __map_domain_page(page);
+
+        cache_flush(addr, PAGE_SIZE);
+        unmap_domain_page(addr);
+    }
+
     return rc;
 }
 
@@ -3090,6 +3112,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
     if ( unlikely(!(nx & PGT_validated)) )
     {
         /*
+         * Flush the cache if there were previously non-coherent mappings of
+         * this page, and we're trying to use it as anything other than a
+         * writeable page.  This forces the page to be coherent before we
+         * validate its contents for safety.
+         */
+        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
+        {
+            void *addr = __map_domain_page(page);
+
+            cache_flush(addr, PAGE_SIZE);
+            unmap_domain_page(addr);
+
+            page->u.inuse.type_info &= ~PGT_non_coherent;
+        }
+
+        /*
          * No special validation needed for writable or shared pages.  Page
          * tables and GDT/LDT need to have their contents audited.
          *
diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
index 0325618c9883..81c72e61ed55 100644
--- xen/arch/x86/pv/grant_table.c.orig
+++ xen/arch/x86/pv/grant_table.c
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
 
     ol1e = *pl1e;
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * We always create mappings in this path.  However, our caller,
+         * map_grant_ref(), only passes potentially non-zero cache_flags for
+         * MMIO frames, so this path doesn't create non-coherent mappings of
+         * RAM frames and there's no need to calculate PGT_non_coherent.
+         */
+        ASSERT(!cache_flags || is_iomem_page(frame));
+
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
                  l1e_get_flags(ol1e), addr, grant_pte_flags);
 
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * Generally, replace_grant_pv_mapping() is used to destroy mappings
+         * (n1le = l1e_empty()), but it can be a present mapping on the
+         * GNTABOP_unmap_and_replace path.
+         *
+         * In such cases, the PTE is fully transplanted from its old location
+         * via steal_linear_addr(), so we need not perform PGT_non_coherent
+         * checking here.
+         */
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index db09849f73f8..82d0fd6104a2 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -48,8 +48,12 @@
 #define _PGT_partial      PG_shift(8)
 #define PGT_partial       PG_mask(1, 8)
 
+/* Has this page been mapped writeable with a non-coherent memory type? */
+#define _PGT_non_coherent PG_shift(9)
+#define PGT_non_coherent  PG_mask(1, 9)
+
  /* Count of uses of this frame as its current type. */
-#define PGT_count_width   PG_shift(8)
+#define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
 /* Are the 'type mask' bits identical? */