sysutils/xenkernel413/patches/patch-XSA402


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743

$NetBSD: patch-XSA402,v 1.1 2022/06/24 13:47:37 bouyer Exp $

From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/page: Introduce _PAGE_* constants for memory types

... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
constants.  These are going to be needed by forthcoming logic.

No functional change.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index c1e92937c073..7269ae89b880 100644
--- xen/include/asm-x86/page.h.orig
+++ xen/include/asm-x86/page.h
@@ -320,6 +320,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
 
 #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 
+/* Memory types, encoded under Xen's choice of MSR_PAT. */
+#define _PAGE_WB         (                                0)
+#define _PAGE_WT         (                        _PAGE_PWT)
+#define _PAGE_UCM        (            _PAGE_PCD            )
+#define _PAGE_UC         (            _PAGE_PCD | _PAGE_PWT)
+#define _PAGE_WC         (_PAGE_PAT                        )
+#define _PAGE_WP         (_PAGE_PAT |             _PAGE_PWT)
+
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
  * WARNING: This will need to be disabled to run OSes that use the spare PTE
@@ -338,8 +346,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
-#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
+#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_UC)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
 
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Don't change the cacheability of the directmap

Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
in response to guest mapping requests") attempted to keep the cacheability
consistent between different mappings of the same page.

The reason wasn't described in the changelog, but it is understood to be in
regards to a concern over machine check exceptions, owing to errata when using
mixed cacheabilities.  It did this primarily by updating Xen's mapping of the
page in the direct map when the guest mapped a page with reduced cacheability.

Unfortunately, the logic didn't actually prevent mixed cacheability from
occurring:
 * A guest could map a page normally, and then map the same page with
   different cacheability; nothing prevented this.
 * The cacheability of the directmap was always latest-takes-precedence in
   terms of guest requests.
 * Grant-mapped frames with lesser cacheability didn't adjust the page's
   cacheattr settings.
 * The map_domain_page() function still unconditionally created WB mappings,
   irrespective of the page's cacheattr settings.

Additionally, update_xen_mappings() had a bug where the alias calculation was
wrong for mfn's which were .init content, which should have been treated as
fully guest pages, not Xen pages.

Worse yet, the logic introduced a vulnerability whereby necessary
pagetable/segdesc adjustments made by Xen in the validation logic could become
non-coherent between the cache and main memory.  The CPU could subsequently
operate on the stale value in the cache, rather than the safe value in main
memory.

The directmap contains primarily mappings of RAM.  PAT/MTRR conflict
resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
cacheability resolves to being coherent.  The special case is WC mappings,
which are non-coherent against MTRR=WB regions (except for fully-coherent
CPUs).

Xen must not have any WC cacheability in the directmap, to prevent Xen's
actions from creating non-coherency.  (Guest actions creating non-coherency is
dealt with in subsequent patches.)  As all memory types for MTRR=WB ranges
inter-operate coherently, so leave Xen's directmap mappings as WB.

Only PV guests with access to devices can use reduced-cacheability mappings to
begin with, and they're trusted not to mount DoSs against the system anyway.

Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
count.  Retain the check in get_page_from_l1e() for special_pages() because a
guest has no business using reduced cacheability on these.

This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0

This is CVE-2022-26363, part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index ee91c7fe5f69..859646b670a8 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -786,24 +786,6 @@ bool is_iomem_page(mfn_t mfn)
     return (page_get_owner(page) == dom_io);
 }
 
-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
-{
-    int err = 0;
-    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
-         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
-    unsigned long xen_va =
-        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
-
-    if ( unlikely(alias) && cacheattr )
-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
-    if ( !err )
-        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
-                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
-    if ( unlikely(alias) && !cacheattr && !err )
-        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
-    return err;
-}
-
 #ifndef NDEBUG
 struct mmio_emul_range_ctxt {
     const struct domain *d;
@@ -1008,47 +990,14 @@ get_page_from_l1e(
         goto could_not_pin;
     }
 
-    if ( pte_flags_to_cacheattr(l1f) !=
-         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
+    if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_xen_heap_page(page) )
     {
-        unsigned long x, nx, y = page->count_info;
-        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
-        int err;
-
-        if ( is_xen_heap_page(page) )
-        {
-            if ( write )
-                put_page_type(page);
-            put_page(page);
-            gdprintk(XENLOG_WARNING,
-                     "Attempt to change cache attributes of Xen heap page\n");
-            return -EACCES;
-        }
-
-        do {
-            x  = y;
-            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
-        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
-        err = update_xen_mappings(mfn, cacheattr);
-        if ( unlikely(err) )
-        {
-            cacheattr = y & PGC_cacheattr_mask;
-            do {
-                x  = y;
-                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
-            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
-
-            if ( write )
-                put_page_type(page);
-            put_page(page);
-
-            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
-                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
-                     mfn, get_gpfn_from_mfn(mfn),
-                     l1e_get_intpte(l1e), l1e_owner->domain_id);
-            return err;
-        }
+        if ( write )
+            put_page_type(page);
+        put_page(page);
+        gdprintk(XENLOG_WARNING,
+                 "Attempt to change cache attributes of Xen heap page\n");
+        return -EACCES;
     }
 
     return 0;
@@ -2541,25 +2490,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
  */
 static int cleanup_page_mappings(struct page_info *page)
 {
-    unsigned int cacheattr =
-        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
     int rc = 0;
     unsigned long mfn = mfn_x(page_to_mfn(page));
 
     /*
-     * If we've modified xen mappings as a result of guest cache
-     * attributes, restore them to the "normal" state.
-     */
-    if ( unlikely(cacheattr) )
-    {
-        page->count_info &= ~PGC_cacheattr_mask;
-
-        BUG_ON(is_xen_heap_page(page));
-
-        rc = update_xen_mappings(mfn, 0);
-    }
-
-    /*
      * If this may be in a PV domain's IOMMU, remove it.
      *
      * NB that writable xenheap pages have their type set and cleared by
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 320c6cd19669..db09849f73f8 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -64,22 +64,19 @@
  /* Set when is using a page as a page table */
 #define _PGC_page_table   PG_shift(3)
 #define PGC_page_table    PG_mask(1, 3)
- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
-#define PGC_cacheattr_base PG_shift(6)
-#define PGC_cacheattr_mask PG_mask(7, 6)
  /* Page is broken? */
-#define _PGC_broken       PG_shift(7)
-#define PGC_broken        PG_mask(1, 7)
+#define _PGC_broken       PG_shift(4)
+#define PGC_broken        PG_mask(1, 4)
  /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
-#define PGC_state         PG_mask(3, 9)
-#define PGC_state_inuse   PG_mask(0, 9)
-#define PGC_state_offlining PG_mask(1, 9)
-#define PGC_state_offlined PG_mask(2, 9)
-#define PGC_state_free    PG_mask(3, 9)
+#define PGC_state           PG_mask(3, 6)
+#define PGC_state_inuse     PG_mask(0, 6)
+#define PGC_state_offlining PG_mask(1, 6)
+#define PGC_state_offlined  PG_mask(2, 6)
+#define PGC_state_free      PG_mask(3, 6)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
 
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(6)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 /*
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86: Split cache_flush() out of cache_writeback()

Subsequent changes will want a fully flushing version.

Use the new helper rather than opencoding it in flush_area_local().  This
resolves an outstanding issue where the conditional sfence is on the wrong
side of the clflushopt loop.  clflushopt is ordered with respect to older
stores, not to younger stores.

Rename gnttab_cache_flush()'s helper to avoid colliding in name.
grant_table.c can see the prototype from cache.h so the build fails
otherwise.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

Xen 4.16 and earlier:
 * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
   syncing" to split cache_writeback() out of the IOMMU logic, but without the
   associated hooks changes.

diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 03f92c23dcaf..8568491c7ea9 100644
--- xen/arch/x86/flushtlb.c.orig
+++ xen/arch/x86/flushtlb.c
@@ -224,7 +224,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
     if ( flags & FLUSH_CACHE )
     {
         const struct cpuinfo_x86 *c = &current_cpu_data;
-        unsigned long i, sz = 0;
+        unsigned long sz = 0;
 
         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
             sz = 1UL << (order + PAGE_SHIFT);
@@ -234,13 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
              c->x86_clflush_size && c->x86_cache_size && sz &&
              ((sz >> 10) < c->x86_cache_size) )
         {
-            alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-            for ( i = 0; i < sz; i += c->x86_clflush_size )
-                alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
-                                  " clflush %0",
-                                  "data16 clflush %0",      /* clflushopt */
-                                  X86_FEATURE_CLFLUSHOPT,
-                                  "m" (((const char *)va)[i]));
+            cache_flush(va, sz);
             flags &= ~FLUSH_CACHE;
         }
         else
@@ -254,3 +248,77 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
 
     return flags;
 }
+
+void cache_flush(const void *addr, unsigned int size)
+{
+    /*
+     * This function may be called before current_cpu_data is established.
+     * Hence a fallback is needed to prevent the loop below becoming infinite.
+     */
+    unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+    const void *end = addr + size;
+
+    addr -= (unsigned long)addr & (clflush_size - 1);
+    for ( ; addr < end; addr += clflush_size )
+    {
+        /*
+         * Note regarding the "ds" prefix use: it's faster to do a clflush
+         * + prefix than a clflush + nop, and hence the prefix is added instead
+         * of letting the alternative framework fill the gap by appending nops.
+         */
+        alternative_io("ds; clflush %[p]",
+                       "data16 clflush %[p]", /* clflushopt */
+                       X86_FEATURE_CLFLUSHOPT,
+                       /* no outputs */,
+                       [p] "m" (*(const char *)(addr)));
+    }
+
+    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+}
+
+void cache_writeback(const void *addr, unsigned int size)
+{
+    unsigned int clflush_size;
+    const void *end = addr + size;
+
+    /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
+    if ( !boot_cpu_has(X86_FEATURE_CLWB) )
+        return cache_flush(addr, size);
+
+    /*
+     * This function may be called before current_cpu_data is established.
+     * Hence a fallback is needed to prevent the loop below becoming infinite.
+     */
+    clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+    addr -= (unsigned long)addr & (clflush_size - 1);
+    for ( ; addr < end; addr += clflush_size )
+    {
+/*
+ * The arguments to a macro must not include preprocessor directives. Doing so
+ * results in undefined behavior, so we have to create some defines here in
+ * order to avoid it.
+ */
+#if defined(HAVE_AS_CLWB)
+# define CLWB_ENCODING "clwb %[p]"
+#elif defined(HAVE_AS_XSAVEOPT)
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+#else
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+#endif
+
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+# define INPUT BASE_INPUT
+#else
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+#endif
+
+        asm volatile (CLWB_ENCODING :: INPUT(addr));
+
+#undef INPUT
+#undef BASE_INPUT
+#undef CLWB_ENCODING
+    }
+
+    asm volatile ("sfence" ::: "memory");
+}
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index cbb2ce17c001..709509e0fc9e 100644
--- xen/common/grant_table.c.orig
+++ xen/common/grant_table.c
@@ -3407,7 +3407,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
     return 0;
 }
 
-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
 {
     struct domain *d, *owner;
     struct page_info *page;
@@ -3501,7 +3501,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
             return -EFAULT;
         for ( ; ; )
         {
-            int ret = cache_flush(&op, cur_ref);
+            int ret = _cache_flush(&op, cur_ref);
 
             if ( ret < 0 )
                 return ret;
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index fbe951b2fad0..3defe9677f06 100644
--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -77,7 +77,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
                                           struct pci_dev *pdev,
                                           u16 did, u16 size, u64 addr);
 
-unsigned int get_cache_line_size(void);
 void flush_all_cache(void);
 
 uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index f051a55764b9..2bf5f02c08de 100644
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -31,6 +31,7 @@
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
+#include <asm/cache.h>
 #include <asm/msi.h>
 #include <asm/nops.h>
 #include <asm/irq.h>
@@ -201,53 +202,10 @@ static int iommus_incoherent;
 
 static void sync_cache(const void *addr, unsigned int size)
 {
-    static unsigned long clflush_size = 0;
-    const void *end = addr + size;
-
     if ( !iommus_incoherent )
         return;
 
-    if ( clflush_size == 0 )
-        clflush_size = get_cache_line_size();
-
-    addr -= (unsigned long)addr & (clflush_size - 1);
-    for ( ; addr < end; addr += clflush_size )
-/*
- * The arguments to a macro must not include preprocessor directives. Doing so
- * results in undefined behavior, so we have to create some defines here in
- * order to avoid it.
- */
-#if defined(HAVE_AS_CLWB)
-# define CLWB_ENCODING "clwb %[p]"
-#elif defined(HAVE_AS_XSAVEOPT)
-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
-#else
-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
-#endif
-
-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
-# define INPUT BASE_INPUT
-#else
-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
-#endif
-        /*
-         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
-         * + prefix than a clflush + nop, and hence the prefix is added instead
-         * of letting the alternative framework fill the gap by appending nops.
-         */
-        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
-                         "data16 clflush %[p]", /* clflushopt */
-                         X86_FEATURE_CLFLUSHOPT,
-                         CLWB_ENCODING,
-                         X86_FEATURE_CLWB, /* no outputs */,
-                         INPUT(addr));
-#undef INPUT
-#undef BASE_INPUT
-#undef CLWB_ENCODING
-
-    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
-                      "sfence", X86_FEATURE_CLWB);
+    cache_writeback(addr, size);
 }
 
 /* Allocate page table, return its machine address */
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 229938f3a812..2a18b76e800d 100644
--- xen/drivers/passthrough/vtd/x86/vtd.c.orig
+++ xen/drivers/passthrough/vtd/x86/vtd.c
@@ -46,11 +46,6 @@ void unmap_vtd_domain_page(void *va)
     unmap_domain_page(va);
 }
 
-unsigned int get_cache_line_size(void)
-{
-    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
-}
-
 void flush_all_cache()
 {
     wbinvd();
diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
index 1f7173d8c72c..e4770efb22b9 100644
--- xen/include/asm-x86/cache.h.orig
+++ xen/include/asm-x86/cache.h
@@ -11,4 +11,11 @@
 
 #define __read_mostly __section(".data.read_mostly")
 
+#ifndef __ASSEMBLY__
+
+void cache_flush(const void *addr, unsigned int size);
+void cache_writeback(const void *addr, unsigned int size);
+
+#endif
+
 #endif
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/amd: Work around CLFLUSH ordering on older parts

On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
including reads and writes to the address, and LFENCE/SFENCE instructions.

This creates a multitude of problematic corner cases, laid out in the manual.
Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.

This is part of XSA-402.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index b77fa1929733..aa1b9d0dda6b 100644
--- xen/arch/x86/cpu/amd.c.orig
+++ xen/arch/x86/cpu/amd.c
@@ -639,6 +639,14 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (!cpu_has_lfence_dispatch)
 		__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
 
+	/*
+	 * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
+	 * everything, including reads and writes to address, and
+	 * LFENCE/SFENCE instructions.
+	 */
+	if (!cpu_has_clflushopt)
+		setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
+
 	switch(c->x86)
 	{
 	case 0xf ... 0x11:
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 8568491c7ea9..6f3f5ab1a3c4 100644
--- xen/arch/x86/flushtlb.c.orig
+++ xen/arch/x86/flushtlb.c
@@ -249,6 +249,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
     return flags;
 }
 
+/*
+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
+ * including reads and writes to address, and LFENCE/SFENCE instructions.
+ *
+ * This function only works safely after alternatives have run.  Luckily, at
+ * the time of writing, we don't flush the caches that early.
+ */
 void cache_flush(const void *addr, unsigned int size)
 {
     /*
@@ -258,6 +265,8 @@ void cache_flush(const void *addr, unsigned int size)
     unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
     const void *end = addr + size;
 
+    alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
+
     addr -= (unsigned long)addr & (clflush_size - 1);
     for ( ; addr < end; addr += clflush_size )
     {
@@ -273,7 +282,9 @@ void cache_flush(const void *addr, unsigned int size)
                        [p] "m" (*(const char *)(addr)));
     }
 
-    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+    alternative_2("",
+                  "sfence", X86_FEATURE_CLFLUSHOPT,
+                  "mfence", X86_BUG_CLFLUSH_MFENCE);
 }
 
 void cache_writeback(const void *addr, unsigned int size)
diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
index b9d3cac97538..a8222e978cd9 100644
--- xen/include/asm-x86/cpufeatures.h.orig
+++ xen/include/asm-x86/cpufeatures.h
@@ -44,6 +44,7 @@ XEN_CPUFEATURE(SC_VERW_IDLE,      X86_SYNTH(25)) /* VERW used by Xen for idle */
 #define X86_BUG(x) ((FSCAPINTS + X86_NR_SYNTH) * 32 + (x))
 
 #define X86_BUG_FPU_PTRS          X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
+#define X86_BUG_CLFLUSH_MFENCE    X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
 
 /* Total number of capability words, inc synth and bug words. */
 #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Track and flush non-coherent mappings of RAM

There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
devices that make non-coherent writes.  The Linux sound subsystem makes
extensive use of this technique.

For such usecases, the guest's DMA buffer is mapped and consistently used as
WC, and Xen doesn't interact with the buffer.

However, a mischevious guest can use WC mappings to deliberately create
non-coherency between the cache and RAM, and use this to trick Xen into
validating a pagetable which isn't actually safe.

Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
it whenever a non-coherent writeable mapping is created.  If the page is used
as anything other than PGT_writable_page, force a cache flush before
validation.  Also force a cache flush before the page is returned to the heap.

This is CVE-2022-26364, part of XSA-402.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 859646b670a8..f5eeddce5867 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@ -1000,6 +1000,15 @@ get_page_from_l1e(
         return -EACCES;
     }
 
+    /*
+     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
+     * flush later if the target is used as anything but a PGT_writeable page.
+     * We care about all writeable mappings, including foreign mappings.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
+         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
+        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
+
     return 0;
 
  could_not_pin:
@@ -2532,6 +2541,19 @@ static int cleanup_page_mappings(struct page_info *page)
         }
     }
 
+    /*
+     * Flush the cache if there were previously non-coherent writeable
+     * mappings of this page.  This forces the page to be coherent before it
+     * is freed back to the heap.
+     */
+    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
+    {
+        void *addr = __map_domain_page(page);
+
+        cache_flush(addr, PAGE_SIZE);
+        unmap_domain_page(addr);
+    }
+
     return rc;
 }
 
@@ -3090,6 +3112,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
     if ( unlikely(!(nx & PGT_validated)) )
     {
         /*
+         * Flush the cache if there were previously non-coherent mappings of
+         * this page, and we're trying to use it as anything other than a
+         * writeable page.  This forces the page to be coherent before we
+         * validate its contents for safety.
+         */
+        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
+        {
+            void *addr = __map_domain_page(page);
+
+            cache_flush(addr, PAGE_SIZE);
+            unmap_domain_page(addr);
+
+            page->u.inuse.type_info &= ~PGT_non_coherent;
+        }
+
+        /*
          * No special validation needed for writable or shared pages.  Page
          * tables and GDT/LDT need to have their contents audited.
          *
diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
index 0325618c9883..81c72e61ed55 100644
--- xen/arch/x86/pv/grant_table.c.orig
+++ xen/arch/x86/pv/grant_table.c
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
 
     ol1e = *pl1e;
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * We always create mappings in this path.  However, our caller,
+         * map_grant_ref(), only passes potentially non-zero cache_flags for
+         * MMIO frames, so this path doesn't create non-coherent mappings of
+         * RAM frames and there's no need to calculate PGT_non_coherent.
+         */
+        ASSERT(!cache_flags || is_iomem_page(frame));
+
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
                  l1e_get_flags(ol1e), addr, grant_pte_flags);
 
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * Generally, replace_grant_pv_mapping() is used to destroy mappings
+         * (n1le = l1e_empty()), but it can be a present mapping on the
+         * GNTABOP_unmap_and_replace path.
+         *
+         * In such cases, the PTE is fully transplanted from its old location
+         * via steal_linear_addr(), so we need not perform PGT_non_coherent
+         * checking here.
+         */
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index db09849f73f8..82d0fd6104a2 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@ -48,8 +48,12 @@
 #define _PGT_partial      PG_shift(8)
 #define PGT_partial       PG_mask(1, 8)
 
+/* Has this page been mapped writeable with a non-coherent memory type? */
+#define _PGT_non_coherent PG_shift(9)
+#define PGT_non_coherent  PG_mask(1, 9)
+
  /* Count of uses of this frame as its current type. */
-#define PGT_count_width   PG_shift(8)
+#define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
 /* Are the 'type mask' bits identical? */