$NetBSD: patch-XSA400,v 1.1 2022/06/24 13:47:37 bouyer Exp $ From: Jan Beulich Subject: VT-d: split domid map cleanup check into a function This logic will want invoking from elsewhere. No functional change intended. Signed-off-by: Jan Beulich Reviewed-by: Roger Pau Monné Reviewed-by: Kevin Tian --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -153,6 +153,51 @@ static void cleanup_domid_map(struct dom } } +static bool any_pdev_behind_iommu(const struct domain *d, + const struct pci_dev *exclude, + const struct vtd_iommu *iommu) +{ + const struct pci_dev *pdev; + + for_each_pdev ( d, pdev ) + { + const struct acpi_drhd_unit *drhd; + + if ( pdev == exclude ) + continue; + + drhd = acpi_find_matched_drhd_unit(pdev); + if ( drhd && drhd->iommu == iommu ) + return true; + } + + return false; +} + +/* + * If no other devices under the same iommu owned by this domain, + * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap. + */ +static void check_cleanup_domid_map(struct domain *d, + const struct pci_dev *exclude, + struct vtd_iommu *iommu) +{ + bool found = any_pdev_behind_iommu(d, exclude, iommu); + + /* + * Hidden devices are associated with DomXEN but usable by the hardware + * domain. Hence they need considering here as well. + */ + if ( !found && is_hardware_domain(d) ) + found = any_pdev_behind_iommu(dom_xen, exclude, iommu); + + if ( !found ) + { + clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap); + cleanup_domid_map(d, iommu); + } +} + static int iommus_incoherent; static void sync_cache(const void *addr, unsigned int size) @@ -1685,7 +1730,6 @@ static int domain_context_unmap(struct d struct vtd_iommu *iommu; int ret = 0; u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus; - int found = 0; drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) @@ -1769,28 +1813,8 @@ static int domain_context_unmap(struct d if ( ret ) goto out; - /* - * if no other devices under the same iommu owned by this domain, - * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp - */ - for_each_pdev ( domain, pdev ) - { - if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) - continue; - - drhd = acpi_find_matched_drhd_unit(pdev); - if ( drhd && drhd->iommu == iommu ) - { - found = 1; - break; - } - } - - if ( found == 0 ) - { - clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap); - cleanup_domid_map(domain, iommu); - } + if ( !ret ) + check_cleanup_domid_map(domain, pdev, iommu); out: return ret; From: Jan Beulich Subject: VT-d: fix (de)assign ordering when RMRRs are in use In the event that the RMRR mappings are essential for device operation, they should be established before updating the device's context entry, while they should be torn down only after the device's context entry was successfully updated. Also adjust a related log message. This is CVE-2022-26358 / part of XSA-400. Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") Signed-off-by: Jan Beulich Reviewed-by: Roger Pau Monné Reviewed-by: Paul Durrant Reviewed-by: Kevin Tian --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -2392,6 +2392,10 @@ static int reassign_device_ownership( { int ret; + ret = domain_context_unmap(source, devfn, pdev); + if ( ret ) + return ret; + /* * Devices assigned to untrusted domains (here assumed to be any domU) * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected @@ -2428,10 +2432,6 @@ static int reassign_device_ownership( } } - ret = domain_context_unmap(source, devfn, pdev); - if ( ret ) - return ret; - if ( devfn == pdev->devfn && pdev->domain != dom_io ) { list_move(&pdev->domain_list, &dom_io->pdev_list); @@ -2508,9 +2508,8 @@ static int intel_iommu_assign_device( } } - ret = reassign_device_ownership(s, d, devfn, pdev); - if ( ret || d == dom_io ) - return ret; + if ( d == dom_io ) + return reassign_device_ownership(s, d, devfn, pdev); /* Setup rmrr identity mapping */ for_each_rmrr_device( rmrr, bdf, i ) @@ -2523,20 +2522,37 @@ static int intel_iommu_assign_device( rmrr->end_address, flag); if ( ret ) { - int rc; - - rc = reassign_device_ownership(d, s, devfn, pdev); printk(XENLOG_G_ERR VTDPREFIX - " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n", - rmrr->base_address, rmrr->end_address, - d->domain_id, ret); - if ( rc ) - { - printk(XENLOG_ERR VTDPREFIX - " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n", - seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc); - domain_crash(d); - } + "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n", + d, rmrr->base_address, rmrr->end_address, ret); + break; + } + } + } + + if ( !ret ) + ret = reassign_device_ownership(s, d, devfn, pdev); + + /* See reassign_device_ownership() for the hwdom aspect. */ + if ( !ret || is_hardware_domain(d) ) + return ret; + + for_each_rmrr_device( rmrr, bdf, i ) + { + if ( rmrr->segment == seg && + PCI_BUS(bdf) == bus && + PCI_DEVFN2(bdf) == devfn ) + { + int rc = iommu_identity_mapping(d, p2m_access_x, + rmrr->base_address, + rmrr->end_address, 0); + + if ( rc && rc != -ENOENT ) + { + printk(XENLOG_ERR VTDPREFIX + "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n", + d, rmrr->base_address, rmrr->end_address, rc); + domain_crash(d); break; } } From: Jan Beulich Subject: VT-d: fix add/remove ordering when RMRRs are in use In the event that the RMRR mappings are essential for device operation, they should be established before updating the device's context entry, while they should be torn down only after the device's context entry was successfully cleared. Also switch to %pd in related log messages. Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR") Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling") Signed-off-by: Jan Beulich Reviewed-by: Roger Pau Monné Reviewed-by: Kevin Tian --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -1993,14 +1993,6 @@ static int intel_iommu_add_device(u8 dev if ( !pdev->domain ) return -EINVAL; - ret = domain_context_mapping(pdev->domain, devfn, pdev); - if ( ret ) - { - dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n", - pdev->domain->domain_id); - return ret; - } - for_each_rmrr_device ( rmrr, bdf, i ) { if ( rmrr->segment == pdev->seg && @@ -2017,12 +2009,17 @@ static int intel_iommu_add_device(u8 dev rmrr->base_address, rmrr->end_address, 0); if ( ret ) - dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", - pdev->domain->domain_id); + dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n", + pdev->domain); } } - return 0; + ret = domain_context_mapping(pdev->domain, devfn, pdev); + if ( ret ) + dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n", + pdev->domain); + + return ret; } static int intel_iommu_enable_device(struct pci_dev *pdev) @@ -2044,11 +2041,15 @@ static int intel_iommu_remove_device(u8 { struct acpi_rmrr_unit *rmrr; u16 bdf; - int i; + int ret, i; if ( !pdev->domain ) return -EINVAL; + ret = domain_context_unmap(pdev->domain, devfn, pdev); + if ( ret ) + return ret; + for_each_rmrr_device ( rmrr, bdf, i ) { if ( rmrr->segment != pdev->seg || @@ -2064,7 +2065,7 @@ static int intel_iommu_remove_device(u8 rmrr->end_address, 0); } - return domain_context_unmap(pdev->domain, devfn, pdev); + return 0; } static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev) From: Jan Beulich Subject: VT-d: drop ownership checking from domain_context_mapping_one() Despite putting in quite a bit of effort it was not possible to establish why exactly this code exists (beyond possibly sanity checking). Instead of a subsequent change further complicating this logic, simply get rid of it. Take the opportunity and move the respective unmap_vtd_domain_page() out of the locked region. Signed-off-by: Jan Beulich Reviewed-by: Roger Pau Monné Reviewed-by: Paul Durrant Reviewed-by: Kevin Tian --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -114,28 +114,6 @@ static int context_set_domain_id(struct return 0; } -static int context_get_domain_id(struct context_entry *context, - struct vtd_iommu *iommu) -{ - unsigned long dom_index, nr_dom; - int domid = -1; - - if (iommu && context) - { - nr_dom = cap_ndoms(iommu->cap); - - dom_index = context_domain_id(*context); - - if ( dom_index < nr_dom && iommu->domid_map ) - domid = iommu->domid_map[dom_index]; - else - dprintk(XENLOG_DEBUG VTDPREFIX, - "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n", - dom_index, nr_dom); - } - return domid; -} - static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) { int iommu_domid = domain_iommu_domid(domain, iommu); @@ -1392,49 +1370,9 @@ int domain_context_mapping_one( if ( context_present(*context) ) { - int res = 0; - - /* Try to get domain ownership from device structure. If that's - * not available, try to read it from the context itself. */ - if ( pdev ) - { - if ( pdev->domain != domain ) - { - printk(XENLOG_G_INFO VTDPREFIX - "d%d: %04x:%02x:%02x.%u owned by d%d!", - domain->domain_id, - seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), - pdev->domain ? pdev->domain->domain_id : -1); - res = -EINVAL; - } - } - else - { - int cdomain; - cdomain = context_get_domain_id(context, iommu); - - if ( cdomain < 0 ) - { - printk(XENLOG_G_WARNING VTDPREFIX - "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n", - domain->domain_id, - seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - res = -EINVAL; - } - else if ( cdomain != domain->domain_id ) - { - printk(XENLOG_G_INFO VTDPREFIX - "d%d: %04x:%02x:%02x.%u already mapped to d%d!", - domain->domain_id, - seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), - cdomain); - res = -EINVAL; - } - } - - unmap_vtd_domain_page(context_entries); spin_unlock(&iommu->lock); - return res; + unmap_vtd_domain_page(context_entries); + return 0; } if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) From: Jan Beulich Subject: VT-d: re-assign devices directly Devices with RMRRs, due to it being unspecified how/when the specified memory regions may get accessed, may not be left disconnected from their respective mappings (as long as it's not certain that the device has been fully quiesced). Hence rather than unmapping the old context and then mapping the new one, re-assignment needs to be done in a single step. This is CVE-2022-26359 / part of XSA-400. Reported-by: Roger Pau Monné Similarly quarantining scratch-page mode relies on page tables to be continuously wired up. To avoid complicating things more than necessary, treat all devices mostly equally, i.e. regardless of their association with any RMRRs. The main difference is when it comes to updating context entries, which need to be atomic when there are RMRRs. Yet atomicity can only be achieved with CMPXCHG16B, availability of which we can't take for given. The seemingly complicated choice of non-negative return values for domain_context_mapping_one() is to limit code churn: This way callers passing NULL for pdev don't need fiddling with. Signed-off-by: Jan Beulich Reviewed-by: Kevin Tian Reviewed-by: Roger Pau Monné --- xen/drivers/passthrough/vtd/extern.h.orig +++ xen/drivers/passthrough/vtd/extern.h @@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr); void *map_vtd_domain_page(u64 maddr); void unmap_vtd_domain_page(void *va); int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn, const struct pci_dev *); + uint8_t bus, uint8_t devfn, + const struct pci_dev *pdev, unsigned int mode); int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, u8 bus, u8 devfn); int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); @@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void); void platform_quirks_init(void); void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); -int __must_check me_wifi_quirk(struct domain *domain, - u8 bus, u8 devfn, int map); +int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, + uint8_t devfn, unsigned int mode); void pci_vtd_quirk(const struct pci_dev *); void quirk_iommu_caps(struct vtd_iommu *iommu); --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -110,6 +110,7 @@ static int context_set_domain_id(struct } set_bit(i, iommu->domid_bitmap); + context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; return 0; } @@ -1350,15 +1351,27 @@ static void __hwdom_init intel_iommu_hwd } } +/* + * This function returns + * - a negative errno value upon error, + * - zero upon success when previously the entry was non-present, or this isn't + * the "main" request for a device (pdev == NULL), or for no-op quarantining + * assignments, + * - positive (one) upon success when previously the entry was present and this + * is the "main" request for a device (pdev != NULL). + */ int domain_context_mapping_one( struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn, const struct pci_dev *pdev) + uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, + unsigned int mode) { struct domain_iommu *hd = dom_iommu(domain); - struct context_entry *context, *context_entries; + struct context_entry *context, *context_entries, lctxt; + __uint128_t old; u64 maddr, pgd_maddr; - u16 seg = iommu->drhd->segment; + uint16_t seg = iommu->drhd->segment, prev_did = 0; + struct domain *prev_dom = NULL; int agaw, rc, ret; bool_t flush_dev_iotlb; @@ -1367,17 +1380,32 @@ int domain_context_mapping_one( maddr = bus_to_context_maddr(iommu, bus); context_entries = (struct context_entry *)map_vtd_domain_page(maddr); context = &context_entries[devfn]; + old = (lctxt = *context).full; - if ( context_present(*context) ) + if ( context_present(lctxt) ) { - spin_unlock(&iommu->lock); - unmap_vtd_domain_page(context_entries); - return 0; + domid_t domid; + + prev_did = context_domain_id(lctxt); + domid = iommu->domid_map[prev_did]; + if ( domid < DOMID_FIRST_RESERVED ) + prev_dom = rcu_lock_domain_by_id(domid); + else if ( domid == DOMID_IO ) + prev_dom = rcu_lock_domain(dom_io); + if ( !prev_dom ) + { + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + dprintk(XENLOG_DEBUG VTDPREFIX, + "no domain for did %u (nr_dom %u)\n", + prev_did, cap_ndoms(iommu->cap)); + return -ESRCH; + } } if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) { - context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); + context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU); agaw = level_to_agaw(iommu->nr_pt_levels); } else @@ -1394,6 +1422,8 @@ int domain_context_mapping_one( spin_unlock(&hd->arch.mapping_lock); spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); + if ( prev_dom ) + rcu_unlock_domain(prev_dom); return -ENOMEM; } } @@ -1411,33 +1441,102 @@ int domain_context_mapping_one( goto nomem; } - context_set_address_root(*context, pgd_maddr); + context_set_address_root(lctxt, pgd_maddr); if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) - context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB); + context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); else - context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); + context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL); spin_unlock(&hd->arch.mapping_lock); } - if ( context_set_domain_id(context, domain, iommu) ) + rc = context_set_domain_id(&lctxt, domain, iommu); + if ( rc ) { + unlock: spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); - return -EFAULT; + if ( prev_dom ) + rcu_unlock_domain(prev_dom); + return rc; + } + + if ( !prev_dom ) + { + context_set_address_width(lctxt, agaw); + context_set_fault_enable(lctxt); + context_set_present(lctxt); + } + else if ( prev_dom == domain ) + { + ASSERT(lctxt.full == context->full); + rc = !!pdev; + goto unlock; + } + else + { + ASSERT(context_address_width(lctxt) == agaw); + ASSERT(!context_fault_disable(lctxt)); + } + + if ( cpu_has_cx16 ) + { + __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); + + /* + * Hardware does not update the context entry behind our backs, + * so the return value should match "old". + */ + if ( res != old ) + { + if ( pdev ) + check_cleanup_domid_map(domain, pdev, iommu); + printk(XENLOG_ERR + "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", + pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + rc = -EILSEQ; + goto unlock; + } + } + else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) + { + context_clear_present(*context); + iommu_sync_cache(context, sizeof(*context)); + + write_atomic(&context->hi, lctxt.hi); + /* No barrier should be needed between these two. */ + write_atomic(&context->lo, lctxt.lo); + } + else /* Best effort, updating DID last. */ + { + /* + * By non-atomically updating the context entry's DID field last, + * during a short window in time TLB entries with the old domain ID + * but the new page tables may be inserted. This could affect I/O + * of other devices using this same (old) domain ID. Such updating + * therefore is not a problem if this was the only device associated + * with the old domain ID. Diverting I/O of any of a dying domain's + * devices to the quarantine page tables is intended anyway. + */ + if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) + printk(XENLOG_WARNING VTDPREFIX + " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom); + + write_atomic(&context->lo, lctxt.lo); + /* No barrier should be needed between these two. */ + write_atomic(&context->hi, lctxt.hi); } - context_set_address_width(*context, agaw); - context_set_fault_enable(*context); - context_set_present(*context); iommu_sync_cache(context, sizeof(struct context_entry)); spin_unlock(&iommu->lock); - /* Context entry was previously non-present (with domid 0). */ - rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn), - DMA_CCMD_MASK_NOBIT, 1); + rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn), + DMA_CCMD_MASK_NOBIT, !prev_dom); flush_dev_iotlb = !!find_ats_dev_drhd(iommu); - ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb); + ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb); /* * The current logic for returns: @@ -1458,12 +1557,21 @@ int domain_context_mapping_one( unmap_vtd_domain_page(context_entries); if ( !seg && !rc ) - rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC); + rc = me_wifi_quirk(domain, bus, devfn, mode); if ( rc ) - domain_context_unmap_one(domain, iommu, bus, devfn); + { + if ( !prev_dom ) + domain_context_unmap_one(domain, iommu, bus, devfn); + else if ( prev_dom != domain ) /* Avoid infinite recursion. */ + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, + mode & MAP_WITH_RMRR); + } - return rc; + if ( prev_dom ) + rcu_unlock_domain(prev_dom); + + return rc ?: pdev && prev_dom; } static int domain_context_unmap(struct domain *d, uint8_t devfn, @@ -1473,8 +1581,11 @@ static int domain_context_mapping(struct struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; int ret = 0; - u8 seg = pdev->seg, bus = pdev->bus, secbus; + unsigned int i, mode = 0; + uint16_t seg = pdev->seg, bdf; + uint8_t bus = pdev->bus, secbus; drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) @@ -1493,8 +1604,29 @@ static int domain_context_mapping(struct ASSERT(pcidevs_locked()); + for_each_rmrr_device( rmrr, bdf, i ) + { + if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf ) + continue; + + mode |= MAP_WITH_RMRR; + break; + } + + if ( domain != pdev->domain ) + { + if ( pdev->domain->is_dying ) + mode |= MAP_OWNER_DYING; + else if ( drhd && + !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && + !pdev->phantom_stride ) + mode |= MAP_SINGLE_DEVICE; + } + switch ( pdev->type ) { + bool prev_present; + case DEV_TYPE_PCI_HOST_BRIDGE: if ( iommu_debug ) printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n", @@ -1515,7 +1647,9 @@ static int domain_context_mapping(struct domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev); + pdev, mode); + if ( ret > 0 ) + ret = 0; if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) enable_ats_device(pdev, &drhd->iommu->ats_devices); @@ -1528,9 +1662,10 @@ static int domain_context_mapping(struct PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev); - if ( ret ) + pdev, mode); + if ( ret < 0 ) break; + prev_present = ret; if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 ) { @@ -1538,6 +1673,15 @@ static int domain_context_mapping(struct break; ret = -ENXIO; } + /* + * Strictly speaking if the device is the only one behind this bridge + * and the only one with this (secbus,0,0) tuple, it could be allowed + * to be re-assigned regardless of RMRR presence. But let's deal with + * that case only if it is actually found in the wild. + */ + else if ( prev_present && (mode & MAP_WITH_RMRR) && + domain != pdev->domain ) + ret = -EOPNOTSUPP; /* * Mapping a bridge should, if anything, pass the struct pci_dev of @@ -1546,7 +1690,7 @@ static int domain_context_mapping(struct */ if ( ret >= 0 ) ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - NULL); + NULL, mode); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different @@ -1561,10 +1705,15 @@ static int domain_context_mapping(struct if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, - NULL); + NULL, mode); if ( ret ) - domain_context_unmap(domain, devfn, pdev); + { + if ( !prev_present ) + domain_context_unmap(domain, devfn, pdev); + else if ( pdev->domain != domain ) /* Avoid infinite recursion. */ + domain_context_mapping(pdev->domain, devfn, pdev); + } break; @@ -2331,9 +2480,8 @@ static int reassign_device_ownership( { int ret; - ret = domain_context_unmap(source, devfn, pdev); - if ( ret ) - return ret; + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_assign(target); /* * Devices assigned to untrusted domains (here assumed to be any domU) @@ -2343,6 +2491,31 @@ static int reassign_device_ownership( if ( (target != hardware_domain) && !iommu_intremap ) untrusted_msi = true; + ret = domain_context_mapping(target, devfn, pdev); + if ( ret ) + { + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_deassign(target); + return ret; + } + + if ( pdev->devfn == devfn ) + { + const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); + + if ( drhd ) + check_cleanup_domid_map(source, pdev, drhd->iommu); + } + + if ( devfn == pdev->devfn && pdev->domain != target ) + { + list_move(&pdev->domain_list, &target->pdev_list); + pdev->domain = target; + } + + if ( !has_arch_pdevs(source) ) + vmx_pi_hooks_deassign(source); + /* * If the device belongs to the hardware domain, and it has RMRR, don't * remove it from the hardware domain, because BIOS may use RMRR at @@ -2371,34 +2544,7 @@ static int reassign_device_ownership( } } - if ( devfn == pdev->devfn && pdev->domain != dom_io ) - { - list_move(&pdev->domain_list, &dom_io->pdev_list); - pdev->domain = dom_io; - } - - if ( !has_arch_pdevs(source) ) - vmx_pi_hooks_deassign(source); - - if ( !has_arch_pdevs(target) ) - vmx_pi_hooks_assign(target); - - ret = domain_context_mapping(target, devfn, pdev); - if ( ret ) - { - if ( !has_arch_pdevs(target) ) - vmx_pi_hooks_deassign(target); - - return ret; - } - - if ( devfn == pdev->devfn && pdev->domain != target ) - { - list_move(&pdev->domain_list, &target->pdev_list); - pdev->domain = target; - } - - return ret; + return 0; } static int intel_iommu_assign_device( --- xen/drivers/passthrough/vtd/iommu.h.orig +++ xen/drivers/passthrough/vtd/iommu.h @@ -202,8 +202,12 @@ struct root_entry { do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) struct context_entry { - u64 lo; - u64 hi; + union { + struct { + uint64_t lo, hi; + }; + __uint128_t full; + }; }; #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) #define context_present(c) ((c).lo & 1) --- xen/drivers/passthrough/vtd/quirks.c.orig +++ xen/drivers/passthrough/vtd/quirks.c @@ -343,7 +343,8 @@ void __init platform_quirks_init(void) */ static int __must_check map_me_phantom_function(struct domain *domain, - u32 dev, int map) + unsigned int dev, + unsigned int mode) { struct acpi_drhd_unit *drhd; struct pci_dev *pdev; @@ -354,9 +355,9 @@ static int __must_check map_me_phantom_f drhd = acpi_find_matched_drhd_unit(pdev); /* map or unmap ME phantom function */ - if ( map ) + if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) rc = domain_context_mapping_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7), NULL); + PCI_DEVFN(dev, 7), NULL, mode); else rc = domain_context_unmap_one(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7)); @@ -364,7 +365,8 @@ static int __must_check map_me_phantom_f return rc; } -int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) +int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + unsigned int mode) { u32 id; int rc = 0; @@ -388,7 +390,7 @@ int me_wifi_quirk(struct domain *domain, case 0x423b8086: case 0x423c8086: case 0x423d8086: - rc = map_me_phantom_function(domain, 3, map); + rc = map_me_phantom_function(domain, 3, mode); break; default: break; @@ -414,7 +416,7 @@ int me_wifi_quirk(struct domain *domain, case 0x42388086: /* Puma Peak */ case 0x422b8086: case 0x422c8086: - rc = map_me_phantom_function(domain, 22, map); + rc = map_me_phantom_function(domain, 22, mode); break; default: break; --- xen/drivers/passthrough/vtd/vtd.h.orig +++ xen/drivers/passthrough/vtd/vtd.h @@ -22,8 +22,14 @@ #include -#define MAP_ME_PHANTOM_FUNC 1 -#define UNMAP_ME_PHANTOM_FUNC 0 +/* + * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode" + * parameters. + */ +#define MAP_WITH_RMRR (1u << 0) +#define MAP_OWNER_DYING (1u << 1) +#define MAP_SINGLE_DEVICE (1u << 2) +#define UNMAP_ME_PHANTOM_FUNC (1u << 3) /* Allow for both IOAPIC and IOSAPIC. */ #define IO_xAPIC_route_entry IO_APIC_route_entry From: Jan Beulich Subject: AMD/IOMMU: re-assign devices directly Devices with unity map ranges, due to it being unspecified how/when these memory ranges may get accessed, may not be left disconnected from their unity mappings (as long as it's not certain that the device has been fully quiesced). Hence rather than tearing down the old root page table pointer and then establishing the new one, re-assignment needs to be done in a single step. This is CVE-2022-26360 / part of XSA-400. Reported-by: Roger Pau Monné Similarly quarantining scratch-page mode relies on page tables to be continuously wired up. To avoid complicating things more than necessary, treat all devices mostly equally, i.e. regardless of their association with any unity map ranges. The main difference is when it comes to updating DTEs, which need to be atomic when there are unity mappings. Yet atomicity can only be achieved with CMPXCHG16B, availability of which we can't take for given. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Roger Pau Monné --- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig +++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -79,9 +79,13 @@ void amd_iommu_set_intremap_table(struct const void *ptr, const struct amd_iommu *iommu, bool valid); -void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, - uint64_t root_ptr, uint16_t domain_id, - uint8_t paging_mode, bool valid); +#define SET_ROOT_VALID (1u << 0) +#define SET_ROOT_WITH_UNITY_MAP (1u << 1) +int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, + uint64_t root_ptr, + uint16_t domain_id, + uint8_t paging_mode, + unsigned int flags); void iommu_dte_add_device_entry(struct amd_iommu_dte *dte, const struct ivrs_mappings *ivrs_dev); void iommu_dte_set_guest_cr3(struct amd_iommu_dte *dte, uint16_t dom_id, --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -103,10 +103,69 @@ static unsigned int set_iommu_pte_presen return flush_flags; } -void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, - uint64_t root_ptr, uint16_t domain_id, - uint8_t paging_mode, bool valid) +/* + * This function returns + * - -errno for errors, + * - 0 for a successful update, atomic when necessary + * - 1 for a successful but non-atomic update, which may need to be warned + * about by the caller. + */ +int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte, + uint64_t root_ptr, uint16_t domain_id, + uint8_t paging_mode, unsigned int flags) { + bool valid = flags & SET_ROOT_VALID; + + if ( dte->v && dte->tv && + (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) ) + { + union { + struct amd_iommu_dte dte; + uint64_t raw64[4]; + __uint128_t raw128[2]; + } ldte = { .dte = *dte }; + __uint128_t old = ldte.raw128[0]; + int ret = 0; + + ldte.dte.domain_id = domain_id; + ldte.dte.pt_root = paddr_to_pfn(root_ptr); + ldte.dte.iw = true; + ldte.dte.ir = true; + ldte.dte.paging_mode = paging_mode; + ldte.dte.v = valid; + + if ( cpu_has_cx16 ) + { + __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]); + + /* + * Hardware does not update the DTE behind our backs, so the + * return value should match "old". + */ + if ( res != old ) + { + printk(XENLOG_ERR + "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n", + domain_id, + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + ret = -EILSEQ; + } + } + else /* Best effort, updating domain_id last. */ + { + uint64_t *ptr = (void *)dte; + + write_atomic(ptr + 0, ldte.raw64[0]); + /* No barrier should be needed between these two. */ + write_atomic(ptr + 1, ldte.raw64[1]); + + ret = 1; + } + + return ret; + } + if ( valid || dte->v ) { dte->tv = false; @@ -121,6 +180,8 @@ void amd_iommu_set_root_page_table(struc smp_wmb(); dte->tv = true; dte->v = valid; + + return 0; } void amd_iommu_set_intremap_table( --- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig +++ xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -85,40 +85,81 @@ int get_dma_requestor_id(uint16_t seg, u return req_id; } -static void amd_iommu_setup_domain_device( +static int __must_check allocate_domain_resources(struct domain_iommu *hd) +{ + int rc; + + spin_lock(&hd->arch.mapping_lock); + rc = amd_iommu_alloc_root(hd); + spin_unlock(&hd->arch.mapping_lock); + + return rc; +} + +static bool any_pdev_behind_iommu(const struct domain *d, + const struct pci_dev *exclude, + const struct amd_iommu *iommu) +{ + const struct pci_dev *pdev; + + for_each_pdev ( d, pdev ) + { + if ( pdev == exclude ) + continue; + + if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu ) + return true; + } + + return false; +} + +static int __must_check amd_iommu_setup_domain_device( struct domain *domain, struct amd_iommu *iommu, uint8_t devfn, struct pci_dev *pdev) { struct amd_iommu_dte *table, *dte; unsigned long flags; - int req_id, valid = 1; + unsigned int req_id, sr_flags; + int rc; u8 bus = pdev->bus; - const struct domain_iommu *hd = dom_iommu(domain); + struct domain_iommu *hd = dom_iommu(domain); + const struct ivrs_mappings *ivrs_dev; - BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode || - !iommu->dev_table.buffer ); + BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); - if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) - valid = 0; + rc = allocate_domain_resources(hd); + if ( rc ) + return rc; + + req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf); + ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain) + ? 0 : SET_ROOT_VALID) + | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0); /* get device-table entry */ req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); table = iommu->dev_table.buffer; dte = &table[req_id]; + ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; spin_lock_irqsave(&iommu->lock, flags); if ( !dte->v || !dte->tv ) { - const struct ivrs_mappings *ivrs_dev; - /* bind DTE to domain page-tables */ - amd_iommu_set_root_page_table( - dte, page_to_maddr(hd->arch.root_table), domain->domain_id, - hd->arch.paging_mode, valid); + rc = amd_iommu_set_root_page_table( + dte, page_to_maddr(hd->arch.root_table), + domain->domain_id, hd->arch.paging_mode, sr_flags); + if ( rc ) + { + ASSERT(rc < 0); + spin_unlock_irqrestore(&iommu->lock, flags); + return rc; + } /* Undo what amd_iommu_disable_domain_device() may have done. */ - ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; if ( dte->it_root ) { dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED; @@ -133,17 +174,74 @@ static void amd_iommu_setup_domain_devic dte->i = ats_enabled; amd_iommu_flush_device(iommu, req_id); + } + else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) + { + /* + * Strictly speaking if the device is the only one with this requestor + * ID, it could be allowed to be re-assigned regardless of unity map + * presence. But let's deal with that case only if it is actually + * found in the wild. + */ + if ( req_id != PCI_BDF2(bus, devfn) && + (sr_flags & SET_ROOT_WITH_UNITY_MAP) ) + rc = -EOPNOTSUPP; + else + rc = amd_iommu_set_root_page_table( + dte, page_to_maddr(hd->arch.root_table), + domain->domain_id, hd->arch.paging_mode, sr_flags); + if ( rc < 0 ) + { + spin_unlock_irqrestore(&iommu->lock, flags); + return rc; + } + if ( rc && + domain != pdev->domain && + /* + * By non-atomically updating the DTE's domain ID field last, + * during a short window in time TLB entries with the old domain + * ID but the new page tables may have been inserted. This could + * affect I/O of other devices using this same (old) domain ID. + * Such updating therefore is not a problem if this was the only + * device associated with the old domain ID. Diverting I/O of any + * of a dying domain's devices to the quarantine page tables is + * intended anyway. + */ + !pdev->domain->is_dying && + (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || + pdev->phantom_stride) ) + printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", + pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + pdev->domain); + + /* + * Check remaining settings are still in place from an earlier call + * here. They're all independent of the domain, so should not have + * changed. + */ + if ( dte->it_root ) + ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED); + ASSERT(dte->iv == iommu_intremap); + ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion); + ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags, + ACPI_IVHD_SYSTEM_MGMT)); - AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " - "root table = %#"PRIx64", " - "domain = %d, paging mode = %d\n", - req_id, pdev->type, - page_to_maddr(hd->arch.root_table), - domain->domain_id, hd->arch.paging_mode); + if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && + iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) + ASSERT(dte->i == ats_enabled); + + amd_iommu_flush_device(iommu, req_id); } spin_unlock_irqrestore(&iommu->lock, flags); + AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " + "root table = %#"PRIx64", " + "domain = %d, paging mode = %d\n", + req_id, pdev->type, + page_to_maddr(hd->arch.root_table), + domain->domain_id, hd->arch.paging_mode); + ASSERT(pcidevs_locked()); if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && @@ -154,6 +252,8 @@ static void amd_iommu_setup_domain_devic amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0); } + + return 0; } int __init acpi_ivrs_init(void) @@ -223,17 +323,6 @@ int amd_iommu_alloc_root(struct domain_i return 0; } -static int __must_check allocate_domain_resources(struct domain_iommu *hd) -{ - int rc; - - spin_lock(&hd->arch.mapping_lock); - rc = amd_iommu_alloc_root(hd); - spin_unlock(&hd->arch.mapping_lock); - - return rc; -} - int __read_mostly amd_iommu_min_paging_mode = 1; static int amd_iommu_domain_init(struct domain *d) @@ -333,7 +422,6 @@ static int reassign_device(struct domain { struct amd_iommu *iommu; int bdf, rc; - struct domain_iommu *t = dom_iommu(target); const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); bdf = PCI_BDF2(pdev->bus, pdev->devfn); @@ -347,7 +435,15 @@ static int reassign_device(struct domain return -ENODEV; } - amd_iommu_disable_domain_device(source, iommu, devfn, pdev); + rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev); + if ( rc ) + return rc; + + if ( devfn == pdev->devfn && pdev->domain != target ) + { + list_move(&pdev->domain_list, &target->pdev_list); + pdev->domain = target; + } /* * If the device belongs to the hardware domain, and it has a unity mapping, @@ -363,27 +459,10 @@ static int reassign_device(struct domain return rc; } - if ( devfn == pdev->devfn && pdev->domain != dom_io ) - { - list_move(&pdev->domain_list, &dom_io->pdev_list); - pdev->domain = dom_io; - } - - rc = allocate_domain_resources(t); - if ( rc ) - return rc; - - amd_iommu_setup_domain_device(target, iommu, devfn, pdev); AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), source->domain_id, target->domain_id); - if ( devfn == pdev->devfn && pdev->domain != target ) - { - list_move(&pdev->domain_list, &target->pdev_list); - pdev->domain = target; - } - return 0; } @@ -547,8 +626,7 @@ static int amd_iommu_add_device(u8 devfn spin_unlock_irqrestore(&iommu->lock, flags); } - amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); - return 0; + return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); } static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) From: Jan Beulich Subject: VT-d: prepare for per-device quarantine page tables (part I) Arrange for domain ID and page table root to be passed around, the latter in particular to domain_pgd_maddr() such that taking it from the per-domain fields can be overridden. No functional change intended. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Roger Pau Monné Reviewed-by: Kevin Tian --- xen/drivers/passthrough/vtd/extern.h.orig +++ xen/drivers/passthrough/vtd/extern.h @@ -86,9 +86,10 @@ void *map_vtd_domain_page(u64 maddr); void unmap_vtd_domain_page(void *va); int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn, - const struct pci_dev *pdev, unsigned int mode); + const struct pci_dev *pdev, domid_t domid, + paddr_t pgd_maddr, unsigned int mode); int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn); + uint8_t bus, uint8_t devfn, domid_t domid); int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); @@ -107,7 +108,8 @@ void platform_quirks_init(void); void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, - uint8_t devfn, unsigned int mode); + uint8_t devfn, domid_t domid, paddr_t pgd_maddr, + unsigned int mode); void pci_vtd_quirk(const struct pci_dev *); void quirk_iommu_caps(struct vtd_iommu *iommu); --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -1364,12 +1364,12 @@ int domain_context_mapping_one( struct domain *domain, struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, - unsigned int mode) + domid_t domid, paddr_t pgd_maddr, unsigned int mode) { struct domain_iommu *hd = dom_iommu(domain); struct context_entry *context, *context_entries, lctxt; __uint128_t old; - u64 maddr, pgd_maddr; + uint64_t maddr; uint16_t seg = iommu->drhd->segment, prev_did = 0; struct domain *prev_dom = NULL; int agaw, rc, ret; @@ -1410,10 +1410,12 @@ int domain_context_mapping_one( } else { + paddr_t root = pgd_maddr; + spin_lock(&hd->arch.mapping_lock); /* Ensure we have pagetables allocated down to leaf PTE. */ - if ( hd->arch.pgd_maddr == 0 ) + if ( !root ) { addr_to_dma_page_maddr(domain, 0, 1); if ( hd->arch.pgd_maddr == 0 ) @@ -1426,22 +1428,24 @@ int domain_context_mapping_one( rcu_unlock_domain(prev_dom); return -ENOMEM; } + + root = hd->arch.pgd_maddr; } /* Skip top levels of page tables for 2- and 3-level DRHDs. */ - pgd_maddr = hd->arch.pgd_maddr; for ( agaw = level_to_agaw(4); agaw != level_to_agaw(iommu->nr_pt_levels); agaw-- ) { - struct dma_pte *p = map_vtd_domain_page(pgd_maddr); - pgd_maddr = dma_pte_addr(*p); + struct dma_pte *p = map_vtd_domain_page(root); + + root = dma_pte_addr(*p); unmap_vtd_domain_page(p); - if ( pgd_maddr == 0 ) + if ( !root ) goto nomem; } - context_set_address_root(lctxt, pgd_maddr); + context_set_address_root(lctxt, root); if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); else @@ -1557,15 +1561,21 @@ int domain_context_mapping_one( unmap_vtd_domain_page(context_entries); if ( !seg && !rc ) - rc = me_wifi_quirk(domain, bus, devfn, mode); + rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); if ( rc ) { if ( !prev_dom ) - domain_context_unmap_one(domain, iommu, bus, devfn); + domain_context_unmap_one(domain, iommu, bus, devfn, + domain->domain_id); else if ( prev_dom != domain ) /* Avoid infinite recursion. */ + { + hd = dom_iommu(prev_dom); domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, + domain->domain_id, + hd->arch.pgd_maddr, mode & MAP_WITH_RMRR); + } } if ( prev_dom ) @@ -1582,6 +1592,7 @@ static int domain_context_mapping(struct { struct acpi_drhd_unit *drhd; const struct acpi_rmrr_unit *rmrr; + paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; int ret = 0; unsigned int i, mode = 0; uint16_t seg = pdev->seg, bdf; @@ -1647,7 +1658,8 @@ static int domain_context_mapping(struct domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev, mode); + pdev, domain->domain_id, pgd_maddr, + mode); if ( ret > 0 ) ret = 0; if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) @@ -1662,7 +1674,8 @@ static int domain_context_mapping(struct PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev, mode); + pdev, domain->domain_id, pgd_maddr, + mode); if ( ret < 0 ) break; prev_present = ret; @@ -1690,7 +1703,8 @@ static int domain_context_mapping(struct */ if ( ret >= 0 ) ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - NULL, mode); + NULL, domain->domain_id, pgd_maddr, + mode); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different @@ -1705,7 +1719,8 @@ static int domain_context_mapping(struct if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, - NULL, mode); + NULL, domain->domain_id, pgd_maddr, + mode); if ( ret ) { @@ -1734,7 +1749,7 @@ static int domain_context_mapping(struct int domain_context_unmap_one( struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn) + uint8_t bus, uint8_t devfn, domid_t domid) { struct context_entry *context, *context_entries; u64 maddr; @@ -1792,7 +1807,7 @@ int domain_context_unmap_one( unmap_vtd_domain_page(context_entries); if ( !iommu->drhd->segment && !rc ) - rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC); + rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC); if ( rc && !is_hardware_domain(domain) && domain != dom_io ) { @@ -1844,7 +1859,8 @@ static int domain_context_unmap(struct d printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn); + ret = domain_context_unmap_one(domain, iommu, bus, devfn, + domain->domain_id); if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) disable_ats_device(pdev); @@ -1854,7 +1870,8 @@ static int domain_context_unmap(struct d if ( iommu_debug ) printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_unmap_one(domain, iommu, bus, devfn); + ret = domain_context_unmap_one(domain, iommu, bus, devfn, + domain->domain_id); if ( ret ) break; @@ -1880,12 +1897,15 @@ static int domain_context_unmap(struct d /* PCIe to PCI/PCIx bridge */ if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) { - ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); + ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, + domain->domain_id); if ( !ret ) - ret = domain_context_unmap_one(domain, iommu, secbus, 0); + ret = domain_context_unmap_one(domain, iommu, secbus, 0, + domain->domain_id); } else /* Legacy PCI bridge */ - ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); + ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, + domain->domain_id); break; --- xen/drivers/passthrough/vtd/quirks.c.orig +++ xen/drivers/passthrough/vtd/quirks.c @@ -344,6 +344,8 @@ void __init platform_quirks_init(void) static int __must_check map_me_phantom_function(struct domain *domain, unsigned int dev, + domid_t domid, + paddr_t pgd_maddr, unsigned int mode) { struct acpi_drhd_unit *drhd; @@ -357,16 +359,17 @@ static int __must_check map_me_phantom_f /* map or unmap ME phantom function */ if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) rc = domain_context_mapping_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7), NULL, mode); + PCI_DEVFN(dev, 7), NULL, + domid, pgd_maddr, mode); else rc = domain_context_unmap_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7)); + PCI_DEVFN(dev, 7), domid); return rc; } int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, - unsigned int mode) + domid_t domid, paddr_t pgd_maddr, unsigned int mode) { u32 id; int rc = 0; @@ -390,7 +393,7 @@ int me_wifi_quirk(struct domain *domain, case 0x423b8086: case 0x423c8086: case 0x423d8086: - rc = map_me_phantom_function(domain, 3, mode); + rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode); break; default: break; @@ -416,7 +419,7 @@ int me_wifi_quirk(struct domain *domain, case 0x42388086: /* Puma Peak */ case 0x422b8086: case 0x422c8086: - rc = map_me_phantom_function(domain, 22, mode); + rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode); break; default: break; From: Jan Beulich Subject: VT-d: prepare for per-device quarantine page tables (part II) Replace the passing of struct domain * by domid_t in preparation of per-device quarantine page tables also requiring per-device pseudo domain IDs, which aren't going to be associated with any struct domain instances. No functional change intended (except for slightly adjusted log message text). Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Kevin Tian Reviewed-by: Roger Pau Monné --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -52,8 +52,8 @@ static struct tasklet vtd_fault_tasklet; static int setup_hwdom_device(u8 devfn, struct pci_dev *); static void setup_hwdom_rmrr(struct domain *d); -static int domain_iommu_domid(struct domain *d, - struct vtd_iommu *iommu) +static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu, + bool warn) { unsigned long nr_dom, i; @@ -61,16 +61,16 @@ static int domain_iommu_domid(struct dom i = find_first_bit(iommu->domid_bitmap, nr_dom); while ( i < nr_dom ) { - if ( iommu->domid_map[i] == d->domain_id ) + if ( iommu->domid_map[i] == domid ) return i; i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); } - if ( !d->is_dying ) + if ( warn ) dprintk(XENLOG_ERR VTDPREFIX, - "Cannot get valid iommu %u domid: %pd\n", - iommu->index, d); + "No valid iommu %u domid for Dom%d\n", + iommu->index, domid); return -1; } @@ -78,8 +78,7 @@ static int domain_iommu_domid(struct dom #define DID_FIELD_WIDTH 16 #define DID_HIGH_OFFSET 8 static int context_set_domain_id(struct context_entry *context, - struct domain *d, - struct vtd_iommu *iommu) + domid_t domid, struct vtd_iommu *iommu) { unsigned long nr_dom, i; int found = 0; @@ -90,7 +89,7 @@ static int context_set_domain_id(struct i = find_first_bit(iommu->domid_bitmap, nr_dom); while ( i < nr_dom ) { - if ( iommu->domid_map[i] == d->domain_id ) + if ( iommu->domid_map[i] == domid ) { found = 1; break; @@ -106,7 +105,7 @@ static int context_set_domain_id(struct dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n"); return -EFAULT; } - iommu->domid_map[i] = d->domain_id; + iommu->domid_map[i] = domid; } set_bit(i, iommu->domid_bitmap); @@ -115,9 +114,9 @@ static int context_set_domain_id(struct return 0; } -static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu) +static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu) { - int iommu_domid = domain_iommu_domid(domain, iommu); + int iommu_domid = get_iommu_did(domid, iommu, false); if ( iommu_domid >= 0 ) { @@ -173,7 +172,7 @@ static void check_cleanup_domid_map(stru if ( !found ) { clear_bit(iommu->index, &dom_iommu(d)->arch.iommu_bitmap); - cleanup_domid_map(d, iommu); + cleanup_domid_map(d->domain_id, iommu); } } @@ -630,7 +629,7 @@ static int __must_check iommu_flush_iotl continue; flush_dev_iotlb = !!find_ats_dev_drhd(iommu); - iommu_domid= domain_iommu_domid(d, iommu); + iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying); if ( iommu_domid == -1 ) continue; @@ -1454,7 +1453,7 @@ int domain_context_mapping_one( spin_unlock(&hd->arch.mapping_lock); } - rc = context_set_domain_id(&lctxt, domain, iommu); + rc = context_set_domain_id(&lctxt, domid, iommu); if ( rc ) { unlock: @@ -1774,7 +1773,7 @@ int domain_context_unmap_one( context_clear_entry(*context); iommu_sync_cache(context, sizeof(struct context_entry)); - iommu_domid= domain_iommu_domid(domain, iommu); + iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying); if ( iommu_domid == -1 ) { spin_unlock(&iommu->lock); @@ -1948,7 +1947,7 @@ static void iommu_domain_teardown(struct spin_unlock(&hd->arch.mapping_lock); for_each_drhd_unit ( drhd ) - cleanup_domid_map(d, drhd->iommu); + cleanup_domid_map(d->domain_id, drhd->iommu); } static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, From: Jan Beulich Subject: IOMMU/x86: maintain a per-device pseudo domain ID In order to subsequently enable per-device quarantine page tables, we'll need domain-ID-like identifiers to be inserted in the respective device (AMD) or context (Intel) table entries alongside the per-device page table root addresses. Make use of "real" domain IDs occupying only half of the value range coverable by domid_t. Note that in VT-d's iommu_alloc() I didn't want to introduce new memory leaks in case of error, but existing ones don't get plugged - that'll be the subject of a later change. The VT-d changes are slightly asymmetric, but this way we can avoid assigning pseudo domain IDs to devices which would never be mapped while still avoiding to add a new parameter to domain_context_unmap(). Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Kevin Tian Reviewed-by: Roger Pau Monné --- xen/include/asm-x86/iommu.h.orig +++ xen/include/asm-x86/iommu.h @@ -130,6 +130,10 @@ int pi_update_irte(const struct pi_desc iommu_vcall(ops, sync_cache, addr, size); \ }) +unsigned long *iommu_init_domid(void); +domid_t iommu_alloc_domid(unsigned long *map); +void iommu_free_domid(domid_t domid, unsigned long *map); + #endif /* !__ARCH_X86_IOMMU_H__ */ /* * Local variables: --- xen/include/asm-x86/pci.h.orig +++ xen/include/asm-x86/pci.h @@ -15,6 +15,12 @@ struct arch_pci_dev { vmask_t used_vectors; + /* + * These fields are (de)initialized under pcidevs-lock. Other uses of + * them don't race (de)initialization and hence don't strictly need any + * locking. + */ + domid_t pseudo_domid; }; int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, --- xen/include/asm-x86/amd-iommu.h.orig +++ xen/include/asm-x86/amd-iommu.h @@ -94,6 +94,7 @@ struct amd_iommu { struct ring_buffer cmd_buffer; struct ring_buffer event_log; struct ring_buffer ppr_log; + unsigned long *domid_map; int exclusion_enable; int exclusion_allow_all; --- xen/drivers/passthrough/amd/iommu_detect.c.orig +++ xen/drivers/passthrough/amd/iommu_detect.c @@ -183,6 +183,11 @@ int __init amd_iommu_detect_one_acpi( if ( rt ) goto out; + iommu->domid_map = iommu_init_domid(); + rt = -ENOMEM; + if ( !iommu->domid_map ) + goto out; + rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); if ( rt ) printk(XENLOG_ERR @@ -194,7 +199,10 @@ int __init amd_iommu_detect_one_acpi( out: if ( rt ) + { + xfree(iommu->domid_map); xfree(iommu); + } return rt; } --- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig +++ xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -563,6 +563,8 @@ static int amd_iommu_add_device(u8 devfn struct amd_iommu *iommu; u16 bdf; struct ivrs_mappings *ivrs_mappings; + bool fresh_domid = false; + int ret; if ( !pdev->domain ) return -EINVAL; @@ -626,7 +628,22 @@ static int amd_iommu_add_device(u8 devfn spin_unlock_irqrestore(&iommu->lock, flags); } - return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); + if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID ) + { + pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map); + if ( pdev->arch.pseudo_domid == DOMID_INVALID ) + return -ENOSPC; + fresh_domid = true; + } + + ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); + if ( ret && fresh_domid ) + { + iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); + pdev->arch.pseudo_domid = DOMID_INVALID; + } + + return ret; } static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) @@ -651,6 +668,9 @@ static int amd_iommu_remove_device(u8 de amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); + iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); + pdev->arch.pseudo_domid = DOMID_INVALID; + ivrs_mappings = get_ivrs_mappings(pdev->seg); bdf = PCI_BDF2(pdev->bus, devfn); if ( amd_iommu_perdev_intremap && --- xen/drivers/passthrough/pci.c.orig +++ xen/drivers/passthrough/pci.c @@ -338,6 +338,7 @@ static struct pci_dev *alloc_pdev(struct *((u8*) &pdev->bus) = bus; *((u8*) &pdev->devfn) = devfn; pdev->domain = NULL; + pdev->arch.pseudo_domid = DOMID_INVALID; INIT_LIST_HEAD(&pdev->msi_list); pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), @@ -1353,9 +1354,13 @@ static int _dump_pci_devices(struct pci_ list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) { - printk("%04x:%02x:%02x.%u - %pd - node %-3d - MSIs < ", - pseg->nr, pdev->bus, - PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain, + printk("%04x:%02x:%02x.%u - ", pseg->nr, pdev->bus, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + if ( pdev->domain == dom_io ) + printk("DomIO:%x", pdev->arch.pseudo_domid); + else + printk("%pd", pdev->domain); + printk(" - node %-3d - MSIs < ", (pdev->node != NUMA_NO_NODE) ? pdev->node : -1); list_for_each_entry ( msi, &pdev->msi_list, list ) printk("%d ", msi->irq); --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1192,7 +1193,7 @@ int __init iommu_alloc(struct acpi_drhd_ { struct vtd_iommu *iommu; unsigned long sagaw, nr_dom; - int agaw; + int agaw, rc; if ( nr_iommus >= MAX_IOMMUS ) { @@ -1285,7 +1286,16 @@ int __init iommu_alloc(struct acpi_drhd_ if ( !iommu->domid_map ) return -ENOMEM; + iommu->pseudo_domid_map = iommu_init_domid(); + rc = -ENOMEM; + if ( !iommu->pseudo_domid_map ) + goto free; + return 0; + + free: + iommu_free(drhd); + return rc; } void __init iommu_free(struct acpi_drhd_unit *drhd) @@ -1308,6 +1318,7 @@ void __init iommu_free(struct acpi_drhd_ xfree(iommu->domid_bitmap); xfree(iommu->domid_map); + xfree(iommu->pseudo_domid_map); if ( iommu->msi.irq >= 0 ) destroy_irq(iommu->msi.irq); @@ -1583,8 +1594,8 @@ int domain_context_mapping_one( return rc ?: pdev && prev_dom; } -static int domain_context_unmap(struct domain *d, uint8_t devfn, - struct pci_dev *pdev); +static const struct acpi_drhd_unit *domain_context_unmap( + struct domain *d, uint8_t devfn, struct pci_dev *pdev); static int domain_context_mapping(struct domain *domain, u8 devfn, struct pci_dev *pdev) @@ -1592,6 +1603,7 @@ static int domain_context_mapping(struct struct acpi_drhd_unit *drhd; const struct acpi_rmrr_unit *rmrr; paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; + domid_t orig_domid = pdev->arch.pseudo_domid; int ret = 0; unsigned int i, mode = 0; uint16_t seg = pdev->seg, bdf; @@ -1652,6 +1664,14 @@ static int domain_context_mapping(struct break; case DEV_TYPE_PCIe_ENDPOINT: + if ( iommu_quarantine && orig_domid == DOMID_INVALID ) + { + pdev->arch.pseudo_domid = + iommu_alloc_domid(drhd->iommu->pseudo_domid_map); + if ( pdev->arch.pseudo_domid == DOMID_INVALID ) + return -ENOSPC; + } + if ( iommu_debug ) printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, @@ -1667,6 +1687,14 @@ static int domain_context_mapping(struct break; case DEV_TYPE_PCI: + if ( iommu_quarantine && orig_domid == DOMID_INVALID ) + { + pdev->arch.pseudo_domid = + iommu_alloc_domid(drhd->iommu->pseudo_domid_map); + if ( pdev->arch.pseudo_domid == DOMID_INVALID ) + return -ENOSPC; + } + if ( iommu_debug ) printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, @@ -1742,6 +1770,13 @@ static int domain_context_mapping(struct if ( !ret && devfn == pdev->devfn ) pci_vtd_quirk(pdev); + if ( ret && drhd && orig_domid == DOMID_INVALID ) + { + iommu_free_domid(pdev->arch.pseudo_domid, + drhd->iommu->pseudo_domid_map); + pdev->arch.pseudo_domid = DOMID_INVALID; + } + return ret; } @@ -1824,8 +1859,10 @@ int domain_context_unmap_one( return rc; } -static int domain_context_unmap(struct domain *domain, u8 devfn, - struct pci_dev *pdev) +static const struct acpi_drhd_unit *domain_context_unmap( + struct domain *domain, + uint8_t devfn, + struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; struct vtd_iommu *iommu; @@ -1834,7 +1871,7 @@ static int domain_context_unmap(struct d drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) - return -ENODEV; + return ERR_PTR(-ENODEV); iommu = drhd->iommu; switch ( pdev->type ) @@ -1845,7 +1882,7 @@ static int domain_context_unmap(struct d domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); if ( !is_hardware_domain(domain) ) - return -EPERM; + return ERR_PTR(-EPERM); goto out; case DEV_TYPE_PCIe_BRIDGE: @@ -1923,7 +1960,7 @@ static int domain_context_unmap(struct d check_cleanup_domid_map(domain, pdev, iommu); out: - return ret; + return ret ? ERR_PTR(ret) : drhd; } static void iommu_domain_teardown(struct domain *d) @@ -2145,16 +2182,17 @@ static int intel_iommu_enable_device(str static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) { + const struct acpi_drhd_unit *drhd; struct acpi_rmrr_unit *rmrr; u16 bdf; - int ret, i; + unsigned int i; if ( !pdev->domain ) return -EINVAL; - ret = domain_context_unmap(pdev->domain, devfn, pdev); - if ( ret ) - return ret; + drhd = domain_context_unmap(pdev->domain, devfn, pdev); + if ( IS_ERR(drhd) ) + return PTR_ERR(drhd); for_each_rmrr_device ( rmrr, bdf, i ) { @@ -2171,6 +2209,13 @@ static int intel_iommu_remove_device(u8 rmrr->end_address, 0); } + if ( drhd ) + { + iommu_free_domid(pdev->arch.pseudo_domid, + drhd->iommu->pseudo_domid_map); + pdev->arch.pseudo_domid = DOMID_INVALID; + } + return 0; } --- xen/drivers/passthrough/vtd/iommu.h.orig +++ xen/drivers/passthrough/vtd/iommu.h @@ -535,6 +535,7 @@ struct vtd_iommu { } flush; struct list_head ats_devices; + unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */ unsigned long *domid_bitmap; /* domain id bitmap */ u16 *domid_map; /* domain id mapping array */ uint32_t version; --- xen/drivers/passthrough/x86/iommu.c.orig +++ xen/drivers/passthrough/x86/iommu.c @@ -346,6 +346,53 @@ void __hwdom_init arch_iommu_hwdom_init( return; } +unsigned long *__init iommu_init_domid(void) +{ + if ( !iommu_quarantine ) + return ZERO_BLOCK_PTR; + + BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX); + + return xzalloc_array(unsigned long, + BITS_TO_LONGS(UINT16_MAX - DOMID_MASK)); +} + +domid_t iommu_alloc_domid(unsigned long *map) +{ + /* + * This is used uniformly across all IOMMUs, such that on typical + * systems we wouldn't re-use the same ID very quickly (perhaps never). + */ + static unsigned int start; + unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start); + + ASSERT(pcidevs_locked()); + + if ( idx >= UINT16_MAX - DOMID_MASK ) + idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK); + if ( idx >= UINT16_MAX - DOMID_MASK ) + return DOMID_INVALID; + + __set_bit(idx, map); + + start = idx + 1; + + return idx | (DOMID_MASK + 1); +} + +void iommu_free_domid(domid_t domid, unsigned long *map) +{ + ASSERT(pcidevs_locked()); + + if ( domid == DOMID_INVALID ) + return; + + ASSERT(domid > DOMID_MASK); + + if ( !__test_and_clear_bit(domid & DOMID_MASK, map) ) + BUG(); +} + /* * Local variables: * mode: C --- xen/include/public/xen.h.orig +++ xen/include/public/xen.h @@ -614,6 +614,9 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); /* Idle domain. */ #define DOMID_IDLE xen_mk_uint(0x7FFF) +/* Mask for valid domain id values */ +#define DOMID_MASK xen_mk_uint(0x7FFF) + #ifndef __ASSEMBLY__ typedef uint16_t domid_t; From: Jan Beulich Subject: IOMMU/x86: drop TLB flushes from quarantine_init() hooks The page tables just created aren't hooked up yet anywhere, so there's nothing that could be present in any TLB, and hence nothing to flush. Dropping this flush is, at least on the VT-d side, a prereq to per- device domain ID use when quarantining devices, as dom_io isn't going to be assigned a DID anymore: The warning in get_iommu_did() would trigger. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Roger Pau Monné Reviewed-by: Kevin Tian --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -595,8 +595,6 @@ int __init amd_iommu_quarantine_init(str out: spin_unlock(&hd->arch.mapping_lock); - amd_iommu_flush_all_pages(d); - /* Pages leaked in failure case */ return level ? -ENOMEM : 0; } --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -2894,7 +2894,6 @@ static int __init intel_iommu_quarantine struct dma_pte *parent; unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); unsigned int level = agaw_to_level(agaw); - int rc; if ( hd->arch.pgd_maddr ) { @@ -2941,10 +2940,8 @@ static int __init intel_iommu_quarantine out: spin_unlock(&hd->arch.mapping_lock); - rc = iommu_flush_iotlb_all(d); - /* Pages leaked in failure case */ - return level ? -ENOMEM : rc; + return level ? -ENOMEM : 0; } const struct iommu_ops __initconstrel intel_iommu_ops = { From: Jan Beulich Subject: AMD/IOMMU: abstract maximum number of page table levels We will want to use the constant elsewhere. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant --- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig +++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -193,7 +193,7 @@ static inline int amd_iommu_get_paging_m while ( max_frames > PTE_PER_TABLE_SIZE ) { max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT; - if ( ++level > 6 ) + if ( ++level > IOMMU_MAX_PT_LEVELS ) return -ENOMEM; } --- xen/include/asm-x86/hvm/svm/amd-iommu-defs.h.orig +++ xen/include/asm-x86/hvm/svm/amd-iommu-defs.h @@ -110,6 +110,7 @@ struct amd_iommu_dte { bool tv:1; unsigned int :5; unsigned int had:2; +#define IOMMU_MAX_PT_LEVELS 6 unsigned int paging_mode:3; uint64_t pt_root:40; bool ppr:1; --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -260,7 +260,7 @@ static int iommu_pde_from_dfn(struct dom table = hd->arch.root_table; level = hd->arch.paging_mode; - BUG_ON( table == NULL || level < 1 || level > 6 ); + BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS ); /* * A frame number past what the current page tables can represent can't From: Jan Beulich Subject: IOMMU/x86: use per-device page tables for quarantining Devices with RMRRs / unity mapped regions, due to it being unspecified how/when these memory regions may be accessed, may not be left disconnected from the mappings of these regions (as long as it's not certain that the device has been fully quiesced). Hence even the page tables used when quarantining such devices need to have mappings of those regions. This implies installing page tables in the first place even when not in scratch-page quarantining mode. This is CVE-2022-26361 / part of XSA-400. While for the purpose here it would be sufficient to have devices with RMRRs / unity mapped regions use per-device page tables, extend this to all devices (in scratch-page quarantining mode). This allows the leaf pages to be mapped r/w, thus covering also memory writes (rather than just reads) issued by non-quiescent devices. Set up quarantine page tables as late as possible, yet early enough to not encounter failure during de-assign. This means setup generally happens in assign_device(), while (for now) the one in deassign_device() is there mainly to be on the safe side. In VT-d's DID allocation function don't require the IOMMU lock to be held anymore: All involved code paths hold pcidevs_lock, so this way we avoid the need to acquire the IOMMU lock around the new call to context_set_domain_id(). Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Reviewed-by: Kevin Tian Reviewed-by: Roger Pau Monné --- xen/arch/x86/mm/p2m.c.orig +++ xen/arch/x86/mm/p2m.c @@ -1453,7 +1453,7 @@ int set_identity_p2m_entry(struct domain struct p2m_domain *p2m = p2m_get_hostp2m(d); int ret; - if ( !paging_mode_translate(p2m->domain) ) + if ( !paging_mode_translate(d) ) { if ( !is_iommu_enabled(d) ) return 0; --- xen/include/asm-x86/pci.h.orig +++ xen/include/asm-x86/pci.h @@ -1,6 +1,8 @@ #ifndef __X86_PCI_H__ #define __X86_PCI_H__ +#include + #define CF8_BDF(cf8) ( ((cf8) & 0x00ffff00) >> 8) #define CF8_ADDR_LO(cf8) ( (cf8) & 0x000000fc) #define CF8_ADDR_HI(cf8) ( ((cf8) & 0x0f000000) >> 16) @@ -20,7 +22,18 @@ struct arch_pci_dev { * them don't race (de)initialization and hence don't strictly need any * locking. */ + union { + /* Subset of struct arch_iommu's fields, to be used in dom_io. */ + struct { + uint64_t pgd_maddr; + } vtd; + struct { + struct page_info *root_table; + } amd; + }; domid_t pseudo_domid; + mfn_t leaf_mfn; + struct page_list_head pgtables_list; }; int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, --- xen/include/asm-x86/hvm/svm/amd-iommu-proto.h.orig +++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h @@ -54,7 +54,8 @@ int amd_iommu_init_late(void); int amd_iommu_update_ivrs_mapping_acpi(void); int iov_adjust_irq_affinities(void); -int amd_iommu_quarantine_init(struct domain *d); +int amd_iommu_quarantine_init(struct pci_dev *pdev); +void amd_iommu_quarantine_teardown(struct pci_dev *pdev); /* mapping functions */ int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn, --- xen/drivers/passthrough/amd/iommu_map.c.orig +++ xen/drivers/passthrough/amd/iommu_map.c @@ -539,64 +539,137 @@ int amd_iommu_reserve_domain_unity_unmap return rc; } -int __init amd_iommu_quarantine_init(struct domain *d) +static int fill_qpt(union amd_iommu_pte *this, unsigned int level, + struct page_info *pgs[IOMMU_MAX_PT_LEVELS], + struct pci_dev *pdev) { - struct domain_iommu *hd = dom_iommu(d); + unsigned int i; + int rc = 0; + + for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i ) + { + union amd_iommu_pte *pte = &this[i], *next; + + if ( !pte->pr ) + { + if ( !pgs[level] ) + { + /* + * The pgtable allocator is fine for the leaf page, as well as + * page table pages, and the resulting allocations are always + * zeroed. + */ + pgs[level] = alloc_amd_iommu_pgtable(); + if ( !pgs[level] ) + { + rc = -ENOMEM; + break; + } + + page_list_add(pgs[level], &pdev->arch.pgtables_list); + + if ( level ) + { + next = __map_domain_page(pgs[level]); + rc = fill_qpt(next, level - 1, pgs, pdev); + unmap_domain_page(next); + } + } + + /* + * PDEs are essentially a subset of PTEs, so this function + * is fine to use even at the leaf. + */ + set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level, + true, true); + } + else if ( level && pte->next_level ) + { + page_list_add(mfn_to_page(_mfn(pte->mfn)), + &pdev->arch.pgtables_list); + next = map_domain_page(_mfn(pte->mfn)); + rc = fill_qpt(next, level - 1, pgs, pdev); + unmap_domain_page(next); + } + } + + return rc; +} + +int amd_iommu_quarantine_init(struct pci_dev *pdev) +{ + struct domain_iommu *hd = dom_iommu(dom_io); unsigned long end_gfn = 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT); unsigned int level = amd_iommu_get_paging_mode(end_gfn); - union amd_iommu_pte *table; + unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf); + const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); + int rc; - if ( hd->arch.root_table ) + ASSERT(pcidevs_locked()); + ASSERT(!hd->arch.root_table); + + ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID); + + if ( pdev->arch.amd.root_table ) { - ASSERT_UNREACHABLE(); + clear_domain_page(pdev->arch.leaf_mfn); return 0; } - spin_lock(&hd->arch.mapping_lock); - - hd->arch.root_table = alloc_amd_iommu_pgtable(); - if ( !hd->arch.root_table ) - goto out; - - table = __map_domain_page(hd->arch.root_table); - while ( level ) + pdev->arch.amd.root_table = alloc_amd_iommu_pgtable(); + if ( !pdev->arch.amd.root_table ) + return -ENOMEM; + + /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ + hd->arch.root_table = pdev->arch.amd.root_table; + + rc = amd_iommu_reserve_domain_unity_map(dom_io, + ivrs_mappings[req_id].unity_map, + 0); + + iommu_identity_map_teardown(dom_io); + hd->arch.root_table = NULL; + + if ( rc ) + printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n", + pdev->seg, pdev->bus, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + else { - struct page_info *pg; - unsigned int i; + union amd_iommu_pte *root; + struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {}; - /* - * The pgtable allocator is fine for the leaf page, as well as - * page table pages, and the resulting allocations are always - * zeroed. - */ - pg = alloc_amd_iommu_pgtable(); - if ( !pg ) - break; + spin_lock(&hd->arch.mapping_lock); - for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) - { - union amd_iommu_pte *pde = &table[i]; + root = __map_domain_page(pdev->arch.amd.root_table); + rc = fill_qpt(root, level - 1, pgs, pdev); + unmap_domain_page(root); - /* - * PDEs are essentially a subset of PTEs, so this function - * is fine to use even at the leaf. - */ - set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1, - false, true); - } + pdev->arch.leaf_mfn = page_to_mfn(pgs[0]); - unmap_domain_page(table); - table = __map_domain_page(pg); - level--; + spin_unlock(&hd->arch.mapping_lock); } - unmap_domain_page(table); - out: - spin_unlock(&hd->arch.mapping_lock); + if ( rc ) + amd_iommu_quarantine_teardown(pdev); + + return rc; +} + +void amd_iommu_quarantine_teardown(struct pci_dev *pdev) +{ + struct page_info *pg; + + ASSERT(pcidevs_locked()); + + if ( !pdev->arch.amd.root_table ) + return; + + while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) + free_amd_iommu_pgtable(pg); - /* Pages leaked in failure case */ - return level ? -ENOMEM : 0; + pdev->arch.amd.root_table = NULL; } /* --- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig +++ xen/drivers/passthrough/amd/pci_amd_iommu.c @@ -125,6 +125,8 @@ static int __must_check amd_iommu_setup_ u8 bus = pdev->bus; struct domain_iommu *hd = dom_iommu(domain); const struct ivrs_mappings *ivrs_dev; + const struct page_info *root_pg; + domid_t domid; BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer); @@ -144,14 +146,25 @@ static int __must_check amd_iommu_setup_ dte = &table[req_id]; ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id]; + if ( domain != dom_io ) + { + root_pg = hd->arch.root_table; + domid = domain->domain_id; + } + else + { + root_pg = pdev->arch.amd.root_table; + domid = pdev->arch.pseudo_domid; + } + spin_lock_irqsave(&iommu->lock, flags); if ( !dte->v || !dte->tv ) { /* bind DTE to domain page-tables */ rc = amd_iommu_set_root_page_table( - dte, page_to_maddr(hd->arch.root_table), - domain->domain_id, hd->arch.paging_mode, sr_flags); + dte, page_to_maddr(root_pg), domid, + hd->arch.paging_mode, sr_flags); if ( rc ) { ASSERT(rc < 0); @@ -175,7 +188,7 @@ static int __must_check amd_iommu_setup_ amd_iommu_flush_device(iommu, req_id); } - else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.root_table)) ) + else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) { /* * Strictly speaking if the device is the only one with this requestor @@ -188,8 +201,8 @@ static int __must_check amd_iommu_setup_ rc = -EOPNOTSUPP; else rc = amd_iommu_set_root_page_table( - dte, page_to_maddr(hd->arch.root_table), - domain->domain_id, hd->arch.paging_mode, sr_flags); + dte, page_to_maddr(root_pg), domid, + hd->arch.paging_mode, sr_flags); if ( rc < 0 ) { spin_unlock_irqrestore(&iommu->lock, flags); @@ -208,6 +221,7 @@ static int __must_check amd_iommu_setup_ * intended anyway. */ !pdev->domain->is_dying && + pdev->domain != dom_io && (any_pdev_behind_iommu(pdev->domain, pdev, iommu) || pdev->phantom_stride) ) printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", @@ -238,9 +252,8 @@ static int __must_check amd_iommu_setup_ AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " "root table = %#"PRIx64", " "domain = %d, paging mode = %d\n", - req_id, pdev->type, - page_to_maddr(hd->arch.root_table), - domain->domain_id, hd->arch.paging_mode); + req_id, pdev->type, page_to_maddr(root_pg), + domid, hd->arch.paging_mode); ASSERT(pcidevs_locked()); @@ -313,7 +326,7 @@ static int iov_enable_xt(void) int amd_iommu_alloc_root(struct domain_iommu *hd) { - if ( unlikely(!hd->arch.root_table) ) + if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) ) { hd->arch.root_table = alloc_amd_iommu_pgtable(); if ( !hd->arch.root_table ) @@ -404,7 +417,7 @@ static void amd_iommu_disable_domain_dev AMD_IOMMU_DEBUG("Disable: device id = %#x, " "domain = %d, paging mode = %d\n", - req_id, domain->domain_id, + req_id, dte->domain_id, dom_iommu(domain)->arch.paging_mode); } spin_unlock_irqrestore(&iommu->lock, flags); @@ -668,6 +681,8 @@ static int amd_iommu_remove_device(u8 de amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); + amd_iommu_quarantine_teardown(pdev); + iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map); pdev->arch.pseudo_domid = DOMID_INVALID; --- xen/drivers/passthrough/iommu.c.orig +++ xen/drivers/passthrough/iommu.c @@ -450,21 +450,21 @@ int iommu_iotlb_flush_all(struct domain return rc; } -static int __init iommu_quarantine_init(void) +int iommu_quarantine_dev_init(device_t *dev) { const struct domain_iommu *hd = dom_iommu(dom_io); - int rc; - dom_io->options |= XEN_DOMCTL_CDF_iommu; + if ( !iommu_quarantine || !hd->platform_ops->quarantine_init ) + return 0; - rc = iommu_domain_init(dom_io, 0); - if ( rc ) - return rc; + return iommu_call(hd->platform_ops, quarantine_init, dev); +} - if ( !hd->platform_ops->quarantine_init ) - return 0; +static int __init iommu_quarantine_init(void) +{ + dom_io->options |= XEN_DOMCTL_CDF_iommu; - return hd->platform_ops->quarantine_init(dom_io); + return iommu_domain_init(dom_io, 0); } int __init iommu_setup(void) --- xen/drivers/passthrough/pci.c.orig +++ xen/drivers/passthrough/pci.c @@ -929,9 +929,16 @@ static int deassign_device(struct domain return -ENODEV; /* De-assignment from dom_io should de-quarantine the device */ - target = ((pdev->quarantine || iommu_quarantine) && - pdev->domain != dom_io) ? - dom_io : hardware_domain; + if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io ) + { + ret = iommu_quarantine_dev_init(pci_to_dev(pdev)); + if ( ret ) + return ret; + + target = dom_io; + } + else + target = hardware_domain; while ( pdev->phantom_stride ) { @@ -1547,6 +1554,13 @@ static int assign_device(struct domain * msixtbl_init(d); } + if ( pdev->domain != dom_io ) + { + rc = iommu_quarantine_dev_init(pci_to_dev(pdev)); + if ( rc ) + goto done; + } + pdev->fault.count = 0; if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) ) --- xen/drivers/passthrough/vtd/iommu.c.orig +++ xen/drivers/passthrough/vtd/iommu.c @@ -43,6 +43,12 @@ #include "vtd.h" #include "../ats.h" +#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \ + : (pdev)->arch.pseudo_domid) +#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \ + ? dom_iommu(d)->arch.pgd_maddr \ + : (pdev)->arch.vtd.pgd_maddr) + /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ bool __read_mostly untrusted_msi; @@ -78,13 +84,18 @@ static int get_iommu_did(domid_t domid, #define DID_FIELD_WIDTH 16 #define DID_HIGH_OFFSET 8 + +/* + * This function may have "context" passed as NULL, to merely obtain a DID + * for "domid". + */ static int context_set_domain_id(struct context_entry *context, domid_t domid, struct vtd_iommu *iommu) { unsigned long nr_dom, i; int found = 0; - ASSERT(spin_is_locked(&iommu->lock)); + ASSERT(pcidevs_locked()); nr_dom = cap_ndoms(iommu->cap); i = find_first_bit(iommu->domid_bitmap, nr_dom); @@ -110,8 +121,13 @@ static int context_set_domain_id(struct } set_bit(i, iommu->domid_bitmap); - context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); - context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; + + if ( context ) + { + context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); + context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; + } + return 0; } @@ -161,8 +177,12 @@ static void check_cleanup_domid_map(stru const struct pci_dev *exclude, struct vtd_iommu *iommu) { - bool found = any_pdev_behind_iommu(d, exclude, iommu); + bool found; + + if ( d == dom_io ) + return; + found = any_pdev_behind_iommu(d, exclude, iommu); /* * Hidden devices are associated with DomXEN but usable by the hardware * domain. Hence they need considering here as well. @@ -1400,7 +1420,7 @@ int domain_context_mapping_one( domid = iommu->domid_map[prev_did]; if ( domid < DOMID_FIRST_RESERVED ) prev_dom = rcu_lock_domain_by_id(domid); - else if ( domid == DOMID_IO ) + else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK ) prev_dom = rcu_lock_domain(dom_io); if ( !prev_dom ) { @@ -1577,15 +1597,12 @@ int domain_context_mapping_one( { if ( !prev_dom ) domain_context_unmap_one(domain, iommu, bus, devfn, - domain->domain_id); + DEVICE_DOMID(domain, pdev)); else if ( prev_dom != domain ) /* Avoid infinite recursion. */ - { - hd = dom_iommu(prev_dom); domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, - domain->domain_id, - hd->arch.pgd_maddr, + DEVICE_DOMID(prev_dom, pdev), + DEVICE_PGTABLE(prev_dom, pdev), mode & MAP_WITH_RMRR); - } } if ( prev_dom ) @@ -1602,7 +1619,7 @@ static int domain_context_mapping(struct { struct acpi_drhd_unit *drhd; const struct acpi_rmrr_unit *rmrr; - paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr; + paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev); domid_t orig_domid = pdev->arch.pseudo_domid; int ret = 0; unsigned int i, mode = 0; @@ -1635,7 +1652,7 @@ static int domain_context_mapping(struct break; } - if ( domain != pdev->domain ) + if ( domain != pdev->domain && pdev->domain != dom_io ) { if ( pdev->domain->is_dying ) mode |= MAP_OWNER_DYING; @@ -1676,8 +1693,8 @@ static int domain_context_mapping(struct printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev, domain->domain_id, pgd_maddr, + ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev, + DEVICE_DOMID(domain, pdev), pgd_maddr, mode); if ( ret > 0 ) ret = 0; @@ -1701,8 +1718,8 @@ static int domain_context_mapping(struct PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev, domain->domain_id, pgd_maddr, - mode); + pdev, DEVICE_DOMID(domain, pdev), + pgd_maddr, mode); if ( ret < 0 ) break; prev_present = ret; @@ -1730,8 +1747,8 @@ static int domain_context_mapping(struct */ if ( ret >= 0 ) ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - NULL, domain->domain_id, pgd_maddr, - mode); + NULL, DEVICE_DOMID(domain, pdev), + pgd_maddr, mode); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different @@ -1746,8 +1763,8 @@ static int domain_context_mapping(struct if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, - NULL, domain->domain_id, pgd_maddr, - mode); + NULL, DEVICE_DOMID(domain, pdev), + pgd_maddr, mode); if ( ret ) { @@ -1896,7 +1913,7 @@ static const struct acpi_drhd_unit *doma domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_unmap_one(domain, iommu, bus, devfn, - domain->domain_id); + DEVICE_DOMID(domain, pdev)); if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) disable_ats_device(pdev); @@ -1907,7 +1924,7 @@ static const struct acpi_drhd_unit *doma printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_unmap_one(domain, iommu, bus, devfn, - domain->domain_id); + DEVICE_DOMID(domain, pdev)); if ( ret ) break; @@ -1930,18 +1947,12 @@ static const struct acpi_drhd_unit *doma break; } + ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, + DEVICE_DOMID(domain, pdev)); /* PCIe to PCI/PCIx bridge */ - if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) - { - ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, - domain->domain_id); - if ( !ret ) - ret = domain_context_unmap_one(domain, iommu, secbus, 0, - domain->domain_id); - } - else /* Legacy PCI bridge */ - ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn, - domain->domain_id); + if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) + ret = domain_context_unmap_one(domain, iommu, secbus, 0, + DEVICE_DOMID(domain, pdev)); break; @@ -1987,6 +1998,25 @@ static void iommu_domain_teardown(struct cleanup_domid_map(d->domain_id, drhd->iommu); } +static void quarantine_teardown(struct pci_dev *pdev, + const struct acpi_drhd_unit *drhd) +{ + struct page_info *pg; + + ASSERT(pcidevs_locked()); + + if ( !pdev->arch.vtd.pgd_maddr ) + return; + + while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) ) + free_domheap_page(pg); + + pdev->arch.vtd.pgd_maddr = 0; + + if ( drhd ) + cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu); +} + static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn, mfn_t mfn, unsigned int flags, unsigned int *flush_flags) @@ -2209,6 +2239,8 @@ static int intel_iommu_remove_device(u8 rmrr->end_address, 0); } + quarantine_teardown(pdev, drhd); + if ( drhd ) { iommu_free_domid(pdev->arch.pseudo_domid, @@ -2888,60 +2920,139 @@ static void vtd_dump_p2m_table(struct do vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0); } -static int __init intel_iommu_quarantine_init(struct domain *d) +static int fill_qpt(struct dma_pte *this, unsigned int level, + paddr_t maddrs[6], struct pci_dev *pdev) { - struct domain_iommu *hd = dom_iommu(d); - struct dma_pte *parent; + struct domain_iommu *hd = dom_iommu(dom_io); + unsigned int i; + int rc = 0; + + for ( i = 0; !rc && i < PTE_NUM; ++i ) + { + struct dma_pte *pte = &this[i], *next; + + if ( !dma_pte_present(*pte) ) + { + if ( !maddrs[level] ) + { + /* + * The pgtable allocator is fine for the leaf page, as well as + * page table pages, and the resulting allocations are always + * zeroed. + */ + maddrs[level] = alloc_pgtable_maddr(1, hd->node); + if ( !maddrs[level] ) + { + rc = -ENOMEM; + break; + } + + page_list_add(maddr_to_page(maddrs[level]), + &pdev->arch.pgtables_list); + + if ( level ) + { + next = map_vtd_domain_page(maddrs[level]); + rc = fill_qpt(next, level - 1, maddrs, pdev); + unmap_vtd_domain_page(next); + } + } + + dma_set_pte_addr(*pte, maddrs[level]); + dma_set_pte_readable(*pte); + dma_set_pte_writable(*pte); + } + else if ( level && !dma_pte_superpage(*pte) ) + { + page_list_add(maddr_to_page(dma_pte_addr(*pte)), + &pdev->arch.pgtables_list); + next = map_vtd_domain_page(dma_pte_addr(*pte)); + rc = fill_qpt(next, level - 1, maddrs, pdev); + unmap_vtd_domain_page(next); + } + } + + return rc; +} + +static int intel_iommu_quarantine_init(struct pci_dev *pdev) +{ + struct domain_iommu *hd = dom_iommu(dom_io); + paddr_t maddr; unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); unsigned int level = agaw_to_level(agaw); + const struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; + unsigned int i, bdf; + bool rmrr_found = false; + int rc; - if ( hd->arch.pgd_maddr ) + ASSERT(pcidevs_locked()); + ASSERT(!hd->arch.pgd_maddr); + + if ( pdev->arch.vtd.pgd_maddr ) { - ASSERT_UNREACHABLE(); + clear_domain_page(pdev->arch.leaf_mfn); return 0; } - spin_lock(&hd->arch.mapping_lock); + drhd = acpi_find_matched_drhd_unit(pdev); + if ( !drhd ) + return -ENODEV; - hd->arch.pgd_maddr = alloc_pgtable_maddr(1, hd->node); - if ( !hd->arch.pgd_maddr ) - goto out; + maddr = alloc_pgtable_maddr(1, hd->node); + if ( !maddr ) + return -ENOMEM; - parent = map_vtd_domain_page(hd->arch.pgd_maddr); - while ( level ) - { - uint64_t maddr; - unsigned int offset; + rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu); - /* - * The pgtable allocator is fine for the leaf page, as well as - * page table pages, and the resulting allocations are always - * zeroed. - */ - maddr = alloc_pgtable_maddr(1, hd->node); - if ( !maddr ) + /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ + hd->arch.pgd_maddr = maddr; + + for_each_rmrr_device ( rmrr, bdf, i ) + { + if ( rc ) break; - for ( offset = 0; offset < PTE_NUM; offset++ ) + if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf ) { - struct dma_pte *pte = &parent[offset]; + rmrr_found = true; - dma_set_pte_addr(*pte, maddr); - dma_set_pte_readable(*pte); + rc = iommu_identity_mapping(dom_io, p2m_access_rw, + rmrr->base_address, rmrr->end_address, + 0); + if ( rc ) + printk(XENLOG_ERR VTDPREFIX + "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n", + pdev->seg, pdev->bus, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); } - iommu_sync_cache(parent, PAGE_SIZE); + } - unmap_vtd_domain_page(parent); - parent = map_vtd_domain_page(maddr); - level--; + iommu_identity_map_teardown(dom_io); + hd->arch.pgd_maddr = 0; + pdev->arch.vtd.pgd_maddr = maddr; + + if ( !rc ) + { + struct dma_pte *root; + paddr_t maddrs[6] = {}; + + spin_lock(&hd->arch.mapping_lock); + + root = map_vtd_domain_page(maddr); + rc = fill_qpt(root, level - 1, maddrs, pdev); + unmap_vtd_domain_page(root); + + pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]); + + spin_unlock(&hd->arch.mapping_lock); } - unmap_vtd_domain_page(parent); - out: - spin_unlock(&hd->arch.mapping_lock); + if ( rc ) + quarantine_teardown(pdev, drhd); - /* Pages leaked in failure case */ - return level ? -ENOMEM : 0; + return rc; } const struct iommu_ops __initconstrel intel_iommu_ops = { --- xen/drivers/passthrough/vtd/iommu.h.orig +++ xen/drivers/passthrough/vtd/iommu.h @@ -509,7 +509,7 @@ struct vtd_iommu { u32 nr_pt_levels; u64 cap; u64 ecap; - spinlock_t lock; /* protect context, domain ids */ + spinlock_t lock; /* protect context */ spinlock_t register_lock; /* protect iommu register handling */ u64 root_maddr; /* root entry machine address */ nodeid_t node; --- xen/include/xen/iommu.h.orig +++ xen/include/xen/iommu.h @@ -211,7 +211,7 @@ typedef int iommu_grdm_t(xen_pfn_t start struct iommu_ops { int (*init)(struct domain *d); void (*hwdom_init)(struct domain *d); - int (*quarantine_init)(struct domain *d); + int (*quarantine_init)(device_t *dev); int (*add_device)(u8 devfn, device_t *dev); int (*enable_device)(device_t *dev); int (*remove_device)(u8 devfn, device_t *dev); @@ -331,6 +331,7 @@ int __must_check iommu_suspend(void); void iommu_resume(void); void iommu_crash_shutdown(void); int iommu_get_reserved_device_memory(iommu_grdm_t *, void *); +int iommu_quarantine_dev_init(device_t *dev); void iommu_share_p2m_table(struct domain *d);