diff options
Diffstat (limited to 'usr/src/uts/i86pc')
65 files changed, 2833 insertions, 2113 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 2758749056..a287bfd209 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -22,6 +22,8 @@ # # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # +# Copyright (c) 2010, Intel Corporation. +# # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc # "implementation architecture" dependent. @@ -187,10 +189,11 @@ PCI_E_NEXUS_OBJS += npe.o npe_misc.o PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o PCINEXUS_OBJS += pci.o pci_common.o pci_kstats.o pci_tools.o PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o \ - mp_platform_common.o mp_platform_misc.o \ - hpet_acpi.o apic_common.o + mp_platform_common.o mp_platform_misc.o \ + hpet_acpi.o apic_common.o apic_timer.o APIX_OBJS += apix.o apic_regops.o psm_common.o apix_intr.o apix_utils.o \ - apix_irm.o mp_platform_common.o hpet_acpi.o apic_common.o + apix_irm.o mp_platform_common.o hpet_acpi.o apic_common.o \ + apic_timer.o ACPI_DRV_OBJS += acpi_drv.o acpi_video.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared index f41e91a4fc..9b910c35ef 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc.shared +++ b/usr/src/uts/i86pc/Makefile.i86pc.shared @@ -22,7 +22,7 @@ # # uts/i86pc/Makefile.i86pc # -# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # # # This makefile contains the common definitions for the i86pc unix diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index dfff27de9f..604a2fb2c5 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -20,7 +20,7 @@ # # -# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # This Makefile defines the build rules for the directory uts/i86pc diff --git a/usr/src/uts/i86pc/cpu/amd_opteron/ao_main.c b/usr/src/uts/i86pc/cpu/amd_opteron/ao_main.c index dd6c2dd616..36ea92669d 100644 --- a/usr/src/uts/i86pc/cpu/amd_opteron/ao_main.c +++ b/usr/src/uts/i86pc/cpu/amd_opteron/ao_main.c @@ -64,7 +64,7 @@ ao_ms_init(cmi_hdl_t hdl, void **datap) if (ao_ms_support_disable || cmi_hdl_model(hdl) >= ao_model_limit) return (ENOTSUP); - if (!(x86_feature & X86_MCA)) + if (!is_x86_feature(x86_featureset, X86FSET_MCA)) return (ENOTSUP); if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) diff --git a/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c b/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c index 46723b1437..311eb6d12f 100644 --- a/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c +++ b/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c @@ -492,7 +492,7 @@ authamd_init(cmi_hdl_t hdl, void **datap) !authamd_supported(hdl)) return (ENOTSUP); - if (!(x86_feature & X86_MCA)) + if (!is_x86_feature(x86_featureset, X86FSET_MCA)) return (ENOTSUP); if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) diff --git a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c index 50ef45bec9..1b9e259bd8 100644 --- a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c +++ b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c @@ -1067,13 +1067,13 @@ gcpu_mca_init(cmi_hdl_t hdl) return; /* - * CPU startup code only calls cmi_mca_init if x86_feature indicates - * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier + * CPU startup code only calls cmi_mca_init if x86_featureset indicates + * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier * processors, which have their own more primitive way of doing * machine checks, will not have cmi_mca_init called since their * CPUID information will not indicate both MCA and MCE features. */ - ASSERT(x86_feature & X86_MCA); + ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA)); /* * Determine whether the IA32_MCG_CTL register is present. If it @@ -2018,13 +2018,13 @@ gcpu_mca_fini(cmi_hdl_t hdl) int i; /* - * CPU startup code only calls cmi_mca_init if x86_feature indicates - * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier + * CPU startup code only calls cmi_mca_init if x86_featureset indicates + * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier * processors, which have their own more primitive way of doing * machine checks, will not have cmi_mca_init called since their * CPUID information will not indicate both MCA and MCE features. */ - if ((x86_feature & X86_MCA) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_MCA)) return; #ifndef __xpv /* diff --git a/usr/src/uts/i86pc/cpu/genuineintel/gintel_main.c b/usr/src/uts/i86pc/cpu/genuineintel/gintel_main.c index e696725f6a..2eb7faea63 100644 --- a/usr/src/uts/i86pc/cpu/genuineintel/gintel_main.c +++ b/usr/src/uts/i86pc/cpu/genuineintel/gintel_main.c @@ -112,7 +112,7 @@ gintel_init(cmi_hdl_t hdl, void **datap) if (gintel_ms_support_disable) return (ENOTSUP); - if (!(x86_feature & X86_MCA)) + if (!is_x86_feature(x86_featureset, X86FSET_MCA)) return (ENOTSUP); nb_chipset = (*pci_getl_func)(0, 0, 0, 0x0); diff --git a/usr/src/uts/i86pc/io/amd_iommu/amd_iommu_impl.c b/usr/src/uts/i86pc/io/amd_iommu/amd_iommu_impl.c index 5b466d75a3..59c004458b 100644 --- a/usr/src/uts/i86pc/io/amd_iommu/amd_iommu_impl.c +++ b/usr/src/uts/i86pc/io/amd_iommu/amd_iommu_impl.c @@ -58,6 +58,11 @@ static int amd_iommu_win(iommulib_handle_t handle, dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); +static int amd_iommu_mapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao); +static int amd_iommu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao); static int amd_iommu_map(iommulib_handle_t handle, dev_info_t *dip, dev_info_t *rdip, struct ddi_dma_req *dmareq, ddi_dma_handle_t *dma_handle); @@ -105,6 +110,8 @@ struct iommulib_ops amd_iommulib_ops = { amd_iommu_unbindhdl, amd_iommu_sync, amd_iommu_win, + amd_iommu_mapobject, + amd_iommu_unmapobject, amd_iommu_map, amd_iommu_mctl }; @@ -1913,6 +1920,23 @@ amd_iommu_mctl(iommulib_handle_t handle, dev_info_t *dip, request, offp, lenp, objpp, cache_flags)); } +/*ARGSUSED*/ +static int +amd_iommu_mapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao) +{ + return (DDI_ENOTSUP); +} + +/*ARGSUSED*/ +static int +amd_iommu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao) +{ + return (DDI_ENOTSUP); +} + uint64_t amd_iommu_reg_get64_workaround(uint64_t *regp, uint32_t bits) { diff --git a/usr/src/uts/i86pc/io/apix/apix.c b/usr/src/uts/i86pc/io/apix/apix.c index f2fdc19282..8c4ccb6a0a 100644 --- a/usr/src/uts/i86pc/io/apix/apix.c +++ b/usr/src/uts/i86pc/io/apix/apix.c @@ -475,7 +475,7 @@ apix_init_intr() if (nlvt >= 5) { /* Enable performance counter overflow interrupt */ - if ((x86_feature & X86_MSR) != X86_MSR) + if (!is_x86_feature(x86_featureset, X86FSET_MSR)) apic_enable_cpcovf_intr = 0; if (apic_enable_cpcovf_intr) { if (apic_cpcovf_vect == 0) { @@ -1609,7 +1609,7 @@ apix_set_cpu(apix_vector_t *vecp, int new_cpu, int *result) dev_info_t *dip; int inum, cap_ptr; ddi_acc_handle_t handle; - ddi_intr_msix_t *msix_p; + ddi_intr_msix_t *msix_p = NULL; ushort_t msix_ctrl; uintptr_t off; uint32_t mask; @@ -1628,7 +1628,7 @@ apix_set_cpu(apix_vector_t *vecp, int new_cpu, int *result) /* * Mask MSI-X. It's unmasked when MSI-X gets enabled. */ - if (vecp->v_type == APIX_TYPE_MSIX) { + if (vecp->v_type == APIX_TYPE_MSIX && IS_VECT_ENABLED(vecp)) { if ((dip = APIX_GET_DIP(vecp)) == NULL) return (NULL); inum = vecp->v_devp->dv_inum; @@ -1651,10 +1651,13 @@ apix_set_cpu(apix_vector_t *vecp, int new_cpu, int *result) } *result = 0; - if ((newp = apix_rebind(vecp, new_cpu, 1)) == NULL) *result = EIO; + /* Restore mask bit */ + if (msix_p != NULL) + ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, mask); + return (newp); } diff --git a/usr/src/uts/i86pc/io/apix/apix_utils.c b/usr/src/uts/i86pc/io/apix/apix_utils.c index 07fc3535b9..3342306db6 100644 --- a/usr/src/uts/i86pc/io/apix/apix_utils.c +++ b/usr/src/uts/i86pc/io/apix/apix_utils.c @@ -312,7 +312,8 @@ apix_pci_msi_enable_vector(apix_vector_t *vecp, dev_info_t *dip, int type, msi_regs.mr_data = vector; msi_regs.mr_addr = target_apic_id; - intrmap_tbl[0] = vecp->v_intrmap_private; + for (i = 0; i < count; i++) + intrmap_tbl[i] = xv_intrmap_private(vecp->v_cpuid, vector + i); apic_vt_ops->apic_intrmap_alloc_entry(intrmap_tbl, dip, type, count, 0xff); for (i = 0; i < count; i++) diff --git a/usr/src/uts/i86pc/io/dr/dr_quiesce.c b/usr/src/uts/i86pc/io/dr/dr_quiesce.c index f3467f6eb1..663977da25 100644 --- a/usr/src/uts/i86pc/io/dr/dr_quiesce.c +++ b/usr/src/uts/i86pc/io/dr/dr_quiesce.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -222,6 +221,10 @@ dr_bypass_device(char *dname) { int i; char **lname; + + if (dname == NULL) + return (0); + /* check the bypass list */ for (i = 0, lname = &dr_bypass_list[i]; **lname != '\0'; lname++) { if (strcmp(dname, dr_bypass_list[i++]) == 0) @@ -707,10 +710,6 @@ dr_signal_user(int sig) void dr_resume(dr_sr_handle_t *srh) { - dr_handle_t *handle; - - handle = srh->sr_dr_handlep; - switch (srh->sr_suspend_state) { case DR_SRSTATE_FULL: @@ -780,8 +779,6 @@ dr_resume(dr_sr_handle_t *srh) break; } - i_ndi_allow_device_tree_changes(handle->h_ndi); - prom_printf("DR: resume COMPLETED\n"); } @@ -798,8 +795,6 @@ dr_suspend(dr_sr_handle_t *srh) force = dr_cmd_flags(handle) & SBD_FLAG_FORCE; - i_ndi_block_device_tree_changes(&handle->h_ndi); - prom_printf("\nDR: suspending user threads...\n"); srh->sr_suspend_state = DR_SRSTATE_USER; if (((rc = dr_stop_user_threads(srh)) != DDI_SUCCESS) && diff --git a/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE b/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE new file mode 100644 index 0000000000..ddb59bc2cf --- /dev/null +++ b/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE @@ -0,0 +1,24 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, Intel Corporation. + * All rights reserved. + */ diff --git a/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE.descrip b/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..8270f005f1 --- /dev/null +++ b/usr/src/uts/i86pc/io/fipe/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +INTEL FIPE DRIVER diff --git a/usr/src/uts/i86pc/io/hpet_acpi.c b/usr/src/uts/i86pc/io/hpet_acpi.c index 8b33cafc8a..b618e491e7 100644 --- a/usr/src/uts/i86pc/io/hpet_acpi.c +++ b/usr/src/uts/i86pc/io/hpet_acpi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/hpet_acpi.h> diff --git a/usr/src/uts/i86pc/io/immu.c b/usr/src/uts/i86pc/io/immu.c index da2fdad9d4..ed21bf8655 100644 --- a/usr/src/uts/i86pc/io/immu.c +++ b/usr/src/uts/i86pc/io/immu.c @@ -53,7 +53,6 @@ #include <sys/spl.h> #include <sys/archsystm.h> #include <sys/x86_archext.h> -#include <sys/rootnex.h> #include <sys/avl.h> #include <sys/bootconf.h> #include <sys/bootinfo.h> @@ -72,7 +71,7 @@ boolean_t immu_dvma_enable = B_TRUE; /* accessed in other files so not static */ boolean_t immu_gfxdvma_enable = B_TRUE; boolean_t immu_intrmap_enable = B_FALSE; -boolean_t immu_qinv_enable = B_FALSE; +boolean_t immu_qinv_enable = B_TRUE; /* various quirks that need working around */ @@ -98,7 +97,6 @@ immu_flags_t immu_global_dvma_flags; dev_info_t *root_devinfo; kmutex_t immu_lock; list_t immu_list; -void *immu_pgtable_cache; boolean_t immu_setup; boolean_t immu_running; boolean_t immu_quiesced; @@ -112,6 +110,11 @@ static char **unity_driver_array; static uint_t nunity; static char **xlate_driver_array; static uint_t nxlate; + +static char **premap_driver_array; +static uint_t npremap; +static char **nopremap_driver_array; +static uint_t nnopremap; /* ###################### Utility routines ############################# */ /* @@ -124,8 +127,6 @@ check_mobile4(dev_info_t *dip, void *arg) int vendor, device; int *ip = (int *)arg; - ASSERT(arg); - vendor = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "vendor-id", -1); device = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, @@ -133,7 +134,7 @@ check_mobile4(dev_info_t *dip, void *arg) if (vendor == 0x8086 && device == 0x2a40) { *ip = B_TRUE; - ddi_err(DER_NOTE, dip, "IMMU: Mobile 4 chipset detected. " + ddi_err(DER_NOTE, dip, "iommu: Mobile 4 chipset detected. " "Force setting IOMMU write buffer"); return (DDI_WALK_TERMINATE); } else { @@ -145,7 +146,12 @@ static void map_bios_rsvd_mem(dev_info_t *dip) { struct memlist *mp; - int e; + + /* + * Make sure the domain for the device is set up before + * mapping anything. + */ + (void) immu_dvma_device_setup(dip, 0); memlist_read_lock(); @@ -153,15 +159,14 @@ map_bios_rsvd_mem(dev_info_t *dip) while (mp != NULL) { memrng_t mrng = {0}; - ddi_err(DER_LOG, dip, "IMMU: Mapping BIOS rsvd range " + ddi_err(DER_LOG, dip, "iommu: Mapping BIOS rsvd range " "[0x%" PRIx64 " - 0x%"PRIx64 "]\n", mp->ml_address, mp->ml_address + mp->ml_size); mrng.mrng_start = IMMU_ROUNDOWN(mp->ml_address); mrng.mrng_npages = IMMU_ROUNDUP(mp->ml_size) / IMMU_PAGESIZE; - e = immu_dvma_map(NULL, NULL, &mrng, 0, dip, IMMU_FLAGS_MEMRNG); - ASSERT(e == DDI_DMA_MAPPED || e == DDI_DMA_USE_PHYSICAL); + (void) immu_map_memrange(dip, &mrng); mp = mp->ml_next; } @@ -180,7 +185,8 @@ check_conf(dev_info_t *dip, void *arg) immu_devi_t *immu_devi; const char *dname; uint_t i; - int hasprop = 0; + int hasmapprop = 0, haspreprop = 0; + boolean_t old_premap; /* * Only PCI devices can use an IOMMU. Legacy ISA devices @@ -196,25 +202,45 @@ check_conf(dev_info_t *dip, void *arg) for (i = 0; i < nunity; i++) { if (strcmp(unity_driver_array[i], dname) == 0) { - hasprop = 1; + hasmapprop = 1; immu_devi->imd_dvma_flags |= IMMU_FLAGS_UNITY; } } for (i = 0; i < nxlate; i++) { if (strcmp(xlate_driver_array[i], dname) == 0) { - hasprop = 1; + hasmapprop = 1; immu_devi->imd_dvma_flags &= ~IMMU_FLAGS_UNITY; } } + old_premap = immu_devi->imd_use_premap; + + for (i = 0; i < nnopremap; i++) { + if (strcmp(nopremap_driver_array[i], dname) == 0) { + haspreprop = 1; + immu_devi->imd_use_premap = B_FALSE; + } + } + + for (i = 0; i < npremap; i++) { + if (strcmp(premap_driver_array[i], dname) == 0) { + haspreprop = 1; + immu_devi->imd_use_premap = B_TRUE; + } + } + /* * Report if we changed the value from the default. */ - if (hasprop && (immu_devi->imd_dvma_flags ^ immu_global_dvma_flags)) + if (hasmapprop && (immu_devi->imd_dvma_flags ^ immu_global_dvma_flags)) ddi_err(DER_LOG, dip, "using %s DVMA mapping", immu_devi->imd_dvma_flags & IMMU_FLAGS_UNITY ? DDI_DVMA_MAPTYPE_UNITY : DDI_DVMA_MAPTYPE_XLATE); + + if (haspreprop && (immu_devi->imd_use_premap != old_premap)) + ddi_err(DER_LOG, dip, "%susing premapped DVMA space", + immu_devi->imd_use_premap ? "" : "not "); } /* @@ -263,11 +289,10 @@ check_lpc(dev_info_t *dip, void *arg) immu_devi_t *immu_devi; immu_devi = immu_devi_get(dip); - ASSERT(immu_devi); if (immu_devi->imd_lpc == B_TRUE) { - ddi_err(DER_LOG, dip, "IMMU: Found LPC device"); + ddi_err(DER_LOG, dip, "iommu: Found LPC device"); /* This will put the immu_devi on the LPC "specials" list */ - (void) immu_dvma_get_immu(dip, IMMU_FLAGS_SLEEP); + (void) immu_dvma_device_setup(dip, IMMU_FLAGS_SLEEP); } } @@ -281,10 +306,9 @@ check_gfx(dev_info_t *dip, void *arg) immu_devi_t *immu_devi; immu_devi = immu_devi_get(dip); - ASSERT(immu_devi); if (immu_devi->imd_display == B_TRUE) { immu_devi->imd_dvma_flags |= IMMU_FLAGS_UNITY; - ddi_err(DER_LOG, dip, "IMMU: Found GFX device"); + ddi_err(DER_LOG, dip, "iommu: Found GFX device"); /* This will put the immu_devi on the GFX "specials" list */ (void) immu_dvma_get_immu(dip, IMMU_FLAGS_SLEEP); } @@ -366,9 +390,6 @@ get_conf_opt(char *bopt, boolean_t *kvar) { char *val = NULL; - ASSERT(bopt); - ASSERT(kvar); - /* * Check the rootnex.conf property * Fake up a dev_t since searching the global @@ -577,6 +598,24 @@ mapping_list_setup(void) xlate_driver_array = string_array; nxlate = nstrings; } + + if (ddi_prop_lookup_string_array( + makedevice(ddi_name_to_major("rootnex"), 0), root_devinfo, + DDI_PROP_DONTPASS | DDI_PROP_ROOTNEX_GLOBAL, + "immu-dvma-premap-drivers", + &string_array, &nstrings) == DDI_PROP_SUCCESS) { + premap_driver_array = string_array; + npremap = nstrings; + } + + if (ddi_prop_lookup_string_array( + makedevice(ddi_name_to_major("rootnex"), 0), root_devinfo, + DDI_PROP_DONTPASS | DDI_PROP_ROOTNEX_GLOBAL, + "immu-dvma-nopremap-drivers", + &string_array, &nstrings) == DDI_PROP_SUCCESS) { + nopremap_driver_array = string_array; + nnopremap = nstrings; + } } /* @@ -590,8 +629,6 @@ blacklisted_driver(void) int i; major_t maj; - ASSERT((black_array == NULL) ^ (nblacks != 0)); - /* need at least 2 strings */ if (nblacks < 2) { return (B_FALSE); @@ -625,8 +662,6 @@ blacklisted_smbios(void) char **strptr; int i; - ASSERT((black_array == NULL) ^ (nblacks != 0)); - /* need at least 4 strings for this setting */ if (nblacks < 4) { return (B_FALSE); @@ -668,7 +703,6 @@ blacklisted_smbios(void) static boolean_t blacklisted_acpi(void) { - ASSERT((black_array == NULL) ^ (nblacks != 0)); if (nblacks == 0) { return (B_FALSE); } @@ -734,9 +768,20 @@ blacklist_destroy(void) black_array = NULL; nblacks = 0; } +} - ASSERT(black_array == NULL); - ASSERT(nblacks == 0); +static char * +immu_alloc_name(const char *str, int instance) +{ + size_t slen; + char *s; + + slen = strlen(str) + IMMU_ISTRLEN + 1; + s = kmem_zalloc(slen, VM_SLEEP); + if (s != NULL) + (void) snprintf(s, slen, "%s%d", str, instance); + + return (s); } @@ -749,6 +794,8 @@ static void * immu_state_alloc(int seg, void *dmar_unit) { immu_t *immu; + char *nodename, *hcachename, *pcachename; + int instance; dmar_unit = immu_dmar_walk_units(seg, dmar_unit); if (dmar_unit == NULL) { @@ -763,10 +810,15 @@ immu_state_alloc(int seg, void *dmar_unit) mutex_enter(&(immu->immu_lock)); immu->immu_dmar_unit = dmar_unit; - immu->immu_name = ddi_strdup(immu_dmar_unit_name(dmar_unit), - KM_SLEEP); immu->immu_dip = immu_dmar_unit_dip(dmar_unit); + nodename = ddi_node_name(immu->immu_dip); + instance = ddi_get_instance(immu->immu_dip); + + immu->immu_name = immu_alloc_name(nodename, instance); + if (immu->immu_name == NULL) + return (NULL); + /* * the immu_intr_lock mutex is grabbed by the IOMMU * unit's interrupt handler so we need to use an @@ -808,9 +860,24 @@ immu_state_alloc(int seg, void *dmar_unit) */ list_insert_tail(&immu_list, immu); + pcachename = immu_alloc_name("immu_pgtable_cache", instance); + if (pcachename == NULL) + return (NULL); + + hcachename = immu_alloc_name("immu_hdl_cache", instance); + if (hcachename == NULL) + return (NULL); + + immu->immu_pgtable_cache = kmem_cache_create(pcachename, + sizeof (pgtable_t), 0, pgtable_ctor, pgtable_dtor, NULL, immu, + NULL, 0); + immu->immu_hdl_cache = kmem_cache_create(hcachename, + sizeof (immu_hdl_priv_t), 64, immu_hdl_priv_ctor, + NULL, NULL, immu, NULL, 0); + mutex_exit(&(immu->immu_lock)); - ddi_err(DER_LOG, immu->immu_dip, "IMMU: unit setup"); + ddi_err(DER_LOG, immu->immu_dip, "unit setup"); immu_dmar_set_immu(dmar_unit, immu); @@ -824,22 +891,13 @@ immu_subsystems_setup(void) void *unit_hdl; ddi_err(DER_VERB, NULL, - "Creating state structures for Intel IOMMU units\n"); - - ASSERT(immu_setup == B_FALSE); - ASSERT(immu_running == B_FALSE); + "Creating state structures for Intel IOMMU units"); mutex_init(&immu_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&immu_list, sizeof (immu_t), offsetof(immu_t, immu_node)); mutex_enter(&immu_lock); - ASSERT(immu_pgtable_cache == NULL); - - immu_pgtable_cache = kmem_cache_create("immu_pgtable_cache", - sizeof (pgtable_t), 0, - pgtable_ctor, pgtable_dtor, NULL, NULL, NULL, 0); - unit_hdl = NULL; for (seg = 0; seg < IMMU_MAXSEG; seg++) { while (unit_hdl = immu_state_alloc(seg, unit_hdl)) { @@ -865,12 +923,10 @@ static void immu_subsystems_startup(void) { immu_t *immu; + iommulib_ops_t *iommulib_ops; mutex_enter(&immu_lock); - ASSERT(immu_setup == B_TRUE); - ASSERT(immu_running == B_FALSE); - immu_dmar_startup(); immu = list_head(&immu_list); @@ -893,6 +949,12 @@ immu_subsystems_startup(void) immu_regs_startup(immu); mutex_exit(&(immu->immu_lock)); + + iommulib_ops = kmem_alloc(sizeof (iommulib_ops_t), KM_SLEEP); + *iommulib_ops = immulib_ops; + iommulib_ops->ilops_data = (void *)immu; + (void) iommulib_iommu_register(immu->immu_dip, iommulib_ops, + &immu->immu_iommulib_handle); } mutex_exit(&immu_lock); @@ -922,11 +984,6 @@ immu_walk_ancestor( int level; int error = DDI_SUCCESS; - ASSERT(root_devinfo); - ASSERT(rdip); - ASSERT(rdip != root_devinfo); - ASSERT(func); - /* ddip and immu can be NULL */ /* Hold rdip so that branch is not detached */ @@ -969,7 +1026,6 @@ immu_init(void) char *phony_reg = "A thing of beauty is a joy forever"; /* Set some global shorthands that are needed by all of IOMMU code */ - ASSERT(root_devinfo == NULL); root_devinfo = ddi_root_node(); /* @@ -1107,7 +1163,7 @@ immu_startup(void) if (immu_setup == B_FALSE) { ddi_err(DER_WARN, NULL, "Intel IOMMU not setup, " - "skipping IOMU startup"); + "skipping IOMMU startup"); return; } @@ -1122,38 +1178,6 @@ immu_startup(void) } /* - * immu_map_sgl() - * called from rootnex_coredma_bindhdl() when Intel - * IOMMU is enabled to build DVMA cookies and map them. - */ -int -immu_map_sgl(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, - int prealloc_count, dev_info_t *rdip) -{ - if (immu_running == B_FALSE) { - return (DDI_DMA_USE_PHYSICAL); - } - - return (immu_dvma_map(hp, dmareq, NULL, prealloc_count, rdip, - IMMU_FLAGS_DMAHDL)); -} - -/* - * immu_unmap_sgl() - * called from rootnex_coredma_unbindhdl(), to unmap DVMA - * cookies and free them - */ -int -immu_unmap_sgl(ddi_dma_impl_t *hp, dev_info_t *rdip) -{ - if (immu_running == B_FALSE) { - return (DDI_DMA_USE_PHYSICAL); - } - - return (immu_dvma_unmap(hp, rdip)); -} - -/* * Hook to notify IOMMU code of device tree changes */ void @@ -1191,10 +1215,10 @@ immu_quiesce(void) mutex_enter(&immu_lock); - if (immu_running == B_FALSE) + if (immu_running == B_FALSE) { + mutex_exit(&immu_lock); return (DDI_SUCCESS); - - ASSERT(immu_setup == B_TRUE); + } immu = list_head(&immu_list); for (; immu; immu = list_next(&immu_list, immu)) { @@ -1205,9 +1229,9 @@ immu_quiesce(void) /* flush caches */ rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); - immu_flush_context_gbl(immu); + immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); + immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); rw_exit(&(immu->immu_ctx_rwlock)); - immu_flush_iotlb_gbl(immu); immu_regs_wbf_flush(immu); mutex_enter(&(immu->immu_lock)); @@ -1227,12 +1251,12 @@ immu_quiesce(void) mutex_exit(&(immu->immu_lock)); } - mutex_exit(&immu_lock); if (ret == DDI_SUCCESS) { immu_running = B_FALSE; immu_quiesced = B_TRUE; } + mutex_exit(&immu_lock); return (ret); } @@ -1249,11 +1273,10 @@ immu_unquiesce(void) mutex_enter(&immu_lock); - if (immu_quiesced == B_FALSE) + if (immu_quiesced == B_FALSE) { + mutex_exit(&immu_lock); return (DDI_SUCCESS); - - ASSERT(immu_setup == B_TRUE); - ASSERT(immu_running == B_FALSE); + } immu = list_head(&immu_list); for (; immu; immu = list_next(&immu_list, immu)) { @@ -1274,9 +1297,9 @@ immu_unquiesce(void) /* flush caches before unquiesce */ rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); - immu_flush_context_gbl(immu); + immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); + immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); rw_exit(&(immu->immu_ctx_rwlock)); - immu_flush_iotlb_gbl(immu); /* * Set IOMMU unit's regs to do @@ -1303,4 +1326,20 @@ immu_unquiesce(void) return (ret); } +void +immu_init_inv_wait(immu_inv_wait_t *iwp, const char *name, boolean_t sync) +{ + caddr_t vaddr; + uint64_t paddr; + + iwp->iwp_sync = sync; + + vaddr = (caddr_t)&iwp->iwp_vstatus; + paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr)); + paddr += ((uintptr_t)vaddr) & MMU_PAGEOFFSET; + + iwp->iwp_pstatus = paddr; + iwp->iwp_name = name; +} + /* ############## END Intel IOMMU entry points ################## */ diff --git a/usr/src/uts/i86pc/io/immu_dmar.c b/usr/src/uts/i86pc/io/immu_dmar.c index 734363beef..7ebcfb0ba6 100644 --- a/usr/src/uts/i86pc/io/immu_dmar.c +++ b/usr/src/uts/i86pc/io/immu_dmar.c @@ -46,7 +46,6 @@ #include <sys/apic.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> -#include <sys/iommulib.h> #include <sys/immu.h> #include <sys/smp_impldefs.h> @@ -639,14 +638,14 @@ dmar_table_print(dmar_table_t *tbl) } static void -drhd_devi_create(drhd_t *drhd, char *name) +drhd_devi_create(drhd_t *drhd, int unit) { struct ddi_parent_private_data *pdptr; struct regspec reg; dev_info_t *dip; - ndi_devi_alloc_sleep(root_devinfo, name, - DEVI_SID_NODEID, &dip); + dip = ddi_add_child(root_devinfo, IMMU_UNIT_NAME, + DEVI_SID_NODEID, unit); drhd->dr_dip = dip; @@ -702,7 +701,6 @@ dmar_devinfos_create(dmar_table_t *tbl) { list_t *drhd_list; drhd_t *drhd; - char name[IMMU_MAXNAMELEN]; int i, unit; for (i = 0; i < IMMU_MAXSEG; i++) { @@ -715,9 +713,7 @@ dmar_devinfos_create(dmar_table_t *tbl) drhd = list_head(drhd_list); for (unit = 0; drhd; drhd = list_next(drhd_list, drhd), unit++) { - (void) snprintf(name, sizeof (name), - "drhd%d,%d", i, unit); - drhd_devi_create(drhd, name); + drhd_devi_create(drhd, unit); } } } @@ -807,8 +803,8 @@ dmar_table_destroy(dmar_table_t *tbl) } /* free strings */ - kmem_free(tbl->tbl_oem_tblid, TBL_OEM_ID_SZ + 1); - kmem_free(tbl->tbl_oem_id, TBL_OEM_TBLID_SZ + 1); + kmem_free(tbl->tbl_oem_tblid, TBL_OEM_TBLID_SZ + 1); + kmem_free(tbl->tbl_oem_id, TBL_OEM_ID_SZ + 1); tbl->tbl_raw = NULL; /* raw ACPI table doesn't have to be freed */ mutex_destroy(&(tbl->tbl_lock)); kmem_free(tbl, sizeof (dmar_table_t)); @@ -946,7 +942,6 @@ void immu_dmar_rmrr_map(void) { int seg; - int e; int count; dev_info_t *rdip; scope_t *scope; @@ -1030,6 +1025,7 @@ immu_dmar_rmrr_map(void) } memlist_read_unlock(); + (void) immu_dvma_device_setup(rdip, 0); ddi_err(DER_LOG, rdip, "IMMU: Mapping RMRR range " @@ -1042,16 +1038,8 @@ immu_dmar_rmrr_map(void) IMMU_ROUNDUP((uintptr_t)rmrr->rm_limit - (uintptr_t)rmrr->rm_base + 1) / IMMU_PAGESIZE; - e = immu_dvma_map(NULL, NULL, &mrng, 0, rdip, - IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | - IMMU_FLAGS_MEMRNG); - /* - * dip may have unity domain or xlate domain - * If the former, PHYSICAL is returned else - * MAPPED is returned. - */ - ASSERT(e == DDI_DMA_MAPPED || - e == DDI_DMA_USE_PHYSICAL); + + (void) immu_map_memrange(rdip, &mrng); } } } @@ -1219,15 +1207,6 @@ found: return (drhd->dr_immu); } -char * -immu_dmar_unit_name(void *dmar_unit) -{ - drhd_t *drhd = (drhd_t *)dmar_unit; - - ASSERT(drhd->dr_dip); - return (ddi_node_name(drhd->dr_dip)); -} - dev_info_t * immu_dmar_unit_dip(void *dmar_unit) { diff --git a/usr/src/uts/i86pc/io/immu_dvma.c b/usr/src/uts/i86pc/io/immu_dvma.c index 59ce95439a..4dfa9c05b4 100644 --- a/usr/src/uts/i86pc/io/immu_dvma.c +++ b/usr/src/uts/i86pc/io/immu_dvma.c @@ -42,6 +42,8 @@ #include <sys/acpica.h> #include <sys/modhash.h> #include <sys/immu.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> #undef TEST @@ -71,13 +73,45 @@ static domain_t *domain_create(immu_t *immu, dev_info_t *ddip, static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus, int dev, int func, immu_flags_t immu_flags); static void destroy_immu_devi(immu_devi_t *immu_devi); -static boolean_t dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, - uint64_t nvpages, dcookie_t *dcookies, int dcount, dev_info_t *rdip, +static boolean_t dvma_map(domain_t *domain, uint64_t sdvma, + uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, immu_flags_t immu_flags); /* Extern globals */ extern struct memlist *phys_install; +/* + * iommulib interface functions. + */ +static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip); +static int immu_allochdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, + int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep); +static int immu_freehdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); +static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req, + ddi_dma_cookie_t *cookiep, uint_t *ccountp); +static int immu_unbindhdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle); +static int immu_sync(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len, + uint_t cachefl); +static int immu_win(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, + off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); +static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao); +static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao); +static int immu_map(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, struct ddi_dma_req *dmareq, + ddi_dma_handle_t *dma_handle); +static int immu_mctl(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + enum ddi_dma_ctlops request, off_t *offp, size_t *lenp, + caddr_t *objpp, uint_t cachefl); /* static Globals */ @@ -106,6 +140,33 @@ static ddi_device_acc_attr_t immu_acc_attr = { DDI_STRICTORDER_ACC }; +struct iommulib_ops immulib_ops = { + IOMMU_OPS_VERSION, + INTEL_IOMMU, + "Intel IOMMU", + NULL, + immu_probe, + immu_allochdl, + immu_freehdl, + immu_bindhdl, + immu_unbindhdl, + immu_sync, + immu_win, + immu_mapobject, + immu_unmapobject, + immu_map, + immu_mctl +}; + +/* + * Fake physical address range used to set up initial prealloc mappings. + * This memory is never actually accessed. It is mapped read-only, + * and is overwritten as soon as the first DMA bind operation is + * performed. Since 0 is a special case, just start at the 2nd + * physical page. + */ + +static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES }; /* globals private to this file */ static kmutex_t immu_domain_lock; @@ -124,6 +185,9 @@ typedef struct xlate { static mod_hash_t *bdf_domain_hash; +int immu_use_alh; +int immu_use_tm; + static domain_t * bdf_domain_lookup(immu_devi_t *immu_devi) { @@ -155,15 +219,12 @@ bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain) int16_t bus = immu_devi->imd_bus; int16_t devfunc = immu_devi->imd_devfunc; uintptr_t bdf = (seg << 16 | bus << 8 | devfunc); - int r; if (seg < 0 || bus < 0 || devfunc < 0) { return; } - r = mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain); - ASSERT(r != MH_ERR_DUPLICATE); - ASSERT(r == 0); + (void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain); } static int @@ -172,10 +233,6 @@ match_lpc(dev_info_t *pdip, void *arg) immu_devi_t *immu_devi; dvma_arg_t *dvap = (dvma_arg_t *)arg; - ASSERT(dvap->dva_error == DDI_FAILURE); - ASSERT(dvap->dva_ddip == NULL); - ASSERT(dvap->dva_list); - if (list_is_empty(dvap->dva_list)) { return (DDI_WALK_TERMINATE); } @@ -183,7 +240,6 @@ match_lpc(dev_info_t *pdip, void *arg) immu_devi = list_head(dvap->dva_list); for (; immu_devi; immu_devi = list_next(dvap->dva_list, immu_devi)) { - ASSERT(immu_devi->imd_dip); if (immu_devi->imd_dip == pdip) { dvap->dva_ddip = pdip; dvap->dva_error = DDI_SUCCESS; @@ -200,8 +256,6 @@ immu_devi_set_spclist(dev_info_t *dip, immu_t *immu) list_t *spclist = NULL; immu_devi_t *immu_devi; - ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_lock))); - immu_devi = IMMU_DEVI(dip); if (immu_devi->imd_display == B_TRUE) { spclist = &(immu->immu_dvma_gfx_list); @@ -226,10 +280,6 @@ immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags) immu_devi_t *new_imd; immu_devi_t *immu_devi; - ASSERT(root_devinfo); - ASSERT(dip); - ASSERT(dip != root_devinfo); - immu_devi = immu_devi_get(dip); if (immu_devi != NULL) { return (DDI_SUCCESS); @@ -305,7 +355,7 @@ get_gfx_devinfo(dev_info_t *rdip) list_t *list_gfx; /* - * The GFX device may not be on the same IMMU unit as "agpgart" + * The GFX device may not be on the same iommu unit as "agpgart" * so search globally */ immu_devi = NULL; @@ -319,16 +369,12 @@ get_gfx_devinfo(dev_info_t *rdip) } if (immu_devi == NULL) { - ddi_err(DER_WARN, rdip, "IMMU: No GFX device. " + ddi_err(DER_WARN, rdip, "iommu: No GFX device. " "Cannot redirect agpgart"); return (NULL); } - /* list is not empty we checked above */ - ASSERT(immu_devi); - ASSERT(immu_devi->imd_dip); - - ddi_err(DER_LOG, rdip, "IMMU: GFX redirect to %s", + ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s", ddi_node_name(immu_devi->imd_dip)); return (immu_devi->imd_dip); @@ -373,6 +419,7 @@ dma_to_immu_flags(struct ddi_dma_req *dmareq) return (flags); } +/*ARGSUSED*/ int pgtable_ctor(void *buf, void *arg, int kmflag) { @@ -381,9 +428,8 @@ pgtable_ctor(void *buf, void *arg, int kmflag) int (*dmafp)(caddr_t); caddr_t vaddr; void *next; - - ASSERT(buf); - ASSERT(arg == NULL); + uint_t flags; + immu_t *immu = arg; pgtable = (pgtable_t *)buf; @@ -394,15 +440,18 @@ pgtable_ctor(void *buf, void *arg, int kmflag) return (-1); } - ASSERT(root_devinfo); if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr, dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) { kmem_free(next, IMMU_PAGESIZE); return (-1); } + flags = DDI_DMA_CONSISTENT; + if (!immu->immu_dvma_coherent) + flags |= IOMEM_DATA_UC_WR_COMBINE; + if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE, - &immu_acc_attr, DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED, + &immu_acc_attr, flags, dmafp, NULL, &vaddr, &actual_size, &pgtable->hwpg_memhdl) != DDI_SUCCESS) { ddi_dma_free_handle(&pgtable->hwpg_dmahdl); @@ -430,16 +479,13 @@ pgtable_ctor(void *buf, void *arg, int kmflag) return (0); } +/*ARGSUSED*/ void pgtable_dtor(void *buf, void *arg) { pgtable_t *pgtable; - ASSERT(buf); - ASSERT(arg == NULL); - pgtable = (pgtable_t *)buf; - ASSERT(pgtable->swpg_next_array); /* destroy will panic if lock is held. */ rw_destroy(&(pgtable->swpg_rwlock)); @@ -447,8 +493,6 @@ pgtable_dtor(void *buf, void *arg) ddi_dma_mem_free(&pgtable->hwpg_memhdl); ddi_dma_free_handle(&pgtable->hwpg_dmahdl); kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE); - - /* don't zero out hwpg_vaddr and swpg_next_array for debugging */ } /* @@ -469,11 +513,9 @@ pgtable_alloc(immu_t *immu, immu_flags_t immu_flags) pgtable_t *pgtable; int kmflags; - ASSERT(immu); - kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; - pgtable = kmem_cache_alloc(immu_pgtable_cache, kmflags); + pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags); if (pgtable == NULL) { return (NULL); } @@ -481,22 +523,16 @@ pgtable_alloc(immu_t *immu, immu_flags_t immu_flags) } static void -pgtable_zero(immu_t *immu, pgtable_t *pgtable) +pgtable_zero(pgtable_t *pgtable) { bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE); bzero(pgtable->swpg_next_array, IMMU_PAGESIZE); - - /* Dont need to flush the write we will flush when we use the entry */ - immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE); } static void pgtable_free(immu_t *immu, pgtable_t *pgtable) { - ASSERT(immu); - ASSERT(pgtable); - - kmem_cache_free(immu_pgtable_cache, pgtable); + kmem_cache_free(immu->immu_pgtable_cache, pgtable); } /* @@ -564,6 +600,14 @@ device_is_pciex( return (is_pciex); } +static boolean_t +device_use_premap(uint_t classcode) +{ + if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET) + return (B_TRUE); + return (B_FALSE); +} + /* * immu_dvma_get_immu() @@ -591,7 +635,6 @@ immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags) /*NOTREACHED*/ } immu_devi = immu_devi_get(dip); - ASSERT(immu_devi); } mutex_enter(&(DEVI(dip)->devi_lock)); @@ -715,9 +758,9 @@ create_immu_devi(dev_info_t *rdip, int bus, int dev, int func, /* check for certain special devices */ immu_devi->imd_display = device_is_display(classcode); - immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) && (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE; + immu_devi->imd_use_premap = device_use_premap(classcode); immu_devi->imd_domain = NULL; @@ -739,9 +782,6 @@ immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp) domain_t *domain; dev_info_t *ddip; - ASSERT(rdip); - ASSERT(ddipp); - *ddipp = NULL; immu_devi = immu_devi_get(rdip); @@ -754,11 +794,8 @@ immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp) ddip = immu_devi->imd_ddip; mutex_exit(&(DEVI(rdip)->devi_lock)); - if (domain) { - ASSERT(domain->dom_did > 0); - ASSERT(ddip); + if (domain) *ddipp = ddip; - } return (domain); @@ -776,16 +813,10 @@ did_alloc(immu_t *immu, dev_info_t *rdip, { int did; - ASSERT(immu); - ASSERT(rdip); - ASSERT(rdip != root_devinfo); - did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1, (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP); if (did == 0) { - ASSERT(immu->immu_unity_domain); - ASSERT(immu->immu_unity_domain->dom_did > 0); ddi_err(DER_WARN, rdip, "device domain-id alloc error" " domain-device: %s%d. immu unit is %s. Using " "unity domain with domain-id (%d)", @@ -806,10 +837,6 @@ get_branch_domain(dev_info_t *pdip, void *arg) immu_t *immu; dvma_arg_t *dvp = (dvma_arg_t *)arg; - ASSERT(pdip); - ASSERT(dvp); - ASSERT(dvp->dva_rdip); - /* * The field dvp->dva_rdip is a work-in-progress * and gets updated as we walk up the ancestor @@ -828,12 +855,9 @@ get_branch_domain(dev_info_t *pdip, void *arg) } immu_devi = immu_devi_get(pdip); - ASSERT(immu_devi); immu = immu_devi->imd_immu; - if (immu == NULL) { + if (immu == NULL) immu = immu_dvma_get_immu(pdip, dvp->dva_flags); - ASSERT(immu); - } /* * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to @@ -879,10 +903,6 @@ get_branch_domain(dev_info_t *pdip, void *arg) return (DDI_WALK_TERMINATE); } - /* immu_devi either has both set or both clear */ - ASSERT(domain == NULL); - ASSERT(ddip == NULL); - /* Domain may already be set, continue walking so that ddip gets set */ if (dvp->dva_domain) { return (DDI_WALK_CONTINUE); @@ -899,7 +919,6 @@ get_branch_domain(dev_info_t *pdip, void *arg) /* Grab lock again to check if something else set immu_devi fields */ mutex_enter(&(DEVI(pdip)->devi_lock)); if (immu_devi->imd_domain != NULL) { - ASSERT(immu_devi->imd_domain == domain); dvp->dva_domain = domain; } else { dvp->dva_domain = domain; @@ -919,19 +938,9 @@ map_unity_domain(domain_t *domain) struct memlist *mp; uint64_t start; uint64_t npages; - dcookie_t dcookies[1] = {0}; + immu_dcookie_t dcookies[1] = {0}; int dcount = 0; - ASSERT(domain); - ASSERT(domain->dom_did == IMMU_UNITY_DID); - - /* - * We call into routines that grab the lock so we should - * not be called with the lock held. This does not matter - * much since, no else has a reference to this domain - */ - ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock))); - /* * UNITY arenas are a mirror of the physical memory * installed on the system. @@ -944,7 +953,7 @@ map_unity_domain(domain_t *domain) dcookies[0].dck_paddr = 0; dcookies[0].dck_npages = 1; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL, + (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); #endif @@ -963,17 +972,17 @@ map_unity_domain(domain_t *domain) dcookies[0].dck_paddr = start; dcookies[0].dck_npages = npages; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, start, npages, dcookies, + (void) dvma_map(domain, start, npages, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); - ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64 + ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", start, start + mp->ml_size); mp = mp->ml_next; while (mp) { - ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64 - " - 0x%" PRIx64 "]", mp->ml_address, - mp->ml_address + mp->ml_size); + ddi_err(DER_LOG, domain->dom_dip, + "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", + mp->ml_address, mp->ml_address + mp->ml_size); start = mp->ml_address; npages = mp->ml_size/IMMU_PAGESIZE + 1; @@ -981,16 +990,16 @@ map_unity_domain(domain_t *domain) dcookies[0].dck_paddr = start; dcookies[0].dck_npages = npages; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, start, npages, + (void) dvma_map(domain, start, npages, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); mp = mp->ml_next; } mp = bios_rsvd; while (mp) { - ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64 - " - 0x%" PRIx64 "]", mp->ml_address, - mp->ml_address + mp->ml_size); + ddi_err(DER_LOG, domain->dom_dip, + "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]", + mp->ml_address, mp->ml_address + mp->ml_size); start = mp->ml_address; npages = mp->ml_size/IMMU_PAGESIZE + 1; @@ -998,7 +1007,7 @@ map_unity_domain(domain_t *domain) dcookies[0].dck_paddr = start; dcookies[0].dck_npages = npages; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, start, npages, + (void) dvma_map(domain, start, npages, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); mp = mp->ml_next; @@ -1035,12 +1044,6 @@ create_xlate_arena(immu_t *immu, domain_t *domain, vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP; - /* - * No one else has access to this domain. - * So no domain locks needed - */ - ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock))); - /* Restrict mgaddr (max guest addr) to MGAW */ mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap); @@ -1066,11 +1069,9 @@ create_xlate_arena(immu_t *immu, domain_t *domain, size = mp->ml_size; ddi_err(DER_VERB, rdip, - "%s: Creating dvma vmem arena [0x%" PRIx64 + "iommu: %s: Creating dvma vmem arena [0x%" PRIx64 " - 0x%" PRIx64 "]", arena_name, start, start + size); - ASSERT(domain->dom_dvma_arena == NULL); - /* * We always allocate in quanta of IMMU_PAGESIZE */ @@ -1105,7 +1106,7 @@ create_xlate_arena(immu_t *immu, domain_t *domain, size = mp->ml_size; ddi_err(DER_VERB, rdip, - "%s: Adding dvma vmem span [0x%" PRIx64 + "iommu: %s: Adding dvma vmem span [0x%" PRIx64 " - 0x%" PRIx64 "]", arena_name, start, start + size); @@ -1139,13 +1140,7 @@ set_domain( domain_t *fdomain; dev_info_t *fddip; - ASSERT(dip); - ASSERT(ddip); - ASSERT(domain); - ASSERT(domain->dom_did > 0); /* must be an initialized domain */ - immu_devi = immu_devi_get(dip); - ASSERT(immu_devi); mutex_enter(&(DEVI(dip)->devi_lock)); fddip = immu_devi->imd_ddip; @@ -1187,8 +1182,6 @@ device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) dvma_arg_t dvarg = {0}; int level; - ASSERT(rdip); - *ddipp = NULL; /* @@ -1198,8 +1191,6 @@ device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) ddip = NULL; domain = immu_devi_domain(rdip, &ddip); if (domain) { - ASSERT(domain->dom_did > 0); - ASSERT(ddip); *ddipp = ddip; return (domain); } @@ -1210,7 +1201,7 @@ device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) * possible that there is no IOMMU unit for this device * - BIOS bugs are one example. */ - ddi_err(DER_WARN, rdip, "No IMMU unit found for device"); + ddi_err(DER_WARN, rdip, "No iommu unit found for device"); return (NULL); } @@ -1262,7 +1253,6 @@ device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags) if (domain == NULL) { return (NULL); } - ASSERT(domain->dom_did > 0); /*FALLTHROUGH*/ found: @@ -1283,10 +1273,6 @@ create_unity_domain(immu_t *immu) { domain_t *domain; - /* 0 is reserved by Vt-d */ - /*LINTED*/ - ASSERT(IMMU_UNITY_DID > 0); - /* domain created during boot and always use sleep flag */ domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP); @@ -1303,10 +1289,16 @@ create_unity_domain(immu_t *immu) * should never fail. */ domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); - ASSERT(domain->dom_pgtable_root); - pgtable_zero(immu, domain->dom_pgtable_root); + pgtable_zero(domain->dom_pgtable_root); + + /* + * Only map all physical memory in to the unity domain + * if passthrough is not supported. If it is supported, + * passthrough is set in the context entry instead. + */ + if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap)) + map_unity_domain(domain); - map_unity_domain(domain); /* * put it on the system-wide UNITY domain list @@ -1331,23 +1323,17 @@ domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, char mod_hash_name[128]; immu_devi_t *immu_devi; int did; - dcookie_t dcookies[1] = {0}; + immu_dcookie_t dcookies[1] = {0}; int dcount = 0; - ASSERT(immu); - ASSERT(ddip); - immu_devi = immu_devi_get(rdip); - ASSERT(immu_devi); - /* * First allocate a domainid. * This routine will never fail, since if we run out * of domains the unity domain will be allocated. */ did = did_alloc(immu, rdip, ddip, immu_flags); - ASSERT(did > 0); if (did == IMMU_UNITY_DID) { /* domain overflow */ ASSERT(immu->immu_unity_domain); @@ -1370,6 +1356,7 @@ domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, domain->dom_did = did; domain->dom_immu = immu; domain->dom_maptype = IMMU_MAPTYPE_XLATE; + domain->dom_dip = ddip; /* * Create xlate DVMA arena for this domain. @@ -1386,7 +1373,7 @@ domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, domain->dom_did, immu->immu_name); /*NOTREACHED*/ } - pgtable_zero(immu, domain->dom_pgtable_root); + pgtable_zero(domain->dom_pgtable_root); /* * Since this is a immu unit-specific domain, put it on @@ -1412,7 +1399,7 @@ domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip, dcookies[0].dck_paddr = 0; dcookies[0].dck_npages = 1; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL, + (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1); #endif return (domain); @@ -1440,8 +1427,8 @@ did_init(immu_t *immu) sizeof (immu->immu_did_arena_name), "%s_domainid_arena", immu->immu_name); - ddi_err(DER_VERB, NULL, "%s: Creating domainid arena %s", - immu->immu_name, immu->immu_did_arena_name); + ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s", + immu->immu_did_arena_name); immu->immu_did_arena = vmem_create( immu->immu_did_arena_name, @@ -1470,7 +1457,6 @@ context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, { pgtable_t *context; pgtable_t *pgtable_root; - pgtable_t *unity_pgtable_root; hw_rce_t *hw_rent; hw_rce_t *hw_cent; hw_rce_t *ctxp; @@ -1479,13 +1465,6 @@ context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, boolean_t fill_root; boolean_t fill_ctx; - ASSERT(immu); - ASSERT(domain); - ASSERT(root_table); - ASSERT(bus >= 0); - ASSERT(devfunc >= 0); - ASSERT(domain->dom_pgtable_root); - pgtable_root = domain->dom_pgtable_root; ctxp = (hw_rce_t *)(root_table->swpg_next_array); @@ -1500,15 +1479,8 @@ context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table, rwtype = RW_READER; again: if (ROOT_GET_P(hw_rent)) { - ASSERT(ROOT_GET_CONT(hw_rent) == context->hwpg_paddr); hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) { - ASSERT(CONT_GET_P(hw_cent)); - ASSERT(CONT_GET_DID(hw_cent) == domain->dom_did); - ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw); - ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY); - ASSERT(CONT_GET_ASR(hw_cent) == - pgtable_root->hwpg_paddr); rw_exit(&(immu->immu_ctx_rwlock)); return; } else { @@ -1536,35 +1508,32 @@ again: if (fill_ctx == B_TRUE) { hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc; - unity_pgtable_root = immu->immu_unity_domain->dom_pgtable_root; - ASSERT(CONT_GET_AVAIL(hw_cent) == IMMU_CONT_UNINITED); - ASSERT(CONT_GET_P(hw_cent)); - ASSERT(CONT_GET_DID(hw_cent) == - immu->immu_unity_domain->dom_did); - ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw); - ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY); - ASSERT(CONT_GET_ASR(hw_cent) == - unity_pgtable_root->hwpg_paddr); - /* need to disable context entry before reprogramming it */ bzero(hw_cent, sizeof (hw_rce_t)); /* flush caches */ immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); - ASSERT(rw_write_held(&(immu->immu_ctx_rwlock))); sid = ((bus << 8) | devfunc); - immu_flush_context_fsi(immu, 0, sid, domain->dom_did); - - immu_regs_wbf_flush(immu); + immu_flush_context_fsi(immu, 0, sid, domain->dom_did, + &immu->immu_ctx_inv_wait); CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED); CONT_SET_DID(hw_cent, domain->dom_did); CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); - /*LINTED*/ - CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); + if (domain->dom_did == IMMU_UNITY_DID && + IMMU_ECAP_GET_PT(immu->immu_regs_excap)) + CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); + else + /*LINTED*/ + CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); CONT_SET_P(hw_cent); + if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) { + CONT_SET_EH(hw_cent); + if (immu_use_alh) + CONT_SET_ALH(hw_cent); + } immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t)); } rw_exit(&(immu->immu_ctx_rwlock)); @@ -1584,7 +1553,7 @@ context_create(immu_t *immu) /* Allocate a zeroed root table (4K 256b entries) */ root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); - pgtable_zero(immu, root_table); + pgtable_zero(root_table); /* * Setup context tables for all possible root table entries. @@ -1594,29 +1563,29 @@ context_create(immu_t *immu) hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr); for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) { context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP); - pgtable_zero(immu, context); - ASSERT(ROOT_GET_P(hw_rent) == 0); + pgtable_zero(context); ROOT_SET_P(hw_rent); ROOT_SET_CONT(hw_rent, context->hwpg_paddr); hw_cent = (hw_rce_t *)(context->hwpg_vaddr); for (devfunc = 0; devfunc < IMMU_CONT_NUM; devfunc++, hw_cent++) { - ASSERT(CONT_GET_P(hw_cent) == 0); pgtable_root = immu->immu_unity_domain->dom_pgtable_root; CONT_SET_DID(hw_cent, immu->immu_unity_domain->dom_did); CONT_SET_AW(hw_cent, immu->immu_dvma_agaw); CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr); - /*LINTED*/ - CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); + if (IMMU_ECAP_GET_PT(immu->immu_regs_excap)) + CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU); + else + /*LINTED*/ + CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY); CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED); CONT_SET_P(hw_cent); } immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE); *((pgtable_t **)ctxp) = context; } - immu_regs_cpu_flush(immu, root_table->hwpg_vaddr, IMMU_PAGESIZE); return (root_table); } @@ -1627,11 +1596,10 @@ context_create(immu_t *immu) static void context_init(immu_t *immu) { - ASSERT(immu); - ASSERT(immu->immu_ctx_root == NULL); - rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL); + immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE); + immu_regs_wbf_flush(immu); immu->immu_ctx_root = context_create(immu); @@ -1639,10 +1607,9 @@ context_init(immu_t *immu) immu_regs_set_root_table(immu); rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER); - immu_flush_context_gbl(immu); + immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait); + immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait); rw_exit(&(immu->immu_ctx_rwlock)); - immu_flush_iotlb_gbl(immu); - immu_regs_wbf_flush(immu); } @@ -1655,10 +1622,7 @@ find_top_pcib(dev_info_t *dip, void *arg) immu_devi_t *immu_devi; dev_info_t **pcibdipp = (dev_info_t **)arg; - ASSERT(dip); - immu_devi = immu_devi_get(dip); - ASSERT(immu_devi); if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) { *pcibdipp = dip; @@ -1678,7 +1642,6 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, int r_devfunc; int d_devfunc; immu_pcib_t d_pcib_type; - immu_pcib_t r_pcib_type; dev_info_t *pcibdip; if (ddip == NULL || rdip == NULL || @@ -1708,23 +1671,14 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, d_immu_devi = immu_devi_get(ddip); r_immu_devi = immu_devi_get(rdip); - ASSERT(r_immu_devi); - ASSERT(d_immu_devi); d_bus = d_immu_devi->imd_bus; d_devfunc = d_immu_devi->imd_devfunc; d_pcib_type = d_immu_devi->imd_pcib_type; r_bus = r_immu_devi->imd_bus; r_devfunc = r_immu_devi->imd_devfunc; - r_pcib_type = r_immu_devi->imd_pcib_type; - - ASSERT(d_bus >= 0); if (rdip == ddip) { - ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT || - d_pcib_type == IMMU_PCIB_PCIE_PCIE); - ASSERT(r_bus >= 0); - ASSERT(r_devfunc >= 0); /* rdip is a PCIE device. set context for it only */ context_set(immu, domain, immu->immu_ctx_root, r_bus, r_devfunc); @@ -1734,9 +1688,6 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and " "0x%lx are identical", rdip, ddip); #endif - ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT); - ASSERT(r_bus >= 0); - ASSERT(r_devfunc >= 0); /* rdip is a PCIE device. set context for it only */ context_set(immu, domain, immu->immu_ctx_root, r_bus, r_devfunc); @@ -1758,11 +1709,7 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, if (immu_walk_ancestor(rdip, ddip, find_top_pcib, &pcibdip, NULL, immu_flags) == DDI_SUCCESS && pcibdip != NULL) { - ASSERT(pcibdip); r_immu_devi = immu_devi_get(pcibdip); - ASSERT(d_immu_devi); - ASSERT(d_immu_devi->imd_pcib_type == - IMMU_PCIB_PCI_PCI); r_bus = r_immu_devi->imd_bus; r_devfunc = r_immu_devi->imd_devfunc; context_set(immu, domain, immu->immu_ctx_root, @@ -1777,7 +1724,6 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, context_set(immu, domain, immu->immu_ctx_root, d_bus, d_devfunc); } else if (d_pcib_type == IMMU_PCIB_ENDPOINT) { - ASSERT(r_pcib_type == IMMU_PCIB_NOBDF); /* * ddip is a PCIE device which has a non-PCI device under it * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata @@ -1786,7 +1732,7 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, d_devfunc); } else { ddi_err(DER_PANIC, rdip, "unknown device type. Cannot " - "set IMMU context."); + "set iommu context."); /*NOTREACHED*/ } @@ -1798,16 +1744,11 @@ immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip, /* ##################### MAPPING CODE ################################## */ +#ifdef DEBUG static boolean_t PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr, dev_info_t *rdip, immu_flags_t immu_flags) { - if (immu_flags & IMMU_FLAGS_PAGE1) { - ASSERT(paddr == 0); - } else { - ASSERT((next == NULL) ^ (paddr == 0)); - } - /* The PDTE must be set i.e. present bit is set */ if (!PDTE_P(pdte)) { ddi_err(DER_MODE, rdip, "No present flag"); @@ -1904,6 +1845,8 @@ PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr, return (B_TRUE); } +#endif + /*ARGSUSED*/ static void PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, @@ -1915,23 +1858,13 @@ PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, hw_pdte_t *hwp; hw_pdte_t *shwp; int idx; - hw_pdte_t pte; - - ASSERT(xlate->xlt_level == 1); pgtable = xlate->xlt_pgtable; idx = xlate->xlt_idx; - ASSERT(pgtable); - ASSERT(idx <= IMMU_PGTABLE_MAXIDX); - dvma = *dvma_ptr; npages = *npages_ptr; - ASSERT(dvma); - ASSERT(dvma % IMMU_PAGESIZE == 0); - ASSERT(npages); - /* * since a caller gets a unique dvma for a physical address, * no other concurrent thread will be writing to the same @@ -1941,45 +1874,23 @@ PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate, hwp = shwp; for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { - - pte = *hwp; - - /* Cannot clear a HW PTE that is aleady clear */ - ASSERT(PDTE_P(pte)); - PDTE_CLEAR_P(pte); - *hwp = pte; - + PDTE_CLEAR_P(*hwp); dvma += IMMU_PAGESIZE; npages--; } - -#ifdef TEST - /* dont need to flush write during unmap */ - immu_regs_cpu_flush(immu, (caddr_t)shwp, - (hwp - shwp) * sizeof (hw_pdte_t)); -#endif - *dvma_ptr = dvma; *npages_ptr = npages; xlate->xlt_idx = idx; } -/*ARGSUSED*/ static void -xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate, - int nlevels, dev_info_t *rdip) +xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels) { int level; uint64_t offbits; - /* level 0 is never used. Sanity check */ - ASSERT(xlate->xlt_level == 0); - ASSERT(xlate->xlt_idx == 0); - ASSERT(xlate->xlt_pgtable == NULL); - ASSERT(dvma % IMMU_PAGESIZE == 0); - /* * Skip the first 12 bits which is the offset into * 4K PFN (phys page frame based on IMMU_PAGESIZE) @@ -1999,41 +1910,28 @@ xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate, /* * Read the pgtables */ -static void -PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, - dev_info_t *rdip) +static boolean_t +PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels) { pgtable_t *pgtable; pgtable_t *next; - hw_pdte_t pde; uint_t idx; - /* xlate should be at level 0 */ - ASSERT(xlate->xlt_level == 0); - ASSERT(xlate->xlt_idx == 0); - /* start with highest level pgtable i.e. root */ xlate += nlevels; - ASSERT(xlate->xlt_level == nlevels); if (xlate->xlt_pgtable == NULL) { xlate->xlt_pgtable = domain->dom_pgtable_root; } for (; xlate->xlt_level > 1; xlate--) { - idx = xlate->xlt_idx; pgtable = xlate->xlt_pgtable; - ASSERT(pgtable); - ASSERT(idx <= IMMU_PGTABLE_MAXIDX); - if ((xlate - 1)->xlt_pgtable) { continue; } - /* xlate's leafier level is not set, set it now */ - /* Lock the pgtable in read mode */ rw_enter(&(pgtable->swpg_rwlock), RW_READER); @@ -2042,16 +1940,76 @@ PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, * already point to a leafier pgtable. */ next = *(pgtable->swpg_next_array + idx); - ASSERT(next); + (xlate - 1)->xlt_pgtable = next; + rw_exit(&(pgtable->swpg_rwlock)); + if (next == NULL) + return (B_FALSE); + } + + return (B_TRUE); +} - pde = *((hw_pdte_t *)(pgtable->hwpg_vaddr) + idx); +static void +immu_fault_walk(void *arg, void *base, size_t len) +{ + uint64_t dvma, start; - ASSERT(PDTE_check(immu, pde, next, 0, rdip, 0) == B_TRUE); + dvma = *(uint64_t *)arg; + start = (uint64_t)(uintptr_t)base; - (xlate - 1)->xlt_pgtable = next; + if (dvma >= start && dvma < (start + len)) { + ddi_err(DER_WARN, NULL, + "faulting DVMA address is in vmem arena " + "(%" PRIx64 "-%" PRIx64 ")", + start, start + len); + *(uint64_t *)arg = ~0ULL; + } +} - rw_exit(&(pgtable->swpg_rwlock)); +void +immu_print_fault_info(uint_t sid, uint64_t dvma) +{ + int nlevels; + xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; + xlate_t *xlatep; + hw_pdte_t pte; + domain_t *domain; + immu_t *immu; + uint64_t dvma_arg; + + if (mod_hash_find(bdf_domain_hash, + (void *)(uintptr_t)sid, (void *)&domain) != 0) { + ddi_err(DER_WARN, NULL, + "no domain for faulting SID %08x", sid); + return; } + + immu = domain->dom_immu; + + dvma_arg = dvma; + vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk, + (void *)&dvma_arg); + if (dvma_arg != ~0ULL) + ddi_err(DER_WARN, domain->dom_dip, + "faulting DVMA address is not in vmem arena"); + + nlevels = immu->immu_dvma_nlevels; + xlate_setup(dvma, xlate, nlevels); + + if (!PDE_lookup(domain, xlate, nlevels)) { + ddi_err(DER_WARN, domain->dom_dip, + "pte not found in domid %d for faulting addr %" PRIx64, + domain->dom_did, dvma); + return; + } + + xlatep = &xlate[1]; + pte = *((hw_pdte_t *) + (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx); + + ddi_err(DER_WARN, domain->dom_dip, + "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did, + (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte)); } /*ARGSUSED*/ @@ -2061,17 +2019,11 @@ PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, { hw_pdte_t pte; - pte = *hwp; - #ifndef DEBUG - /* Set paddr */ - ASSERT(paddr % IMMU_PAGESIZE == 0); - pte = 0; + pte = immu->immu_ptemask; PDTE_SET_PADDR(pte, paddr); - PDTE_SET_READ(pte); - PDTE_SET_WRITE(pte); - *hwp = pte; #else + pte = *hwp; if (PDTE_P(pte)) { if (PDTE_PADDR(pte) != paddr) { @@ -2085,17 +2037,13 @@ PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, #endif } - /* Don't touch SW4. It is the present field */ - /* clear TM field if not reserved */ if (immu->immu_TM_reserved == B_FALSE) { PDTE_CLEAR_TM(pte); } -#ifdef DEBUG /* Clear 3rd field for system software - not used */ PDTE_CLEAR_SW3(pte); -#endif /* Set paddr */ ASSERT(paddr % IMMU_PAGESIZE == 0); @@ -2107,21 +2055,15 @@ PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, PDTE_CLEAR_SNP(pte); } -#ifdef DEBUG /* Clear SW2 field available for software */ PDTE_CLEAR_SW2(pte); -#endif -#ifdef DEBUG /* SP is don't care for PTEs. Clear it for cleanliness */ PDTE_CLEAR_SP(pte); -#endif -#ifdef DEBUG /* Clear SW1 field available for software */ PDTE_CLEAR_SW1(pte); -#endif /* * Now that we are done writing the PTE @@ -2136,7 +2078,10 @@ PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr, PDTE_SET_P(pte); + pte |= immu->immu_ptemask; + out: +#endif /* DEBUG */ #ifdef BUGGY_DRIVERS PDTE_SET_READ(pte); PDTE_SET_WRITE(pte); @@ -2145,16 +2090,15 @@ out: PDTE_SET_READ(pte); if (immu_flags & IMMU_FLAGS_WRITE) PDTE_SET_WRITE(pte); -#endif +#endif /* BUGGY_DRIVERS */ *hwp = pte; -#endif } /*ARGSUSED*/ static void PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, - uint64_t *dvma_ptr, uint64_t *nvpages_ptr, dcookie_t *dcookies, + uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, immu_flags_t immu_flags) { paddr_t paddr; @@ -2164,23 +2108,15 @@ PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, pgtable_t *pgtable; hw_pdte_t *hwp; hw_pdte_t *shwp; - int idx; + int idx, nset; int j; - ASSERT(xlate->xlt_level == 1); - pgtable = xlate->xlt_pgtable; idx = xlate->xlt_idx; - ASSERT(idx <= IMMU_PGTABLE_MAXIDX); - ASSERT(pgtable); - dvma = *dvma_ptr; nvpages = *nvpages_ptr; - ASSERT(dvma || (immu_flags & IMMU_FLAGS_PAGE1)); - ASSERT(nvpages); - /* * since a caller gets a unique dvma for a physical address, * no other concurrent thread will be writing to the same @@ -2195,19 +2131,15 @@ PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, nvpages -= dcookies[j].dck_npages; } - ASSERT(j >= 0); - ASSERT(nvpages); - ASSERT(nvpages <= dcookies[j].dck_npages); nppages = nvpages; paddr = dcookies[j].dck_paddr + (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE; nvpages = *nvpages_ptr; + nset = 0; for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) { - - ASSERT(paddr || (immu_flags & IMMU_FLAGS_PAGE1)); - PTE_set_one(immu, hwp, paddr, rdip, immu_flags); + nset++; ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags) == B_TRUE); @@ -2220,22 +2152,15 @@ PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, j++; } - if (j == dcount) { - ASSERT(nvpages == 0); + if (j == dcount) break; - } - ASSERT(nvpages); if (nppages == 0) { nppages = dcookies[j].dck_npages; paddr = dcookies[j].dck_paddr; } } - /* flush writes to HW PTE table */ - immu_regs_cpu_flush(immu, (caddr_t)shwp, (hwp - shwp) * - sizeof (hw_pdte_t)); - if (nvpages) { *dvma_ptr = dvma; *nvpages_ptr = nvpages; @@ -2274,7 +2199,6 @@ PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next, PDTE_CLEAR_SW3(pde); /* Set next level pgtable-paddr for PDE */ - ASSERT(next->hwpg_paddr % IMMU_PAGESIZE == 0); PDTE_CLEAR_PADDR(pde); PDTE_SET_PADDR(pde, next->hwpg_paddr); @@ -2314,8 +2238,6 @@ out: PDTE_SET_P(pde); *hwp = pde; - - immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t)); } /* @@ -2334,42 +2256,20 @@ PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels, krw_t rwtype; boolean_t set = B_FALSE; - /* xlate should be at level 0 */ - ASSERT(xlate->xlt_level == 0); - ASSERT(xlate->xlt_idx == 0); - /* start with highest level pgtable i.e. root */ xlate += nlevels; - ASSERT(xlate->xlt_level == nlevels); new = NULL; xlate->xlt_pgtable = domain->dom_pgtable_root; for (level = nlevels; level > 1; level--, xlate--) { - - ASSERT(xlate->xlt_level == level); - idx = xlate->xlt_idx; pgtable = xlate->xlt_pgtable; - ASSERT(pgtable); - ASSERT(idx <= IMMU_PGTABLE_MAXIDX); - - /* speculative alloc */ - if (new == NULL) { - new = pgtable_alloc(immu, immu_flags); - if (new == NULL) { - ddi_err(DER_PANIC, rdip, "pgtable alloc err"); - } - } - /* Lock the pgtable in READ mode first */ rw_enter(&(pgtable->swpg_rwlock), RW_READER); rwtype = RW_READER; again: hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx; - - ASSERT(pgtable->swpg_next_array); - next = (pgtable->swpg_next_array)[idx]; /* @@ -2377,6 +2277,19 @@ again: * if yes, verify */ if (next == NULL) { + if (new == NULL) { + + IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *, + rdip, int, level); + + new = pgtable_alloc(immu, immu_flags); + if (new == NULL) { + ddi_err(DER_PANIC, rdip, + "pgtable alloc err"); + } + pgtable_zero(new); + } + /* Change to a write lock */ if (rwtype == RW_READER && rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) { @@ -2386,18 +2299,18 @@ again: goto again; } rwtype = RW_WRITER; - pgtable_zero(immu, new); next = new; - new = NULL; (pgtable->swpg_next_array)[idx] = next; + new = NULL; PDE_set_one(immu, hwp, next, rdip, immu_flags); set = B_TRUE; rw_downgrade(&(pgtable->swpg_rwlock)); rwtype = RW_READER; - } else { + } +#ifndef BUGGY_DRIVERS + else { hw_pdte_t pde = *hwp; -#ifndef BUGGY_DRIVERS /* * If buggy driver we already set permission * READ+WRITE so nothing to do for that case @@ -2409,16 +2322,14 @@ again: if (immu_flags & IMMU_FLAGS_WRITE) PDTE_SET_WRITE(pde); -#endif - *hwp = pde; } +#endif ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags) == B_TRUE); (xlate - 1)->xlt_pgtable = next; - ASSERT(rwtype == RW_READER); rw_exit(&(pgtable->swpg_rwlock)); } @@ -2442,24 +2353,22 @@ again: * immu_flags: flags */ static boolean_t -dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snvpages, - dcookie_t *dcookies, int dcount, dev_info_t *rdip, immu_flags_t immu_flags) +dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages, + immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip, + immu_flags_t immu_flags) { uint64_t dvma; uint64_t n; + immu_t *immu = domain->dom_immu; int nlevels = immu->immu_dvma_nlevels; xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; boolean_t pde_set = B_FALSE; - ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS); - ASSERT(sdvma % IMMU_PAGESIZE == 0); - ASSERT(snvpages); - n = snvpages; dvma = sdvma; while (n > 0) { - xlate_setup(immu, dvma, xlate, nlevels, rdip); + xlate_setup(dvma, xlate, nlevels); /* Lookup or allocate PGDIRs and PGTABLEs if necessary */ if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags) @@ -2487,28 +2396,27 @@ dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snvpages, * rdip: requesting device */ static void -dvma_unmap(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snpages, +dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages, dev_info_t *rdip) { + immu_t *immu = domain->dom_immu; int nlevels = immu->immu_dvma_nlevels; xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}; uint64_t n; uint64_t dvma; - ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS); - ASSERT(sdvma != 0); - ASSERT(sdvma % IMMU_PAGESIZE == 0); - ASSERT(snpages); - dvma = sdvma; n = snpages; while (n > 0) { /* setup the xlate array */ - xlate_setup(immu, dvma, xlate, nlevels, rdip); + xlate_setup(dvma, xlate, nlevels); /* just lookup existing pgtables. Should never fail */ - PDE_lookup(immu, domain, xlate, nlevels, rdip); + if (!PDE_lookup(domain, xlate, nlevels)) + ddi_err(DER_PANIC, rdip, + "PTE not found for addr %" PRIx64, + (unsigned long long)dvma); /* clear all matching ptes that fit into this leaf pgtable */ PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip); @@ -2518,24 +2426,17 @@ dvma_unmap(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snpages, } static uint64_t -dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages) +dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf) { - ddi_dma_attr_t *dma_attr; uint64_t dvma; size_t xsize, align; uint64_t minaddr, maxaddr; - ASSERT(domain->dom_maptype != IMMU_MAPTYPE_UNITY); - - /* shotcuts */ - dma_attr = &(hp->dmai_attr); - /* parameters */ xsize = npages * IMMU_PAGESIZE; align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); minaddr = dma_attr->dma_attr_addr_lo; maxaddr = dma_attr->dma_attr_addr_hi + 1; - /* nocross is checked in cookie_update() */ /* handle the rollover cases */ if (maxaddr < dma_attr->dma_attr_addr_hi) { @@ -2547,426 +2448,324 @@ dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages) */ dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, xsize, align, 0, 0, (void *)(uintptr_t)minaddr, - (void *)(uintptr_t)maxaddr, VM_NOSLEEP); - - ASSERT(dvma); - ASSERT(dvma >= minaddr); - ASSERT(dvma + xsize - 1 < maxaddr); + (void *)(uintptr_t)maxaddr, kmf); return (dvma); } static void -dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages) -{ - uint64_t size = npages * IMMU_PAGESIZE; - - ASSERT(domain); - ASSERT(domain->dom_did > 0); - ASSERT(dvma); - ASSERT(npages); - - if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) { - ASSERT(domain->dom_maptype == IMMU_MAPTYPE_UNITY); - return; - } - - vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size); -} -/*ARGSUSED*/ -static void -cookie_free(rootnex_dma_t *dma, immu_t *immu, domain_t *domain, - dev_info_t *rdip) +dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr) { - int i; - uint64_t dvma; - uint64_t npages; - dvcookie_t *dvcookies = dma->dp_dvcookies; - - ASSERT(dma->dp_max_cookies); - ASSERT(dma->dp_max_dcookies); - ASSERT(dma->dp_dvmax < dma->dp_max_cookies); - ASSERT(dma->dp_dmax < dma->dp_max_dcookies); + int nlevels; + xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp; + uint64_t dvma, n; + size_t xsize, align; + uint64_t minaddr, maxaddr, dmamax; + int on, npte, pindex; + hw_pdte_t *shwp; + immu_t *immu; + domain_t *domain; - /* - * we allocated DVMA in a single chunk. Calculate total number - * of pages - */ - for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) { - npages += dvcookies[i].dvck_npages; - } - dvma = dvcookies[0].dvck_dvma; -#ifdef DEBUG - /* Unmap only in DEBUG mode */ - dvma_unmap(immu, domain, dvma, npages, rdip); -#endif - dvma_free(domain, dvma, npages); + /* parameters */ + domain = IMMU_DEVI(rdip)->imd_domain; + immu = domain->dom_immu; + nlevels = immu->immu_dvma_nlevels; + xsize = IMMU_NPREPTES * IMMU_PAGESIZE; + align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE); + minaddr = dma_attr->dma_attr_addr_lo; + if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG) + dmamax = dma_attr->dma_attr_seg; + else + dmamax = dma_attr->dma_attr_addr_hi; + maxaddr = dmamax + 1; - kmem_free(dma->dp_dvcookies, sizeof (dvcookie_t) * dma->dp_max_cookies); - dma->dp_dvcookies = NULL; - kmem_free(dma->dp_dcookies, sizeof (dcookie_t) * dma->dp_max_dcookies); - dma->dp_dcookies = NULL; - if (dma->dp_need_to_free_cookie == B_TRUE) { - kmem_free(dma->dp_cookies, sizeof (ddi_dma_cookie_t) * - dma->dp_max_cookies); - dma->dp_dcookies = NULL; - dma->dp_need_to_free_cookie = B_FALSE; - } + if (maxaddr < dmamax) + maxaddr = dmamax; - dma->dp_max_cookies = 0; - dma->dp_max_dcookies = 0; - dma->dp_cookie_size = 0; - dma->dp_dvmax = 0; - dma->dp_dmax = 0; -} + dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena, + xsize, align, 0, dma_attr->dma_attr_seg + 1, + (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP); -/* - * cookie_alloc() - */ -static int -cookie_alloc(rootnex_dma_t *dma, struct ddi_dma_req *dmareq, - ddi_dma_attr_t *attr, uint_t prealloc) -{ - int kmflag; - rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo); - dvcookie_t *dvcookies = dma->dp_dvcookies; - dcookie_t *dcookies = dma->dp_dcookies; - ddi_dma_cookie_t *cookies = dma->dp_cookies; - uint64_t max_cookies; - uint64_t max_dcookies; - uint64_t cookie_size; - - /* we need to allocate new array */ - if (dmareq->dmar_fp == DDI_DMA_SLEEP) { - kmflag = KM_SLEEP; - } else { - kmflag = KM_NOSLEEP; - } + ihp->ihp_predvma = dvma; + ihp->ihp_npremapped = 0; + if (dvma == 0) + return; - /* - * XXX make sure cookies size doen't exceed sinfo->si_max_cookie_size; - */ + n = IMMU_NPREPTES; + pindex = 0; /* - * figure out the rough estimate of array size - * At a minimum, each cookie must hold 1 page. - * At a maximum, it cannot exceed dma_attr_sgllen + * Set up a mapping at address 0, just so that all PDPs get allocated + * now. Although this initial mapping should never be used, + * explicitly set it to read-only, just to be safe. */ - max_dcookies = dmareq->dmar_object.dmao_size + IMMU_PAGEOFFSET; - max_dcookies /= IMMU_PAGESIZE; - max_dcookies++; - max_cookies = MIN(max_dcookies, attr->dma_attr_sgllen); - - /* allocate the dvma cookie array */ - dvcookies = kmem_zalloc(sizeof (dvcookie_t) * max_cookies, kmflag); - if (dvcookies == NULL) { - return (DDI_FAILURE); - } + while (n > 0) { + xlate_setup(dvma, xlate, nlevels); - /* allocate the "phys" cookie array */ - dcookies = kmem_zalloc(sizeof (dcookie_t) * max_dcookies, kmflag); - if (dcookies == NULL) { - kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies); - dvcookies = NULL; - return (DDI_FAILURE); - } + (void) PDE_set_all(immu, domain, xlate, nlevels, rdip, + IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); - /* allocate the "real" cookie array - the one given to users */ - cookie_size = sizeof (ddi_dma_cookie_t) * max_cookies; - if (max_cookies > prealloc) { - cookies = kmem_zalloc(cookie_size, kmflag); - if (cookies == NULL) { - kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies); - kmem_free(dcookies, sizeof (dcookie_t) * max_dcookies); - goto fail; - } - dma->dp_need_to_free_cookie = B_TRUE; - } else { - /* the preallocated buffer fits this size */ - cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer; - bzero(cookies, sizeof (ddi_dma_cookie_t)* max_cookies); - dma->dp_need_to_free_cookie = B_FALSE; - } - - dma->dp_dvcookies = dvcookies; - dma->dp_dcookies = dcookies; - dma->dp_cookies = cookies; - dma->dp_cookie_size = cookie_size; - dma->dp_max_cookies = max_cookies; - dma->dp_max_dcookies = max_dcookies; - dma->dp_dvmax = 0; - dma->dp_dmax = 0; - sinfo->si_max_pages = dma->dp_max_cookies; + xlp = &xlate[1]; + shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr) + + xlp->xlt_idx; + on = n; - return (DDI_SUCCESS); + PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie, + 1, rdip, IMMU_FLAGS_READ); -fail: - dma->dp_dvcookies = NULL; - dma->dp_dcookies = NULL; - dma->dp_cookies = NULL; - dma->dp_cookie_size = 0; - dma->dp_max_cookies = 0; - dma->dp_max_dcookies = 0; - dma->dp_dvmax = 0; - dma->dp_dmax = 0; - dma->dp_need_to_free_cookie = B_FALSE; - sinfo->si_max_pages = 0; + npte = on - n; - return (DDI_FAILURE); + while (npte > 0) { + ihp->ihp_preptes[pindex++] = shwp; +#ifdef BUGGY_DRIVERS + PDTE_CLEAR_WRITE(*shwp); +#endif + shwp++; + npte--; + } + } } -/*ARGSUSED*/ static void -cookie_update(domain_t *domain, rootnex_dma_t *dma, paddr_t paddr, - int64_t psize, uint64_t maxseg, size_t nocross) +dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp) { - dvcookie_t *dvcookies = dma->dp_dvcookies; - dcookie_t *dcookies = dma->dp_dcookies; - ddi_dma_cookie_t *cookies = dma->dp_cookies; - uint64_t dvmax = dma->dp_dvmax; - uint64_t dmax = dma->dp_dmax; - - ASSERT(dvmax < dma->dp_max_cookies); - ASSERT(dmax < dma->dp_max_dcookies); - - paddr &= IMMU_PAGEMASK; - - ASSERT(paddr); - ASSERT(psize); - ASSERT(maxseg); - - /* - * check to see if this page would put us - * over the max cookie size. - */ - if (cookies[dvmax].dmac_size + psize > maxseg) { - dvmax++; /* use the next dvcookie */ - dmax++; /* also means we use the next dcookie */ - ASSERT(dvmax < dma->dp_max_cookies); - ASSERT(dmax < dma->dp_max_dcookies); - } + domain_t *domain; - /* - * check to see if this page would make us larger than - * the nocross boundary. If yes, create a new cookie - * otherwise we will fail later with vmem_xalloc() - * due to overconstrained alloc requests - * nocross == 0 implies no nocross constraint. - */ - if (nocross > 0) { - ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE - <= nocross); - if ((dvcookies[dvmax].dvck_npages + 1) * IMMU_PAGESIZE - > nocross) { - dvmax++; /* use the next dvcookie */ - dmax++; /* also means we use the next dcookie */ - ASSERT(dvmax < dma->dp_max_cookies); - ASSERT(dmax < dma->dp_max_dcookies); - } - ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE - <= nocross); - } + domain = IMMU_DEVI(rdip)->imd_domain; - /* - * If the cookie is empty - */ - if (dvcookies[dvmax].dvck_npages == 0) { - ASSERT(cookies[dvmax].dmac_size == 0); - ASSERT(dvcookies[dvmax].dvck_dvma == 0); - ASSERT(dvcookies[dvmax].dvck_npages - == 0); - ASSERT(dcookies[dmax].dck_paddr == 0); - ASSERT(dcookies[dmax].dck_npages == 0); - - dvcookies[dvmax].dvck_dvma = 0; - dvcookies[dvmax].dvck_npages = 1; - dcookies[dmax].dck_paddr = paddr; - dcookies[dmax].dck_npages = 1; - cookies[dvmax].dmac_size = psize; - } else { - /* Cookie not empty. Add to it */ - cookies[dma->dp_dvmax].dmac_size += psize; - ASSERT(dvcookies[dma->dp_dvmax].dvck_dvma == 0); - dvcookies[dma->dp_dvmax].dvck_npages++; - ASSERT(dcookies[dmax].dck_paddr != 0); - ASSERT(dcookies[dmax].dck_npages != 0); - - /* Check if this paddr is contiguous */ - if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) { - dcookies[dmax].dck_npages++; - } else { - /* No, we need a new dcookie */ - dmax++; - ASSERT(dcookies[dmax].dck_paddr == 0); - ASSERT(dcookies[dmax].dck_npages == 0); - dcookies[dmax].dck_paddr = paddr; - dcookies[dmax].dck_npages = 1; - } + if (ihp->ihp_predvma != 0) { + dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip); + vmem_free(domain->dom_dvma_arena, + (void *)(uintptr_t)ihp->ihp_predvma, + IMMU_NPREPTES * IMMU_PAGESIZE); } - - dma->dp_dvmax = dvmax; - dma->dp_dmax = dmax; } static void -cookie_finalize(ddi_dma_impl_t *hp, immu_t *immu, domain_t *domain, - dev_info_t *rdip, immu_flags_t immu_flags) +dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages) { - int i; - rootnex_dma_t *dma = (rootnex_dma_t *)hp->dmai_private; - dvcookie_t *dvcookies = dma->dp_dvcookies; - dcookie_t *dcookies = dma->dp_dcookies; - ddi_dma_cookie_t *cookies = dma->dp_cookies; - uint64_t npages; - uint64_t dvma; - boolean_t pde_set; - - /* First calculate the total number of pages required */ - for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) { - npages += dvcookies[i].dvck_npages; - } - - /* Now allocate dvma */ - dvma = dvma_alloc(hp, domain, npages); - - /* Now map the dvma */ - pde_set = dvma_map(immu, domain, dvma, npages, dcookies, - dma->dp_dmax + 1, rdip, immu_flags); - - /* Invalidate the IOTLB */ - immu_flush_iotlb_psi(immu, domain->dom_did, dvma, npages, - pde_set == B_TRUE ? TLB_IVA_WHOLE : TLB_IVA_LEAF); + uint64_t size = npages * IMMU_PAGESIZE; - /* Now setup dvcookies and real cookie addresses */ - for (i = 0; i <= dma->dp_dvmax; i++) { - dvcookies[i].dvck_dvma = dvma; - cookies[i].dmac_laddress = dvma; - ASSERT(cookies[i].dmac_size != 0); - cookies[i].dmac_type = 0; - dvma += (dvcookies[i].dvck_npages * IMMU_PAGESIZE); - } + if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) + return; -#ifdef TEST - immu_flush_iotlb_dsi(immu, domain->dom_did); -#endif + vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size); } -/* - * cookie_create() - */ static int -cookie_create(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, - ddi_dma_attr_t *a, immu_t *immu, domain_t *domain, dev_info_t *rdip, - uint_t prealloc_count, immu_flags_t immu_flags) +immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle, + immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq, + ddi_dma_obj_t *dma_out) { + domain_t *domain; + immu_t *immu; + immu_flags_t immu_flags; ddi_dma_atyp_t buftype; - uint64_t offset; + ddi_dma_obj_t *dmar_object; + ddi_dma_attr_t *attrp; + uint64_t offset, paddr, dvma, sdvma, rwmask; + size_t npages, npgalloc; + uint_t psize, size, pcnt, dmax; page_t **pparray; - uint64_t paddr; - uint_t psize; - uint_t size; - uint64_t maxseg; caddr_t vaddr; - uint_t pcnt; page_t *page; - rootnex_sglinfo_t *sglinfo; - ddi_dma_obj_t *dmar_object; - rootnex_dma_t *dma; - size_t nocross; + struct as *vas; + immu_dcookie_t *dcookies; + int pde_set; - dma = (rootnex_dma_t *)hp->dmai_private; - sglinfo = &(dma->dp_sglinfo); - dmar_object = &(dmareq->dmar_object); - maxseg = sglinfo->si_max_cookie_size; + domain = IMMU_DEVI(rdip)->imd_domain; + immu = domain->dom_immu; + immu_flags = dma_to_immu_flags(dmareq); + + attrp = &((ddi_dma_impl_t *)handle)->dmai_attr; + + dmar_object = &dmareq->dmar_object; pparray = dmar_object->dmao_obj.virt_obj.v_priv; vaddr = dmar_object->dmao_obj.virt_obj.v_addr; buftype = dmar_object->dmao_type; size = dmar_object->dmao_size; - nocross = (size_t)(a->dma_attr_seg + 1); - /* - * Allocate cookie, dvcookie and dcookie - */ - if (cookie_alloc(dma, dmareq, a, prealloc_count) != DDI_SUCCESS) { - return (DDI_FAILURE); - } - hp->dmai_cookie = dma->dp_cookies; + IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t, + buftype, uint_t, size); + + dcookies = &ihp->ihp_dcookies[0]; - pcnt = 0; + pcnt = dmax = 0; /* retrieve paddr, psize, offset from dmareq */ if (buftype == DMA_OTYP_PAGES) { page = dmar_object->dmao_obj.pp_obj.pp_pp; - ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page)); offset = dmar_object->dmao_obj.pp_obj.pp_offset & MMU_PAGEOFFSET; paddr = pfn_to_pa(page->p_pagenum) + offset; psize = MIN((MMU_PAGESIZE - offset), size); - sglinfo->si_asp = NULL; page = page->p_next; + vas = dmar_object->dmao_obj.virt_obj.v_as; } else { - ASSERT((buftype == DMA_OTYP_VADDR) || - (buftype == DMA_OTYP_BUFVADDR)); - sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as; - if (sglinfo->si_asp == NULL) { - sglinfo->si_asp = &kas; + if (vas == NULL) { + vas = &kas; } offset = (uintptr_t)vaddr & MMU_PAGEOFFSET; if (pparray != NULL) { - ASSERT(!PP_ISFREE(pparray[pcnt])); paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset; psize = MIN((MMU_PAGESIZE - offset), size); pcnt++; } else { - paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat, + paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr)) + offset; psize = MIN(size, (MMU_PAGESIZE - offset)); vaddr += psize; } } - /* save the iommu page offset */ - sglinfo->si_buf_offset = offset & IMMU_PAGEOFFSET; + npgalloc = IMMU_BTOPR(size + offset); - /* - * setup dvcookie and dcookie for [paddr, paddr+psize) - */ - cookie_update(domain, dma, paddr, psize, maxseg, nocross); + if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) { +#ifdef BUGGY_DRIVERS + rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask; +#else + rwmask = immu->immu_ptemask; + if (immu_flags & IMMU_FLAGS_READ) + rwmask |= PDTE_MASK_R; + if (immu_flags & IMMU_FLAGS_WRITE) + rwmask |= PDTE_MASK_W; +#endif +#ifdef DEBUG + rwmask |= PDTE_MASK_P; +#endif + sdvma = ihp->ihp_predvma; + ihp->ihp_npremapped = npgalloc; + *ihp->ihp_preptes[0] = + PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask; + } else { + ihp->ihp_npremapped = 0; + sdvma = dvma_alloc(domain, attrp, npgalloc, + dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP); + if (sdvma == 0) + return (DDI_DMA_NORESOURCES); + dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET); + dcookies[0].dck_npages = 1; + } + + IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc, + uint64_t, sdvma); + + dvma = sdvma; + pde_set = 0; + npages = 1; size -= psize; while (size > 0) { /* get the size for this page (i.e. partial or full page) */ psize = MIN(size, MMU_PAGESIZE); if (buftype == DMA_OTYP_PAGES) { /* get the paddr from the page_t */ - ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page)); paddr = pfn_to_pa(page->p_pagenum); page = page->p_next; } else if (pparray != NULL) { /* index into the array of page_t's to get the paddr */ - ASSERT(!PP_ISFREE(pparray[pcnt])); paddr = pfn_to_pa(pparray[pcnt]->p_pagenum); pcnt++; } else { /* call into the VM to get the paddr */ - paddr = pfn_to_pa(hat_getpfnum - (sglinfo->si_asp->a_hat, vaddr)); + paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr)); vaddr += psize; } - /* - * set dvcookie and dcookie for [paddr, paddr+psize) - */ - cookie_update(domain, dma, paddr, psize, maxseg, nocross); + + npages++; + + if (ihp->ihp_npremapped > 0) { + *ihp->ihp_preptes[npages - 1] = + PDTE_PADDR(paddr) | rwmask; + } else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) { + dcookies[dmax].dck_npages++; + } else { + /* No, we need a new dcookie */ + if (dmax == (IMMU_NDCK - 1)) { + /* + * Ran out of dcookies. Map them now. + */ + if (dvma_map(domain, dvma, + npages, dcookies, dmax + 1, rdip, + immu_flags)) + pde_set++; + + IMMU_DPROBE4(immu__dvmamap__early, + dev_info_t *, rdip, uint64_t, dvma, + uint_t, npages, uint_t, dmax+1); + + dvma += (npages << IMMU_PAGESHIFT); + npages = 0; + dmax = 0; + } else + dmax++; + dcookies[dmax].dck_paddr = paddr; + dcookies[dmax].dck_npages = 1; + } size -= psize; } - cookie_finalize(hp, immu, domain, rdip, immu_flags); + /* + * Finish up, mapping all, or all of the remaining, + * physical memory ranges. + */ + if (ihp->ihp_npremapped == 0 && npages > 0) { + IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \ + uint64_t, dvma, uint_t, npages, uint_t, dmax+1); + + if (dvma_map(domain, dvma, npages, dcookies, + dmax + 1, rdip, immu_flags)) + pde_set++; + } + + /* Invalidate the IOTLB */ + immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc, + pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF, + &ihp->ihp_inv_wait); + + ihp->ihp_ndvseg = 1; + ihp->ihp_dvseg[0].dvs_start = sdvma; + ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size; + + dma_out->dmao_size = dmar_object->dmao_size; + dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET; + dma_out->dmao_obj.dvma_obj.dv_nseg = 1; + dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0]; + dma_out->dmao_type = DMA_OTYP_DVADDR; + + return (DDI_DMA_MAPPED); +} + +static int +immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao) +{ + uint64_t dvma, npages; + domain_t *domain; + struct dvmaseg *dvs; + + domain = IMMU_DEVI(rdip)->imd_domain; + dvs = dmao->dmao_obj.dvma_obj.dv_seg; - /* take account in the offset into the first page */ - dma->dp_cookies[0].dmac_laddress += sglinfo->si_buf_offset; + dvma = dvs[0].dvs_start; + npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off); - /* save away how many cookies we have */ - sglinfo->si_sgl_size = dma->dp_dvmax + 1; +#ifdef DEBUG + /* Unmap only in DEBUG mode */ + dvma_unmap(domain, dvma, npages, rdip); +#endif + dvma_free(domain, dvma, npages); + + IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages, + uint64_t, dvma); + +#ifdef DEBUG + /* + * In the DEBUG case, the unmap was actually done, + * but an IOTLB flush was not done. So, an explicit + * write back flush is needed. + */ + immu_regs_wbf_flush(domain->dom_immu); +#endif return (DDI_SUCCESS); } @@ -3001,7 +2800,6 @@ immu_dvma_setup(list_t *listp) nchains, mod_hash_null_keydtor, mod_hash_null_valdtor, mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp, KM_NOSLEEP); - ASSERT(bdf_domain_hash); immu = list_head(listp); for (; immu; immu = list_next(listp, immu)) { @@ -3018,9 +2816,6 @@ immu_dvma_setup(list_t *listp) void immu_dvma_startup(immu_t *immu) { - ASSERT(immu); - ASSERT(immu->immu_dvma_running == B_FALSE); - if (immu_gfxdvma_enable == B_FALSE && immu->immu_dvma_gfx_only == B_TRUE) { return; @@ -3029,7 +2824,6 @@ immu_dvma_startup(immu_t *immu) /* * DVMA will start once IOMMU is "running" */ - ASSERT(immu->immu_dvma_running == B_FALSE); immu->immu_dvma_running = B_TRUE; } @@ -3045,7 +2839,7 @@ immu_dvma_physmem_update(uint64_t addr, uint64_t size) uint64_t start; uint64_t npages; int dcount; - dcookie_t dcookies[1] = {0}; + immu_dcookie_t dcookies[1] = {0}; domain_t *domain; /* @@ -3057,9 +2851,15 @@ immu_dvma_physmem_update(uint64_t addr, uint64_t size) mutex_enter(&immu_domain_lock); domain = list_head(&immu_unity_domain_list); for (; domain; domain = list_next(&immu_unity_domain_list, domain)) { + /* + * Nothing to do if the IOMMU supports passthrough. + */ + if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap)) + continue; /* There is no vmem_arena for unity domains. Just map it */ - ddi_err(DER_LOG, NULL, "IMMU: unity-domain: Adding map " + ddi_err(DER_LOG, domain->dom_dip, + "iommu: unity-domain: Adding map " "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size); start = IMMU_ROUNDOWN(addr); @@ -3068,46 +2868,21 @@ immu_dvma_physmem_update(uint64_t addr, uint64_t size) dcookies[0].dck_paddr = start; dcookies[0].dck_npages = npages; dcount = 1; - (void) dvma_map(domain->dom_immu, domain, start, npages, + (void) dvma_map(domain, start, npages, dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); } mutex_exit(&immu_domain_lock); } - int -immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, - uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags) +immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags) { - ddi_dma_attr_t *attr; - dev_info_t *ddip; - domain_t *domain; + dev_info_t *ddip, *odip; immu_t *immu; - dcookie_t dcookies[1] = {0}; - int dcount = 0; - boolean_t pde_set = B_TRUE; - int r = DDI_FAILURE; - - ASSERT(immu_enable == B_TRUE); - ASSERT(immu_running == B_TRUE || !(immu_flags & IMMU_FLAGS_DMAHDL)); - ASSERT(hp || !(immu_flags & IMMU_FLAGS_DMAHDL)); - - /* - * Intel IOMMU will only be turned on if IOMMU - * page size is a multiple of IOMMU page size - */ - - /*LINTED*/ - ASSERT(MMU_PAGESIZE % IMMU_PAGESIZE == 0); - - /* Can only do DVMA if dip is attached */ - if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "DVMA map: No device specified"); - /*NOTREACHED*/ - } + domain_t *domain; - immu_flags |= dma_to_immu_flags(dmareq); + odip = rdip; immu = immu_dvma_get_immu(rdip, immu_flags); if (immu == NULL) { @@ -3115,7 +2890,7 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, * possible that there is no IOMMU unit for this device * - BIOS bugs are one example. */ - ddi_err(DER_WARN, rdip, "No IMMU unit found for device"); + ddi_err(DER_WARN, rdip, "No iommu unit found for device"); return (DDI_DMA_NORESOURCES); } @@ -3125,7 +2900,7 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) { rdip = get_lpc_devinfo(immu, rdip, immu_flags); if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "IMMU redirect failed"); + ddi_err(DER_PANIC, rdip, "iommu redirect failed"); /*NOTREACHED*/ } } @@ -3139,7 +2914,7 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, if (strcmp(ddi_node_name(rdip), "agpgart") == 0) { rdip = get_gfx_devinfo(rdip); if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "IMMU redirect failed"); + ddi_err(DER_PANIC, rdip, "iommu redirect failed"); /*NOTREACHED*/ } } @@ -3152,11 +2927,12 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, ddip = NULL; domain = device_domain(rdip, &ddip, immu_flags); if (domain == NULL) { - ASSERT(ddip == NULL); ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device"); return (DDI_DMA_NORESOURCES); } + immu = domain->dom_immu; + /* * If a domain is found, we must also have a domain dip * which is the topmost ancestor dip of rdip that shares @@ -3168,45 +2944,8 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, return (DDI_DMA_NORESOURCES); } - immu = domain->dom_immu; - ASSERT(immu); - if (domain->dom_did == IMMU_UNITY_DID) { - ASSERT(domain == immu->immu_unity_domain); - /* mapping already done. Let rootnex create cookies */ - r = DDI_DMA_USE_PHYSICAL; - } else if (immu_flags & IMMU_FLAGS_DMAHDL) { - - /* if we have a DMA handle, the IOMMUs must be running */ - ASSERT(immu->immu_regs_running == B_TRUE); - ASSERT(immu->immu_dvma_running == B_TRUE); - - attr = &hp->dmai_attr; - if (attr == NULL) { - ddi_err(DER_PANIC, rdip, - "DMA handle (%p): NULL attr", hp); - /*NOTREACHED*/ - } - - if (cookie_create(hp, dmareq, attr, immu, domain, rdip, - prealloc_count, immu_flags) != DDI_SUCCESS) { - ddi_err(DER_MODE, rdip, "dvcookie_alloc: failed"); - return (DDI_DMA_NORESOURCES); - } - r = DDI_DMA_MAPPED; - } else if (immu_flags & IMMU_FLAGS_MEMRNG) { - dcookies[0].dck_paddr = mrng->mrng_start; - dcookies[0].dck_npages = mrng->mrng_npages; - dcount = 1; - pde_set = dvma_map(immu, domain, mrng->mrng_start, - mrng->mrng_npages, dcookies, dcount, rdip, immu_flags); - immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start, - mrng->mrng_npages, pde_set == B_TRUE ? - TLB_IVA_WHOLE : TLB_IVA_LEAF); - r = DDI_DMA_MAPPED; - } else { - ddi_err(DER_PANIC, rdip, "invalid flags for immu_dvma_map()"); - /*NOTREACHED*/ - } + if (odip != rdip) + set_domain(odip, ddip, domain); /* * Update the root and context entries @@ -3217,133 +2956,251 @@ immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng, return (DDI_DMA_NORESOURCES); } - immu_regs_wbf_flush(immu); - - return (r); + return (DDI_SUCCESS); } int -immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip) +immu_map_memrange(dev_info_t *rdip, memrng_t *mrng) { - ddi_dma_attr_t *attr; - rootnex_dma_t *dma; - domain_t *domain; + immu_dcookie_t dcookies[1] = {0}; + boolean_t pde_set; immu_t *immu; - dev_info_t *ddip; - immu_flags_t immu_flags; + domain_t *domain; + immu_inv_wait_t iw; - ASSERT(immu_enable == B_TRUE); - ASSERT(immu_running == B_TRUE); - ASSERT(hp); + dcookies[0].dck_paddr = mrng->mrng_start; + dcookies[0].dck_npages = mrng->mrng_npages; - /* - * Intel IOMMU will only be turned on if IOMMU - * page size is same as MMU page size - */ - /*LINTED*/ - ASSERT(MMU_PAGESIZE == IMMU_PAGESIZE); + domain = IMMU_DEVI(rdip)->imd_domain; + immu = domain->dom_immu; - /* rdip need not be attached */ - if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "DVMA unmap: No device specified"); - return (DDI_DMA_NORESOURCES); - } + pde_set = dvma_map(domain, mrng->mrng_start, + mrng->mrng_npages, dcookies, 1, rdip, + IMMU_FLAGS_READ | IMMU_FLAGS_WRITE); - /* - * Get the device domain, this should always - * succeed since there had to be a domain to - * setup DVMA. - */ - dma = (rootnex_dma_t *)hp->dmai_private; - attr = &hp->dmai_attr; - if (attr == NULL) { - ddi_err(DER_PANIC, rdip, "DMA handle (%p) has NULL attr", hp); - /*NOTREACHED*/ - } - immu_flags = dma->dp_sleep_flags; + immu_init_inv_wait(&iw, "memrange", B_TRUE); - immu = immu_dvma_get_immu(rdip, immu_flags); - if (immu == NULL) { - /* - * possible that there is no IOMMU unit for this device - * - BIOS bugs are one example. - */ - ddi_err(DER_WARN, rdip, "No IMMU unit found for device"); - return (DDI_DMA_NORESOURCES); - } + immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start, + mrng->mrng_npages, pde_set == B_TRUE ? + TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw); + + return (DDI_SUCCESS); +} + +immu_devi_t * +immu_devi_get(dev_info_t *rdip) +{ + immu_devi_t *immu_devi; + volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu); + + /* Just want atomic reads. No need for lock */ + immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr, + 0); + return (immu_devi); +} + +/*ARGSUSED*/ +int +immu_hdl_priv_ctor(void *buf, void *arg, int kmf) +{ + immu_hdl_priv_t *ihp; + + ihp = buf; + immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE); + + return (0); +} + +/* + * iommulib interface functions + */ +static int +immu_probe(iommulib_handle_t handle, dev_info_t *dip) +{ + immu_devi_t *immu_devi; + int ret; + if (!immu_enable) + return (DDI_FAILURE); /* - * redirect isa devices attached under lpc to lpc dip + * Make sure the device has all the IOMMU structures + * initialized. If this device goes through an IOMMU + * unit (e.g. this probe function returns success), + * this will be called at most N times, with N being + * the number of IOMMUs in the system. + * + * After that, when iommulib_nex_open succeeds, + * we can always assume that this device has all + * the structures initialized. IOMMU_USED(dip) will + * be true. There is no need to find the controlling + * IOMMU/domain again. */ - if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) { - rdip = get_lpc_devinfo(immu, rdip, immu_flags); - if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "IMMU redirect failed"); - /*NOTREACHED*/ - } - } + ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP); + if (ret != DDI_SUCCESS) + return (ret); - /* Reset immu, as redirection can change IMMU */ - immu = NULL; + immu_devi = IMMU_DEVI(dip); /* - * for gart, redirect to the real graphic devinfo + * For unity domains, there is no need to call in to + * the IOMMU code. */ - if (strcmp(ddi_node_name(rdip), "agpgart") == 0) { - rdip = get_gfx_devinfo(rdip); - if (rdip == NULL) { - ddi_err(DER_PANIC, rdip, "IMMU redirect failed"); - /*NOTREACHED*/ + if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID) + return (DDI_FAILURE); + + if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle)) + return (DDI_SUCCESS); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +immu_allochdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, + int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep) +{ + int ret; + immu_hdl_priv_t *ihp; + immu_t *immu; + + ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp, + arg, dma_handlep); + if (ret == DDI_SUCCESS) { + immu = IMMU_DEVI(rdip)->imd_immu; + + ihp = kmem_cache_alloc(immu->immu_hdl_cache, + waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP); + if (ihp == NULL) { + (void) iommulib_iommu_dma_freehdl(dip, rdip, + *dma_handlep); + return (DDI_DMA_NORESOURCES); } - } - ddip = NULL; - domain = device_domain(rdip, &ddip, immu_flags); - if (domain == NULL || domain->dom_did == 0 || ddip == NULL) { - ddi_err(DER_MODE, rdip, "Attempt to unmap DVMA for " - "a device without domain or with an uninitialized " - "domain"); - return (DDI_DMA_NORESOURCES); + if (IMMU_DEVI(rdip)->imd_use_premap) + dvma_prealloc(rdip, ihp, attr); + else { + ihp->ihp_npremapped = 0; + ihp->ihp_predvma = 0; + } + ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep, + ihp); } + return (ret); +} - /* - * immu must be set in the domain. - */ - immu = domain->dom_immu; - ASSERT(immu); - if (domain->dom_did == IMMU_UNITY_DID) { - ASSERT(domain == immu->immu_unity_domain); - /* - * domain is unity, nothing to do here, let the rootnex - * code free the cookies. - */ - return (DDI_DMA_USE_PHYSICAL); +/*ARGSUSED*/ +static int +immu_freehdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) +{ + immu_hdl_priv_t *ihp; + + ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); + if (ihp != NULL) { + if (IMMU_DEVI(rdip)->imd_use_premap) + dvma_prefree(rdip, ihp); + kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp); } - dma = hp->dmai_private; - if (dma == NULL) { - ddi_err(DER_PANIC, rdip, "DVMA unmap: DMA handle (%p) has " - "no private dma structure", hp); - /*NOTREACHED*/ + return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle)); +} + + +/*ARGSUSED*/ +static int +immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep, + uint_t *ccountp) +{ + int ret; + immu_hdl_priv_t *ihp; + + ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle, + dma_req, cookiep, ccountp); + + if (ret == DDI_DMA_MAPPED) { + ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); + immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait); } - cookie_free(dma, immu, domain, rdip); + return (ret); +} - /* No invalidation needed for unmap */ - immu_regs_wbf_flush(immu); +/*ARGSUSED*/ +static int +immu_unbindhdl(iommulib_handle_t handle, + dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle) +{ + return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle)); +} - return (DDI_SUCCESS); +/*ARGSUSED*/ +static int +immu_sync(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, + size_t len, uint_t cachefl) +{ + return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len, + cachefl)); } -immu_devi_t * -immu_devi_get(dev_info_t *rdip) +/*ARGSUSED*/ +static int +immu_win(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win, + off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, + uint_t *ccountp) { - immu_devi_t *immu_devi; - volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu); + return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp, + lenp, cookiep, ccountp)); +} - /* Just want atomic reads. No need for lock */ - immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr, - 0); - return (immu_devi); +/*ARGSUSED*/ +static int +immu_mapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao) +{ + immu_hdl_priv_t *ihp; + + ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); + + return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao)); +} + +/*ARGSUSED*/ +static int +immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao) +{ + immu_hdl_priv_t *ihp; + + ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle); + if (ihp->ihp_npremapped > 0) + return (DDI_SUCCESS); + return (immu_unmap_dvmaseg(rdip, dmao)); +} + +/*ARGSUSED*/ +static int +immu_map(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, struct ddi_dma_req *dmareq, + ddi_dma_handle_t *dma_handle) +{ + ASSERT(0); + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +immu_mctl(iommulib_handle_t handle, dev_info_t *dip, + dev_info_t *rdip, ddi_dma_handle_t dma_handle, + enum ddi_dma_ctlops request, off_t *offp, size_t *lenp, + caddr_t *objpp, uint_t cachefl) +{ + ASSERT(0); + return (DDI_FAILURE); } diff --git a/usr/src/uts/i86pc/io/immu_intrmap.c b/usr/src/uts/i86pc/io/immu_intrmap.c index 8c98573bc0..ab9f9bcbe7 100644 --- a/usr/src/uts/i86pc/io/immu_intrmap.c +++ b/usr/src/uts/i86pc/io/immu_intrmap.c @@ -38,6 +38,7 @@ typedef struct intrmap_private { immu_t *ir_immu; + immu_inv_wait_t ir_inv_wait; uint16_t ir_idx; uint32_t ir_sid_svt_sq; } intrmap_private_t; @@ -573,6 +574,7 @@ immu_intr_handler(immu_t *immu) (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7, immu_dvma_faults[MIN(fault_reason, DVMA_MAX_FAULTS)]); + immu_print_fault_info(sid, pg_addr); } else if (fault_reason < 0x27) { /* intr-remapping fault */ ddi_err(DER_WARN, idip, @@ -666,6 +668,7 @@ immu_intrmap_alloc(void **intrmap_private_tbl, dev_info_t *dip, { immu_t *immu; intrmap_t *intrmap; + immu_inv_wait_t *iwp; uint32_t idx, i; uint32_t sid_svt_sq; intrmap_private_t *intrmap_private; @@ -702,10 +705,12 @@ immu_intrmap_alloc(void **intrmap_private_tbl, dev_info_t *dip, sid_svt_sq = intrmap_private->ir_sid_svt_sq = get_sid(dip, type, ioapic_index); + iwp = &intrmap_private->ir_inv_wait; + immu_init_inv_wait(iwp, "intrmaplocal", B_TRUE); if (count == 1) { if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) { - immu_qinv_intr_one_cache(immu, idx); + immu_qinv_intr_one_cache(immu, idx, iwp); } else { immu_regs_wbf_flush(immu); } @@ -723,7 +728,7 @@ immu_intrmap_alloc(void **intrmap_private_tbl, dev_info_t *dip, } if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) { - immu_qinv_intr_caches(immu, idx, count); + immu_qinv_intr_caches(immu, idx, count, iwp); } else { immu_regs_wbf_flush(immu); } @@ -742,6 +747,7 @@ immu_intrmap_map(void *intrmap_private, void *intrmap_data, uint16_t type, int count) { immu_t *immu; + immu_inv_wait_t *iwp; intrmap_t *intrmap; ioapic_rdt_t *irdt = (ioapic_rdt_t *)intrmap_data; msi_regs_t *mregs = (msi_regs_t *)intrmap_data; @@ -755,6 +761,7 @@ immu_intrmap_map(void *intrmap_private, void *intrmap_data, uint16_t type, idx = INTRMAP_PRIVATE(intrmap_private)->ir_idx; immu = INTRMAP_PRIVATE(intrmap_private)->ir_immu; + iwp = &INTRMAP_PRIVATE(intrmap_private)->ir_inv_wait; intrmap = immu->immu_intrmap; sid_svt_sq = INTRMAP_PRIVATE(intrmap_private)->ir_sid_svt_sq; @@ -795,7 +802,7 @@ immu_intrmap_map(void *intrmap_private, void *intrmap_data, uint16_t type, idx * INTRMAP_RTE_SIZE, INTRMAP_RTE_SIZE); - immu_qinv_intr_one_cache(immu, idx); + immu_qinv_intr_one_cache(immu, idx, iwp); } else { for (i = 0; i < count; i++) { @@ -810,7 +817,7 @@ immu_intrmap_map(void *intrmap_private, void *intrmap_data, uint16_t type, idx++; } - immu_qinv_intr_caches(immu, idx, count); + immu_qinv_intr_caches(immu, idx, count, iwp); } } @@ -819,6 +826,7 @@ static void immu_intrmap_free(void **intrmap_privatep) { immu_t *immu; + immu_inv_wait_t *iwp; intrmap_t *intrmap; uint32_t idx; @@ -828,13 +836,14 @@ immu_intrmap_free(void **intrmap_privatep) } immu = INTRMAP_PRIVATE(*intrmap_privatep)->ir_immu; + iwp = &INTRMAP_PRIVATE(*intrmap_privatep)->ir_inv_wait; intrmap = immu->immu_intrmap; idx = INTRMAP_PRIVATE(*intrmap_privatep)->ir_idx; bzero(intrmap->intrmap_vaddr + idx * INTRMAP_RTE_SIZE, INTRMAP_RTE_SIZE); - immu_qinv_intr_one_cache(immu, idx); + immu_qinv_intr_one_cache(immu, idx, iwp); mutex_enter(&intrmap->intrmap_lock); bitset_del(&intrmap->intrmap_map, idx); @@ -928,6 +937,8 @@ immu_intrmap_setup(list_t *listp) mutex_init(&(immu->immu_intrmap_lock), NULL, MUTEX_DEFAULT, NULL); mutex_enter(&(immu->immu_intrmap_lock)); + immu_init_inv_wait(&immu->immu_intrmap_inv_wait, + "intrmapglobal", B_TRUE); immu->immu_intrmap_setup = B_TRUE; mutex_exit(&(immu->immu_intrmap_lock)); } diff --git a/usr/src/uts/i86pc/io/immu_qinv.c b/usr/src/uts/i86pc/io/immu_qinv.c index a3384a7340..de25729482 100644 --- a/usr/src/uts/i86pc/io/immu_qinv.c +++ b/usr/src/uts/i86pc/io/immu_qinv.c @@ -32,6 +32,7 @@ #include <sys/archsystm.h> #include <vm/hat_i86.h> #include <sys/types.h> +#include <sys/cpu.h> #include <sys/sysmacros.h> #include <sys/immu.h> @@ -44,10 +45,6 @@ /* status data size of invalidation wait descriptor */ #define QINV_SYNC_DATA_SIZE 0x4 -/* status data value of invalidation wait descriptor */ -#define QINV_SYNC_DATA_FENCE 1 -#define QINV_SYNC_DATA_UNFENCE 2 - /* invalidation queue head and tail */ #define QINV_IQA_HEAD(QH) BITX((QH), 18, 4) #define QINV_IQA_TAIL_SHIFT 4 @@ -58,38 +55,6 @@ typedef struct qinv_inv_dsc { uint64_t hi; } qinv_dsc_t; -/* - * struct iotlb_cache_node - * the pending data for iotlb flush - */ -typedef struct iotlb_pend_node { - dvcookie_t *icn_dvcookies; /* ptr to dvma cookie array */ - uint_t icn_count; /* valid cookie count */ - uint_t icn_array_size; /* array size */ - list_node_t node; -} qinv_iotlb_pend_node_t; - -/* - * struct iotlb_cache_head - * the pending head for the iotlb flush - */ -typedef struct iotlb_pend_head { - /* the pending node cache list */ - kmutex_t ich_mem_lock; - list_t ich_mem_list; -} qinv_iotlb_pend_head_t; - -/* - * qinv_iotlb_t - * pending data for qiueued invalidation iotlb flush - */ -typedef struct qinv_iotlb { - dvcookie_t *qinv_iotlb_dvcookies; - uint_t qinv_iotlb_count; - uint_t qinv_iotlb_size; - list_node_t qinv_iotlb_node; -} qinv_iotlb_t; - /* physical contigous pages for invalidation queue */ typedef struct qinv_mem { kmutex_t qinv_mem_lock; @@ -111,22 +76,22 @@ typedef struct qinv_mem { * * qinv_table - invalidation queue table * qinv_sync - sync status memory for invalidation wait descriptor - * qinv_iotlb_pend_node - pending iotlb node */ typedef struct qinv { qinv_mem_t qinv_table; qinv_mem_t qinv_sync; - qinv_iotlb_pend_head_t qinv_pend_head; - qinv_iotlb_pend_node_t **qinv_iotlb_pend_node; } qinv_t; +static void immu_qinv_inv_wait(immu_inv_wait_t *iwp); + static struct immu_flushops immu_qinv_flushops = { immu_qinv_context_fsi, immu_qinv_context_dsi, immu_qinv_context_gbl, immu_qinv_iotlb_psi, immu_qinv_iotlb_dsi, - immu_qinv_iotlb_gbl + immu_qinv_iotlb_gbl, + immu_qinv_inv_wait }; /* helper macro for making queue invalidation descriptor */ @@ -200,13 +165,8 @@ static void qinv_iotlb_common(immu_t *immu, uint_t domain_id, uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type); static void qinv_iec_common(immu_t *immu, uint_t iidx, uint_t im, uint_t g); -static uint_t qinv_alloc_sync_mem_entry(immu_t *immu); -static void qinv_wait_async_unfence(immu_t *immu, - qinv_iotlb_pend_node_t *node); -static void qinv_wait_sync(immu_t *immu); -static int qinv_wait_async_finish(immu_t *immu, int *count); -/*LINTED*/ -static void qinv_wait_async_fence(immu_t *immu); +static void immu_qinv_inv_wait(immu_inv_wait_t *iwp); +static void qinv_wait_sync(immu_t *immu, immu_inv_wait_t *iwp); /*LINTED*/ static void qinv_dev_iotlb_common(immu_t *immu, uint16_t sid, uint64_t addr, uint_t size, uint_t max_invs_pd); @@ -219,6 +179,9 @@ qinv_submit_inv_dsc(immu_t *immu, qinv_dsc_t *dsc) qinv_t *qinv; qinv_mem_t *qinv_table; uint_t tail; +#ifdef DEBUG + uint_t count = 0; +#endif qinv = (qinv_t *)immu->immu_qinv; qinv_table = &(qinv->qinv_table); @@ -231,6 +194,9 @@ qinv_submit_inv_dsc(immu_t *immu, qinv_dsc_t *dsc) qinv_table->qinv_mem_tail = 0; while (qinv_table->qinv_mem_head == qinv_table->qinv_mem_tail) { +#ifdef DEBUG + count++; +#endif /* * inv queue table exhausted, wait hardware to fetch * next descriptor @@ -239,6 +205,9 @@ qinv_submit_inv_dsc(immu_t *immu, qinv_dsc_t *dsc) immu_regs_get64(immu, IMMU_REG_INVAL_QH)); } + IMMU_DPROBE3(immu__qinv__sub, uint64_t, dsc->lo, uint64_t, dsc->hi, + uint_t, count); + bcopy(dsc, qinv_table->qinv_mem_vaddr + tail * QINV_ENTRY_SIZE, QINV_ENTRY_SIZE); @@ -331,162 +300,71 @@ qinv_iec_common(immu_t *immu, uint_t iidx, uint_t im, uint_t g) } /* - * alloc free entry from sync status table - */ -static uint_t -qinv_alloc_sync_mem_entry(immu_t *immu) -{ - qinv_mem_t *sync_mem; - uint_t tail; - qinv_t *qinv; - - qinv = (qinv_t *)immu->immu_qinv; - sync_mem = &qinv->qinv_sync; - -sync_mem_exhausted: - mutex_enter(&sync_mem->qinv_mem_lock); - tail = sync_mem->qinv_mem_tail; - sync_mem->qinv_mem_tail++; - if (sync_mem->qinv_mem_tail == sync_mem->qinv_mem_size) - sync_mem->qinv_mem_tail = 0; - - if (sync_mem->qinv_mem_head == sync_mem->qinv_mem_tail) { - /* should never happen */ - ddi_err(DER_WARN, NULL, "sync mem exhausted"); - sync_mem->qinv_mem_tail = tail; - mutex_exit(&sync_mem->qinv_mem_lock); - delay(IMMU_ALLOC_RESOURCE_DELAY); - goto sync_mem_exhausted; - } - mutex_exit(&sync_mem->qinv_mem_lock); - - return (tail); -} - -/* - * queued invalidation interface -- invalidation wait descriptor - * fence flag not set, need status data to indicate the invalidation - * wait descriptor completion - */ -static void -qinv_wait_async_unfence(immu_t *immu, qinv_iotlb_pend_node_t *node) -{ - qinv_dsc_t dsc; - qinv_mem_t *sync_mem; - uint64_t saddr; - uint_t tail; - qinv_t *qinv; - - qinv = (qinv_t *)immu->immu_qinv; - sync_mem = &qinv->qinv_sync; - tail = qinv_alloc_sync_mem_entry(immu); - - /* plant an iotlb pending node */ - qinv->qinv_iotlb_pend_node[tail] = node; - - saddr = sync_mem->qinv_mem_paddr + tail * QINV_SYNC_DATA_SIZE; - - /* - * sdata = QINV_SYNC_DATA_UNFENCE, fence = 0, sw = 1, if = 0 - * indicate the invalidation wait descriptor completion by - * performing a coherent DWORD write to the status address, - * not by generating an invalidation completion event - */ - dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_UNFENCE, 0, 1, 0); - dsc.hi = INV_WAIT_DSC_HIGH(saddr); - - qinv_submit_inv_dsc(immu, &dsc); -} - -/* - * queued invalidation interface -- invalidation wait descriptor - * fence flag set, indicate descriptors following the invalidation - * wait descriptor must be processed by hardware only after the - * invalidation wait descriptor completes. - */ -static void -qinv_wait_async_fence(immu_t *immu) -{ - qinv_dsc_t dsc; - - /* sw = 0, fence = 1, iflag = 0 */ - dsc.lo = INV_WAIT_DSC_LOW(0, 1, 0, 0); - dsc.hi = 0; - qinv_submit_inv_dsc(immu, &dsc); -} - -/* * queued invalidation interface -- invalidation wait descriptor * wait until the invalidation request finished */ static void -qinv_wait_sync(immu_t *immu) +qinv_wait_sync(immu_t *immu, immu_inv_wait_t *iwp) { qinv_dsc_t dsc; - qinv_mem_t *sync_mem; - uint64_t saddr; - uint_t tail; - qinv_t *qinv; volatile uint32_t *status; + uint64_t paddr; +#ifdef DEBUG + uint_t count; +#endif - qinv = (qinv_t *)immu->immu_qinv; - sync_mem = &qinv->qinv_sync; - tail = qinv_alloc_sync_mem_entry(immu); - saddr = sync_mem->qinv_mem_paddr + tail * QINV_SYNC_DATA_SIZE; - status = (uint32_t *)(sync_mem->qinv_mem_vaddr + tail * - QINV_SYNC_DATA_SIZE); + status = &iwp->iwp_vstatus; + paddr = iwp->iwp_pstatus; + + *status = IMMU_INV_DATA_PENDING; + membar_producer(); /* - * sdata = QINV_SYNC_DATA_FENCE, fence = 1, sw = 1, if = 0 + * sdata = IMMU_INV_DATA_DONE, fence = 1, sw = 1, if = 0 * indicate the invalidation wait descriptor completion by * performing a coherent DWORD write to the status address, * not by generating an invalidation completion event */ - dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_FENCE, 1, 1, 0); - dsc.hi = INV_WAIT_DSC_HIGH(saddr); + dsc.lo = INV_WAIT_DSC_LOW(IMMU_INV_DATA_DONE, 1, 1, 0); + dsc.hi = INV_WAIT_DSC_HIGH(paddr); qinv_submit_inv_dsc(immu, &dsc); - while ((*status) != QINV_SYNC_DATA_FENCE) - iommu_cpu_nop(); - *status = QINV_SYNC_DATA_UNFENCE; + if (iwp->iwp_sync) { +#ifdef DEBUG + count = 0; + while (*status != IMMU_INV_DATA_DONE) { + count++; + ht_pause(); + } + DTRACE_PROBE2(immu__wait__sync, const char *, iwp->iwp_name, + uint_t, count); +#else + while (*status != IMMU_INV_DATA_DONE) + ht_pause(); +#endif + } } -/* get already completed invalidation wait requests */ -static int -qinv_wait_async_finish(immu_t *immu, int *cnt) +static void +immu_qinv_inv_wait(immu_inv_wait_t *iwp) { - qinv_mem_t *sync_mem; - int index; - qinv_t *qinv; - volatile uint32_t *value; - - ASSERT((*cnt) == 0); - - qinv = (qinv_t *)immu->immu_qinv; - sync_mem = &qinv->qinv_sync; - - mutex_enter(&sync_mem->qinv_mem_lock); - index = sync_mem->qinv_mem_head; - value = (uint32_t *)(sync_mem->qinv_mem_vaddr + index - * QINV_SYNC_DATA_SIZE); - while (*value == QINV_SYNC_DATA_UNFENCE) { - *value = 0; - (*cnt)++; - sync_mem->qinv_mem_head++; - if (sync_mem->qinv_mem_head == sync_mem->qinv_mem_size) { - sync_mem->qinv_mem_head = 0; - value = (uint32_t *)(sync_mem->qinv_mem_vaddr); - } else - value = (uint32_t *)((char *)value + - QINV_SYNC_DATA_SIZE); + volatile uint32_t *status = &iwp->iwp_vstatus; +#ifdef DEBUG + uint_t count; + + count = 0; + while (*status != IMMU_INV_DATA_DONE) { + count++; + ht_pause(); } + DTRACE_PROBE2(immu__wait__async, const char *, iwp->iwp_name, + uint_t, count); +#else - mutex_exit(&sync_mem->qinv_mem_lock); - if ((*cnt) > 0) - return (index); - else - return (-1); + while (*status != IMMU_INV_DATA_DONE) + ht_pause(); +#endif } /* @@ -608,15 +486,6 @@ qinv_setup(immu_t *immu) mutex_init(&(qinv->qinv_table.qinv_mem_lock), NULL, MUTEX_DRIVER, NULL); mutex_init(&(qinv->qinv_sync.qinv_mem_lock), NULL, MUTEX_DRIVER, NULL); - /* - * init iotlb pend node for submitting invalidation iotlb - * queue request - */ - qinv->qinv_iotlb_pend_node = (qinv_iotlb_pend_node_t **) - kmem_zalloc(qinv->qinv_sync.qinv_mem_size - * sizeof (qinv_iotlb_pend_node_t *), KM_SLEEP); - - /* set invalidation queue structure */ immu->immu_qinv = qinv; mutex_exit(&(immu->immu_qinv_lock)); @@ -698,11 +567,11 @@ immu_qinv_startup(immu_t *immu) */ void immu_qinv_context_fsi(immu_t *immu, uint8_t function_mask, - uint16_t source_id, uint_t domain_id) + uint16_t source_id, uint_t domain_id, immu_inv_wait_t *iwp) { qinv_context_common(immu, function_mask, source_id, domain_id, CTT_INV_G_DEVICE); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* @@ -710,10 +579,10 @@ immu_qinv_context_fsi(immu_t *immu, uint8_t function_mask, * domain based context cache invalidation */ void -immu_qinv_context_dsi(immu_t *immu, uint_t domain_id) +immu_qinv_context_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp) { qinv_context_common(immu, 0, 0, domain_id, CTT_INV_G_DOMAIN); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* @@ -721,10 +590,10 @@ immu_qinv_context_dsi(immu_t *immu, uint_t domain_id) * invalidation global context cache */ void -immu_qinv_context_gbl(immu_t *immu) +immu_qinv_context_gbl(immu_t *immu, immu_inv_wait_t *iwp) { qinv_context_common(immu, 0, 0, 0, CTT_INV_G_GLOBAL); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* @@ -733,7 +602,7 @@ immu_qinv_context_gbl(immu_t *immu) */ void immu_qinv_iotlb_psi(immu_t *immu, uint_t domain_id, - uint64_t dvma, uint_t count, uint_t hint) + uint64_t dvma, uint_t count, uint_t hint, immu_inv_wait_t *iwp) { uint_t am = 0; uint_t max_am; @@ -761,6 +630,8 @@ immu_qinv_iotlb_psi(immu_t *immu, uint_t domain_id, qinv_iotlb_common(immu, domain_id, dvma, 0, hint, TLB_INV_G_DOMAIN); } + + qinv_wait_sync(immu, iwp); } /* @@ -768,10 +639,10 @@ immu_qinv_iotlb_psi(immu_t *immu, uint_t domain_id, * domain based iotlb invalidation */ void -immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id) +immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp) { qinv_iotlb_common(immu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* @@ -779,97 +650,32 @@ immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id) * global iotlb invalidation */ void -immu_qinv_iotlb_gbl(immu_t *immu) +immu_qinv_iotlb_gbl(immu_t *immu, immu_inv_wait_t *iwp) { qinv_iotlb_common(immu, 0, 0, 0, 0, TLB_INV_G_GLOBAL); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } - - -/* - * the plant wait operation for queued invalidation interface - */ -void -immu_qinv_plant(immu_t *immu, dvcookie_t *dvcookies, - uint_t count, uint_t array_size) -{ - qinv_t *qinv; - qinv_iotlb_pend_node_t *node = NULL; - qinv_iotlb_pend_head_t *head; - - qinv = (qinv_t *)immu->immu_qinv; - - head = &(qinv->qinv_pend_head); - mutex_enter(&(head->ich_mem_lock)); - node = list_head(&(head->ich_mem_list)); - if (node) { - list_remove(&(head->ich_mem_list), node); - } - mutex_exit(&(head->ich_mem_lock)); - - /* no cache, alloc one */ - if (node == NULL) { - node = kmem_zalloc(sizeof (qinv_iotlb_pend_node_t), KM_SLEEP); - } - node->icn_dvcookies = dvcookies; - node->icn_count = count; - node->icn_array_size = array_size; - - /* plant an invalidation wait descriptor, not wait its completion */ - qinv_wait_async_unfence(immu, node); -} - -/* - * the reap wait operation for queued invalidation interface - */ -void -immu_qinv_reap(immu_t *immu) -{ - int index, cnt = 0; - qinv_iotlb_pend_node_t *node; - qinv_iotlb_pend_head_t *head; - qinv_t *qinv; - - qinv = (qinv_t *)immu->immu_qinv; - head = &(qinv->qinv_pend_head); - - index = qinv_wait_async_finish(immu, &cnt); - - while (cnt--) { - node = qinv->qinv_iotlb_pend_node[index]; - if (node == NULL) - continue; - mutex_enter(&(head->ich_mem_lock)); - list_insert_head(&(head->ich_mem_list), node); - mutex_exit(&(head->ich_mem_lock)); - qinv->qinv_iotlb_pend_node[index] = NULL; - index++; - if (index == qinv->qinv_sync.qinv_mem_size) - index = 0; - } -} - - /* queued invalidation interface -- global invalidate interrupt entry cache */ void -immu_qinv_intr_global(immu_t *immu) +immu_qinv_intr_global(immu_t *immu, immu_inv_wait_t *iwp) { qinv_iec_common(immu, 0, 0, IEC_INV_GLOBAL); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* queued invalidation interface -- invalidate single interrupt entry cache */ void -immu_qinv_intr_one_cache(immu_t *immu, uint_t iidx) +immu_qinv_intr_one_cache(immu_t *immu, uint_t iidx, immu_inv_wait_t *iwp) { qinv_iec_common(immu, iidx, 0, IEC_INV_INDEX); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } /* queued invalidation interface -- invalidate interrupt entry caches */ void -immu_qinv_intr_caches(immu_t *immu, uint_t iidx, uint_t cnt) +immu_qinv_intr_caches(immu_t *immu, uint_t iidx, uint_t cnt, + immu_inv_wait_t *iwp) { uint_t i, mask = 0; @@ -880,7 +686,7 @@ immu_qinv_intr_caches(immu_t *immu, uint_t iidx, uint_t cnt) for (i = 0; i < cnt; i++) { qinv_iec_common(immu, iidx + cnt, 0, IEC_INV_INDEX); } - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); return; } @@ -892,13 +698,13 @@ immu_qinv_intr_caches(immu_t *immu, uint_t iidx, uint_t cnt) for (i = 0; i < cnt; i++) { qinv_iec_common(immu, iidx + cnt, 0, IEC_INV_INDEX); } - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); return; } qinv_iec_common(immu, iidx, mask, IEC_INV_INDEX); - qinv_wait_sync(immu); + qinv_wait_sync(immu, iwp); } void diff --git a/usr/src/uts/i86pc/io/immu_regs.c b/usr/src/uts/i86pc/io/immu_regs.c index 97d56a3776..dc43b0f49a 100644 --- a/usr/src/uts/i86pc/io/immu_regs.c +++ b/usr/src/uts/i86pc/io/immu_regs.c @@ -33,6 +33,7 @@ #include <sys/spl.h> #include <sys/sysmacros.h> #include <sys/immu.h> +#include <sys/cpu.h> #define get_reg32(immu, offset) ddi_get32((immu)->immu_regs_handle, \ (uint32_t *)(immu->immu_regs_addr + (offset))) @@ -45,13 +46,16 @@ ((immu)->immu_regs_handle, \ (uint64_t *)(immu->immu_regs_addr + (offset)), val) +static void immu_regs_inv_wait(immu_inv_wait_t *iwp); + struct immu_flushops immu_regs_flushops = { immu_regs_context_fsi, immu_regs_context_dsi, immu_regs_context_gbl, immu_regs_iotlb_psi, immu_regs_iotlb_dsi, - immu_regs_iotlb_gbl + immu_regs_iotlb_gbl, + immu_regs_inv_wait }; /* @@ -74,7 +78,7 @@ struct immu_flushops immu_regs_flushops = { "immu wait completion time out"); \ /*NOTREACHED*/ \ } else { \ - iommu_cpu_nop();\ + ht_pause();\ }\ }\ } @@ -118,9 +122,6 @@ iotlb_flush(immu_t *immu, uint_t domain_id, */ switch (type) { case IOTLB_PSI: - ASSERT(IMMU_CAP_GET_PSI(immu->immu_regs_cap)); - ASSERT(am <= IMMU_CAP_GET_MAMV(immu->immu_regs_cap)); - ASSERT(!(addr & IMMU_PAGEOFFSET)); command |= TLB_INV_PAGE | TLB_INV_IVT | TLB_INV_DID(domain_id); iva = addr | am | TLB_IVA_HINT(hint); @@ -149,9 +150,10 @@ iotlb_flush(immu_t *immu, uint_t domain_id, * immu_regs_iotlb_psi() * iotlb page specific invalidation */ +/*ARGSUSED*/ void immu_regs_iotlb_psi(immu_t *immu, uint_t did, uint64_t dvma, uint_t snpages, - uint_t hint) + uint_t hint, immu_inv_wait_t *iwp) { int dvma_am; int npg_am; @@ -163,12 +165,10 @@ immu_regs_iotlb_psi(immu_t *immu, uint_t did, uint64_t dvma, uint_t snpages, int i; if (!IMMU_CAP_GET_PSI(immu->immu_regs_cap)) { - immu_regs_iotlb_dsi(immu, did); + immu_regs_iotlb_dsi(immu, did, iwp); return; } - ASSERT(dvma % IMMU_PAGESIZE == 0); - max_am = IMMU_CAP_GET_MAMV(immu->immu_regs_cap); mutex_enter(&(immu->immu_regs_lock)); @@ -210,8 +210,9 @@ immu_regs_iotlb_psi(immu_t *immu, uint_t did, uint64_t dvma, uint_t snpages, * immu_regs_iotlb_dsi() * domain specific invalidation */ +/*ARGSUSED*/ void -immu_regs_iotlb_dsi(immu_t *immu, uint_t domain_id) +immu_regs_iotlb_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp) { mutex_enter(&(immu->immu_regs_lock)); iotlb_flush(immu, domain_id, 0, 0, 0, IOTLB_DSI); @@ -222,8 +223,9 @@ immu_regs_iotlb_dsi(immu_t *immu, uint_t domain_id) * immu_regs_iotlb_gbl() * global iotlb invalidation */ +/*ARGSUSED*/ void -immu_regs_iotlb_gbl(immu_t *immu) +immu_regs_iotlb_gbl(immu_t *immu, immu_inv_wait_t *iwp) { mutex_enter(&(immu->immu_regs_lock)); iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL); @@ -345,9 +347,6 @@ setup_regs(immu_t *immu) { int error; - ASSERT(immu); - ASSERT(immu->immu_name); - /* * This lock may be acquired by the IOMMU interrupt handler */ @@ -382,7 +381,7 @@ setup_regs(immu_t *immu) immu->immu_dvma_coherent = B_TRUE; } else { immu->immu_dvma_coherent = B_FALSE; - if (!(x86_feature & X86_CLFSH)) { + if (!is_x86_feature(x86_featureset, X86FSET_CLFSH)) { ddi_err(DER_WARN, NULL, "immu unit %s can't be enabled due to " "missing clflush functionality", immu->immu_name); @@ -396,6 +395,11 @@ setup_regs(immu_t *immu) immu->immu_SNP_reserved = immu_regs_is_SNP_reserved(immu); immu->immu_TM_reserved = immu_regs_is_TM_reserved(immu); + if (IMMU_ECAP_GET_CH(immu->immu_regs_excap) && immu_use_tm) + immu->immu_ptemask = PDTE_MASK_TM; + else + immu->immu_ptemask = 0; + /* * Check for Mobile 4 series chipset */ @@ -405,7 +409,6 @@ setup_regs(immu_t *immu) "IMMU: Mobile 4 chipset quirk detected. " "Force-setting RWBF"); IMMU_CAP_SET_RWBF(immu->immu_regs_cap); - ASSERT(IMMU_CAP_GET_RWBF(immu->immu_regs_cap)); } /* @@ -514,10 +517,6 @@ immu_regs_startup(immu_t *immu) return; } - ASSERT(immu->immu_regs_running == B_FALSE); - - ASSERT(MUTEX_HELD(&(immu->immu_lock))); - mutex_enter(&(immu->immu_regs_lock)); put_reg32(immu, IMMU_REG_GLOBAL_CMD, immu->immu_regs_cmdval | IMMU_GCMD_TE); @@ -527,7 +526,7 @@ immu_regs_startup(immu_t *immu) immu->immu_regs_running = B_TRUE; mutex_exit(&(immu->immu_regs_lock)); - ddi_err(DER_NOTE, NULL, "IMMU %s running", immu->immu_name); + ddi_err(DER_NOTE, NULL, "%s running", immu->immu_name); } /* @@ -543,10 +542,6 @@ immu_regs_shutdown(immu_t *immu) return; } - ASSERT(immu->immu_regs_setup == B_TRUE); - - ASSERT(MUTEX_HELD(&(immu->immu_lock))); - mutex_enter(&(immu->immu_regs_lock)); immu->immu_regs_cmdval &= ~IMMU_GCMD_TE; put_reg32(immu, IMMU_REG_GLOBAL_CMD, @@ -649,15 +644,17 @@ immu_regs_wbf_flush(immu_t *immu) void immu_regs_cpu_flush(immu_t *immu, caddr_t addr, uint_t size) { - uint64_t i; - - ASSERT(immu); + uintptr_t startline, endline; if (immu->immu_dvma_coherent == B_TRUE) return; - for (i = 0; i < size; i += x86_clflush_size, addr += x86_clflush_size) { - clflush_insn(addr); + startline = (uintptr_t)addr & ~(uintptr_t)(x86_clflush_size - 1); + endline = ((uintptr_t)addr + size - 1) & + ~(uintptr_t)(x86_clflush_size - 1); + while (startline <= endline) { + clflush_insn((caddr_t)startline); + startline += x86_clflush_size; } mfence_insn(); @@ -674,9 +671,6 @@ context_flush(immu_t *immu, uint8_t function_mask, uint64_t command = 0; uint64_t status; - ASSERT(immu); - ASSERT(rw_write_held(&(immu->immu_ctx_rwlock))); - /* * define the command */ @@ -687,15 +681,10 @@ context_flush(immu_t *immu, uint8_t function_mask, | CCMD_INV_SID(sid) | CCMD_INV_FM(function_mask); break; case CONTEXT_DSI: - ASSERT(function_mask == 0); - ASSERT(sid == 0); command |= CCMD_INV_ICC | CCMD_INV_DOMAIN | CCMD_INV_DID(did); break; case CONTEXT_GLOBAL: - ASSERT(function_mask == 0); - ASSERT(sid == 0); - ASSERT(did == 0); command |= CCMD_INV_ICC | CCMD_INV_GLOBAL; break; default: @@ -706,32 +695,43 @@ context_flush(immu_t *immu, uint8_t function_mask, } mutex_enter(&(immu->immu_regs_lock)); - ASSERT(!(get_reg64(immu, IMMU_REG_CONTEXT_CMD) & CCMD_INV_ICC)); put_reg64(immu, IMMU_REG_CONTEXT_CMD, command); wait_completion(immu, IMMU_REG_CONTEXT_CMD, get_reg64, (!(status & CCMD_INV_ICC)), status); mutex_exit(&(immu->immu_regs_lock)); } +/*ARGSUSED*/ void immu_regs_context_fsi(immu_t *immu, uint8_t function_mask, - uint16_t source_id, uint_t domain_id) + uint16_t source_id, uint_t domain_id, immu_inv_wait_t *iwp) { context_flush(immu, function_mask, source_id, domain_id, CONTEXT_FSI); } +/*ARGSUSED*/ void -immu_regs_context_dsi(immu_t *immu, uint_t domain_id) +immu_regs_context_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp) { context_flush(immu, 0, 0, domain_id, CONTEXT_DSI); } +/*ARGSUSED*/ void -immu_regs_context_gbl(immu_t *immu) +immu_regs_context_gbl(immu_t *immu, immu_inv_wait_t *iwp) { context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL); } +/* + * Nothing to do, all register operations are synchronous. + */ +/*ARGSUSED*/ +static void +immu_regs_inv_wait(immu_inv_wait_t *iwp) +{ +} + void immu_regs_set_root_table(immu_t *immu) { @@ -797,8 +797,7 @@ immu_regs_intrmap_enable(immu_t *immu, uint64_t irta_reg) mutex_exit(&(immu->immu_regs_lock)); /* global flush intr entry cache */ - if (immu_qinv_enable == B_TRUE) - immu_qinv_intr_global(immu); + immu_qinv_intr_global(immu, &immu->immu_intrmap_inv_wait); /* enable interrupt remapping */ mutex_enter(&(immu->immu_regs_lock)); diff --git a/usr/src/uts/i86pc/io/isa.c b/usr/src/uts/i86pc/io/isa.c index aa5cea74f1..d2bb59ca99 100644 --- a/usr/src/uts/i86pc/io/isa.c +++ b/usr/src/uts/i86pc/io/isa.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index 134a945207..29e371f000 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -45,6 +45,7 @@ #include <sys/acpica.h> #include <sys/psm_common.h> #include <sys/apic.h> +#include <sys/apic_timer.h> #include <sys/pit.h> #include <sys/ddi.h> #include <sys/sunddi.h> diff --git a/usr/src/uts/i86pc/io/pci/pci_common.c b/usr/src/uts/i86pc/io/pci/pci_common.c index ad689868bc..1cea07237f 100644 --- a/usr/src/uts/i86pc/io/pci/pci_common.c +++ b/usr/src/uts/i86pc/io/pci/pci_common.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* diff --git a/usr/src/uts/i86pc/io/pci/pci_kstats.c b/usr/src/uts/i86pc/io/pci/pci_kstats.c index ea7fcc9dc1..6a8c365c06 100644 --- a/usr/src/uts/i86pc/io/pci/pci_kstats.c +++ b/usr/src/uts/i86pc/io/pci/pci_kstats.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Kstat support for X86 PCI driver diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic.c b/usr/src/uts/i86pc/io/pcplusmp/apic.c index 22553d39d3..2a7ccb1080 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c @@ -75,6 +75,7 @@ #include <sys/reboot.h> #include <sys/hpet.h> #include <sys/apic_common.h> +#include <sys/apic_timer.h> /* * Local Function Prototypes @@ -86,8 +87,6 @@ static void apic_init_intr(void); */ static int apic_probe(void); static int apic_getclkirq(int ipl); -static uint_t apic_calibrate(volatile uint32_t *addr, - uint16_t *pit_ticks_adj); static void apic_init(void); static void apic_picinit(void); static int apic_post_cpu_start(void); @@ -371,7 +370,7 @@ apic_init_intr(void) if (nlvt >= 5) { /* Enable performance counter overflow interrupt */ - if ((x86_feature & X86_MSR) != X86_MSR) + if (!is_x86_feature(x86_featureset, X86FSET_MSR)) apic_enable_cpcovf_intr = 0; if (apic_enable_cpcovf_intr) { if (apic_cpcovf_vect == 0) { diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c index 79d24ed110..0cc45ff4e0 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c @@ -70,6 +70,7 @@ #include <sys/reboot.h> #include <sys/hpet.h> #include <sys/apic_common.h> +#include <sys/apic_timer.h> static void apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt); @@ -87,17 +88,11 @@ void apic_unset_idlecpu(processorid_t); void apic_shutdown(int, int); void apic_preshutdown(int, int); processorid_t apic_get_next_processorid(processorid_t); -void apic_timer_reprogram(hrtime_t); -void apic_timer_enable(void); -void apic_timer_disable(void); hrtime_t apic_gettime(); enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP; -int apic_oneshot = 0; -int apic_oneshot_enable = 1; /* to allow disabling one-shot capability */ - /* Now the ones for Dynamic Interrupt distribution */ int apic_enable_dynamic_migration = 0; @@ -146,9 +141,6 @@ int apic_panic_on_apic_error = 0; int apic_verbose = 0; /* 0x1ff */ -/* minimum number of timer ticks to program to */ -int apic_min_timer_ticks = 1; - #ifdef DEBUG int apic_debug = 0; int apic_restrict_vector = 0; @@ -158,8 +150,6 @@ int apic_debug_msgbufindex = 0; #endif /* DEBUG */ -uint_t apic_nsec_per_intr = 0; - uint_t apic_nticks = 0; uint_t apic_skipped_redistribute = 0; @@ -167,11 +157,6 @@ uint_t last_count_read = 0; lock_t apic_gethrtime_lock; volatile int apic_hrtime_stamp = 0; volatile hrtime_t apic_nsec_since_boot = 0; -uint_t apic_hertz_count; - -uint64_t apic_ticks_per_SFnsecs; /* # of ticks in SF nsecs */ - -static hrtime_t apic_nsec_max; static hrtime_t apic_last_hrtime = 0; int apic_hrtime_error = 0; @@ -1075,7 +1060,7 @@ apic_cpu_remove(psm_cpu_request_t *reqp) * Return the number of APIC clock ticks elapsed for 8245 to decrement * (APIC_TIME_COUNT + pit_ticks_adj) ticks. */ -static uint_t +uint_t apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) { uint8_t pit_tick_lo; @@ -1144,46 +1129,7 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) int apic_clkinit(int hertz) { - uint_t apic_ticks = 0; - uint_t pit_ticks; int ret; - uint16_t pit_ticks_adj; - static int firsttime = 1; - - if (firsttime) { - /* first time calibrate on CPU0 only */ - - apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); - - /* total number of PIT ticks corresponding to apic_ticks */ - pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; - - /* - * Determine the number of nanoseconds per APIC clock tick - * and then determine how many APIC ticks to interrupt at the - * desired frequency - * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s - * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s - * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) - * pic_ticks_per_SFns = - * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) - */ - apic_ticks_per_SFnsecs = - ((SF * apic_ticks * PIT_HZ) / - ((uint64_t)pit_ticks * NANOSEC)); - - /* the interval timer initial count is 32 bit max */ - apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); - firsttime = 0; - } - - if (hertz != 0) { - /* periodic */ - apic_nsec_per_intr = NANOSEC / hertz; - apic_hertz_count = APIC_NSECS_TO_TICKS(apic_nsec_per_intr); - } apic_int_busy_mark = (apic_int_busy_mark * apic_sample_factor_redistribution) / 100; @@ -1192,21 +1138,7 @@ apic_clkinit(int hertz) apic_diff_for_redistribution = (apic_diff_for_redistribution * apic_sample_factor_redistribution) / 100; - if (hertz == 0) { - /* requested one_shot */ - if (!tsc_gethrtime_enable || !apic_oneshot_enable) - return (0); - apic_oneshot = 1; - ret = (int)APIC_TICKS_TO_NSECS(1); - } else { - /* program the local APIC to interrupt at the given frequency */ - apic_reg_ops->apic_write(APIC_INIT_COUNT, apic_hertz_count); - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_TIME); - apic_oneshot = 0; - ret = NANOSEC / hertz; - } - + ret = apic_timer_init(hertz); return (ret); } @@ -1419,137 +1351,6 @@ restart_sitka_bmc: } -/* - * This function will reprogram the timer. - * - * When in oneshot mode the argument is the absolute time in future to - * generate the interrupt at. - * - * When in periodic mode, the argument is the interval at which the - * interrupts should be generated. There is no need to support the periodic - * mode timer change at this time. - */ -void -apic_timer_reprogram(hrtime_t time) -{ - hrtime_t now; - uint_t ticks; - int64_t delta; - - /* - * We should be called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - - if (!apic_oneshot) { - /* time is the interval for periodic mode */ - ticks = APIC_NSECS_TO_TICKS(time); - } else { - /* one shot mode */ - - now = gethrtime(); - delta = time - now; - - if (delta <= 0) { - /* - * requested to generate an interrupt in the past - * generate an interrupt as soon as possible - */ - ticks = apic_min_timer_ticks; - } else if (delta > apic_nsec_max) { - /* - * requested to generate an interrupt at a time - * further than what we are capable of. Set to max - * the hardware can handle - */ - - ticks = APIC_MAXVAL; -#ifdef DEBUG - cmn_err(CE_CONT, "apic_timer_reprogram, request at" - " %lld too far in future, current time" - " %lld \n", time, now); -#endif - } else - ticks = APIC_NSECS_TO_TICKS(delta); - } - - if (ticks < apic_min_timer_ticks) - ticks = apic_min_timer_ticks; - - apic_reg_ops->apic_write(APIC_INIT_COUNT, ticks); -} - -/* - * This function will enable timer interrupts. - */ -void -apic_timer_enable(void) -{ - /* - * We should be Called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - - if (!apic_oneshot) { - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_TIME); - } else { - /* one shot */ - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT)); - } -} - -/* - * This function will disable timer interrupts. - */ -void -apic_timer_disable(void) -{ - /* - * We should be Called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_MASK); -} - -/* - * Set timer far into the future and return timer - * current Count in nanoseconds. - */ -hrtime_t -apic_timer_stop_count(void) -{ - hrtime_t ns_val; - int enable_val, count_val; - - /* - * Should be called with interrupts disabled. - */ - ASSERT(!interrupts_enabled()); - - enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER); - if ((enable_val & AV_MASK) == AV_MASK) - return ((hrtime_t)-1); /* timer is disabled */ - - count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT); - ns_val = APIC_TICKS_TO_NSECS(count_val); - - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - - return (ns_val); -} - -/* - * Reprogram timer after Deep C-State. - */ -void -apic_timer_restart(hrtime_t time) -{ - apic_timer_reprogram(time); -} - ddi_periodic_t apic_periodic_id; /* diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c index 46f1257ecd..1a1d72fccf 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c @@ -75,7 +75,10 @@ apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, msi_regs.mr_data = vector; msi_regs.mr_addr = target_apic_id; - intrmap_tbl[0] = irq_ptr->airq_intrmap_private; + for (i = 0; i < count; i++) { + irqno = apic_vector_to_irq[vector + i]; + intrmap_tbl[i] = apic_irq_table[irqno]->airq_intrmap_private; + } apic_vt_ops->apic_intrmap_alloc_entry(intrmap_tbl, dip, type, count, 0xff); for (i = 0; i < count; i++) { diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c new file mode 100644 index 0000000000..ffc1e99f68 --- /dev/null +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c @@ -0,0 +1,399 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/psm_common.h> +#include <sys/apic.h> +#include <sys/pit.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> +#include <sys/cpuvar.h> +#include <sys/clock.h> +#include <sys/apic_timer.h> + +/* + * preferred apic timer mode, allow tuning from the /etc/system file. + */ +int apic_timer_preferred_mode = APIC_TIMER_MODE_DEADLINE; + +int apic_oneshot = 0; +uint_t apic_hertz_count; +uint_t apic_nsec_per_intr = 0; +uint64_t apic_ticks_per_SFnsecs; /* # of ticks in SF nsecs */ + +static int apic_min_timer_ticks = 1; /* minimum timer tick */ +static hrtime_t apic_nsec_max; + +static void periodic_timer_enable(void); +static void periodic_timer_disable(void); +static void periodic_timer_reprogram(hrtime_t); +static void oneshot_timer_enable(void); +static void oneshot_timer_disable(void); +static void oneshot_timer_reprogram(hrtime_t); +static void deadline_timer_enable(void); +static void deadline_timer_disable(void); +static void deadline_timer_reprogram(hrtime_t); + +extern int apic_clkvect; +extern uint32_t apic_divide_reg_init; + +/* + * apic timer data structure + */ +typedef struct apic_timer { + int mode; + void (*apic_timer_enable_ops)(void); + void (*apic_timer_disable_ops)(void); + void (*apic_timer_reprogram_ops)(hrtime_t); +} apic_timer_t; + +static apic_timer_t apic_timer; + +/* + * apic timer initialization + * + * For the one-shot mode request case, the function returns the + * resolution (in nanoseconds) for the hardware timer interrupt. + * If one-shot mode capability is not available, the return value + * will be 0. + */ +int +apic_timer_init(int hertz) +{ + uint_t apic_ticks = 0; + uint_t pit_ticks; + int ret, timer_mode; + uint16_t pit_ticks_adj; + static int firsttime = 1; + + if (firsttime) { + /* first time calibrate on CPU0 only */ + + apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); + apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); + + /* total number of PIT ticks corresponding to apic_ticks */ + pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; + + /* + * Determine the number of nanoseconds per APIC clock tick + * and then determine how many APIC ticks to interrupt at the + * desired frequency + * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s + * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s + * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) + * pic_ticks_per_SFns = + * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) + */ + apic_ticks_per_SFnsecs = ((SF * apic_ticks * PIT_HZ) / + ((uint64_t)pit_ticks * NANOSEC)); + + /* the interval timer initial count is 32 bit max */ + apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); + firsttime = 0; + } + + if (hertz == 0) { + /* requested one_shot */ + + /* + * return 0 if TSC is not supported. + */ + if (!tsc_gethrtime_enable) + return (0); + /* + * return 0 if one_shot is not preferred. + * here, APIC_TIMER_DEADLINE is also an one_shot mode. + */ + if ((apic_timer_preferred_mode != APIC_TIMER_MODE_ONESHOT) && + (apic_timer_preferred_mode != APIC_TIMER_MODE_DEADLINE)) + return (0); + + apic_oneshot = 1; + ret = (int)APIC_TICKS_TO_NSECS(1); + if ((apic_timer_preferred_mode == APIC_TIMER_MODE_DEADLINE) && + cpuid_deadline_tsc_supported()) { + timer_mode = APIC_TIMER_MODE_DEADLINE; + } else { + timer_mode = APIC_TIMER_MODE_ONESHOT; + } + } else { + /* periodic */ + apic_nsec_per_intr = NANOSEC / hertz; + apic_hertz_count = APIC_NSECS_TO_TICKS(apic_nsec_per_intr); + + /* program the local APIC to interrupt at the given frequency */ + apic_reg_ops->apic_write(APIC_INIT_COUNT, apic_hertz_count); + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_PERIODIC); + apic_oneshot = 0; + timer_mode = APIC_TIMER_MODE_PERIODIC; + ret = NANOSEC / hertz; + } + + /* + * initialize apic_timer data structure, install the timer ops + */ + apic_timer.mode = timer_mode; + switch (timer_mode) { + default: + /* FALLTHROUGH */ + case APIC_TIMER_MODE_ONESHOT: + apic_timer.apic_timer_enable_ops = oneshot_timer_enable; + apic_timer.apic_timer_disable_ops = oneshot_timer_disable; + apic_timer.apic_timer_reprogram_ops = oneshot_timer_reprogram; + break; + + case APIC_TIMER_MODE_PERIODIC: + apic_timer.apic_timer_enable_ops = periodic_timer_enable; + apic_timer.apic_timer_disable_ops = periodic_timer_disable; + apic_timer.apic_timer_reprogram_ops = periodic_timer_reprogram; + break; + + case APIC_TIMER_MODE_DEADLINE: + apic_timer.apic_timer_enable_ops = deadline_timer_enable; + apic_timer.apic_timer_disable_ops = deadline_timer_disable; + apic_timer.apic_timer_reprogram_ops = deadline_timer_reprogram; + break; + } + + return (ret); +} + +/* + * periodic timer mode ops + */ +/* periodic timer enable */ +static void +periodic_timer_enable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_PERIODIC); +} + +/* periodic timer disable */ +static void +periodic_timer_disable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_MASK); +} + +/* periodic timer reprogram */ +static void +periodic_timer_reprogram(hrtime_t time) +{ + uint_t ticks; + /* time is the interval for periodic mode */ + ticks = APIC_NSECS_TO_TICKS(time); + + if (ticks < apic_min_timer_ticks) + ticks = apic_min_timer_ticks; + + apic_reg_ops->apic_write(APIC_INIT_COUNT, ticks); +} + +/* + * oneshot timer mode ops + */ +/* oneshot timer enable */ +static void +oneshot_timer_enable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT)); +} + +/* oneshot timer disable */ +static void +oneshot_timer_disable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_MASK); +} + +/* oneshot timer reprogram */ +static void +oneshot_timer_reprogram(hrtime_t time) +{ + hrtime_t now; + int64_t delta; + uint_t ticks; + + now = gethrtime(); + delta = time - now; + + if (delta <= 0) { + /* + * requested to generate an interrupt in the past + * generate an interrupt as soon as possible + */ + ticks = apic_min_timer_ticks; + } else if (delta > apic_nsec_max) { + /* + * requested to generate an interrupt at a time + * further than what we are capable of. Set to max + * the hardware can handle + */ + ticks = APIC_MAXVAL; +#ifdef DEBUG + cmn_err(CE_CONT, "apic_timer_reprogram, request at" + " %lld too far in future, current time" + " %lld \n", time, now); +#endif + } else { + ticks = APIC_NSECS_TO_TICKS(delta); + } + + if (ticks < apic_min_timer_ticks) + ticks = apic_min_timer_ticks; + + apic_reg_ops->apic_write(APIC_INIT_COUNT, ticks); +} + +/* + * deadline timer mode ops + */ +/* deadline timer enable */ +static void +deadline_timer_enable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_DEADLINE); +} + +/* deadline timer disable */ +static void +deadline_timer_disable(void) +{ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_MASK); +} + +/* deadline timer reprogram */ +static void +deadline_timer_reprogram(hrtime_t time) +{ + uint64_t ticks; + + if (time <= 0) { + /* + * generate an immediate interrupt + */ + ticks = (uint64_t)tsc_read(); + } else { + ticks = unscalehrtime(time); + } + + wrmsr(IA32_DEADLINE_TSC_MSR, ticks); +} + +/* + * This function will reprogram the timer. + * + * When in oneshot mode the argument is the absolute time in future to + * generate the interrupt at. + * + * When in periodic mode, the argument is the interval at which the + * interrupts should be generated. There is no need to support the periodic + * mode timer change at this time. + */ +void +apic_timer_reprogram(hrtime_t time) +{ + /* + * we should be Called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + apic_timer.apic_timer_reprogram_ops(time); +} + +/* + * This function will enable timer interrupts. + */ +void +apic_timer_enable(void) +{ + /* + * we should be Called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + apic_timer.apic_timer_enable_ops(); +} + +/* + * This function will disable timer interrupts. + */ +void +apic_timer_disable(void) +{ + /* + * we should be Called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + apic_timer.apic_timer_disable_ops(); +} + +/* + * Set timer far into the future and return timer + * current count in nanoseconds. + */ +hrtime_t +apic_timer_stop_count(void) +{ + hrtime_t ns_val; + int enable_val, count_val; + + /* + * Should be called with interrupts disabled. + */ + ASSERT(!interrupts_enabled()); + + enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER); + if ((enable_val & AV_MASK) == AV_MASK) + return ((hrtime_t)-1); /* timer is disabled */ + + count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT); + ns_val = APIC_TICKS_TO_NSECS(count_val); + + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); + + return (ns_val); +} + +/* + * Reprogram timer after Deep C-State. + */ +void +apic_timer_restart(hrtime_t time) +{ + apic_timer_reprogram(time); +} diff --git a/usr/src/uts/i86pc/io/rootnex.c b/usr/src/uts/i86pc/io/rootnex.c index 8416281fee..0f42739f23 100644 --- a/usr/src/uts/i86pc/io/rootnex.c +++ b/usr/src/uts/i86pc/io/rootnex.c @@ -134,8 +134,10 @@ int rootnex_prealloc_copybuf = 2; /* driver global state */ static rootnex_state_t *rootnex_state; +#ifdef DEBUG /* shortcut to rootnex counters */ static uint64_t *rootnex_cnt; +#endif /* * XXX - does x86 even need these or are they left over from the SPARC days? @@ -149,12 +151,17 @@ static rootnex_intprop_t rootnex_intprp[] = { }; #define NROOT_INTPROPS (sizeof (rootnex_intprp) / sizeof (rootnex_intprop_t)) +/* + * If we're dom0, we're using a real device so we need to load + * the cookies with MFNs instead of PFNs. + */ #ifdef __xpv typedef maddr_t rootnex_addr_t; -#define ROOTNEX_PADDR_TO_RBASE(xinfo, pa) \ - (DOMAIN_IS_INITDOMAIN(xinfo) ? pa_to_ma(pa) : (pa)) +#define ROOTNEX_PADDR_TO_RBASE(pa) \ + (DOMAIN_IS_INITDOMAIN(xen_info) ? pa_to_ma(pa) : (pa)) #else typedef paddr_t rootnex_addr_t; +#define ROOTNEX_PADDR_TO_RBASE(pa) (pa) #endif #if !defined(__xpv) @@ -244,6 +251,14 @@ static int rootnex_coredma_win(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, uint_t win, off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp); +#if defined(__amd64) && !defined(__xpv) +static int rootnex_coredma_hdl_setprivate(dev_info_t *dip, dev_info_t *rdip, + ddi_dma_handle_t handle, void *v); +static void *rootnex_coredma_hdl_getprivate(dev_info_t *dip, dev_info_t *rdip, + ddi_dma_handle_t handle); +#endif + + static struct bus_ops rootnex_bus_ops = { BUSO_REV, rootnex_map, @@ -324,7 +339,9 @@ static iommulib_nexops_t iommulib_nexops = { rootnex_coredma_sync, rootnex_coredma_win, rootnex_dma_map, - rootnex_dma_mctl + rootnex_dma_mctl, + rootnex_coredma_hdl_setprivate, + rootnex_coredma_hdl_getprivate }; #endif @@ -369,13 +386,15 @@ static int rootnex_valid_bind_parms(ddi_dma_req_t *dmareq, ddi_dma_attr_t *attr); static void rootnex_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, rootnex_sglinfo_t *sglinfo); +static void rootnex_dvma_get_sgl(ddi_dma_obj_t *dmar_object, + ddi_dma_cookie_t *sgl, rootnex_sglinfo_t *sglinfo); static int rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, - rootnex_dma_t *dma, ddi_dma_attr_t *attr, int kmflag); + rootnex_dma_t *dma, ddi_dma_attr_t *attr, ddi_dma_obj_t *dmao, int kmflag); static int rootnex_setup_copybuf(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, rootnex_dma_t *dma, ddi_dma_attr_t *attr); static void rootnex_teardown_copybuf(rootnex_dma_t *dma); static int rootnex_setup_windows(ddi_dma_impl_t *hp, rootnex_dma_t *dma, - ddi_dma_attr_t *attr, int kmflag); + ddi_dma_attr_t *attr, ddi_dma_obj_t *dmao, int kmflag); static void rootnex_teardown_windows(rootnex_dma_t *dma); static void rootnex_init_win(ddi_dma_impl_t *hp, rootnex_dma_t *dma, rootnex_window_t *window, ddi_dma_cookie_t *cookie, off_t cur_offset); @@ -397,6 +416,7 @@ static int rootnex_dma_check(dev_info_t *dip, const void *handle, const void *comp_addr, const void *not_used); static boolean_t rootnex_need_bounce_seg(ddi_dma_obj_t *dmar_object, rootnex_sglinfo_t *sglinfo); +static struct as *rootnex_get_as(ddi_dma_obj_t *dmar_object); /* * _init() @@ -466,7 +486,9 @@ rootnex_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) rootnex_state->r_dip = dip; rootnex_state->r_err_ibc = (ddi_iblock_cookie_t)ipltospl(15); rootnex_state->r_reserved_msg_printed = B_FALSE; +#ifdef DEBUG rootnex_cnt = &rootnex_state->r_counters[0]; +#endif /* * Set minimum fm capability level for i86pc platforms and then @@ -1723,13 +1745,8 @@ rootnex_coredma_allochdl(dev_info_t *dip, dev_info_t *rdip, * best we can do with the current bind interfaces. */ hp = kmem_cache_alloc(rootnex_state->r_dmahdl_cache, kmflag); - if (hp == NULL) { - if (waitfp != DDI_DMA_DONTWAIT) { - ddi_set_callback(waitfp, arg, - &rootnex_state->r_dvma_call_list_id); - } + if (hp == NULL) return (DDI_DMA_NORESOURCES); - } /* Do our pointer manipulation now, align the structures */ hp->dmai_private = (void *)(((uintptr_t)hp + @@ -1743,13 +1760,32 @@ rootnex_coredma_allochdl(dev_info_t *dip, dev_info_t *rdip, hp->dmai_error.err_fep = NULL; hp->dmai_error.err_cf = NULL; dma->dp_dip = rdip; + dma->dp_sglinfo.si_flags = attr->dma_attr_flags; dma->dp_sglinfo.si_min_addr = attr->dma_attr_addr_lo; - dma->dp_sglinfo.si_max_addr = attr->dma_attr_addr_hi; + + /* + * The BOUNCE_ON_SEG workaround is not needed when an IOMMU + * is being used. Set the upper limit to the seg value. + * There will be enough DVMA space to always get addresses + * that will match the constraints. + */ + if (IOMMU_USED(rdip) && + (attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG)) { + dma->dp_sglinfo.si_max_addr = attr->dma_attr_seg; + dma->dp_sglinfo.si_flags &= ~_DDI_DMA_BOUNCE_ON_SEG; + } else + dma->dp_sglinfo.si_max_addr = attr->dma_attr_addr_hi; + hp->dmai_minxfer = attr->dma_attr_minxfer; hp->dmai_burstsizes = attr->dma_attr_burstsizes; hp->dmai_rdip = rdip; hp->dmai_attr = *attr; + if (attr->dma_attr_seg >= dma->dp_sglinfo.si_max_addr) + dma->dp_sglinfo.si_cancross = B_FALSE; + else + dma->dp_sglinfo.si_cancross = B_TRUE; + /* we don't need to worry about the SPL since we do a tryenter */ mutex_init(&dma->dp_mutex, NULL, MUTEX_DRIVER, NULL); @@ -1812,13 +1848,12 @@ rootnex_coredma_allochdl(dev_info_t *dip, dev_info_t *rdip, } dma->dp_sglinfo.si_max_cookie_size = maxsegmentsize; dma->dp_sglinfo.si_segmask = attr->dma_attr_seg; - dma->dp_sglinfo.si_flags = attr->dma_attr_flags; /* check the ddi_dma_attr arg to make sure it makes a little sense */ if (rootnex_alloc_check_parms) { e = rootnex_valid_alloc_parms(attr, maxsegmentsize); if (e != DDI_SUCCESS) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_ALLOC_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_ALLOC_FAIL]); (void) rootnex_dma_freehdl(dip, rdip, (ddi_dma_handle_t)hp); return (e); @@ -1843,31 +1878,40 @@ static int rootnex_dma_allochdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep) { - int retval; + int retval = DDI_SUCCESS; #if defined(__amd64) && !defined(__xpv) - uint_t error = ENOTSUP; - retval = iommulib_nex_open(rdip, &error); + if (IOMMU_UNITIALIZED(rdip)) { + retval = iommulib_nex_open(dip, rdip); - if (retval != DDI_SUCCESS && error == ENOTSUP) { - /* No IOMMU */ - return (rootnex_coredma_allochdl(dip, rdip, attr, waitfp, arg, - handlep)); - } else if (retval != DDI_SUCCESS) { - return (DDI_FAILURE); + if (retval != DDI_SUCCESS && retval != DDI_ENOTSUP) + return (retval); } - ASSERT(IOMMU_USED(rdip)); - - /* has an IOMMU */ - retval = iommulib_nexdma_allochdl(dip, rdip, attr, - waitfp, arg, handlep); + if (IOMMU_UNUSED(rdip)) { + retval = rootnex_coredma_allochdl(dip, rdip, attr, waitfp, arg, + handlep); + } else { + retval = iommulib_nexdma_allochdl(dip, rdip, attr, + waitfp, arg, handlep); + } #else retval = rootnex_coredma_allochdl(dip, rdip, attr, waitfp, arg, handlep); #endif - if (retval == DDI_SUCCESS) + switch (retval) { + case DDI_DMA_NORESOURCES: + if (waitfp != DDI_DMA_DONTWAIT) { + ddi_set_callback(waitfp, arg, + &rootnex_state->r_dvma_call_list_id); + } + break; + case DDI_SUCCESS: ndi_fmc_insert(rdip, DMA_HANDLE, *handlep, NULL); + break; + default: + break; + } return (retval); } @@ -1893,9 +1937,6 @@ rootnex_coredma_freehdl(dev_info_t *dip, dev_info_t *rdip, ROOTNEX_DPROBE1(rootnex__free__handle, uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_HDLS]); - if (rootnex_state->r_dvma_call_list_id) - ddi_run_callback(&rootnex_state->r_dvma_call_list_id); - return (DDI_SUCCESS); } @@ -1906,13 +1947,20 @@ rootnex_coredma_freehdl(dev_info_t *dip, dev_info_t *rdip, static int rootnex_dma_freehdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle) { + int ret; + ndi_fmc_remove(rdip, DMA_HANDLE, handle); #if defined(__amd64) && !defined(__xpv) - if (IOMMU_USED(rdip)) { - return (iommulib_nexdma_freehdl(dip, rdip, handle)); - } + if (IOMMU_USED(rdip)) + ret = iommulib_nexdma_freehdl(dip, rdip, handle); + else #endif - return (rootnex_coredma_freehdl(dip, rdip, handle)); + ret = rootnex_coredma_freehdl(dip, rdip, handle); + + if (rootnex_state->r_dvma_call_list_id) + ddi_run_callback(&rootnex_state->r_dvma_call_list_id); + + return (ret); } /*ARGSUSED*/ @@ -1922,21 +1970,29 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_cookie_t *cookiep, uint_t *ccountp) { rootnex_sglinfo_t *sinfo; + ddi_dma_obj_t *dmao; +#if defined(__amd64) && !defined(__xpv) + struct dvmaseg *dvs; + ddi_dma_cookie_t *cookie; +#endif ddi_dma_attr_t *attr; ddi_dma_impl_t *hp; rootnex_dma_t *dma; int kmflag; int e; + uint_t ncookies; hp = (ddi_dma_impl_t *)handle; dma = (rootnex_dma_t *)hp->dmai_private; + dmao = &dma->dp_dma; sinfo = &dma->dp_sglinfo; attr = &hp->dmai_attr; + /* convert the sleep flags */ if (dmareq->dmar_fp == DDI_DMA_SLEEP) { - dma->dp_sleep_flags = KM_SLEEP; + dma->dp_sleep_flags = kmflag = KM_SLEEP; } else { - dma->dp_sleep_flags = KM_NOSLEEP; + dma->dp_sleep_flags = kmflag = KM_NOSLEEP; } hp->dmai_rflags = dmareq->dmar_flags & DMP_DDIFLAGS; @@ -1953,12 +2009,12 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, */ e = mutex_tryenter(&dma->dp_mutex); if (e == 0) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); return (DDI_DMA_INUSE); } if (dma->dp_inuse) { mutex_exit(&dma->dp_mutex); - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); return (DDI_DMA_INUSE); } dma->dp_inuse = B_TRUE; @@ -1969,7 +2025,7 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, if (rootnex_bind_check_parms) { e = rootnex_valid_bind_parms(dmareq, attr); if (e != DDI_SUCCESS) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); rootnex_clean_dmahdl(hp); return (e); } @@ -1979,30 +2035,72 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, dma->dp_dma = dmareq->dmar_object; #if defined(__amd64) && !defined(__xpv) - e = immu_map_sgl(hp, dmareq, rootnex_prealloc_cookies, rdip); - switch (e) { - case DDI_DMA_MAPPED: - goto out; - case DDI_DMA_USE_PHYSICAL: - break; - case DDI_DMA_PARTIAL: - ddi_err(DER_PANIC, rdip, "Partial DVMA map"); - e = DDI_DMA_NORESOURCES; - /*FALLTHROUGH*/ - default: - ddi_err(DER_MODE, rdip, "DVMA map failed"); - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); - rootnex_clean_dmahdl(hp); - return (e); + if (IOMMU_USED(rdip)) { + dmao = &dma->dp_dvma; + e = iommulib_nexdma_mapobject(dip, rdip, handle, dmareq, dmao); + switch (e) { + case DDI_SUCCESS: + if (sinfo->si_cancross || + dmao->dmao_obj.dvma_obj.dv_nseg != 1 || + dmao->dmao_size > sinfo->si_max_cookie_size) { + dma->dp_dvma_used = B_TRUE; + break; + } + sinfo->si_sgl_size = 1; + hp->dmai_rflags |= DMP_NOSYNC; + + dma->dp_dvma_used = B_TRUE; + dma->dp_need_to_free_cookie = B_FALSE; + + dvs = &dmao->dmao_obj.dvma_obj.dv_seg[0]; + cookie = hp->dmai_cookie = dma->dp_cookies = + (ddi_dma_cookie_t *)dma->dp_prealloc_buffer; + cookie->dmac_laddress = dvs->dvs_start + + dmao->dmao_obj.dvma_obj.dv_off; + cookie->dmac_size = dvs->dvs_len; + cookie->dmac_type = 0; + + ROOTNEX_DPROBE1(rootnex__bind__dvmafast, dev_info_t *, + rdip); + goto fast; + case DDI_ENOTSUP: + break; + default: + rootnex_clean_dmahdl(hp); + return (e); + } } #endif /* - * Figure out a rough estimate of what maximum number of pages this - * buffer could use (a high estimate of course). + * Figure out a rough estimate of what maximum number of pages + * this buffer could use (a high estimate of course). */ sinfo->si_max_pages = mmu_btopr(dma->dp_dma.dmao_size) + 1; + if (dma->dp_dvma_used) { + /* + * The number of physical pages is the worst case. + * + * For DVMA, the worst case is the length divided + * by the maximum cookie length, plus 1. Add to that + * the number of segment boundaries potentially crossed, and + * the additional number of DVMA segments that was returned. + * + * In the normal case, for modern devices, si_cancross will + * be false, and dv_nseg will be 1, and the fast path will + * have been taken above. + */ + ncookies = (dma->dp_dma.dmao_size / sinfo->si_max_cookie_size) + + 1; + if (sinfo->si_cancross) + ncookies += + (dma->dp_dma.dmao_size / attr->dma_attr_seg) + 1; + ncookies += (dmao->dmao_obj.dvma_obj.dv_nseg - 1); + + sinfo->si_max_pages = MIN(sinfo->si_max_pages, ncookies); + } + /* * We'll use the pre-allocated cookies for any bind that will *always* * fit (more important to be consistent, we don't want to create @@ -2011,7 +2109,7 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, if (sinfo->si_max_pages <= rootnex_state->r_prealloc_cookies) { dma->dp_cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer; dma->dp_need_to_free_cookie = B_FALSE; - DTRACE_PROBE2(rootnex__bind__prealloc, dev_info_t *, rdip, + ROOTNEX_DPROBE2(rootnex__bind__prealloc, dev_info_t *, rdip, uint_t, sinfo->si_max_pages); /* @@ -2024,13 +2122,6 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, * the bind interface would speed this case up. */ } else { - /* convert the sleep flags */ - if (dmareq->dmar_fp == DDI_DMA_SLEEP) { - kmflag = KM_SLEEP; - } else { - kmflag = KM_NOSLEEP; - } - /* * Save away how much memory we allocated. If we're doing a * nosleep, the alloc could fail... @@ -2039,13 +2130,13 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, sizeof (ddi_dma_cookie_t); dma->dp_cookies = kmem_alloc(dma->dp_cookie_size, kmflag); if (dma->dp_cookies == NULL) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); rootnex_clean_dmahdl(hp); return (DDI_DMA_NORESOURCES); } dma->dp_need_to_free_cookie = B_TRUE; - DTRACE_PROBE2(rootnex__bind__alloc, dev_info_t *, rdip, uint_t, - sinfo->si_max_pages); + ROOTNEX_DPROBE2(rootnex__bind__alloc, dev_info_t *, rdip, + uint_t, sinfo->si_max_pages); } hp->dmai_cookie = dma->dp_cookies; @@ -2056,8 +2147,10 @@ rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip, * the sgl clean, or do we need to do some munging; how many pages * need to be copied, etc.) */ - rootnex_get_sgl(&dmareq->dmar_object, dma->dp_cookies, - &dma->dp_sglinfo); + if (dma->dp_dvma_used) + rootnex_dvma_get_sgl(dmao, dma->dp_cookies, &dma->dp_sglinfo); + else + rootnex_get_sgl(dmao, dma->dp_cookies, &dma->dp_sglinfo); out: ASSERT(sinfo->si_sgl_size <= sinfo->si_max_pages); @@ -2076,7 +2169,8 @@ out: */ if ((sinfo->si_copybuf_req == 0) && (sinfo->si_sgl_size <= attr->dma_attr_sgllen) && - (dma->dp_dma.dmao_size < dma->dp_maxxfer)) { + (dmao->dmao_size < dma->dp_maxxfer)) { +fast: /* * If the driver supports FMA, insert the handle in the FMA DMA * handle cache. @@ -2094,10 +2188,10 @@ out: *ccountp = sinfo->si_sgl_size; hp->dmai_cookie++; hp->dmai_rflags &= ~DDI_DMA_PARTIAL; - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]); - DTRACE_PROBE3(rootnex__bind__fast, dev_info_t *, rdip, + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]); + ROOTNEX_DPROBE4(rootnex__bind__fast, dev_info_t *, rdip, uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS], - uint_t, dma->dp_dma.dmao_size); + uint_t, dmao->dmao_size, uint_t, *ccountp); return (DDI_DMA_MAPPED); @@ -2107,12 +2201,26 @@ out: * go to the slow path, we may need to alloc more memory, create * multiple windows, and munge up a sgl to make the device happy. */ - e = rootnex_bind_slowpath(hp, dmareq, dma, attr, kmflag); + + /* + * With the IOMMU mapobject method used, we should never hit + * the slow path. If we do, something is seriously wrong. + * Clean up and return an error. + */ + + if (dma->dp_dvma_used) { + (void) iommulib_nexdma_unmapobject(dip, rdip, handle, + &dma->dp_dvma); + e = DDI_DMA_NOMAPPING; + } else { + e = rootnex_bind_slowpath(hp, dmareq, dma, attr, &dma->dp_dma, + kmflag); + } if ((e != DDI_DMA_MAPPED) && (e != DDI_DMA_PARTIAL_MAP)) { if (dma->dp_need_to_free_cookie) { kmem_free(dma->dp_cookies, dma->dp_cookie_size); } - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]); rootnex_clean_dmahdl(hp); /* must be after free cookie */ return (e); } @@ -2150,9 +2258,9 @@ out: hp->dmai_cookie++; ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]); - ROOTNEX_DPROBE3(rootnex__bind__slow, dev_info_t *, rdip, uint64_t, + ROOTNEX_DPROBE4(rootnex__bind__slow, dev_info_t *, rdip, uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS], uint_t, - dma->dp_dma.dmao_size); + dmao->dmao_size, uint_t, *ccountp); return (e); } @@ -2165,14 +2273,22 @@ rootnex_dma_bindhdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, struct ddi_dma_req *dmareq, ddi_dma_cookie_t *cookiep, uint_t *ccountp) { + int ret; #if defined(__amd64) && !defined(__xpv) - if (IOMMU_USED(rdip)) { - return (iommulib_nexdma_bindhdl(dip, rdip, handle, dmareq, - cookiep, ccountp)); - } + if (IOMMU_USED(rdip)) + ret = iommulib_nexdma_bindhdl(dip, rdip, handle, dmareq, + cookiep, ccountp); + else #endif - return (rootnex_coredma_bindhdl(dip, rdip, handle, dmareq, - cookiep, ccountp)); + ret = rootnex_coredma_bindhdl(dip, rdip, handle, dmareq, + cookiep, ccountp); + + if (ret == DDI_DMA_NORESOURCES && dmareq->dmar_fp != DDI_DMA_DONTWAIT) { + ddi_set_callback(dmareq->dmar_fp, dmareq->dmar_arg, + &rootnex_state->r_dvma_call_list_id); + } + + return (ret); } @@ -2212,15 +2328,9 @@ rootnex_coredma_unbindhdl(dev_info_t *dip, dev_info_t *rdip, rootnex_teardown_copybuf(dma); rootnex_teardown_windows(dma); -#if defined(__amd64) && !defined(__xpv) - /* - * Clean up the page tables and free the dvma - */ - e = immu_unmap_sgl(hp, rdip); - if (e != DDI_DMA_USE_PHYSICAL && e != DDI_SUCCESS) { - return (e); - } -#endif + if (IOMMU_USED(rdip)) + (void) iommulib_nexdma_unmapobject(dip, rdip, handle, + &dma->dp_dvma); /* * If we had to allocate space to for the worse case sgl (it didn't @@ -2237,9 +2347,6 @@ rootnex_coredma_unbindhdl(dev_info_t *dip, dev_info_t *rdip, rootnex_clean_dmahdl(hp); hp->dmai_error.err_cf = NULL; - if (rootnex_state->r_dvma_call_list_id) - ddi_run_callback(&rootnex_state->r_dvma_call_list_id); - ROOTNEX_DPROF_DEC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]); ROOTNEX_DPROBE1(rootnex__unbind, uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]); @@ -2256,12 +2363,19 @@ static int rootnex_dma_unbindhdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle) { + int ret; + #if defined(__amd64) && !defined(__xpv) - if (IOMMU_USED(rdip)) { - return (iommulib_nexdma_unbindhdl(dip, rdip, handle)); - } + if (IOMMU_USED(rdip)) + ret = iommulib_nexdma_unbindhdl(dip, rdip, handle); + else #endif - return (rootnex_coredma_unbindhdl(dip, rdip, handle)); + ret = rootnex_coredma_unbindhdl(dip, rdip, handle); + + if (rootnex_state->r_dvma_call_list_id) + ddi_run_callback(&rootnex_state->r_dvma_call_list_id); + + return (ret); } #if defined(__amd64) && !defined(__xpv) @@ -2417,6 +2531,25 @@ rootnex_coredma_clear_cookies(dev_info_t *dip, ddi_dma_handle_t handle) #endif +static struct as * +rootnex_get_as(ddi_dma_obj_t *dmao) +{ + struct as *asp; + + switch (dmao->dmao_type) { + case DMA_OTYP_VADDR: + case DMA_OTYP_BUFVADDR: + asp = dmao->dmao_obj.virt_obj.v_as; + if (asp == NULL) + asp = &kas; + break; + default: + asp = NULL; + break; + } + return (asp); +} + /* * rootnex_verify_buffer() * verify buffer wasn't free'd @@ -2472,7 +2605,7 @@ rootnex_verify_buffer(rootnex_dma_t *dma) /* For a virtual address, try to peek at each page */ } else { - if (dma->dp_sglinfo.si_asp == &kas) { + if (rootnex_get_as(&dma->dp_dma) == &kas) { for (i = 0; i < pcnt; i++) { if (ddi_peek8(NULL, vaddr, &b) == DDI_FAILURE) @@ -2484,7 +2617,7 @@ rootnex_verify_buffer(rootnex_dma_t *dma) break; default: - ASSERT(0); + cmn_err(CE_PANIC, "rootnex_verify_buffer: bad DMA object"); break; } @@ -2511,6 +2644,7 @@ rootnex_clean_dmahdl(ddi_dma_impl_t *hp) dma->dp_window = NULL; dma->dp_cbaddr = NULL; dma->dp_inuse = B_FALSE; + dma->dp_dvma_used = B_FALSE; dma->dp_need_to_free_cookie = B_FALSE; dma->dp_need_to_switch_cookies = B_FALSE; dma->dp_saved_cookies = NULL; @@ -2660,15 +2794,7 @@ rootnex_need_bounce_seg(ddi_dma_obj_t *dmar_object, rootnex_sglinfo_t *sglinfo) vaddr += psize; } -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MFNs instead of PFNs. - */ - raddr = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - raddr = paddr; -#endif + raddr = ROOTNEX_PADDR_TO_RBASE(paddr); if ((raddr + psize) > sglinfo->si_segmask) { upper_addr = B_TRUE; @@ -2702,15 +2828,7 @@ rootnex_need_bounce_seg(ddi_dma_obj_t *dmar_object, rootnex_sglinfo_t *sglinfo) vaddr += psize; } -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MFNs instead of PFNs. - */ - raddr = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - raddr = paddr; -#endif + raddr = ROOTNEX_PADDR_TO_RBASE(paddr); if ((raddr + psize) > sglinfo->si_segmask) { upper_addr = B_TRUE; @@ -2734,7 +2852,6 @@ rootnex_need_bounce_seg(ddi_dma_obj_t *dmar_object, rootnex_sglinfo_t *sglinfo) return (B_FALSE); } - /* * rootnex_get_sgl() * Called in bind fastpath to get the sgl. Most of this will be replaced @@ -2839,15 +2956,7 @@ rootnex_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, vaddr += psize; } -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MFNs instead of PFNs. - */ - raddr = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - raddr = paddr; -#endif + raddr = ROOTNEX_PADDR_TO_RBASE(paddr); /* * Setup the first cookie with the physical address of the page and the @@ -2921,15 +3030,7 @@ rootnex_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, vaddr += psize; } -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MFNs instead of PFNs. - */ - raddr = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - raddr = paddr; -#endif + raddr = ROOTNEX_PADDR_TO_RBASE(paddr); /* * If we are using the copy buffer for anything over the @@ -3043,6 +3144,98 @@ rootnex_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, } } +static void +rootnex_dvma_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, + rootnex_sglinfo_t *sglinfo) +{ + uint64_t offset; + uint64_t maxseg; + uint64_t dvaddr; + struct dvmaseg *dvs; + uint64_t paddr; + uint32_t psize, ssize; + uint32_t size; + uint_t cnt; + int physcontig; + + ASSERT(dmar_object->dmao_type == DMA_OTYP_DVADDR); + + /* shortcuts */ + maxseg = sglinfo->si_max_cookie_size; + size = dmar_object->dmao_size; + + cnt = 0; + sglinfo->si_bounce_on_seg = B_FALSE; + + dvs = dmar_object->dmao_obj.dvma_obj.dv_seg; + offset = dmar_object->dmao_obj.dvma_obj.dv_off; + ssize = dvs->dvs_len; + paddr = dvs->dvs_start; + paddr += offset; + psize = MIN(ssize, (maxseg - offset)); + dvaddr = paddr + psize; + ssize -= psize; + + sgl[cnt].dmac_laddress = paddr; + sgl[cnt].dmac_size = psize; + sgl[cnt].dmac_type = 0; + + size -= psize; + while (size > 0) { + if (ssize == 0) { + dvs++; + ssize = dvs->dvs_len; + dvaddr = dvs->dvs_start; + physcontig = 0; + } else + physcontig = 1; + + paddr = dvaddr; + psize = MIN(ssize, maxseg); + dvaddr += psize; + ssize -= psize; + + if (!physcontig || !(paddr & sglinfo->si_segmask) || + ((sgl[cnt].dmac_size + psize) > maxseg) || + (sgl[cnt].dmac_size == 0)) { + /* + * if we're not already in a new cookie, go to the next + * cookie. + */ + if (sgl[cnt].dmac_size != 0) { + cnt++; + } + + /* save the cookie information */ + sgl[cnt].dmac_laddress = paddr; + sgl[cnt].dmac_size = psize; + sgl[cnt].dmac_type = 0; + } else { + sgl[cnt].dmac_size += psize; + + /* + * if this exactly == the maximum cookie size, and + * it isn't the last cookie, go to the next cookie. + */ + if (((sgl[cnt].dmac_size + psize) == maxseg) && + ((cnt + 1) < sglinfo->si_max_pages)) { + cnt++; + sgl[cnt].dmac_laddress = 0; + sgl[cnt].dmac_size = 0; + sgl[cnt].dmac_type = 0; + } + } + size -= psize; + } + + /* we're done, save away how many cookies the sgl has */ + if (sgl[cnt].dmac_size == 0) { + sglinfo->si_sgl_size = cnt; + } else { + sglinfo->si_sgl_size = cnt + 1; + } +} + /* * rootnex_bind_slowpath() * Call in the bind path if the calling driver can't use the sgl without @@ -3051,7 +3244,7 @@ rootnex_get_sgl(ddi_dma_obj_t *dmar_object, ddi_dma_cookie_t *sgl, */ static int rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, - rootnex_dma_t *dma, ddi_dma_attr_t *attr, int kmflag) + rootnex_dma_t *dma, ddi_dma_attr_t *attr, ddi_dma_obj_t *dmao, int kmflag) { rootnex_sglinfo_t *sinfo; rootnex_window_t *window; @@ -3088,7 +3281,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, * if we need to trim the buffers when we munge the sgl. */ if ((dma->dp_copybuf_size < sinfo->si_copybuf_req) || - (dma->dp_dma.dmao_size > dma->dp_maxxfer) || + (dmao->dmao_size > dma->dp_maxxfer) || (attr->dma_attr_sgllen < sinfo->si_sgl_size)) { dma->dp_partial_required = B_TRUE; if (attr->dma_attr_granular != 1) { @@ -3127,7 +3320,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, * we might need multiple windows, setup state to handle them. In this * code path, we will have at least one window. */ - e = rootnex_setup_windows(hp, dma, attr, kmflag); + e = rootnex_setup_windows(hp, dma, attr, dmao, kmflag); if (e != DDI_SUCCESS) { rootnex_teardown_copybuf(dma); return (e); @@ -3137,7 +3330,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, cookie = &dma->dp_cookies[0]; cur_offset = 0; rootnex_init_win(hp, dma, window, cookie, cur_offset); - if (dmareq->dmar_object.dmao_type == DMA_OTYP_PAGES) { + if (dmao->dmao_type == DMA_OTYP_PAGES) { cur_pp = dmareq->dmar_object.dmao_obj.pp_obj.pp_pp; } @@ -3149,7 +3342,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, * copy buffer, make sure we sync this window during dma_sync. */ if (dma->dp_copybuf_size > 0) { - rootnex_setup_cookie(&dmareq->dmar_object, dma, cookie, + rootnex_setup_cookie(dmao, dma, cookie, cur_offset, ©buf_used, &cur_pp); if (cookie->dmac_type & ROOTNEX_USES_COPYBUF) { window->wd_dosync = B_TRUE; @@ -3181,7 +3374,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, if (cookie->dmac_type & ROOTNEX_USES_COPYBUF) { window->wd_dosync = B_TRUE; } - DTRACE_PROBE1(rootnex__copybuf__window, dev_info_t *, + ROOTNEX_DPROBE1(rootnex__copybuf__window, dev_info_t *, dma->dp_dip); /* if the cookie cnt == max sgllen, move to the next window */ @@ -3203,7 +3396,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, if (cookie->dmac_type & ROOTNEX_USES_COPYBUF) { window->wd_dosync = B_TRUE; } - DTRACE_PROBE1(rootnex__sgllen__window, dev_info_t *, + ROOTNEX_DPROBE1(rootnex__sgllen__window, dev_info_t *, dma->dp_dip); /* else if we will be over maxxfer */ @@ -3225,7 +3418,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, if (cookie->dmac_type & ROOTNEX_USES_COPYBUF) { window->wd_dosync = B_TRUE; } - DTRACE_PROBE1(rootnex__maxxfer__window, dev_info_t *, + ROOTNEX_DPROBE1(rootnex__maxxfer__window, dev_info_t *, dma->dp_dip); /* else this cookie fits in the current window */ @@ -3235,7 +3428,7 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, } /* track our offset into the buffer, go to the next cookie */ - ASSERT(dmac_size <= dma->dp_dma.dmao_size); + ASSERT(dmac_size <= dmao->dmao_size); ASSERT(cookie->dmac_size <= dmac_size); cur_offset += dmac_size; cookie++; @@ -3257,7 +3450,6 @@ rootnex_bind_slowpath(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, return (DDI_DMA_PARTIAL_MAP); } - /* * rootnex_setup_copybuf() * Called in bind slowpath. Figures out if we're going to use the copy @@ -3276,6 +3468,7 @@ rootnex_setup_copybuf(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, int vmflag; #endif + ASSERT(!dma->dp_dvma_used); sinfo = &dma->dp_sglinfo; @@ -3353,7 +3546,7 @@ rootnex_setup_copybuf(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, return (DDI_DMA_NORESOURCES); } - DTRACE_PROBE2(rootnex__alloc__copybuf, dev_info_t *, dma->dp_dip, + ROOTNEX_DPROBE2(rootnex__alloc__copybuf, dev_info_t *, dma->dp_dip, size_t, dma->dp_copybuf_size); return (DDI_SUCCESS); @@ -3367,7 +3560,7 @@ rootnex_setup_copybuf(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, */ static int rootnex_setup_windows(ddi_dma_impl_t *hp, rootnex_dma_t *dma, - ddi_dma_attr_t *attr, int kmflag) + ddi_dma_attr_t *attr, ddi_dma_obj_t *dmao, int kmflag) { rootnex_window_t *windowp; rootnex_sglinfo_t *sinfo; @@ -3436,8 +3629,8 @@ rootnex_setup_windows(ddi_dma_impl_t *hp, rootnex_dma_t *dma, * for remainder, and plus 2 to handle the extra pages on the * trim (see above comment about trim) */ - if (dma->dp_dma.dmao_size > dma->dp_maxxfer) { - maxxfer_win = (dma->dp_dma.dmao_size / + if (dmao->dmao_size > dma->dp_maxxfer) { + maxxfer_win = (dmao->dmao_size / dma->dp_maxxfer) + 1 + 2; } else { maxxfer_win = 0; @@ -3509,7 +3702,7 @@ rootnex_setup_windows(ddi_dma_impl_t *hp, rootnex_dma_t *dma, } dma->dp_need_to_free_window = B_TRUE; dma->dp_window_size = space_needed; - DTRACE_PROBE2(rootnex__bind__sp__alloc, dev_info_t *, + ROOTNEX_DPROBE2(rootnex__bind__sp__alloc, dev_info_t *, dma->dp_dip, size_t, space_needed); } @@ -3639,6 +3832,8 @@ rootnex_setup_cookie(ddi_dma_obj_t *dmar_object, rootnex_dma_t *dma, page_t **pplist; #endif + ASSERT(dmar_object->dmao_type != DMA_OTYP_DVADDR); + sinfo = &dma->dp_sglinfo; /* @@ -3706,15 +3901,7 @@ rootnex_setup_cookie(ddi_dma_obj_t *dmar_object, rootnex_dma_t *dma, paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, dma->dp_pgmap[pidx].pm_cbaddr)) + poff; -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MAs instead of PAs. - */ - cookie->dmac_laddress = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - cookie->dmac_laddress = paddr; -#endif + cookie->dmac_laddress = ROOTNEX_PADDR_TO_RBASE(paddr); /* if we have a kernel VA, it's easy, just save that address */ if ((dmar_object->dmao_type != DMA_OTYP_PAGES) && @@ -4186,16 +4373,8 @@ rootnex_copybuf_window_boundary(ddi_dma_impl_t *hp, rootnex_dma_t *dma, paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, dma->dp_cbaddr)) + poff; -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MAs instead of PAs. - */ (*windowp)->wd_trim.tr_first_paddr = - ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - (*windowp)->wd_trim.tr_first_paddr = paddr; -#endif + ROOTNEX_PADDR_TO_RBASE(paddr); #if !defined(__amd64) (*windowp)->wd_trim.tr_first_kaddr = dma->dp_kva; @@ -4230,15 +4409,7 @@ rootnex_copybuf_window_boundary(ddi_dma_impl_t *hp, rootnex_dma_t *dma, paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, dma->dp_pgmap[pidx + 1].pm_cbaddr)) + poff; -#ifdef __xpv - /* - * If we're dom0, we're using a real device so we need to load - * the cookies with MAs instead of PAs. - */ - cookie->dmac_laddress = ROOTNEX_PADDR_TO_RBASE(xen_info, paddr); -#else - cookie->dmac_laddress = paddr; -#endif + cookie->dmac_laddress = ROOTNEX_PADDR_TO_RBASE(paddr); #if !defined(__amd64) ASSERT(dma->dp_pgmap[pidx + 1].pm_mapped == B_FALSE); @@ -4392,7 +4563,7 @@ rootnex_coredma_sync(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, e = rootnex_valid_sync_parms(hp, win, offset, size, cache_flags); if (e != DDI_SUCCESS) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_SYNC_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_SYNC_FAIL]); return (DDI_FAILURE); } } @@ -4439,7 +4610,7 @@ rootnex_coredma_sync(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, if (cache_flags == DDI_DMA_SYNC_FORDEV) { fromaddr = cbpage->pm_kaddr + poff; toaddr = cbpage->pm_cbaddr + poff; - DTRACE_PROBE2(rootnex__sync__dev, + ROOTNEX_DPROBE2(rootnex__sync__dev, dev_info_t *, dma->dp_dip, size_t, psize); /* @@ -4450,7 +4621,7 @@ rootnex_coredma_sync(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, } else { fromaddr = cbpage->pm_cbaddr + poff; toaddr = cbpage->pm_kaddr + poff; - DTRACE_PROBE2(rootnex__sync__cpu, + ROOTNEX_DPROBE2(rootnex__sync__cpu, dev_info_t *, dma->dp_dip, size_t, psize); } @@ -4554,6 +4725,7 @@ rootnex_coredma_win(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, rootnex_trim_t *trim; ddi_dma_impl_t *hp; rootnex_dma_t *dma; + ddi_dma_obj_t *dmao; #if !defined(__amd64) rootnex_sglinfo_t *sinfo; rootnex_pgmap_t *pmap; @@ -4572,10 +4744,12 @@ rootnex_coredma_win(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, /* If we try and get a window which doesn't exist, return failure */ if (win >= hp->dmai_nwin) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_GETWIN_FAIL]); + ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_GETWIN_FAIL]); return (DDI_FAILURE); } + dmao = dma->dp_dvma_used ? &dma->dp_dma : &dma->dp_dvma; + /* * if we don't have any windows, and they're asking for the first * window, setup the cookie pointer to the first cookie in the bind. @@ -4584,12 +4758,13 @@ rootnex_coredma_win(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, */ if (dma->dp_window == NULL) { if (win != 0) { - ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_GETWIN_FAIL]); + ROOTNEX_DPROF_INC( + &rootnex_cnt[ROOTNEX_CNT_GETWIN_FAIL]); return (DDI_FAILURE); } hp->dmai_cookie = dma->dp_cookies; *offp = 0; - *lenp = dma->dp_dma.dmao_size; + *lenp = dmao->dmao_size; *ccountp = dma->dp_sglinfo.si_sgl_size; *cookiep = hp->dmai_cookie[0]; hp->dmai_cookie++; @@ -4781,6 +4956,37 @@ rootnex_dma_win(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle, cookiep, ccountp)); } +#if defined(__amd64) && !defined(__xpv) +/*ARGSUSED*/ +static int +rootnex_coredma_hdl_setprivate(dev_info_t *dip, dev_info_t *rdip, + ddi_dma_handle_t handle, void *v) +{ + ddi_dma_impl_t *hp; + rootnex_dma_t *dma; + + hp = (ddi_dma_impl_t *)handle; + dma = (rootnex_dma_t *)hp->dmai_private; + dma->dp_iommu_private = v; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void * +rootnex_coredma_hdl_getprivate(dev_info_t *dip, dev_info_t *rdip, + ddi_dma_handle_t handle) +{ + ddi_dma_impl_t *hp; + rootnex_dma_t *dma; + + hp = (ddi_dma_impl_t *)handle; + dma = (rootnex_dma_t *)hp->dmai_private; + + return (dma->dp_iommu_private); +} +#endif + /* * ************************ * obsoleted dma routines diff --git a/usr/src/uts/i86pc/ml/cpr_wakecode.s b/usr/src/uts/i86pc/ml/cpr_wakecode.s index 917ce412aa..fc58cd5d2a 100644 --- a/usr/src/uts/i86pc/ml/cpr_wakecode.s +++ b/usr/src/uts/i86pc/ml/cpr_wakecode.s @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/asm_linkage.h> @@ -673,9 +672,8 @@ kernel_wc_code: * Before proceeding, enable usage of the page table NX bit if * that's how the page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset(%rip) + jnc 1f movl $MSR_AMD_EFER, %ecx rdmsr orl $AMD_EFER_NXE, %eax @@ -1092,9 +1090,8 @@ kernel_wc_code: * Before proceeding, enable usage of the page table NX bit if * that's how the page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset + jnc 1f movl $MSR_AMD_EFER, %ecx rdmsr orl $AMD_EFER_NXE, %eax diff --git a/usr/src/uts/i86pc/ml/genassym.c b/usr/src/uts/i86pc/ml/genassym.c index 4836628401..a34ca50669 100644 --- a/usr/src/uts/i86pc/ml/genassym.c +++ b/usr/src/uts/i86pc/ml/genassym.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _GENASSYM @@ -123,6 +122,10 @@ main(int argc, char *argv[]) printf("#define\tFP_387 0x%x\n", FP_387); printf("#define\t__FP_SSE 0x%x\n", __FP_SSE); + printf("#define\tFP_FNSAVE 0x%x\n", FP_FNSAVE); + printf("#define\tFP_FXSAVE 0x%x\n", FP_FXSAVE); + printf("#define\tFP_XSAVE 0x%x\n", FP_XSAVE); + printf("#define\tAV_INT_SPURIOUS 0x%x\n", AV_INT_SPURIOUS); printf("#define\tCPU_READY 0x%x\n", CPU_READY); diff --git a/usr/src/uts/i86pc/ml/interrupt.s b/usr/src/uts/i86pc/ml/interrupt.s index 97f5acba2d..46cbf2f308 100644 --- a/usr/src/uts/i86pc/ml/interrupt.s +++ b/usr/src/uts/i86pc/ml/interrupt.s @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s index db016a55db..8aec1537e5 100644 --- a/usr/src/uts/i86pc/ml/locore.s +++ b/usr/src/uts/i86pc/ml/locore.s @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -208,7 +207,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop) /* * (We just assert this works by virtue of being here) */ - orl $X86_CPUID, x86_feature(%rip) + bts $X86FSET_CPUID, x86_featureset(%rip) /* * mlsetup() gets called with a struct regs as argument, while @@ -623,7 +622,7 @@ have_cpuid: /* * cpuid instruction present */ - orl $X86_CPUID, x86_feature + bts $X86FSET_CPUID, x86_featureset / Just to set; Ignore the CF movl $0, %eax cpuid @@ -2340,7 +2339,7 @@ cpu_vendor: .globl CyrixInstead - .globl x86_feature + .globl x86_featureset .globl x86_type .globl x86_vendor #endif diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s index 25a943870e..3ff8aa9d6d 100644 --- a/usr/src/uts/i86pc/ml/mpcore.s +++ b/usr/src/uts/i86pc/ml/mpcore.s @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -316,9 +315,8 @@ kernel_cs_code: * Before going any further, enable usage of page table NX bit if * that's how our page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset(%rip) + jnc 1f movl $MSR_AMD_EFER, %ecx rdmsr orl $AMD_EFER_NXE, %eax @@ -570,9 +568,8 @@ kernel_cs_code: * Before going any further, enable usage of page table NX bit if * that's how our page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset(%rip) + jnc 1f movl $MSR_AMD_EFER, %ecx rdmsr orl $AMD_EFER_NXE, %eax @@ -671,9 +668,8 @@ kernel_cs_code: * Before going any further, enable usage of page table NX bit if * that's how our page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset + jnc 1f movl %cr4, %ecx andl $CR4_PAE, %ecx jz 1f @@ -763,9 +759,8 @@ kernel_cs_code: * Before going any farther, enable usage of page table NX bit if * that's how our page tables are set up. */ - movl x86_feature, %ecx - andl $X86_NX, %ecx - jz 1f + bt $X86FSET_NX, x86_featureset + jnc 1f movl %cr4, %ecx andl $CR4_PAE, %ecx jz 1f diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index ceefce6d3c..20e0c972d4 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -165,6 +165,7 @@ _klwp fpu_ctx fpu_regs FPU_CTX_FPU_REGS fpu_flags FPU_CTX_FPU_FLAGS + fpu_xsave_mask FPU_CTX_FPU_XSAVE_MASK fxsave_state FXSAVE_STATE_SIZE fx_fsw FXSAVE_STATE_FSW @@ -293,7 +294,6 @@ rm_platter rm_pdbr CR3OFF rm_cpu CPUNOFF rm_cr4 CR4OFF - rm_x86feature X86FEATURE rm_cpu_halt_code CPUHALTCODEOFF rm_cpu_halted CPUHALTEDOFF diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 103955a097..555ed9f842 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -66,6 +65,7 @@ #include <sys/reboot.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> +#include <sys/fp.h> #define AFMT "%lx" @@ -876,8 +876,6 @@ init_real_mode_platter(int cpun, uint32_t offset, uint_t cr4, wc_desctbr_t gdt) real_mode_platter->rm_gdt_lim = gdt.limit; #if defined(__amd64) - real_mode_platter->rm_x86feature = x86_feature; - if (getcr3() > 0xffffffffUL) panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" "located above 4G in physical memory (@ 0x%llx).", @@ -943,10 +941,17 @@ i_cpr_start_cpu(void) * We need to Sync PAT with cpu0's PAT. We have to do * this with interrupts disabled. */ - if (x86_feature & X86_PAT) + if (is_x86_feature(x86_featureset, X86FSET_PAT)) pat_sync(); /* + * If we use XSAVE, we need to restore XFEATURE_ENABLE_MASK register. + */ + if (fp_save_mech == FP_XSAVE) { + setup_xfem(); + } + + /* * Initialize this CPU's syscall handlers */ init_cpu_syscall(cp); @@ -994,7 +999,7 @@ i_cpr_start_cpu(void) * cmi already been init'd (during boot), so do not need to do it again */ #ifdef PM_REINITMCAONRESUME - if (x86_feature & X86_MCA) + if (is_x86_feature(x86_featureset, X86FSET_MCA)) cmi_mca_init(); #endif diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 4a86da8c3b..1a472c9e93 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -22,7 +22,7 @@ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * All rights reserved. */ /* @@ -48,8 +48,8 @@ #include <sys/pg.h> #include <sys/fp.h> #include <sys/controlregs.h> -#include <sys/auxv_386.h> #include <sys/bitmap.h> +#include <sys/auxv_386.h> #include <sys/memnode.h> #include <sys/pci_cfgspace.h> @@ -67,7 +67,7 @@ * * Pass 1 of cpuid feature analysis happens just at the beginning of mlsetup() * for the boot CPU and does the basic analysis that the early kernel needs. - * x86_feature is set based on the return value of cpuid_pass1() of the boot + * x86_featureset is set based on the return value of cpuid_pass1() of the boot * CPU. * * Pass 1 includes: @@ -111,7 +111,6 @@ * to the accessor code. */ -uint_t x86_feature = 0; uint_t x86_vendor = X86_VENDOR_IntelClone; uint_t x86_type = X86_TYPE_OTHER; uint_t x86_clflush_size = 0; @@ -119,7 +118,98 @@ uint_t x86_clflush_size = 0; uint_t pentiumpro_bug4046376; uint_t pentiumpro_bug4064495; +uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; + +static char *x86_feature_names[NUM_X86_FEATURES] = { + "lgpg", + "tsc", + "msr", + "mtrr", + "pge", + "de", + "cmov", + "mmx", + "mca", + "pae", + "cv8", + "pat", + "sep", + "sse", + "sse2", + "htt", + "asysc", + "nx", + "sse3", + "cx16", + "cmp", + "tscp", + "mwait", + "sse4a", + "cpuid", + "ssse3", + "sse4_1", + "sse4_2", + "1gpg", + "clfsh", + "64", + "aes", + "pclmulqdq", + "xsave", + "avx" }; + +boolean_t +is_x86_feature(void *featureset, uint_t feature) +{ + ASSERT(feature < NUM_X86_FEATURES); + return (BT_TEST((ulong_t *)featureset, feature)); +} + +void +add_x86_feature(void *featureset, uint_t feature) +{ + ASSERT(feature < NUM_X86_FEATURES); + BT_SET((ulong_t *)featureset, feature); +} + +void +remove_x86_feature(void *featureset, uint_t feature) +{ + ASSERT(feature < NUM_X86_FEATURES); + BT_CLEAR((ulong_t *)featureset, feature); +} + +boolean_t +compare_x86_featureset(void *setA, void *setB) +{ + /* + * We assume that the unused bits of the bitmap are always zero. + */ + if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +void +print_x86_featureset(void *featureset) +{ + uint_t i; + + for (i = 0; i < NUM_X86_FEATURES; i++) { + if (is_x86_feature(featureset, i)) { + cmn_err(CE_CONT, "?x86_feature: %s\n", + x86_feature_names[i]); + } + } +} + uint_t enable486; + +static size_t xsave_state_size = 0; +uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE); +boolean_t xsave_force_disable = B_FALSE; + /* * This is set to platform type Solaris is running on. */ @@ -150,6 +240,23 @@ struct mwait_info { }; /* + * xsave/xrestor info. + * + * This structure contains HW feature bits and size of the xsave save area. + * Note: the kernel will use the maximum size required for all hardware + * features. It is not optimize for potential memory savings if features at + * the end of the save area are not enabled. + */ +struct xsave_info { + uint32_t xsav_hw_features_low; /* Supported HW features */ + uint32_t xsav_hw_features_high; /* Supported HW features */ + size_t xsav_max_size; /* max size save area for HW features */ + size_t ymm_size; /* AVX: size of ymm save area */ + size_t ymm_offset; /* AVX: offset for ymm save area */ +}; + + +/* * These constants determine how many of the elements of the * cpuid we cache in the cpuid_info data structure; the * remaining elements are accessible via the cpuid instruction. @@ -230,6 +337,8 @@ struct cpuid_info { uint_t cpi_procnodeid; /* AMD: nodeID on HT, Intel: chipid */ uint_t cpi_procnodes_per_pkg; /* AMD: # of nodes in the package */ /* Intel: 1 */ + + struct xsave_info cpi_xsave; /* fn D: xsave/xrestor info */ }; @@ -332,6 +441,12 @@ static struct cpuid_info cpuid_info0; BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state) /* + * XSAVE leaf 0xD enumeration + */ +#define CPUID_LEAFD_2_YMM_OFFSET 576 +#define CPUID_LEAFD_2_YMM_SIZE 256 + +/* * Functions we consune from cpuid_subr.c; don't publish these in a header * file to try and keep people using the expected cpuid_* interfaces. */ @@ -542,7 +657,7 @@ is_controldom(void) #endif /* __xpv */ static void -cpuid_intel_getids(cpu_t *cpu, uint_t feature) +cpuid_intel_getids(cpu_t *cpu, void *feature) { uint_t i; uint_t chipid_shift = 0; @@ -555,7 +670,7 @@ cpuid_intel_getids(cpu_t *cpu, uint_t feature) cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift; cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1); - if (feature & X86_CMP) { + if (is_x86_feature(feature, X86FSET_CMP)) { /* * Multi-core (and possibly multi-threaded) * processors. @@ -591,7 +706,7 @@ cpuid_intel_getids(cpu_t *cpu, uint_t feature) coreid_shift++; cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift; cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; - } else if (feature & X86_HTT) { + } else if (is_x86_feature(feature, X86FSET_HTT)) { /* * Single-core multi-threaded processors. */ @@ -718,11 +833,31 @@ cpuid_amd_getids(cpu_t *cpu) } } -uint_t -cpuid_pass1(cpu_t *cpu) +/* + * Setup XFeature_Enabled_Mask register. Required by xsave feature. + */ +void +setup_xfem(void) +{ + uint64_t flags = XFEATURE_LEGACY_FP; + + ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); + + if (is_x86_feature(x86_featureset, X86FSET_SSE)) + flags |= XFEATURE_SSE; + + if (is_x86_feature(x86_featureset, X86FSET_AVX)) + flags |= XFEATURE_AVX; + + set_xcr(XFEATURE_ENABLED_MASK, flags); + + xsave_bv_all = flags; +} + +void +cpuid_pass1(cpu_t *cpu, uchar_t *featureset) { uint32_t mask_ecx, mask_edx; - uint_t feature = X86_CPUID; struct cpuid_info *cpi; struct cpuid_regs *cp; int xcpuid; @@ -730,15 +865,19 @@ cpuid_pass1(cpu_t *cpu) extern int idle_cpu_prefer_mwait; #endif - #if !defined(__xpv) determine_platform(); #endif /* * Space statically allocated for BSP, ensure pointer is set */ - if (cpu->cpu_id == 0 && cpu->cpu_m.mcpu_cpi == NULL) - cpu->cpu_m.mcpu_cpi = &cpuid_info0; + if (cpu->cpu_id == 0) { + if (cpu->cpu_m.mcpu_cpi == NULL) + cpu->cpu_m.mcpu_cpi = &cpuid_info0; + } + + add_x86_feature(featureset, X86FSET_CPUID); + cpi = cpu->cpu_m.mcpu_cpi; ASSERT(cpi != NULL); cp = &cpi->cpi_std[0]; @@ -977,8 +1116,18 @@ cpuid_pass1(cpu_t *cpu) * Do not support MONITOR/MWAIT under a hypervisor */ mask_ecx &= ~CPUID_INTC_ECX_MON; + /* + * Do not support XSAVE under a hypervisor for now + */ + xsave_force_disable = B_TRUE; + #endif /* __xpv */ + if (xsave_force_disable) { + mask_ecx &= ~CPUID_INTC_ECX_XSAVE; + mask_ecx &= ~CPUID_INTC_ECX_AVX; + } + /* * Now we've figured out the masks that determine * which bits we choose to believe, apply the masks @@ -1004,58 +1153,91 @@ cpuid_pass1(cpu_t *cpu) cp->cp_ecx |= cpuid_feature_ecx_include; cp->cp_ecx &= ~cpuid_feature_ecx_exclude; - if (cp->cp_edx & CPUID_INTC_EDX_PSE) - feature |= X86_LARGEPAGE; - if (cp->cp_edx & CPUID_INTC_EDX_TSC) - feature |= X86_TSC; - if (cp->cp_edx & CPUID_INTC_EDX_MSR) - feature |= X86_MSR; - if (cp->cp_edx & CPUID_INTC_EDX_MTRR) - feature |= X86_MTRR; - if (cp->cp_edx & CPUID_INTC_EDX_PGE) - feature |= X86_PGE; - if (cp->cp_edx & CPUID_INTC_EDX_CMOV) - feature |= X86_CMOV; - if (cp->cp_edx & CPUID_INTC_EDX_MMX) - feature |= X86_MMX; + if (cp->cp_edx & CPUID_INTC_EDX_PSE) { + add_x86_feature(featureset, X86FSET_LARGEPAGE); + } + if (cp->cp_edx & CPUID_INTC_EDX_TSC) { + add_x86_feature(featureset, X86FSET_TSC); + } + if (cp->cp_edx & CPUID_INTC_EDX_MSR) { + add_x86_feature(featureset, X86FSET_MSR); + } + if (cp->cp_edx & CPUID_INTC_EDX_MTRR) { + add_x86_feature(featureset, X86FSET_MTRR); + } + if (cp->cp_edx & CPUID_INTC_EDX_PGE) { + add_x86_feature(featureset, X86FSET_PGE); + } + if (cp->cp_edx & CPUID_INTC_EDX_CMOV) { + add_x86_feature(featureset, X86FSET_CMOV); + } + if (cp->cp_edx & CPUID_INTC_EDX_MMX) { + add_x86_feature(featureset, X86FSET_MMX); + } if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 && - (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) - feature |= X86_MCA; - if (cp->cp_edx & CPUID_INTC_EDX_PAE) - feature |= X86_PAE; - if (cp->cp_edx & CPUID_INTC_EDX_CX8) - feature |= X86_CX8; - if (cp->cp_ecx & CPUID_INTC_ECX_CX16) - feature |= X86_CX16; - if (cp->cp_edx & CPUID_INTC_EDX_PAT) - feature |= X86_PAT; - if (cp->cp_edx & CPUID_INTC_EDX_SEP) - feature |= X86_SEP; + (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) { + add_x86_feature(featureset, X86FSET_MCA); + } + if (cp->cp_edx & CPUID_INTC_EDX_PAE) { + add_x86_feature(featureset, X86FSET_PAE); + } + if (cp->cp_edx & CPUID_INTC_EDX_CX8) { + add_x86_feature(featureset, X86FSET_CX8); + } + if (cp->cp_ecx & CPUID_INTC_ECX_CX16) { + add_x86_feature(featureset, X86FSET_CX16); + } + if (cp->cp_edx & CPUID_INTC_EDX_PAT) { + add_x86_feature(featureset, X86FSET_PAT); + } + if (cp->cp_edx & CPUID_INTC_EDX_SEP) { + add_x86_feature(featureset, X86FSET_SEP); + } if (cp->cp_edx & CPUID_INTC_EDX_FXSR) { /* * In our implementation, fxsave/fxrstor * are prerequisites before we'll even * try and do SSE things. */ - if (cp->cp_edx & CPUID_INTC_EDX_SSE) - feature |= X86_SSE; - if (cp->cp_edx & CPUID_INTC_EDX_SSE2) - feature |= X86_SSE2; - if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) - feature |= X86_SSE3; + if (cp->cp_edx & CPUID_INTC_EDX_SSE) { + add_x86_feature(featureset, X86FSET_SSE); + } + if (cp->cp_edx & CPUID_INTC_EDX_SSE2) { + add_x86_feature(featureset, X86FSET_SSE2); + } + if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) { + add_x86_feature(featureset, X86FSET_SSE3); + } if (cpi->cpi_vendor == X86_VENDOR_Intel) { - if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) - feature |= X86_SSSE3; - if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) - feature |= X86_SSE4_1; - if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) - feature |= X86_SSE4_2; - if (cp->cp_ecx & CPUID_INTC_ECX_AES) - feature |= X86_AES; + if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) { + add_x86_feature(featureset, X86FSET_SSSE3); + } + if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) { + add_x86_feature(featureset, X86FSET_SSE4_1); + } + if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) { + add_x86_feature(featureset, X86FSET_SSE4_2); + } + if (cp->cp_ecx & CPUID_INTC_ECX_AES) { + add_x86_feature(featureset, X86FSET_AES); + } + if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) { + add_x86_feature(featureset, X86FSET_PCLMULQDQ); + } + + if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) { + add_x86_feature(featureset, X86FSET_XSAVE); + /* We only test AVX when there is XSAVE */ + if (cp->cp_ecx & CPUID_INTC_ECX_AVX) { + add_x86_feature(featureset, + X86FSET_AVX); + } + } } } - if (cp->cp_edx & CPUID_INTC_EDX_DE) - feature |= X86_DE; + if (cp->cp_edx & CPUID_INTC_EDX_DE) { + add_x86_feature(featureset, X86FSET_DE); + } #if !defined(__xpv) if (cp->cp_ecx & CPUID_INTC_ECX_MON) { @@ -1065,7 +1247,7 @@ cpuid_pass1(cpu_t *cpu) */ if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { cpi->cpi_mwait.support |= MWAIT_SUPPORT; - feature |= X86_MWAIT; + add_x86_feature(featureset, X86FSET_MWAIT); } else { extern int idle_cpu_assert_cflush_monitor; @@ -1086,11 +1268,10 @@ cpuid_pass1(cpu_t *cpu) * we only capture this for the bootcpu. */ if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { - feature |= X86_CLFSH; + add_x86_feature(featureset, X86FSET_CLFSH); x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); } - - if (feature & X86_PAE) + if (is_x86_feature(featureset, X86FSET_PAE)) cpi->cpi_pabits = 36; /* @@ -1105,7 +1286,7 @@ cpuid_pass1(cpu_t *cpu) if (cp->cp_edx & CPUID_INTC_EDX_HTT) { cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); if (cpi->cpi_ncpu_per_chip > 1) - feature |= X86_HTT; + add_x86_feature(featureset, X86FSET_HTT); } else { cpi->cpi_ncpu_per_chip = 1; } @@ -1180,27 +1361,31 @@ cpuid_pass1(cpu_t *cpu) /* * Compute the additions to the kernel's feature word. */ - if (cp->cp_edx & CPUID_AMD_EDX_NX) - feature |= X86_NX; + if (cp->cp_edx & CPUID_AMD_EDX_NX) { + add_x86_feature(featureset, X86FSET_NX); + } /* * Regardless whether or not we boot 64-bit, * we should have a way to identify whether * the CPU is capable of running 64-bit. */ - if (cp->cp_edx & CPUID_AMD_EDX_LM) - feature |= X86_64; + if (cp->cp_edx & CPUID_AMD_EDX_LM) { + add_x86_feature(featureset, X86FSET_64); + } #if defined(__amd64) /* 1 GB large page - enable only for 64 bit kernel */ - if (cp->cp_edx & CPUID_AMD_EDX_1GPG) - feature |= X86_1GPG; + if (cp->cp_edx & CPUID_AMD_EDX_1GPG) { + add_x86_feature(featureset, X86FSET_1GPG); + } #endif if ((cpi->cpi_vendor == X86_VENDOR_AMD) && (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) && - (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) - feature |= X86_SSE4A; + (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) { + add_x86_feature(featureset, X86FSET_SSE4A); + } /* * If both the HTT and CMP_LGCY bits are set, @@ -1208,10 +1393,10 @@ cpuid_pass1(cpu_t *cpu) * "AMD CPUID Specification" for more details. */ if (cpi->cpi_vendor == X86_VENDOR_AMD && - (feature & X86_HTT) && + is_x86_feature(featureset, X86FSET_HTT) && (cp->cp_ecx & CPUID_AMD_ECX_CMP_LGCY)) { - feature &= ~X86_HTT; - feature |= X86_CMP; + remove_x86_feature(featureset, X86FSET_HTT); + add_x86_feature(featureset, X86FSET_CMP); } #if defined(__amd64) /* @@ -1220,19 +1405,22 @@ cpuid_pass1(cpu_t *cpu) * instead. In the amd64 kernel, things are -way- * better. */ - if (cp->cp_edx & CPUID_AMD_EDX_SYSC) - feature |= X86_ASYSC; + if (cp->cp_edx & CPUID_AMD_EDX_SYSC) { + add_x86_feature(featureset, X86FSET_ASYSC); + } /* * While we're thinking about system calls, note * that AMD processors don't support sysenter * in long mode at all, so don't try to program them. */ - if (x86_vendor == X86_VENDOR_AMD) - feature &= ~X86_SEP; + if (x86_vendor == X86_VENDOR_AMD) { + remove_x86_feature(featureset, X86FSET_SEP); + } #endif - if (cp->cp_edx & CPUID_AMD_EDX_TSCP) - feature |= X86_TSCP; + if (cp->cp_edx & CPUID_AMD_EDX_TSCP) { + add_x86_feature(featureset, X86FSET_TSCP); + } break; default: break; @@ -1327,20 +1515,22 @@ cpuid_pass1(cpu_t *cpu) /* * If more than one core, then this processor is CMP. */ - if (cpi->cpi_ncore_per_chip > 1) - feature |= X86_CMP; + if (cpi->cpi_ncore_per_chip > 1) { + add_x86_feature(featureset, X86FSET_CMP); + } /* * If the number of cores is the same as the number * of CPUs, then we cannot have HyperThreading. */ - if (cpi->cpi_ncpu_per_chip == cpi->cpi_ncore_per_chip) - feature &= ~X86_HTT; + if (cpi->cpi_ncpu_per_chip == cpi->cpi_ncore_per_chip) { + remove_x86_feature(featureset, X86FSET_HTT); + } cpi->cpi_apicid = CPI_APIC_ID(cpi); cpi->cpi_procnodes_per_pkg = 1; - - if ((feature & (X86_HTT | X86_CMP)) == 0) { + if (is_x86_feature(featureset, X86FSET_HTT) == B_FALSE && + is_x86_feature(featureset, X86FSET_CMP) == B_FALSE) { /* * Single-core single-threaded processors. */ @@ -1354,7 +1544,7 @@ cpuid_pass1(cpu_t *cpu) cpi->cpi_procnodeid = cpi->cpi_chipid; } else if (cpi->cpi_ncpu_per_chip > 1) { if (cpi->cpi_vendor == X86_VENDOR_Intel) - cpuid_intel_getids(cpu, feature); + cpuid_intel_getids(cpu, featureset); else if (cpi->cpi_vendor == X86_VENDOR_AMD) cpuid_amd_getids(cpu); else { @@ -1380,7 +1570,6 @@ cpuid_pass1(cpu_t *cpu) pass1_done: cpi->cpi_pass = 1; - return (feature); } /* @@ -1587,6 +1776,92 @@ cpuid_pass2(cpu_t *cpu) cp = NULL; } + /* + * XSAVE enumeration + */ + if (cpi->cpi_maxeax >= 0xD && cpi->cpi_vendor == X86_VENDOR_Intel) { + struct cpuid_regs regs; + boolean_t cpuid_d_valid = B_TRUE; + + cp = ®s; + cp->cp_eax = 0xD; + cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; + + (void) __cpuid_insn(cp); + + /* + * Sanity checks for debug + */ + if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 || + (cp->cp_eax & XFEATURE_SSE) == 0) { + cpuid_d_valid = B_FALSE; + } + + cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax; + cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx; + cpi->cpi_xsave.xsav_max_size = cp->cp_ecx; + + /* + * If the hw supports AVX, get the size and offset in the save + * area for the ymm state. + */ + if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) { + cp->cp_eax = 0xD; + cp->cp_ecx = 2; + cp->cp_edx = cp->cp_ebx = 0; + + (void) __cpuid_insn(cp); + + if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET || + cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) { + cpuid_d_valid = B_FALSE; + } + + cpi->cpi_xsave.ymm_size = cp->cp_eax; + cpi->cpi_xsave.ymm_offset = cp->cp_ebx; + } + + if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { + xsave_state_size = 0; + } else if (cpuid_d_valid) { + xsave_state_size = cpi->cpi_xsave.xsav_max_size; + } else { + /* Broken CPUID 0xD, probably in HVM */ + cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid " + "value: hw_low = %d, hw_high = %d, xsave_size = %d" + ", ymm_size = %d, ymm_offset = %d\n", + cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low, + cpi->cpi_xsave.xsav_hw_features_high, + (int)cpi->cpi_xsave.xsav_max_size, + (int)cpi->cpi_xsave.ymm_size, + (int)cpi->cpi_xsave.ymm_offset); + + if (xsave_state_size != 0) { + /* + * This must be a non-boot CPU. We cannot + * continue, because boot cpu has already + * enabled XSAVE. + */ + ASSERT(cpu->cpu_id != 0); + cmn_err(CE_PANIC, "cpu%d: we have already " + "enabled XSAVE on boot cpu, cannot " + "continue.", cpu->cpu_id); + } else { + /* + * Must be from boot CPU, OK to disable XSAVE. + */ + ASSERT(cpu->cpu_id == 0); + remove_x86_feature(x86_featureset, + X86FSET_XSAVE); + remove_x86_feature(x86_featureset, X86FSET_AVX); + CPI_FEATURES_ECX(cpi) &= ~CPUID_INTC_ECX_XSAVE; + CPI_FEATURES_ECX(cpi) &= ~CPUID_INTC_ECX_AVX; + xsave_force_disable = B_TRUE; + } + } + } + + if ((cpi->cpi_xmaxeax & 0x80000000) == 0) goto pass2_done; @@ -1703,7 +1978,7 @@ intel_cpubrand(const struct cpuid_info *cpi) { int i; - if ((x86_feature & X86_CPUID) == 0 || + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) return ("i486"); @@ -1837,7 +2112,7 @@ intel_cpubrand(const struct cpuid_info *cpi) static const char * amd_cpubrand(const struct cpuid_info *cpi) { - if ((x86_feature & X86_CPUID) == 0 || + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || cpi->cpi_maxeax < 1 || cpi->cpi_family < 5) return ("i486 compatible"); @@ -1907,7 +2182,7 @@ amd_cpubrand(const struct cpuid_info *cpi) static const char * cyrix_cpubrand(struct cpuid_info *cpi, uint_t type) { - if ((x86_feature & X86_CPUID) == 0 || + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 || type == X86_TYPE_CYRIX_486) return ("i486 compatible"); @@ -2224,29 +2499,36 @@ cpuid_pass4(cpu_t *cpu) /* * [these require explicit kernel support] */ - if ((x86_feature & X86_SEP) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SEP)) *edx &= ~CPUID_INTC_EDX_SEP; - if ((x86_feature & X86_SSE) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE)) *edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE); - if ((x86_feature & X86_SSE2) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) *edx &= ~CPUID_INTC_EDX_SSE2; - if ((x86_feature & X86_HTT) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_HTT)) *edx &= ~CPUID_INTC_EDX_HTT; - if ((x86_feature & X86_SSE3) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE3)) *ecx &= ~CPUID_INTC_ECX_SSE3; if (cpi->cpi_vendor == X86_VENDOR_Intel) { - if ((x86_feature & X86_SSSE3) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSSE3)) *ecx &= ~CPUID_INTC_ECX_SSSE3; - if ((x86_feature & X86_SSE4_1) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1)) *ecx &= ~CPUID_INTC_ECX_SSE4_1; - if ((x86_feature & X86_SSE4_2) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2)) *ecx &= ~CPUID_INTC_ECX_SSE4_2; - if ((x86_feature & X86_AES) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_AES)) *ecx &= ~CPUID_INTC_ECX_AES; + if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ)) + *ecx &= ~CPUID_INTC_ECX_PCLMULQDQ; + if (!is_x86_feature(x86_featureset, X86FSET_XSAVE)) + *ecx &= ~(CPUID_INTC_ECX_XSAVE | + CPUID_INTC_ECX_OSXSAVE); + if (!is_x86_feature(x86_featureset, X86FSET_AVX)) + *ecx &= ~CPUID_INTC_ECX_AVX; } /* @@ -2280,6 +2562,9 @@ cpuid_pass4(cpu_t *cpu) hwcap_flags |= AV_386_AES; if (*ecx & CPUID_INTC_ECX_PCLMULQDQ) hwcap_flags |= AV_386_PCLMULQDQ; + if ((*ecx & CPUID_INTC_ECX_XSAVE) && + (*ecx & CPUID_INTC_ECX_OSXSAVE)) + hwcap_flags |= AV_386_XSAVE; } if (*ecx & CPUID_INTC_ECX_POPCNT) hwcap_flags |= AV_386_POPCNT; @@ -2326,14 +2611,14 @@ cpuid_pass4(cpu_t *cpu) */ switch (cpi->cpi_vendor) { case X86_VENDOR_Intel: - if ((x86_feature & X86_TSCP) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) *edx &= ~CPUID_AMD_EDX_TSCP; break; case X86_VENDOR_AMD: - if ((x86_feature & X86_TSCP) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_TSCP)) *edx &= ~CPUID_AMD_EDX_TSCP; - if ((x86_feature & X86_SSE4A) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE4A)) *ecx &= ~CPUID_AMD_ECX_SSE4A; break; @@ -2349,7 +2634,7 @@ cpuid_pass4(cpu_t *cpu) *edx &= ~(CPUID_AMD_EDX_MMXamd | CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx); - if ((x86_feature & X86_NX) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_NX)) *edx &= ~CPUID_AMD_EDX_NX; #if !defined(__amd64) *edx &= ~CPUID_AMD_EDX_LM; @@ -3340,7 +3625,7 @@ intel_walk_cacheinfo(struct cpuid_info *cpi, des_b1_ct.ct_code = 0xb1; des_b1_ct.ct_assoc = 4; des_b1_ct.ct_line_size = 0; - if (x86_feature & X86_PAE) { + if (is_x86_feature(x86_featureset, X86FSET_PAE)) { des_b1_ct.ct_size = 8; des_b1_ct.ct_label = itlb2M_str; } else { @@ -3687,7 +3972,7 @@ cpuid_set_cpu_properties(void *dip, processorid_t cpu_id, "clock-frequency", (int)mul); } - if ((x86_feature & X86_CPUID) == 0) { + if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) { return; } @@ -4083,7 +4368,7 @@ cpuid_deep_cstates_supported(void) cpi = CPU->cpu_m.mcpu_cpi; - if (!(x86_feature & X86_CPUID)) + if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) return (0); switch (cpi->cpi_vendor) { @@ -4134,6 +4419,31 @@ post_startup_cpu_fixups(void) } /* + * Setup necessary registers to enable XSAVE feature on this processor. + * This function needs to be called early enough, so that no xsave/xrstor + * ops will execute on the processor before the MSRs are properly set up. + * + * Current implementation has the following assumption: + * - cpuid_pass1() is done, so that X86 features are known. + * - fpu_probe() is done, so that fp_save_mech is chosen. + */ +void +xsave_setup_msr(cpu_t *cpu) +{ + ASSERT(fp_save_mech == FP_XSAVE); + ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE)); + + /* Enable OSXSAVE in CR4. */ + setcr4(getcr4() | CR4_OSXSAVE); + /* + * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report + * correct value. + */ + cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE; + setup_xfem(); +} + +/* * Starting with the Westmere processor the local * APIC timer will continue running in all C-states, * including the deepest C-states. @@ -4145,7 +4455,7 @@ cpuid_arat_supported(void) struct cpuid_regs regs; ASSERT(cpuid_checkpass(CPU, 1)); - ASSERT(x86_feature & X86_CPUID); + ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); cpi = CPU->cpu_m.mcpu_cpi; @@ -4178,7 +4488,8 @@ cpuid_iepb_supported(struct cpu *cp) ASSERT(cpuid_checkpass(cp, 1)); - if (!(x86_feature & X86_CPUID) || !(x86_feature & X86_MSR)) { + if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) || + !(is_x86_feature(x86_featureset, X86FSET_MSR))) { return (0); } @@ -4194,6 +4505,38 @@ cpuid_iepb_supported(struct cpu *cp) return (regs.cp_ecx & CPUID_EPB_SUPPORT); } +/* + * Check support for TSC deadline timer + * + * TSC deadline timer provides a superior software programming + * model over local APIC timer that eliminates "time drifts". + * Instead of specifying a relative time, software specifies an + * absolute time as the target at which the processor should + * generate a timer event. + */ +int +cpuid_deadline_tsc_supported(void) +{ + struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi; + struct cpuid_regs regs; + + ASSERT(cpuid_checkpass(CPU, 1)); + ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID)); + + switch (cpi->cpi_vendor) { + case X86_VENDOR_Intel: + if (cpi->cpi_maxeax >= 1) { + regs.cp_eax = 1; + (void) cpuid_insn(NULL, ®s); + return (regs.cp_ecx & CPUID_DEADLINE_TSC); + } else { + return (0); + } + default: + return (0); + } +} + #if defined(__amd64) && !defined(__xpv) /* * Patch in versions of bcopy for high performance Intel Nhm processors @@ -4205,7 +4548,8 @@ patch_memops(uint_t vendor) size_t cnt, i; caddr_t to, from; - if ((vendor == X86_VENDOR_Intel) && ((x86_feature & X86_SSE4_2) != 0)) { + if ((vendor == X86_VENDOR_Intel) && + is_x86_feature(x86_featureset, X86FSET_SSE4_2)) { cnt = &bcopy_patch_end - &bcopy_patch_start; to = &bcopy_ck_size; from = &bcopy_patch_start; diff --git a/usr/src/uts/i86pc/os/cpupm/pwrnow.c b/usr/src/uts/i86pc/os/cpupm/pwrnow.c index 8840d8ce34..d403c69e34 100644 --- a/usr/src/uts/i86pc/os/cpupm/pwrnow.c +++ b/usr/src/uts/i86pc/os/cpupm/pwrnow.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/x86_archext.h> @@ -217,8 +216,8 @@ pwrnow_supported() struct cpuid_regs cpu_regs; /* Required features */ - if (!(x86_feature & X86_CPUID) || - !(x86_feature & X86_MSR)) { + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || + !is_x86_feature(x86_featureset, X86FSET_MSR)) { PWRNOW_DEBUG(("No CPUID or MSR support.")); return (B_FALSE); } diff --git a/usr/src/uts/i86pc/os/cpupm/speedstep.c b/usr/src/uts/i86pc/os/cpupm/speedstep.c index 151fdc79ae..2be4529a83 100644 --- a/usr/src/uts/i86pc/os/cpupm/speedstep.c +++ b/usr/src/uts/i86pc/os/cpupm/speedstep.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2009, Intel Corporation. @@ -444,8 +443,8 @@ speedstep_supported(uint_t family, uint_t model) struct cpuid_regs cpu_regs; /* Required features */ - if (!(x86_feature & X86_CPUID) || - !(x86_feature & X86_MSR)) { + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || + !is_x86_feature(x86_featureset, X86FSET_MSR)) { return (B_FALSE); } @@ -476,8 +475,8 @@ turbo_supported(void) struct cpuid_regs cpu_regs; /* Required features */ - if (!(x86_feature & X86_CPUID) || - !(x86_feature & X86_MSR)) { + if (!is_x86_feature(x86_featureset, X86FSET_CPUID) || + !is_x86_feature(x86_featureset, X86FSET_MSR)) { return (B_FALSE); } diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c index c1a27cc466..a1ae318703 100644 --- a/usr/src/uts/i86pc/os/ddi_impl.c +++ b/usr/src/uts/i86pc/os/ddi_impl.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -1557,7 +1556,8 @@ i_ddi_cacheattr_to_hatacc(uint_t flags, uint_t *hataccp) * If write-combining is not supported, then it falls back * to uncacheable. */ - if (cache_attr == IOMEM_DATA_UC_WR_COMBINE && !(x86_feature & X86_PAT)) + if (cache_attr == IOMEM_DATA_UC_WR_COMBINE && + !is_x86_feature(x86_featureset, X86FSET_PAT)) cache_attr = IOMEM_DATA_UNCACHED; /* diff --git a/usr/src/uts/i86pc/os/fastboot.c b/usr/src/uts/i86pc/os/fastboot.c index 4cedb0be28..1520a6653c 100644 --- a/usr/src/uts/i86pc/os/fastboot.c +++ b/usr/src/uts/i86pc/os/fastboot.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -601,7 +600,7 @@ fastboot_build_mbi(char *mdep, fastboot_info_t *nk) static void fastboot_init_fields(fastboot_info_t *nk) { - if (x86_feature & X86_PAE) { + if (is_x86_feature(x86_featureset, X86FSET_PAE)) { nk->fi_has_pae = 1; nk->fi_shift_amt = fastboot_shift_amt_pae; nk->fi_ptes_per_table = 512; @@ -1155,9 +1154,11 @@ load_kernel_retry: goto err_out; } - if ((x86_feature & X86_64) == 0 || - (x86_feature & X86_PAE) == 0) { - cmn_err(CE_NOTE, "!Fastboot: Cannot " + if (!is_x86_feature(x86_featureset, + X86FSET_64) || + !is_x86_feature(x86_featureset, + X86FSET_PAE)) { + cmn_err(CE_NOTE, "Fastboot: Cannot " "reboot to %s: " "not a 64-bit capable system", kern_bootfile); diff --git a/usr/src/uts/i86pc/os/fpu_subr.c b/usr/src/uts/i86pc/os/fpu_subr.c index 11f226a1eb..0598b913f1 100644 --- a/usr/src/uts/i86pc/os/fpu_subr.c +++ b/usr/src/uts/i86pc/os/fpu_subr.c @@ -20,12 +20,9 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Floating point configuration. */ @@ -51,6 +48,15 @@ int fpu_exists = 1; int fp_kind = FP_387; /* + * Mechanism to save FPU state. + */ +#if defined(__amd64) +int fp_save_mech = FP_FXSAVE; +#elif defined(__i386) +int fp_save_mech = FP_FNSAVE; +#endif + +/* * The variable fpu_ignored is provided to allow other code to * determine whether emulation is being done because there is * no FPU or because of an override requested via /etc/system. @@ -142,24 +148,59 @@ fpu_probe(void) * * (Perhaps we should complain more about this case!) */ - if ((x86_feature & X86_SSE|X86_SSE2) == (X86_SSE|X86_SSE2)) { - fp_kind = __FP_SSE; + if (is_x86_feature(x86_featureset, X86FSET_SSE) && + is_x86_feature(x86_featureset, X86FSET_SSE2)) { + fp_kind |= __FP_SSE; ENABLE_SSE(); + + if (is_x86_feature(x86_featureset, X86FSET_AVX)) { + ASSERT(is_x86_feature(x86_featureset, + X86FSET_XSAVE)); + fp_kind |= __FP_AVX; + } + + if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { + fp_save_mech = FP_XSAVE; + fpsave_ctxt = xsave_ctxt; + patch_xsave(); + } } #elif defined(__i386) /* * SSE and SSE2 are both optional, and we patch kernel * code to exploit it when present. */ - if (x86_feature & X86_SSE) { - fp_kind = __FP_SSE; + if (is_x86_feature(x86_featureset, X86FSET_SSE)) { + fp_kind |= __FP_SSE; + ENABLE_SSE(); + fp_save_mech = FP_FXSAVE; fpsave_ctxt = fpxsave_ctxt; - patch_sse(); - if (x86_feature & X86_SSE2) + + if (is_x86_feature(x86_featureset, X86FSET_SSE2)) { patch_sse2(); - ENABLE_SSE(); + } + + if (is_x86_feature(x86_featureset, X86FSET_AVX)) { + ASSERT(is_x86_feature(x86_featureset, + X86FSET_XSAVE)); + fp_kind |= __FP_AVX; + } + + if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) { + fp_save_mech = FP_XSAVE; + fpsave_ctxt = xsave_ctxt; + patch_xsave(); + } else { + patch_sse(); /* use fxrstor */ + } } else { - x86_feature &= ~X86_SSE2; + remove_x86_feature(x86_featureset, X86FSET_SSE2); + /* + * We will not likely to have a chip with AVX but not + * SSE. But to be safe we disable AVX if SSE is not + * enabled. + */ + remove_x86_feature(x86_featureset, X86FSET_AVX); /* * (Just in case the BIOS decided we wanted SSE * enabled when we didn't. See 4965674.) @@ -167,11 +208,11 @@ fpu_probe(void) DISABLE_SSE(); } #endif - if (x86_feature & X86_SSE2) { + if (is_x86_feature(x86_featureset, X86FSET_SSE2)) { use_sse_pagecopy = use_sse_pagezero = use_sse_copy = 1; } - if (fp_kind == __FP_SSE) { + if (fp_kind & __FP_SSE) { struct fxsave_state *fx; uint8_t fxsave_state[sizeof (struct fxsave_state) + XMM_ALIGN]; diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index faaecd20b6..91d7afcf36 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index 02596478c0..ac647bea16 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -177,7 +176,7 @@ #include <sys/thread.h> #include <sys/types.h> #include <sys/var.h> -#include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ +#include <sys/x86_archext.h> #include <vm/hat_i86.h> #include <vm/seg_kmem.h> #include <vm/vm_dep.h> diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index 38c9f7159f..dcc82d6d9b 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -1161,7 +1161,7 @@ get_cpu_mstate(cpu_t *cpu, hrtime_t *times) int checked_rdmsr(uint_t msr, uint64_t *value) { - if ((x86_feature & X86_MSR) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_MSR)) return (ENOTSUP); *value = rdmsr(msr); return (0); @@ -1174,7 +1174,7 @@ checked_rdmsr(uint_t msr, uint64_t *value) int checked_wrmsr(uint_t msr, uint64_t value) { - if ((x86_feature & X86_MSR) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_MSR)) return (ENOTSUP); wrmsr(msr, value); return (0); @@ -1226,9 +1226,9 @@ num_phys_pages() /* cpu threshold for compressed dumps */ #ifdef _LP64 -uint_t dump_plat_mincpu = DUMP_PLAT_X86_64_MINCPU; +uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_64_MINCPU; #else -uint_t dump_plat_mincpu = DUMP_PLAT_X86_32_MINCPU; +uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_32_MINCPU; #endif int diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index bf57dd3c50..2f7e271c29 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -180,13 +179,13 @@ mlsetup(struct regs *rp) * was done in locore before mlsetup was called. Do the next * pass in C code. * - * The x86_feature bits are set here on the basis of the capabilities + * The x86_featureset is initialized here based on the capabilities * of the boot CPU. Note that if we choose to support CPUs that have * different feature sets (at which point we would almost certainly * want to set the feature bits to correspond to the feature * minimum) this value may be altered. */ - x86_feature = cpuid_pass1(cpu[0]); + cpuid_pass1(cpu[0], x86_featureset); #if !defined(__xpv) @@ -212,13 +211,16 @@ mlsetup(struct regs *rp) * The Xen hypervisor does not correctly report whether rdtscp is * supported or not, so we must assume that it is not. */ - if (get_hwenv() != HW_XEN_HVM && (x86_feature & X86_TSCP)) + if (get_hwenv() != HW_XEN_HVM && + is_x86_feature(x86_featureset, X86FSET_TSCP)) patch_tsc_read(X86_HAVE_TSCP); else if (cpuid_getvendor(CPU) == X86_VENDOR_AMD && - cpuid_getfamily(CPU) <= 0xf && (x86_feature & X86_SSE2) != 0) + cpuid_getfamily(CPU) <= 0xf && + is_x86_feature(x86_featureset, X86FSET_SSE2)) patch_tsc_read(X86_TSC_MFENCE); else if (cpuid_getvendor(CPU) == X86_VENDOR_Intel && - cpuid_getfamily(CPU) <= 6 && (x86_feature & X86_SSE2) != 0) + cpuid_getfamily(CPU) <= 6 && + is_x86_feature(x86_featureset, X86FSET_SSE2)) patch_tsc_read(X86_TSC_LFENCE); #endif /* !__xpv */ @@ -229,7 +231,7 @@ mlsetup(struct regs *rp) * or at least they do not implement it correctly. Patch them to * return 0. */ - if ((x86_feature & X86_TSC) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_TSC)) patch_tsc_read(X86_NO_TSC); #endif /* __i386 && !__xpv */ @@ -246,13 +248,13 @@ mlsetup(struct regs *rp) * (the cpuid) for the rdtscp instruction on appropriately * capable hardware. */ - if (x86_feature & X86_TSC) + if (is_x86_feature(x86_featureset, X86FSET_TSC)) setcr4(getcr4() & ~CR4_TSD); - if (x86_feature & X86_TSCP) + if (is_x86_feature(x86_featureset, X86FSET_TSCP)) (void) wrmsr(MSR_AMD_TSCAUX, 0); - if (x86_feature & X86_DE) + if (is_x86_feature(x86_featureset, X86FSET_DE)) setcr4(getcr4() | CR4_DE); #endif /* __xpv */ diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index 9f9c3aae4a..d7aab080a6 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2009-2010, Intel Corporation. @@ -237,7 +237,7 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw) { switch (hw) { case PGHW_IPIPE: - if (x86_feature & (X86_HTT)) { + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { /* * Hyper-threading is SMT */ @@ -251,7 +251,8 @@ pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw) else return (0); case PGHW_CHIP: - if (x86_feature & (X86_CMP|X86_HTT)) + if (is_x86_feature(x86_featureset, X86FSET_CMP) || + is_x86_feature(x86_featureset, X86FSET_HTT)) return (1); else return (0); @@ -1017,7 +1018,8 @@ mach_init() idle_cpu = cpu_idle_adaptive; CPU->cpu_m.mcpu_idle_cpu = cpu_idle; #ifndef __xpv - if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { + if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && + idle_cpu_prefer_mwait) { CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); /* * Protect ourself from insane mwait size. @@ -1130,7 +1132,8 @@ mach_smpinit(void) if (idle_cpu_use_hlt) { disp_enq_thread = cpu_wakeup; #ifndef __xpv - if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) + if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && + idle_cpu_prefer_mwait) disp_enq_thread = cpu_wakeup_mwait; non_deep_idle_disp_enq_thread = disp_enq_thread; #endif @@ -1239,7 +1242,7 @@ mach_getcpufreq(void) uint32_t pit_counter; uint64_t processor_clks; - if (x86_feature & X86_TSC) { + if (is_x86_feature(x86_featureset, X86FSET_TSC)) { /* * We have a TSC. freq_tsc() knows how to measure the number * of clock cycles sampled against the PIT. @@ -1411,7 +1414,7 @@ mach_clkinit(int preferred_mode, int *set_mode) cpu_freq = machhztomhz(cpu_freq_hz); - if (!(x86_feature & X86_TSC) || (cpu_freq == 0)) + if (!is_x86_feature(x86_featureset, X86FSET_TSC) || (cpu_freq == 0)) tsc_gethrtime_enable = 0; #ifndef __xpv diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c index bc21812187..0429b463f6 100644 --- a/usr/src/uts/i86pc/os/mp_pc.c +++ b/usr/src/uts/i86pc/os/mp_pc.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -313,7 +312,6 @@ mach_cpucontext_xalloc(struct cpu *cp, int optype) */ rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); rm->rm_cpu = cp->cpu_id; - rm->rm_x86feature = x86_feature; /* * For hot-adding CPU at runtime, Machine Check and Performance Counter @@ -624,7 +622,7 @@ out_enable_cmi: if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { - if (x86_feature & X86_MCA) + if (is_x86_feature(x86_featureset, X86FSET_MCA)) cmi_mca_init(hdl); cp->cpu_m.mcpu_cmi_hdl = hdl; } diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 146f879a66..843c5ae73a 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -165,7 +164,8 @@ init_cpu_syscall(struct cpu *cp) kpreempt_disable(); #if defined(__amd64) - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) { + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_ASYSC)) { #if !defined(__lint) /* @@ -205,7 +205,8 @@ init_cpu_syscall(struct cpu *cp) * On 64-bit kernels on Nocona machines, the 32-bit syscall * variant isn't available to 32-bit applications, but sysenter is. */ - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) { + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_SEP)) { #if !defined(__lint) /* @@ -415,7 +416,8 @@ mp_cpu_configure_common(int cpun, boolean_t boot) */ cpuid_alloc_space(cp); #if !defined(__xpv) - if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { + if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && + idle_cpu_prefer_mwait) { cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(cp); cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; } else @@ -1142,7 +1144,7 @@ workaround_errata(struct cpu *cpu) if (opteron_workaround_6323525) { opteron_workaround_6323525++; #if defined(__xpv) - } else if (x86_feature & X86_SSE2) { + } else if (is_x86_feature(x86_featureset, X86FSET_SSE2)) { if (DOMAIN_IS_INITDOMAIN(xen_info)) { /* * XXPV Use dom0_msr here when extended @@ -1160,7 +1162,8 @@ workaround_errata(struct cpu *cpu) opteron_workaround_6323525++; } #else /* __xpv */ - } else if ((x86_feature & X86_SSE2) && ((opteron_get_nnodes() * + } else if (is_x86_feature(x86_featureset, X86FSET_SSE2) && + ((opteron_get_nnodes() * cpuid_get_ncpu_per_chip(cpu)) > 1)) { if ((xrdmsr(MSR_AMD_BU_CFG) & (UINT64_C(1) << 33)) == 0) opteron_workaround_6323525++; @@ -1602,8 +1605,7 @@ static void mp_startup_common(boolean_t boot) { cpu_t *cp = CPU; - uint_t new_x86_feature; - const char *fmt = "?cpu%d: %b\n"; + uchar_t new_x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; extern void cpu_event_init_cpu(cpu_t *); /* @@ -1629,13 +1631,14 @@ mp_startup_common(boolean_t boot) */ (void) (*ap_mlsetup)(); - new_x86_feature = cpuid_pass1(cp); + bzero(new_x86_featureset, BT_SIZEOFMAP(NUM_X86_FEATURES)); + cpuid_pass1(cp, new_x86_featureset); #ifndef __xpv /* * Program this cpu's PAT */ - if (x86_feature & X86_PAT) + if (is_x86_feature(x86_featureset, X86FSET_PAT)) pat_sync(); #endif @@ -1643,7 +1646,7 @@ mp_startup_common(boolean_t boot) * Set up TSC_AUX to contain the cpuid for this processor * for the rdtscp instruction. */ - if (x86_feature & X86_TSCP) + if (is_x86_feature(x86_featureset, X86FSET_TSCP)) (void) wrmsr(MSR_AMD_TSCAUX, cp->cpu_id); /* @@ -1671,9 +1674,10 @@ mp_startup_common(boolean_t boot) * likely to happen once the number of processors in a configuration * gets large enough. */ - if ((x86_feature & new_x86_feature) != x86_feature) { - cmn_err(CE_CONT, fmt, cp->cpu_id, new_x86_feature, - FMT_X86_FEATURE); + if (compare_x86_featureset(x86_featureset, new_x86_featureset) == + B_FALSE) { + cmn_err(CE_CONT, "cpu%d: featureset\n", cp->cpu_id); + print_x86_featureset(new_x86_featureset); cmn_err(CE_WARN, "cpu%d feature mismatch", cp->cpu_id); } @@ -1681,7 +1685,8 @@ mp_startup_common(boolean_t boot) * We do not support cpus with mixed monitor/mwait support if the * boot cpu supports monitor/mwait. */ - if ((x86_feature & ~new_x86_feature) & X86_MWAIT) + if (is_x86_feature(x86_featureset, X86FSET_MWAIT) != + is_x86_feature(new_x86_featureset, X86FSET_MWAIT)) panic("unsupported mixed cpu monitor/mwait support detected"); /* @@ -1705,6 +1710,13 @@ mp_startup_common(boolean_t boot) */ cp->cpu_flags &= ~(CPU_POWEROFF | CPU_QUIESCED); + /* + * Setup this processor for XSAVE. + */ + if (fp_save_mech == FP_XSAVE) { + xsave_setup_msr(cp); + } + cpuid_pass2(cp); cpuid_pass3(cp); (void) cpuid_pass4(cp); @@ -1775,7 +1787,7 @@ mp_startup_common(boolean_t boot) if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU), cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL) { - if (x86_feature & X86_MCA) + if (is_x86_feature(x86_featureset, X86FSET_MCA)) cmi_mca_init(hdl); cp->cpu_m.mcpu_cmi_hdl = hdl; } @@ -1934,19 +1946,21 @@ mp_cpu_faulted_exit(struct cpu *cp) * The following two routines are used as context operators on threads belonging * to processes with a private LDT (see sysi86). Due to the rarity of such * processes, these routines are currently written for best code readability and - * organization rather than speed. We could avoid checking x86_feature at every - * context switch by installing different context ops, depending on the - * x86_feature flags, at LDT creation time -- one for each combination of fast - * syscall feature flags. + * organization rather than speed. We could avoid checking x86_featureset at + * every context switch by installing different context ops, depending on + * x86_featureset, at LDT creation time -- one for each combination of fast + * syscall features. */ /*ARGSUSED*/ void cpu_fast_syscall_disable(void *arg) { - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_SEP)) cpu_sep_disable(); - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_ASYSC)) cpu_asysc_disable(); } @@ -1954,16 +1968,18 @@ cpu_fast_syscall_disable(void *arg) void cpu_fast_syscall_enable(void *arg) { - if ((x86_feature & (X86_MSR | X86_SEP)) == (X86_MSR | X86_SEP)) + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_SEP)) cpu_sep_enable(); - if ((x86_feature & (X86_MSR | X86_ASYSC)) == (X86_MSR | X86_ASYSC)) + if (is_x86_feature(x86_featureset, X86FSET_MSR) && + is_x86_feature(x86_featureset, X86FSET_ASYSC)) cpu_asysc_enable(); } static void cpu_sep_enable(void) { - ASSERT(x86_feature & X86_SEP); + ASSERT(is_x86_feature(x86_featureset, X86FSET_SEP)); ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); wrmsr(MSR_INTC_SEP_CS, (uint64_t)(uintptr_t)KCS_SEL); @@ -1972,7 +1988,7 @@ cpu_sep_enable(void) static void cpu_sep_disable(void) { - ASSERT(x86_feature & X86_SEP); + ASSERT(is_x86_feature(x86_featureset, X86FSET_SEP)); ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); /* @@ -1985,7 +2001,7 @@ cpu_sep_disable(void) static void cpu_asysc_enable(void) { - ASSERT(x86_feature & X86_ASYSC); + ASSERT(is_x86_feature(x86_featureset, X86FSET_ASYSC)); ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) | @@ -1995,7 +2011,7 @@ cpu_asysc_enable(void) static void cpu_asysc_disable(void) { - ASSERT(x86_feature & X86_ASYSC); + ASSERT(is_x86_feature(x86_featureset, X86FSET_ASYSC)); ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); /* diff --git a/usr/src/uts/i86pc/os/pci_mech1_amd.c b/usr/src/uts/i86pc/os/pci_mech1_amd.c index d45408731b..3b6eb918fe 100644 --- a/usr/src/uts/i86pc/os/pci_mech1_amd.c +++ b/usr/src/uts/i86pc/os/pci_mech1_amd.c @@ -42,7 +42,7 @@ pci_check_amd_ioecs(void) struct cpuid_regs cp; int family; - if ((x86_feature & X86_CPUID) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) return (B_FALSE); /* diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index f69b37a9f2..d8facc92e7 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -1320,7 +1320,6 @@ static void startup_kmem(void) { extern void page_set_colorequiv_arr(void); - const char *fmt = "?features: %b\n"; PRM_POINT("startup_kmem() starting..."); @@ -1429,7 +1428,7 @@ startup_kmem(void) /* * print this out early so that we know what's going on */ - cmn_err(CE_CONT, fmt, x86_feature, FMT_X86_FEATURE); + print_x86_featureset(x86_featureset); /* * Initialize bp_mapin(). @@ -1651,7 +1650,7 @@ startup_modules(void) if ((hdl = cmi_init(CMI_HDL_SOLARIS_xVM_MCA, xen_physcpu_chipid(cpi), xen_physcpu_coreid(cpi), xen_physcpu_strandid(cpi))) != NULL && - (x86_feature & X86_MCA)) + is_x86_feature(x86_featureset, X86FSET_MCA)) cmi_mca_init(hdl); } } @@ -1663,7 +1662,7 @@ startup_modules(void) if ((get_hwenv() != HW_XEN_HVM) && (hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU), cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL && - (x86_feature & X86_MCA)) { + is_x86_feature(x86_featureset, X86FSET_MCA)) { cmi_mca_init(hdl); CPU->cpu_m.mcpu_cmi_hdl = hdl; } @@ -2194,6 +2193,13 @@ startup_end(void) PRM_POINT("configure() done"); /* + * We can now setup for XSAVE because fpu_probe is done in configure(). + */ + if (fp_save_mech == FP_XSAVE) { + xsave_setup_msr(CPU); + } + + /* * Set the isa_list string to the defined instruction sets we * support. */ @@ -2670,7 +2676,7 @@ pat_sync(void) { ulong_t cr0, cr0_orig, cr4; - if (!(x86_feature & X86_PAT)) + if (!is_x86_feature(x86_featureset, X86FSET_PAT)) return; cr0_orig = cr0 = getcr0(); cr4 = getcr4(); @@ -2993,12 +2999,13 @@ setx86isalist(void) case X86_VENDOR_Intel: case X86_VENDOR_AMD: case X86_VENDOR_TM: - if (x86_feature & X86_CMOV) { + if (is_x86_feature(x86_featureset, X86FSET_CMOV)) { /* * Pentium Pro or later */ (void) strcat(tp, "pentium_pro"); - (void) strcat(tp, x86_feature & X86_MMX ? + (void) strcat(tp, + is_x86_feature(x86_featureset, X86FSET_MMX) ? "+mmx pentium_pro " : " "); } /*FALLTHROUGH*/ @@ -3007,9 +3014,10 @@ setx86isalist(void) * The Cyrix 6x86 does not have any Pentium features * accessible while not at privilege level 0. */ - if (x86_feature & X86_CPUID) { + if (is_x86_feature(x86_featureset, X86FSET_CPUID)) { (void) strcat(tp, "pentium"); - (void) strcat(tp, x86_feature & X86_MMX ? + (void) strcat(tp, + is_x86_feature(x86_featureset, X86FSET_MMX) ? "+mmx pentium " : " "); } break; diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index a004b73055..71995cc992 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -256,17 +256,17 @@ instr_is_other_syscall(caddr_t pc, int which) { uchar_t instr[FAST_SCALL_SIZE]; - ASSERT(which == X86_SEP || which == X86_ASYSC || which == 0xCD); + ASSERT(which == X86FSET_SEP || which == X86FSET_ASYSC || which == 0xCD); if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0) return (0); switch (which) { - case X86_SEP: + case X86FSET_SEP: if (instr[0] == 0x0F && instr[1] == 0x34) return (1); break; - case X86_ASYSC: + case X86FSET_ASYSC: if (instr[0] == 0x0F && instr[1] == 0x05) return (1); break; @@ -283,9 +283,9 @@ static const char * syscall_insn_string(int syscall_insn) { switch (syscall_insn) { - case X86_SEP: + case X86FSET_SEP: return ("sysenter"); - case X86_ASYSC: + case X86FSET_ASYSC: return ("syscall"); case 0xCD: return ("int"); @@ -916,7 +916,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) * be to emulate that particular instruction. */ if (p->p_ldt != NULL && - ldt_rewrite_syscall(rp, p, X86_ASYSC)) + ldt_rewrite_syscall(rp, p, X86FSET_ASYSC)) goto out; #ifdef __amd64 @@ -1018,7 +1018,8 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) case T_SIMDFPE + USER: /* SSE and SSE2 exceptions */ if (tudebug && tudebugsse) showregs(type, rp, addr); - if ((x86_feature & (X86_SSE|X86_SSE2)) == 0) { + if (!is_x86_feature(x86_featureset, X86FSET_SSE) && + !is_x86_feature(x86_featureset, X86FSET_SSE2)) { /* * There are rumours that some user instructions * on older CPUs can cause this trap to occur; in @@ -1268,7 +1269,7 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) * this will be to emulate that particular instruction. */ if (p->p_ldt != NULL && - ldt_rewrite_syscall(rp, p, X86_SEP)) + ldt_rewrite_syscall(rp, p, X86FSET_SEP)) goto out; /*FALLTHROUGH*/ diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 1f177556ea..b632cea09c 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -22,6 +22,11 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + #ifndef _SYS_APIC_APIC_H #define _SYS_APIC_APIC_H @@ -74,9 +79,6 @@ extern "C" { #define APIC_INT_CMD1 0xc0 #define APIC_INT_CMD2 0xc4 -/* Timer Vector Table register */ -#define APIC_LOCAL_TIMER 0xc8 - /* Local Interrupt Vector registers */ #define APIC_CMCI_VECT 0xbc #define APIC_THERM_VECT 0xcc @@ -344,9 +346,6 @@ struct apic_io_intr { /* spurious interrupt vector register */ #define AV_UNIT_ENABLE 0x100 -/* timer vector table */ -#define AV_TIME 0x20000 /* Set timer mode to periodic */ - #define APIC_MAXVAL 0xffffffffUL #define APIC_TIME_MIN 0x5000 #define APIC_TIME_COUNT 0x4000 @@ -665,19 +664,6 @@ typedef struct { #define PSMGI_INTRBY_DEFAULT 0x4000 /* PSM specific default value */ #define PSMGI_INTRBY_FLAGS 0xc000 /* Mask for this flag */ -/* - * Use scaled-fixed-point arithmetic to calculate apic ticks. - * Round when dividing (by adding half of divisor to dividend) - * for one extra bit of precision. - */ - -#define SF (1ULL<<20) /* Scaling Factor: scale by 2^20 */ -#define APIC_TICKS_TO_NSECS(ticks) ((((int64_t)(ticks) * SF) + \ - apic_ticks_per_SFnsecs / 2) / \ - apic_ticks_per_SFnsecs); -#define APIC_NSECS_TO_TICKS(nsecs) (((int64_t)(nsecs) * \ - apic_ticks_per_SFnsecs + (SF/2)) / SF) - extern int apic_verbose; /* Flag definitions for apic_verbose */ @@ -839,6 +825,7 @@ extern int apic_local_mode(); extern void apic_change_eoi(); extern void apic_send_EOI(uint32_t); extern void apic_send_directed_EOI(uint32_t); +extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); extern volatile uint32_t *apicadr; /* virtual addr of local APIC */ extern int apic_forceload; diff --git a/usr/src/uts/i86pc/sys/apic_common.h b/usr/src/uts/i86pc/sys/apic_common.h index cd259d7f62..c7b6d925f1 100644 --- a/usr/src/uts/i86pc/sys/apic_common.h +++ b/usr/src/uts/i86pc/sys/apic_common.h @@ -111,9 +111,6 @@ extern int apic_panic_on_apic_error; extern int apic_verbose; -/* minimum number of timer ticks to program to */ -extern int apic_min_timer_ticks; - #ifdef DEBUG extern int apic_debug; extern int apic_restrict_vector; @@ -180,9 +177,7 @@ extern void apic_unset_idlecpu(processorid_t cpun); extern void apic_shutdown(int cmd, int fcn); extern void apic_preshutdown(int cmd, int fcn); extern processorid_t apic_get_next_processorid(processorid_t cpun); -extern void apic_timer_reprogram(hrtime_t time); -extern void apic_timer_enable(void); -extern void apic_timer_disable(void); +extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); extern int apic_error_intr(); extern void apic_cpcovf_mask_clear(void); diff --git a/usr/src/uts/i86pc/sys/apic_timer.h b/usr/src/uts/i86pc/sys/apic_timer.h new file mode 100644 index 0000000000..7f7285fb83 --- /dev/null +++ b/usr/src/uts/i86pc/sys/apic_timer.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_APIC_TIMER_H +#define _SYS_APIC_TIMER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/time.h> + +#define IA32_DEADLINE_TSC_MSR 0x6E0 + +/* Timer Vector Table register */ +#define APIC_LOCAL_TIMER 0xc8 + +/* timer vector table */ +#define AV_PERIODIC 0x20000 /* Set timer mode to periodic */ +#define AV_DEADLINE 0x40000 /* Set timer mode to deadline */ + +#define APIC_TIMER_MODE_ONESHOT 0x0 +#define APIC_TIMER_MODE_PERIODIC 0x1 +#define APIC_TIMER_MODE_DEADLINE 0x2 /* TSC-Deadline timer mode */ + +extern int apic_oneshot; +extern uint_t apic_nsec_per_intr; +extern uint_t apic_hertz_count; +extern uint64_t apic_ticks_per_SFnsecs; + +/* + * Use scaled-fixed-point arithmetic to calculate apic ticks. + * Round when dividing (by adding half of divisor to dividend) + * for one extra bit of precision. + */ + +#define SF (1ULL<<20) /* Scaling Factor: scale by 2^20 */ +#define APIC_TICKS_TO_NSECS(ticks) ((((int64_t)(ticks) * SF) + \ + apic_ticks_per_SFnsecs / 2) / \ + apic_ticks_per_SFnsecs); +#define APIC_NSECS_TO_TICKS(nsecs) (((int64_t)(nsecs) * \ + apic_ticks_per_SFnsecs + (SF/2)) / SF) + +extern int apic_timer_init(int); +extern void apic_timer_reprogram(hrtime_t); +extern void apic_timer_enable(void); +extern void apic_timer_disable(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_APIC_TIMER_H */ diff --git a/usr/src/uts/i86pc/sys/apix.h b/usr/src/uts/i86pc/sys/apix.h index 3db39b4021..8237d2e455 100644 --- a/usr/src/uts/i86pc/sys/apix.h +++ b/usr/src/uts/i86pc/sys/apix.h @@ -30,6 +30,7 @@ #include <sys/traptrace.h> #include <sys/apic.h> #include <sys/apic_common.h> +#include <sys/apic_timer.h> #ifdef __cplusplus extern "C" { diff --git a/usr/src/uts/i86pc/sys/hpet_acpi.h b/usr/src/uts/i86pc/sys/hpet_acpi.h index 078f4e73b3..e60ebe4bba 100644 --- a/usr/src/uts/i86pc/sys/hpet_acpi.h +++ b/usr/src/uts/i86pc/sys/hpet_acpi.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _HPET_ACPI_H diff --git a/usr/src/uts/i86pc/sys/immu.h b/usr/src/uts/i86pc/sys/immu.h index db9732c8a7..70193d26e6 100644 --- a/usr/src/uts/i86pc/sys/immu.h +++ b/usr/src/uts/i86pc/sys/immu.h @@ -42,8 +42,11 @@ extern "C" { #include <sys/types.h> #include <sys/bitset.h> #include <sys/kstat.h> +#include <sys/kmem.h> #include <sys/vmem.h> #include <sys/rootnex.h> +#include <sys/iommulib.h> +#include <sys/sdt.h> /* * Some ON drivers have bugs. Keep this define until all such drivers @@ -63,6 +66,7 @@ typedef uint64_t hw_pdte_t; #define IMMU_PAGEMASK (~IMMU_PAGEOFFSET) #define IMMU_BTOP(b) (((uint64_t)b) >> IMMU_PAGESHIFT) #define IMMU_PTOB(p) (((uint64_t)p) << IMMU_PAGESHIFT) +#define IMMU_BTOPR(x) ((((x) + IMMU_PAGEOFFSET) >> IMMU_PAGESHIFT)) #define IMMU_PGTABLE_MAX_LEVELS (6) #define IMMU_ROUNDUP(size) (((size) + IMMU_PAGEOFFSET) & ~IMMU_PAGEOFFSET) #define IMMU_ROUNDOWN(addr) ((addr) & ~IMMU_PAGEOFFSET) @@ -71,9 +75,6 @@ typedef uint64_t hw_pdte_t; #define IMMU_PGTABLE_OFFSHIFT (IMMU_PAGESHIFT - IMMU_PGTABLE_LEVEL_STRIDE) #define IMMU_PGTABLE_MAXIDX ((IMMU_PAGESIZE / sizeof (hw_pdte_t)) - 1) -#define IMMU_ROUNDUP(size) (((size) + IMMU_PAGEOFFSET) & ~IMMU_PAGEOFFSET) -#define IMMU_ROUNDOWN(addr) ((addr) & ~IMMU_PAGEOFFSET) - /* * DMAR global defines */ @@ -145,6 +146,8 @@ typedef struct rmrr { list_node_t rm_node; } rmrr_t; +#define IMMU_UNIT_NAME "iommu" + /* * Macros based on PCI spec */ @@ -460,6 +463,12 @@ typedef struct hw_rce { #define CONT_GET_P(hcent) ((hcent)->lo & 0x1) #define CONT_SET_P(hcent) ((hcent)->lo |= 0x1) +#define CONT_GET_ALH(hcent) ((hcent)->lo & 0x20) +#define CONT_SET_ALH(hcent) ((hcent)->lo |= 0x20) + +#define CONT_GET_EH(hcent) ((hcent)->lo & 0x10) +#define CONT_SET_EH(hcent) ((hcent)->lo |= 0x10) + /* we use the bit 63 (available for system SW) as a present bit */ #define PDTE_SW4(hw_pdte) ((hw_pdte) & ((uint64_t)1<<63)) @@ -507,8 +516,41 @@ typedef struct hw_rce { #define PDTE_CLEAR_READ(hw_pdte) ((hw_pdte) &= ~(0x1)) #define PDTE_SET_READ(hw_pdte) ((hw_pdte) |= (0x1)) +#define PDTE_MASK_R ((uint64_t)1 << 0) +#define PDTE_MASK_W ((uint64_t)1 << 1) +#define PDTE_MASK_SNP ((uint64_t)1 << 11) +#define PDTE_MASK_TM ((uint64_t)1 << 62) +#define PDTE_MASK_P ((uint64_t)1 << 63) + struct immu_flushops; +/* + * Used to wait for invalidation completion. + * vstatus is the virtual address of the status word that will be written + * pstatus is the physical addres + * If sync is true, then the the operation will be waited on for + * completion immediately. Else, the wait interface can be called + * to wait for completion later. + */ + +#define IMMU_INV_DATA_PENDING 1 +#define IMMU_INV_DATA_DONE 2 + +typedef struct immu_inv_wait { + volatile uint32_t iwp_vstatus; + uint64_t iwp_pstatus; + boolean_t iwp_sync; + const char *iwp_name; /* ID for debugging/statistics */ +} immu_inv_wait_t; + +/* + * Used to batch IOMMU pagetable writes. + */ +typedef struct immu_dcookie { + paddr_t dck_paddr; + uint64_t dck_npages; +} immu_dcookie_t; + typedef struct immu { kmutex_t immu_lock; char *immu_name; @@ -547,10 +589,12 @@ typedef struct immu { boolean_t immu_dvma_coherent; boolean_t immu_TM_reserved; boolean_t immu_SNP_reserved; + uint64_t immu_ptemask; /* DVMA context related */ krwlock_t immu_ctx_rwlock; pgtable_t *immu_ctx_root; + immu_inv_wait_t immu_ctx_inv_wait; /* DVMA domain related */ int immu_max_domains; @@ -569,6 +613,7 @@ typedef struct immu { boolean_t immu_intrmap_running; intrmap_t *immu_intrmap; uint64_t immu_intrmap_irta_reg; + immu_inv_wait_t immu_intrmap_inv_wait; /* queued invalidation related */ kmutex_t immu_qinv_lock; @@ -582,8 +627,19 @@ typedef struct immu { list_node_t immu_node; struct immu_flushops *immu_flushops; + + kmem_cache_t *immu_hdl_cache; + kmem_cache_t *immu_pgtable_cache; + + iommulib_handle_t immu_iommulib_handle; } immu_t; +/* + * Enough space to hold the decimal number of any device instance. + * Used for device/cache names. + */ +#define IMMU_ISTRLEN 11 /* log10(2^31) + 1 */ + /* properties that control DVMA */ #define DDI_DVMA_MAPTYPE_ROOTNEX_PROP "immu-dvma-mapping" @@ -622,6 +678,9 @@ typedef struct domain { list_node_t dom_immu_node; mod_hash_t *dom_cookie_hash; + + /* topmost device in domain; usually the device itself (non-shared) */ + dev_info_t *dom_dip; } domain_t; typedef enum immu_pcib { @@ -652,6 +711,9 @@ typedef struct immu_devi { boolean_t imd_display; boolean_t imd_lpc; + /* set if premapped DVMA space is used */ + boolean_t imd_use_premap; + /* dmar unit to which this dip belongs */ immu_t *imd_immu; @@ -686,6 +748,21 @@ typedef struct immu_arg { dev_info_t *ima_ddip; } immu_arg_t; +#define IMMU_NDVSEG 8 +#define IMMU_NDCK 64 +#define IMMU_NPREPTES 8 + +typedef struct immu_hdl_private { + immu_inv_wait_t ihp_inv_wait; + size_t ihp_ndvseg; + struct dvmaseg ihp_dvseg[IMMU_NDVSEG]; + immu_dcookie_t ihp_dcookies[IMMU_NDCK]; + + hw_pdte_t *ihp_preptes[IMMU_NPREPTES]; + uint64_t ihp_predvma; + int ihp_npremapped; +} immu_hdl_priv_t; + /* * Invalidation operation function pointers for context and IOTLB. * These will be set to either the register or the queue invalidation @@ -693,28 +770,35 @@ typedef struct immu_arg { * both at the same time. */ struct immu_flushops { - void (*imf_context_fsi)(immu_t *, uint8_t, uint16_t, uint_t); - void (*imf_context_dsi)(immu_t *, uint_t); - void (*imf_context_gbl)(immu_t *); + void (*imf_context_fsi)(immu_t *, uint8_t, uint16_t, uint_t, + immu_inv_wait_t *); + void (*imf_context_dsi)(immu_t *, uint_t, immu_inv_wait_t *); + void (*imf_context_gbl)(immu_t *, immu_inv_wait_t *); - void (*imf_iotlb_psi)(immu_t *, uint_t, uint64_t, uint_t, uint_t); - void (*imf_iotlb_dsi)(immu_t *, uint_t); - void (*imf_iotlb_gbl)(immu_t *); + void (*imf_iotlb_psi)(immu_t *, uint_t, uint64_t, uint_t, uint_t, + immu_inv_wait_t *); + void (*imf_iotlb_dsi)(immu_t *, uint_t, immu_inv_wait_t *); + void (*imf_iotlb_gbl)(immu_t *, immu_inv_wait_t *); + + void (*imf_wait)(immu_inv_wait_t *); }; -#define immu_flush_context_fsi(i, f, s, d) \ - (i)->immu_flushops->imf_context_fsi(i, f, s, d) -#define immu_flush_context_dsi(i, d) \ - (i)->immu_flushops->imf_context_dsi(i, d) -#define immu_flush_context_gbl(i) \ - (i)->immu_flushops->imf_context_gbl(i) +#define immu_flush_context_fsi(i, f, s, d, w) \ + (i)->immu_flushops->imf_context_fsi(i, f, s, d, w) +#define immu_flush_context_dsi(i, d, w) \ + (i)->immu_flushops->imf_context_dsi(i, d, w) +#define immu_flush_context_gbl(i, w) \ + (i)->immu_flushops->imf_context_gbl(i, w) + +#define immu_flush_iotlb_psi(i, d, v, c, h, w) \ + (i)->immu_flushops->imf_iotlb_psi(i, d, v, c, h, w) +#define immu_flush_iotlb_dsi(i, d, w) \ + (i)->immu_flushops->imf_iotlb_dsi(i, d, w) +#define immu_flush_iotlb_gbl(i, w) \ + (i)->immu_flushops->imf_iotlb_gbl(i, w) -#define immu_flush_iotlb_psi(i, d, v, c, h) \ - (i)->immu_flushops->imf_iotlb_psi(i, d, v, c, h) -#define immu_flush_iotlb_dsi(i, d) \ - (i)->immu_flushops->imf_iotlb_dsi(i, d) -#define immu_flush_iotlb_gbl(i) \ - (i)->immu_flushops->imf_iotlb_gbl(i) +#define immu_flush_wait(i, w) \ + (i)->immu_flushops->imf_wait(w) /* * Globals used by IOMMU code @@ -723,11 +807,11 @@ struct immu_flushops { extern dev_info_t *root_devinfo; extern kmutex_t immu_lock; extern list_t immu_list; -extern void *immu_pgtable_cache; extern boolean_t immu_setup; extern boolean_t immu_running; extern kmutex_t ioapic_drhd_lock; extern list_t ioapic_drhd_list; +extern struct iommulib_ops immulib_ops; /* switches */ @@ -751,6 +835,9 @@ extern int64_t immu_flush_gran; extern immu_flags_t immu_global_dvma_flags; +extern int immu_use_tm; +extern int immu_use_alh; + /* ################### Interfaces exported outside IOMMU code ############## */ void immu_init(void); void immu_startup(void); @@ -766,12 +853,6 @@ int immu_unquiesce(void); /* ######################################################################### */ /* ################# Interfaces used within IOMMU code #################### */ - -/* functions in rootnex.c */ -int rootnex_dvcookies_alloc(ddi_dma_impl_t *hp, - struct ddi_dma_req *dmareq, dev_info_t *rdip, void *arg); -void rootnex_dvcookies_free(dvcookie_t *dvcookies, void *arg); - /* immu_dmar.c interfaces */ int immu_dmar_setup(void); int immu_dmar_parse(void); @@ -780,7 +861,6 @@ void immu_dmar_shutdown(void); void immu_dmar_destroy(void); boolean_t immu_dmar_blacklisted(char **strings_array, uint_t nstrings); immu_t *immu_dmar_get_immu(dev_info_t *rdip); -char *immu_dmar_unit_name(void *dmar_unit); dev_info_t *immu_dmar_unit_dip(void *dmar_unit); void immu_dmar_set_immu(void *dmar_unit, immu_t *immu); void *immu_dmar_walk_units(int seg, void *dmar_unit); @@ -793,6 +873,7 @@ void immu_dmar_rmrr_map(void); int immu_walk_ancestor(dev_info_t *rdip, dev_info_t *ddip, int (*func)(dev_info_t *, void *arg), void *arg, int *level, immu_flags_t immu_flags); +void immu_init_inv_wait(immu_inv_wait_t *iwp, const char *s, boolean_t sync); /* immu_regs.c interfaces */ void immu_regs_setup(list_t *immu_list); @@ -813,13 +894,14 @@ void immu_regs_wbf_flush(immu_t *immu); void immu_regs_cpu_flush(immu_t *immu, caddr_t addr, uint_t size); void immu_regs_context_fsi(immu_t *immu, uint8_t function_mask, - uint16_t source_id, uint_t domain_id); -void immu_regs_context_dsi(immu_t *immu, uint_t domain_id); -void immu_regs_context_gbl(immu_t *immu); + uint16_t source_id, uint_t domain_id, immu_inv_wait_t *iwp); +void immu_regs_context_dsi(immu_t *immu, uint_t domain_id, + immu_inv_wait_t *iwp); +void immu_regs_context_gbl(immu_t *immu, immu_inv_wait_t *iwp); void immu_regs_iotlb_psi(immu_t *immu, uint_t domain_id, - uint64_t dvma, uint_t count, uint_t hint); -void immu_regs_iotlb_dsi(immu_t *immu, uint_t domain_id); -void immu_regs_iotlb_gbl(immu_t *immu); + uint64_t dvma, uint_t count, uint_t hint, immu_inv_wait_t *iwp); +void immu_regs_iotlb_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp); +void immu_regs_iotlb_gbl(immu_t *immu, immu_inv_wait_t *iwp); void immu_regs_set_root_table(immu_t *immu); void immu_regs_qinv_enable(immu_t *immu, uint64_t qinv_reg_value); @@ -838,17 +920,22 @@ void immu_dvma_shutdown(immu_t *immu); void immu_dvma_destroy(list_t *immu_list); void immu_dvma_physmem_update(uint64_t addr, uint64_t size); -int immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *, - uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags); +int immu_map_memrange(dev_info_t *, memrng_t *); +int immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, + uint_t prealloc_count, dev_info_t *rdip); int immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip); -int immu_dvma_alloc(dvcookie_t *first_dvcookie, void *arg); -void immu_dvma_free(dvcookie_t *first_dvcookie, void *arg); int immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags); immu_devi_t *immu_devi_get(dev_info_t *dip); immu_t *immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags); int pgtable_ctor(void *buf, void *arg, int kmflag); void pgtable_dtor(void *buf, void *arg); +int immu_hdl_priv_ctor(void *buf, void *arg, int kmf); + +int immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags); + +void immu_print_fault_info(uint_t sid, uint64_t dvma); + /* immu_intrmap.c interfaces */ void immu_intrmap_setup(list_t *immu_list); void immu_intrmap_startup(immu_t *immu); @@ -867,19 +954,37 @@ void immu_qinv_shutdown(immu_t *immu); void immu_qinv_destroy(list_t *immu_list); void immu_qinv_context_fsi(immu_t *immu, uint8_t function_mask, - uint16_t source_id, uint_t domain_id); -void immu_qinv_context_dsi(immu_t *immu, uint_t domain_id); -void immu_qinv_context_gbl(immu_t *immu); + uint16_t source_id, uint_t domain_id, immu_inv_wait_t *iwp); +void immu_qinv_context_dsi(immu_t *immu, uint_t domain_id, + immu_inv_wait_t *iwp); +void immu_qinv_context_gbl(immu_t *immu, immu_inv_wait_t *iwp); void immu_qinv_iotlb_psi(immu_t *immu, uint_t domain_id, - uint64_t dvma, uint_t count, uint_t hint); -void immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id); -void immu_qinv_iotlb_gbl(immu_t *immu); - -void immu_qinv_intr_global(immu_t *immu); -void immu_qinv_intr_one_cache(immu_t *immu, uint_t idx); -void immu_qinv_intr_caches(immu_t *immu, uint_t idx, uint_t cnt); + uint64_t dvma, uint_t count, uint_t hint, immu_inv_wait_t *iwp); +void immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id, immu_inv_wait_t *iwp); +void immu_qinv_iotlb_gbl(immu_t *immu, immu_inv_wait_t *iwp); + +void immu_qinv_intr_global(immu_t *immu, immu_inv_wait_t *iwp); +void immu_qinv_intr_one_cache(immu_t *immu, uint_t idx, immu_inv_wait_t *iwp); +void immu_qinv_intr_caches(immu_t *immu, uint_t idx, uint_t cnt, + immu_inv_wait_t *); void immu_qinv_report_fault(immu_t *immu); +#ifdef DEBUG +#define IMMU_DPROBE1(name, type1, arg1) \ + DTRACE_PROBE1(name, type1, arg1) +#define IMMU_DPROBE2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(name, type1, arg1, type2, arg2) +#define IMMU_DPROBE3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(name, type1, arg1, type2, arg2, type3, arg3) +#define IMMU_DPROBE4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) +#else +#define IMMU_DPROBE1(name, type1, arg1) +#define IMMU_DPROBE2(name, type1, arg1, type2, arg2) +#define IMMU_DPROBE3(name, type1, arg1, type2, arg2, type3, arg3) +#define IMMU_DPROBE4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) +#endif + #ifdef __cplusplus } diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index a783e942f7..e61f1baa84 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h index 48e141126b..9ca3a4908d 100644 --- a/usr/src/uts/i86pc/sys/rm_platter.h +++ b/usr/src/uts/i86pc/sys/rm_platter.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -70,7 +69,7 @@ typedef struct rm_platter { gate_desc_t *rm_idt_base; uint_t rm_pdbr; /* cr3 value */ uint_t rm_cpu; /* easy way to know which CPU we are */ - uint_t rm_x86feature; /* X86 supported features */ + uint_t rm_filler3; uint_t rm_cr4; /* cr4 value on cpu0 */ #if defined(__amd64) /* diff --git a/usr/src/uts/i86pc/sys/rootnex.h b/usr/src/uts/i86pc/sys/rootnex.h index c0e3199828..859157d1c8 100644 --- a/usr/src/uts/i86pc/sys/rootnex.h +++ b/usr/src/uts/i86pc/sys/rootnex.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ROOTNEX_H @@ -55,16 +54,22 @@ extern "C" { #define ROOTNEX_DPROF_DEC(addr) atomic_add_64(addr, -1) #define ROOTNEX_DPROBE1(name, type1, arg1) \ DTRACE_PROBE1(name, type1, arg1) +#define ROOTNEX_DPROBE2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(name, type1, arg1, type2, arg2) #define ROOTNEX_DPROBE3(name, type1, arg1, type2, arg2, type3, arg3) \ DTRACE_PROBE3(name, type1, arg1, type2, arg2, type3, arg3) +#define ROOTNEX_DPROBE4(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + DTRACE_PROBE4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) #else #define ROOTNEX_DPROF_INC(addr) #define ROOTNEX_DPROF_DEC(addr) #define ROOTNEX_DPROBE1(name, type1, arg1) +#define ROOTNEX_DPROBE2(name, type1, arg1, type2, arg2) #define ROOTNEX_DPROBE3(name, type1, arg1, type2, arg2, type3, arg3) +#define ROOTNEX_DPROBE4(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) #endif -#define ROOTNEX_PROF_INC(addr) atomic_inc_64(addr) -#define ROOTNEX_PROF_DEC(addr) atomic_add_64(addr, -1) /* set in dmac_type to signify that this cookie uses the copy buffer */ #define ROOTNEX_USES_COPYBUF 0x80000000 @@ -84,6 +89,12 @@ typedef struct rootnex_intprop_s { */ typedef struct rootnex_sglinfo_s { /* + * Used to simplify calculations to get the maximum number + * of cookies. + */ + boolean_t si_cancross; + + /* * These are passed into rootnex_get_sgl(). * * si_min_addr - the minimum physical address @@ -210,16 +221,6 @@ typedef struct rootnex_window_s { #endif } rootnex_window_t; -typedef struct dvcookie { - uint64_t dvck_dvma; - uint64_t dvck_npages; -} dvcookie_t; - -typedef struct dcookie { - paddr_t dck_paddr; - uint64_t dck_npages; -} dcookie_t; - /* per dma handle private state */ typedef struct rootnex_dma_s { /* @@ -241,7 +242,10 @@ typedef struct rootnex_dma_s { boolean_t dp_trim_required; boolean_t dp_granularity_power_2; uint64_t dp_maxxfer; + + boolean_t dp_dvma_used; ddi_dma_obj_t dp_dma; + ddi_dma_obj_t dp_dvma; rootnex_sglinfo_t dp_sglinfo; /* @@ -315,6 +319,8 @@ typedef struct rootnex_dma_s { ddi_dma_cookie_t *dp_saved_cookies; boolean_t dp_need_to_switch_cookies; + void *dp_iommu_private; + /* * pre allocated space for the bind state, allocated during alloc * handle. For a lot of devices, this will save us from having to do @@ -325,18 +331,6 @@ typedef struct rootnex_dma_s { uchar_t *dp_prealloc_buffer; /* - * Intel IOMMU (immu) related state - * dv_cookies saves the dvma allocated for this handler - * max index of dvcookies in dvmax - */ - dvcookie_t *dp_dvcookies; - uint64_t dp_dvmax; - dcookie_t *dp_dcookies; - uint64_t dp_dmax; - uint64_t dp_max_cookies; - uint64_t dp_max_dcookies; - - /* * sleep flags set on bind and unset on unbind */ int dp_sleep_flags; diff --git a/usr/src/uts/i86pc/sys/smp_impldefs.h b/usr/src/uts/i86pc/sys/smp_impldefs.h index 6afce7fd6c..77a203042c 100644 --- a/usr/src/uts/i86pc/sys/smp_impldefs.h +++ b/usr/src/uts/i86pc/sys/smp_impldefs.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SMP_IMPLDEFS_H diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 092cd1659b..bc8e3e197f 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -491,7 +491,7 @@ set_max_page_level() if (!kbm_largepage_support) { lvl = 0; } else { - if (x86_feature & X86_1GPG) { + if (is_x86_feature(x86_featureset, X86FSET_1GPG)) { lvl = 2; if (chk_optimal_1gtlb && cpuid_opteron_erratum(CPU, 6671130)) { @@ -528,7 +528,8 @@ mmu_init(void) * If CPU enabled the page table global bit, use it for the kernel * This is bit 7 in CR4 (PGE - Page Global Enable). */ - if ((x86_feature & X86_PGE) != 0 && (getcr4() & CR4_PGE) != 0) + if (is_x86_feature(x86_featureset, X86FSET_PGE) && + (getcr4() & CR4_PGE) != 0) mmu.pt_global = PT_GLOBAL; /* @@ -576,10 +577,10 @@ mmu_init(void) mmu.pte_size_shift = 2; } - if (mmu.pae_hat && (x86_feature & X86_PAE) == 0) + if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE)) panic("Processor does not support PAE"); - if ((x86_feature & X86_CX8) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_CX8)) panic("Processor does not support cmpxchg8b instruction"); #if defined(__amd64) @@ -1095,7 +1096,7 @@ hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags) /* nothing to set */; } else if (cache_attr & (HAT_MERGING_OK | HAT_LOADCACHING_OK)) { PTE_SET(pte, PT_NOCACHE); - if (x86_feature & X86_PAT) + if (is_x86_feature(x86_featureset, X86FSET_PAT)) PTE_SET(pte, (level == 0) ? PT_PAT_4K : PT_PAT_LARGE); else PTE_SET(pte, PT_WRITETHRU); diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c index 3bc7eb254d..bcb2b117a3 100644 --- a/usr/src/uts/i86pc/vm/htable.c +++ b/usr/src/uts/i86pc/vm/htable.c @@ -2416,7 +2416,7 @@ x86pte_zero(htable_t *dest, uint_t entry, uint_t count) size = count << mmu.pte_size_shift; ASSERT(size > BLOCKZEROALIGN); #ifdef __i386 - if ((x86_feature & X86_SSE2) == 0) + if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) bzero(dst_va, size); else #endif diff --git a/usr/src/uts/i86pc/vm/vm_dep.h b/usr/src/uts/i86pc/vm/vm_dep.h index 753aa9d146..dbad783383 100644 --- a/usr/src/uts/i86pc/vm/vm_dep.h +++ b/usr/src/uts/i86pc/vm/vm_dep.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -55,7 +54,7 @@ extern "C" { * correct tick value. The proper routine to use is tsc_read(). */ -extern hrtime_t randtick(); +extern u_longlong_t randtick(); extern uint_t page_create_update_flags_x86(uint_t); extern size_t plcnt_sz(size_t); |
