diff options
| author | Jimmy Vetayases <Jimmy.Vetayases@oracle.com> | 2010-06-24 09:34:22 -0700 |
|---|---|---|
| committer | Jimmy Vetayases <Jimmy.Vetayases@oracle.com> | 2010-06-24 09:34:22 -0700 |
| commit | 7ff178cd8db129d385d3177eb20744d3b6efc59b (patch) | |
| tree | 2104196adc12fdbdd7a78a325176de2cfad29a0c /usr | |
| parent | 4b31676f89e318c11400fc0c4defc802da29222f (diff) | |
| download | illumos-joyent-7ff178cd8db129d385d3177eb20744d3b6efc59b.tar.gz | |
PSARC/2009/505 IRM Framework Extension(s)
PSARC/2009/665 Pcitool Extensions
6669984 Solaris x86 need to provide large number of interrupt vectors for MSI/MSI-x
6866130 Interrupt Resource Management (IRM) support on x86 platforms
6876744 Need a new mdb debugger module for the new apix PSM
6881939 decouple current i86xpv interrupt implementation from i86pc
6916041 Pcitool Enhancement(1M) for the new apix PSM on X86
6957091 update pcitool(1m) manpage for 6916041
--HG--
rename : usr/src/cmd/mdb/i86pc/modules/pcplusmp/apic.c => usr/src/cmd/mdb/i86pc/modules/pcplusmp/pcplusmp.c
Diffstat (limited to 'usr')
74 files changed, 15471 insertions, 4595 deletions
diff --git a/usr/src/cmd/intrd/intrd.pl b/usr/src/cmd/intrd/intrd.pl index 004909327b..78276e34b3 100644 --- a/usr/src/cmd/intrd/intrd.pl +++ b/usr/src/cmd/intrd/intrd.pl @@ -21,8 +21,7 @@ # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # require 5.8.4; @@ -66,7 +65,7 @@ while ($_ = shift @ARGV) { if ($using_scengen == 0) { require Sun::Solaris::Kstat; require Sun::Solaris::Intrs; - import Sun::Solaris::Intrs(qw(intrmove is_pcplusmp)); + import Sun::Solaris::Intrs(qw(intrmove is_apic)); require Sys::Syslog; import Sys::Syslog; openlog($cmdname, 'pid', 'daemon'); @@ -937,8 +936,8 @@ sub do_reconfig($) while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) { next if ($ivec->{origcpu} == $cpuid); - if (!intrmove($ivec->{buspath}, $ivec->{ino}, - $cpuid, $ivec->{num_ino})) { + if (!intrmove($ivec->{buspath}, $ivec->{origcpu}, + $ivec->{ino}, $cpuid, $ivec->{num_ino})) { syslog('warning', "Unable to move interrupts") if $warned++ == 0; syslog('debug', "Unable to move buspath ". @@ -1275,8 +1274,8 @@ my $elem0 = $elem[0]; my $elemval = (values(%$elem0))[0]; # Use its buspath to query the system. It is assumed that either all or none -# of the busses on a system are hosted by the pcplusmp APIC. -my $pcplusmp_sys = is_pcplusmp($elemval->{buspath}); +# of the busses on a system are hosted by the pcplusmp APIC or APIX. +my $pcplusmp_sys = is_apic($elemval->{buspath}); my $stat = getstat($ks, $pcplusmp_sys); diff --git a/usr/src/cmd/mdb/common/modules/genunix/irm.c b/usr/src/cmd/mdb/common/modules/genunix/irm.c index a9ce6f59d1..bf3af41eb8 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/irm.c +++ b/usr/src/cmd/mdb/common/modules/genunix/irm.c @@ -140,8 +140,17 @@ irmpools_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_devinfo2driver((uintptr_t)pool.ipool_owner, driver, sizeof (driver)); - mdb_snprintf(devname, sizeof (devname), "%s#%d", driver, - dev.devi_instance); + /* + * Include driver instance number only if the node has an + * instance number assigned (i.e. instance != -1) to it. + * This will cover cases like rootnex driver which doesn't + * have instance number assigned to it. + */ + if (dev.devi_instance != -1) + mdb_snprintf(devname, sizeof (devname), "%s#%d", driver, + dev.devi_instance); + else + mdb_snprintf(devname, sizeof (devname), "%s", driver); mdb_printf("%0?p %-18s %-8s %-6d %-9d %-8d\n", addr, devname, irm_get_type(pool.ipool_types), pool.ipool_totsz, diff --git a/usr/src/cmd/mdb/i86pc/modules/Makefile b/usr/src/cmd/mdb/i86pc/modules/Makefile index b272183c58..675a70b2a4 100644 --- a/usr/src/cmd/mdb/i86pc/modules/Makefile +++ b/usr/src/cmd/mdb/i86pc/modules/Makefile @@ -19,10 +19,10 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # SUBDIRS = \ + apix \ pcplusmp \ uppc \ unix diff --git a/usr/src/cmd/mdb/i86pc/modules/apix/Makefile b/usr/src/cmd/mdb/i86pc/modules/apix/Makefile new file mode 100644 index 0000000000..adb7244adb --- /dev/null +++ b/usr/src/cmd/mdb/i86pc/modules/apix/Makefile @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +include $(SRC)/Makefile.master +SUBDIRS = ia32 +$(BUILD64)SUBDIRS += $(MACH64) +include ../../../Makefile.subdirs diff --git a/usr/src/cmd/mdb/i86pc/modules/apix/amd64/Makefile b/usr/src/cmd/mdb/i86pc/modules/apix/amd64/Makefile new file mode 100644 index 0000000000..2eadc490bb --- /dev/null +++ b/usr/src/cmd/mdb/i86pc/modules/apix/amd64/Makefile @@ -0,0 +1,42 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +MODULE = apix.so +MDBTGT = kvm + +MODSRCS = apix.c apic_common.c intr_common.c + +include ../../../../../Makefile.cmd +include ../../../../../Makefile.cmd.64 +include ../../../../intel/Makefile.amd64 +include ../../../Makefile.i86pc +include ../../../../Makefile.module + +MODSRCS_DIR = ../../common + +CPPFLAGS += -DMP -D_MACHDEP +CPPFLAGS += -I../../../../common +CPPFLAGS += -I../../common +CPPFLAGS += -I$(SRC)/uts/intel +CPPFLAGS += -I$(SRC)/uts/i86pc diff --git a/usr/src/cmd/mdb/i86pc/modules/apix/apix.c b/usr/src/cmd/mdb/i86pc/modules/apix/apix.c new file mode 100644 index 0000000000..f17f6dacb3 --- /dev/null +++ b/usr/src/cmd/mdb/i86pc/modules/apix/apix.c @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "intr_common.h" + +/* + * Globals + */ +static apic_irq_t *irq_tbl[APIC_MAX_VECTOR+1]; +static char level_tbl[APIC_MAX_VECTOR+1]; +static apix_impl_t *d_apixs[NCPU]; +static int d_ncpus = NCPU; + + +/* + * Dump interrupt information for apix PSM. + */ +/* ARGSUSED */ +int +interrupt_dump_apix(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + int i, j; + apix_impl_t apix; + apix_vector_t apix_vector; + struct autovec av; + apic_irq_t apic_irq; + + option_flags = 0; + if (mdb_getopts(argc, argv, + 'd', MDB_OPT_SETBITS, INTR_DISPLAY_DRVR_INST, &option_flags, + 'i', MDB_OPT_SETBITS, INTR_DISPLAY_INTRSTAT, &option_flags, + NULL) != argc) + return (DCMD_USAGE); + + if (mdb_readvar(&d_apixs, "apixs") == -1) { + mdb_warn("failed to read apixs"); + return (DCMD_ERR); + } + + if (mdb_readvar(&d_ncpus, "apic_nproc") == -1) { + mdb_warn("failed to read apic_nproc"); + d_ncpus = NCPU; + } + if (d_ncpus == 0 || d_ncpus > NCPU) + d_ncpus = NCPU; + + if (mdb_readvar(&irq_tbl, "apic_irq_table") == -1) { + mdb_warn("failed to read apic_irq_table"); + return (DCMD_ERR); + } + + if (mdb_readvar(&level_tbl, "apic_level_intr") == -1) { + mdb_warn("failed to read apic_level_intr"); + return (DCMD_ERR); + } + + /* Print the header first */ + if (option_flags & INTR_DISPLAY_INTRSTAT) + mdb_printf("%<u>CPU "); + else + mdb_printf("%<u>CPU/Vect IRQ IPL Bus Trg Type " + "Share APIC/INT# "); + mdb_printf("%s %</u>\n", option_flags & INTR_DISPLAY_DRVR_INST ? + "Driver Name(s)" : "ISR"); + + /* Walk all the entries */ + for (i = 0; i < d_ncpus; i++) { + /* Read the per CPU apix entry */ + if (mdb_vread(&apix, sizeof (apix_impl_t), + (uintptr_t)d_apixs[i]) == -1) + continue; + for (j = 0; j < APIX_NVECTOR; j++) { + /* Read the vector entry */ + if (mdb_vread(&apix_vector, sizeof (apix_vector_t), + (uintptr_t)apix.x_vectbl[j]) == -1) + continue; + /* If invalid vector state; continue */ + if (apix_vector.v_state == APIX_STATE_FREED || + apix_vector.v_state == APIX_STATE_OBSOLETED) + continue; + if (apix_vector.v_type == APIX_TYPE_IPI) + continue; + if (mdb_vread(&av, sizeof (struct autovec), + (uintptr_t)(apix_vector.v_autovect)) == -1) + continue; + if ((apix_vector.v_type == APIX_TYPE_FIXED) && + (mdb_vread(&apic_irq, sizeof (apic_irq_t), + (uintptr_t)irq_tbl[apix_vector.v_inum]) == -1)) + continue; + + apix_interrupt_dump(&apix_vector, &apic_irq, &av, + NULL, level_tbl[apix_vector.v_inum]); + } + } + /* print IPIs */ + if (mdb_vread(&apix, sizeof (apix_impl_t), + (uintptr_t)d_apixs[0]) != -1) { + for (j = 0; j < APIX_NVECTOR; j++) { + /* Read the vector entry */ + if (mdb_vread(&apix_vector, sizeof (apix_vector_t), + (uintptr_t)apix.x_vectbl[j]) == -1) + continue; + /* If invalid vector state; continue */ + if (apix_vector.v_state == APIX_STATE_FREED || + apix_vector.v_state == APIX_STATE_OBSOLETED) + continue; + if (apix_vector.v_type != APIX_TYPE_IPI) + continue; + if (mdb_vread(&av, sizeof (struct autovec), + (uintptr_t)(apix_vector.v_autovect)) == -1) { + /* v_share for poke_cpu is 0 */ + if (apix_vector.v_share != 0) + continue; + } + apix_interrupt_ipi_dump(&apix_vector, &av, NULL); + } + } + + return (DCMD_OK); +} + +/* + * MDB module linkage information: + * + * We declare a list of structures describing our dcmds, and a function + * named _mdb_init to return a pointer to our module information. + */ +static const mdb_dcmd_t dcmds[] = { + { "interrupts", "?[-di]", "print interrupts", interrupt_dump_apix, + interrupt_help}, + { "softint", "?[-d]", "print soft interrupts", soft_interrupt_dump, + soft_interrupt_help}, + { "apic", NULL, "print apic register contents", apic }, + { "ioapic", NULL, "print ioapic register contents", ioapic }, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, NULL }; + +const mdb_modinfo_t * +_mdb_init(void) +{ + GElf_Sym sym; + + if (mdb_lookup_by_name("gld_intr", &sym) != -1) + if (GELF_ST_TYPE(sym.st_info) == STT_FUNC) + gld_intr_addr = (uintptr_t)sym.st_value; + + return (&modinfo); +} diff --git a/usr/src/cmd/mdb/i86pc/modules/apix/ia32/Makefile b/usr/src/cmd/mdb/i86pc/modules/apix/ia32/Makefile new file mode 100644 index 0000000000..8671b51407 --- /dev/null +++ b/usr/src/cmd/mdb/i86pc/modules/apix/ia32/Makefile @@ -0,0 +1,41 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +MODULE = apix.so +MDBTGT = kvm + +MODSRCS = apix.c apic_common.c intr_common.c + +include ../../../../../Makefile.cmd +include ../../../../intel/Makefile.ia32 +include ../../../Makefile.i86pc +include ../../../../Makefile.module + +MODSRCS_DIR = ../../common + +CPPFLAGS += -DMP -D_MACHDEP +CPPFLAGS += -I../../../../common +CPPFLAGS += -I../../common +CPPFLAGS += -I$(SRC)/uts/intel +CPPFLAGS += -I$(SRC)/uts/i86pc diff --git a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/apic.c b/usr/src/cmd/mdb/i86pc/modules/common/apic_common.c index 2e048223ea..e4db9b61a5 100644 --- a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/apic.c +++ b/usr/src/cmd/mdb/i86pc/modules/common/apic_common.c @@ -19,87 +19,23 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "intr_common.h" -/* - * Globals - */ -static struct av_head avec_tbl[APIC_MAX_VECTOR+1]; -static apic_irq_t *irq_tbl[APIC_MAX_VECTOR+1], airq; -static char level_tbl[APIC_MAX_VECTOR+1]; - -/* - * interrupt_dump: - * - * Dump interrupt information. - */ -/* ARGSUSED */ -int -interrupt_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - int i; - - option_flags = 0; - if (mdb_getopts(argc, argv, - 'd', MDB_OPT_SETBITS, INTR_DISPLAY_DRVR_INST, &option_flags, - 'i', MDB_OPT_SETBITS, INTR_DISPLAY_INTRSTAT, &option_flags, - NULL) != argc) - return (DCMD_USAGE); - - if (mdb_readvar(&irq_tbl, "apic_irq_table") == -1) { - mdb_warn("failed to read apic_irq_table"); - return (DCMD_ERR); - } - - if (mdb_readvar(&level_tbl, "apic_level_intr") == -1) { - mdb_warn("failed to read apic_level_intr"); - return (DCMD_ERR); - } - - if (mdb_readvar(&avec_tbl, "autovect") == -1) { - mdb_warn("failed to read autovect"); - return (DCMD_ERR); - } - - /* Print the header first */ - if (option_flags & INTR_DISPLAY_INTRSTAT) - mdb_printf("%<u>CPU "); - else - mdb_printf( - "%<u>IRQ Vect IPL Bus Trg Type CPU Share APIC/INT# "); - mdb_printf("%s %</u>\n", option_flags & INTR_DISPLAY_DRVR_INST ? - "Driver Name(s)" : "ISR(s)"); - - /* Walk all the entries */ - for (i = 0; i < APIC_MAX_VECTOR + 1; i++) { - /* Read the entry */ - if (mdb_vread(&airq, sizeof (apic_irq_t), - (uintptr_t)irq_tbl[i]) == -1) - continue; - - apic_interrupt_dump(&airq, &avec_tbl[i], i, NULL, level_tbl[i]); - } - - return (DCMD_OK); -} /* Macros for reading/writing the IOAPIC RDT entries */ -#define READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, ipin) \ - ioapic_read(ioapic_ix, APIC_RDT_CMD + (2 * (ipin))) +#define APIC_READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, ipin) \ + apic_ioapic_read(ioapic_ix, APIC_RDT_CMD + (2 * (ipin))) -#define READ_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapic_ix, ipin) \ - ioapic_read(ioapic_ix, APIC_RDT_CMD2 + (2 * (ipin))) +#define APIC_READ_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapic_ix, ipin) \ + apic_ioapic_read(ioapic_ix, APIC_RDT_CMD2 + (2 * (ipin))) static uint32_t *ioapic_adr[MAX_IO_APIC]; -uint32_t -ioapic_read(int ioapic_ix, uint32_t reg) +static uint32_t +apic_ioapic_read(int ioapic_ix, uint32_t reg) { volatile uint32_t *ioapic; @@ -112,7 +48,7 @@ ioapic_read(int ioapic_ix, uint32_t reg) * ioapic dcmd - Print out the ioapic registers, nicely formatted. */ /*ARGSUSED*/ -static int +int ioapic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { uint32_t apic_io_max; @@ -146,15 +82,15 @@ ioapic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) for (i = 0; i < apic_io_max; i++) { /* Bits 23-16 define the maximum redirection entries */ - reg_max = ioapic_read(i, APIC_VERS_CMD); + reg_max = apic_ioapic_read(i, APIC_VERS_CMD); reg_max = (reg_max >> 16) & 0xff; mdb_printf("%4s %8s %8s\n", "reg", "high", " low"); for (reg = 0; reg <= reg_max; reg++) { uint32_t high, low; - high = READ_IOAPIC_RDT_ENTRY_HIGH_DWORD(i, reg); - low = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(i, reg); + high = APIC_READ_IOAPIC_RDT_ENTRY_HIGH_DWORD(i, reg); + low = APIC_READ_IOAPIC_RDT_ENTRY_LOW_DWORD(i, reg); mdb_printf("%2d %8x %8x\n", reg, high, low); } @@ -171,7 +107,7 @@ ioapic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) * apic dcmd - Print out the apic registers, nicely formatted. */ /*ARGSUSED*/ -static int +int apic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { uint32_t *papic; @@ -203,34 +139,3 @@ apic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } - - -/* - * MDB module linkage information: - * - * We declare a list of structures describing our dcmds, and a function - * named _mdb_init to return a pointer to our module information. - */ -static const mdb_dcmd_t dcmds[] = { - { "interrupts", "?[-di]", "print interrupts", interrupt_dump, - interrupt_help}, - { "softint", "?[-d]", "print soft interrupts", soft_interrupt_dump, - soft_interrupt_help}, - { "apic", NULL, "print apic register contents", apic }, - { "ioapic", NULL, "print ioapic register contents", ioapic }, - { NULL } -}; - -static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, NULL }; - -const mdb_modinfo_t * -_mdb_init(void) -{ - GElf_Sym sym; - - if (mdb_lookup_by_name("gld_intr", &sym) != -1) - if (GELF_ST_TYPE(sym.st_info) == STT_FUNC) - gld_intr_addr = (uintptr_t)sym.st_value; - - return (&modinfo); -} diff --git a/usr/src/cmd/mdb/i86pc/modules/common/intr_common.c b/usr/src/cmd/mdb/i86pc/modules/common/intr_common.c index aa5430050e..691f0c4064 100644 --- a/usr/src/cmd/mdb/i86pc/modules/common/intr_common.c +++ b/usr/src/cmd/mdb/i86pc/modules/common/intr_common.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "intr_common.h" #include <sys/multidata.h> #include <sys/gld.h> @@ -196,6 +193,21 @@ get_interrupt_type(short index) return ("Fixed"); } +static char * +get_apix_interrupt_type(short type) +{ + if (type == APIX_TYPE_IPI) + return ("IPI"); + else if (type == APIX_TYPE_FIXED) + return ("Fixed"); + else if (type == APIX_TYPE_MSI) + return ("MSI"); + else if (type == APIX_TYPE_MSIX) + return ("MSI-X"); + else + return ("Fixed"); +} + void apic_interrupt_dump(apic_irq_t *irqp, struct av_head *avp, int i, ushort_t *evtchnp, char level) @@ -298,3 +310,146 @@ apic_interrupt_dump(apic_irq_t *irqp, struct av_head *avp, } mdb_printf("\n"); } + +void +apix_interrupt_dump(apix_vector_t *vectp, apic_irq_t *irqp, + struct autovec *avp, ushort_t *evtchnp, char level) +{ + int j; + int bus_type; + char *intr_type; + char irq[4]; + char ioapic_iline[10]; + char ipl[3]; + char cpu_assigned[4]; + char cpu_vector[10]; + char evtchn[8]; + + + /* If invalid vector state; continue */ + if (vectp->v_state == APIX_STATE_FREED || + vectp->v_state == APIX_STATE_OBSOLETED) + return; + + /* use apic_interrupt_ipi_dump for IPIs */ + if (vectp->v_type == APIX_TYPE_IPI) + return; + + /* Figure out interrupt type and trigger information */ + intr_type = get_apix_interrupt_type(vectp->v_type); + + /* Figure out IOAPIC number and ILINE number */ + if (vectp->v_type != APIX_TYPE_FIXED) { + level = 0; /* MSI/MSI-X are Edge trigger */ + (void) mdb_snprintf(irq, 4, "- "); + (void) mdb_snprintf(ioapic_iline, 10, "- "); + if (vectp->v_type == APIX_TYPE_IPI) + bus_type = BUSTYPE_NONE; + else + /* statically assign MSI/X with "PCI" */ + bus_type = BUSTYPE_PCI; + } else { + (void) mdb_snprintf(irq, 4, "%d", vectp->v_inum); + bus_type = irqp->airq_iflag.bustype; + if (!irqp->airq_ioapicindex && !irqp->airq_intin_no) { + if (strcmp(intr_type, "Fixed") == 0) + (void) mdb_snprintf(ioapic_iline, 10, + "0x%x/0x%x", irqp->airq_ioapicindex, + irqp->airq_intin_no); + else + (void) mdb_snprintf(ioapic_iline, 10, "- "); + } else + (void) mdb_snprintf(ioapic_iline, 10, "0x%x/0x%x", + irqp->airq_ioapicindex, irqp->airq_intin_no); + } + + evtchn[0] = '\0'; + if (evtchnp != NULL) + (void) mdb_snprintf(evtchn, 8, "%-7hd", *evtchnp); + + (void) mdb_snprintf(cpu_assigned, 4, "%d", vectp->v_cpuid); + (void) mdb_snprintf(cpu_vector, 10, "%d/0x%x", + vectp->v_cpuid, vectp->v_vector); + + /* Loop all the shared vectors */ + for (j = 0; j < vectp->v_share; ) { + /* shared interrupts with one or more ISR removed afterwards */ + if (avp->av_vector == NULL) { + if (mdb_vread(avp, sizeof (struct autovec), + (uintptr_t)avp->av_link) == -1) + break; + else + continue; + } + + (void) mdb_snprintf(ipl, 3, "%d", avp->av_prilevel); + /* Print each interrupt entry */ + if (option_flags & INTR_DISPLAY_INTRSTAT) + mdb_printf("%-4s", cpu_assigned); + else + mdb_printf("%-9s %-3s %s%-3s %-6s %-3s %-6s %-3d " + "%-9s ", cpu_vector, irq, evtchn, ipl, + (bus_type ? businfo_array[bus_type] : "-"), + (level ? "Lvl" : "Edg"), + intr_type, vectp->v_share, ioapic_iline); + + interrupt_print_isr((uintptr_t)avp->av_vector, + (uintptr_t)avp->av_intarg1, (uintptr_t)avp->av_dip); + mdb_printf("\n"); + + if (++j == vectp->v_share) + break; /* done */ + + if (mdb_vread(avp, sizeof (struct autovec), + (uintptr_t)avp->av_link) == -1) + break; + } +} + +void +apix_interrupt_ipi_dump(apix_vector_t *vectp, struct autovec *avp, + ushort_t *evtchnp) +{ + char *intr_type = "IPI"; + char ioapic_iline[10]; + char ipl[3]; + char cpu_assigned[4]; + char cpu_vector[10]; + char evtchn[8]; + + /* If invalid vector state; continue */ + if (vectp->v_state == APIX_STATE_FREED || + vectp->v_state == APIX_STATE_OBSOLETED) + return; + + if (vectp->v_type != APIX_TYPE_IPI) + return; + + /* No IOAPIC number and ILINE number info */ + (void) mdb_snprintf(ioapic_iline, 10, "- "); + + evtchn[0] = '\0'; + if (evtchnp != NULL) + (void) mdb_snprintf(evtchn, 8, "%-7hd", *evtchnp); + + /* IPI targeted ALL cpus */ + mdb_snprintf(cpu_assigned, 4, "all"); + (void) mdb_snprintf(cpu_vector, 10, "%s/0x%x", + "all", vectp->v_vector); + /* IPI is not shared interrupt, so we can get the IPL from v_pri */ + (void) mdb_snprintf(ipl, 3, "%d", vectp->v_pri); + + /* Print each interrupt entry */ + if (option_flags & INTR_DISPLAY_INTRSTAT) + mdb_printf("%-4s", cpu_assigned); + else + mdb_printf("%-9s %-3s %s%-3s %-6s %-3s %-6s %-3d %-9s ", + cpu_vector, "- ", evtchn, ipl, "- ", "Edg", + intr_type, vectp->v_share, ioapic_iline); + if (!vectp->v_share) + mdb_printf("poke_cpu"); + else + mdb_printf("%a", avp->av_vector); + + mdb_printf("\n"); +} diff --git a/usr/src/cmd/mdb/i86pc/modules/common/intr_common.h b/usr/src/cmd/mdb/i86pc/modules/common/intr_common.h index 19c90f04bc..c3f187f777 100644 --- a/usr/src/cmd/mdb/i86pc/modules/common/intr_common.h +++ b/usr/src/cmd/mdb/i86pc/modules/common/intr_common.h @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _MDB_INTR_COMMON_H #define _MDB_INTR_COMMON_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -40,6 +37,7 @@ extern "C" { #include <sys/psm_common.h> #include <sys/pic.h> #include <sys/apic.h> +#include <sys/apix.h> /* * Function prototypes @@ -48,6 +46,12 @@ void interrupt_help(void); void interrupt_print_isr(uintptr_t, uintptr_t, uintptr_t); void apic_interrupt_dump(apic_irq_t *, struct av_head *, int i, ushort_t *, char); +void apix_interrupt_dump(apix_vector_t *, apic_irq_t *, + struct autovec *, ushort_t *, char); +void apix_interrupt_ipi_dump(apix_vector_t *, struct autovec *, + ushort_t *); +int ioapic(uintptr_t, uint_t, int, const mdb_arg_t *); +int apic(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv); void soft_interrupt_help(void); int soft_interrupt_dump(uintptr_t, uint_t, int, const mdb_arg_t *); @@ -61,6 +65,9 @@ int soft_interrupt_dump(uintptr_t, uint_t, int, const mdb_arg_t *); extern int option_flags; +#define BUSTYPE_PCI 0x0d +#define BUSTYPE_NONE 0x00 + /* * gld_intr_addr is used to save address of gld_intr() ISR */ diff --git a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/amd64/Makefile b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/amd64/Makefile index 88bf7fd4c8..04542d0fe1 100644 --- a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/amd64/Makefile +++ b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/amd64/Makefile @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # -#ident "%Z%%M% %I% %E% SMI" MODULE = pcplusmp.so MDBTGT = kvm -MODSRCS = apic.c intr_common.c +MODSRCS = pcplusmp.c apic_common.c intr_common.c include ../../../../../Makefile.cmd include ../../../../../Makefile.cmd.64 diff --git a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/ia32/Makefile b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/ia32/Makefile index e0e8edc45b..46899c52a1 100644 --- a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/ia32/Makefile +++ b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/ia32/Makefile @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # -#ident "%Z%%M% %I% %E% SMI" MODULE = pcplusmp.so MDBTGT = kvm -MODSRCS = apic.c intr_common.c +MODSRCS = pcplusmp.c apic_common.c intr_common.c include ../../../../../Makefile.cmd include ../../../../intel/Makefile.ia32 diff --git a/usr/src/cmd/mdb/i86pc/modules/pcplusmp/pcplusmp.c b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/pcplusmp.c new file mode 100644 index 0000000000..f5b3af0027 --- /dev/null +++ b/usr/src/cmd/mdb/i86pc/modules/pcplusmp/pcplusmp.c @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "intr_common.h" + +/* + * Globals + */ +static struct av_head avec_tbl[APIC_MAX_VECTOR+1]; +static apic_irq_t *irq_tbl[APIC_MAX_VECTOR+1], airq; +static char level_tbl[APIC_MAX_VECTOR+1]; + +/* + * Dump interrupt information for pcplusmp PSM. + */ +/* ARGSUSED */ +int +interrupt_dump_apic(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + int i; + + option_flags = 0; + if (mdb_getopts(argc, argv, + 'd', MDB_OPT_SETBITS, INTR_DISPLAY_DRVR_INST, &option_flags, + 'i', MDB_OPT_SETBITS, INTR_DISPLAY_INTRSTAT, &option_flags, + NULL) != argc) + return (DCMD_USAGE); + + if (mdb_readvar(&irq_tbl, "apic_irq_table") == -1) { + mdb_warn("failed to read apic_irq_table"); + return (DCMD_ERR); + } + + if (mdb_readvar(&level_tbl, "apic_level_intr") == -1) { + mdb_warn("failed to read apic_level_intr"); + return (DCMD_ERR); + } + + if (mdb_readvar(&avec_tbl, "autovect") == -1) { + mdb_warn("failed to read autovect"); + return (DCMD_ERR); + } + + /* Print the header first */ + if (option_flags & INTR_DISPLAY_INTRSTAT) + mdb_printf("%<u>CPU "); + else + mdb_printf( + "%<u>IRQ Vect IPL Bus Trg Type CPU Share APIC/INT# "); + mdb_printf("%s %</u>\n", option_flags & INTR_DISPLAY_DRVR_INST ? + "Driver Name(s)" : "ISR(s)"); + + /* Walk all the entries */ + for (i = 0; i < APIC_MAX_VECTOR + 1; i++) { + /* Read the entry */ + if (mdb_vread(&airq, sizeof (apic_irq_t), + (uintptr_t)irq_tbl[i]) == -1) + continue; + + apic_interrupt_dump(&airq, &avec_tbl[i], i, NULL, level_tbl[i]); + } + + return (DCMD_OK); +} + + +/* + * MDB module linkage information: + * + * We declare a list of structures describing our dcmds, and a function + * named _mdb_init to return a pointer to our module information. + */ +static const mdb_dcmd_t dcmds[] = { + { "interrupts", "?[-di]", "print interrupts", interrupt_dump_apic, + interrupt_help}, + { "softint", "?[-d]", "print soft interrupts", soft_interrupt_dump, + soft_interrupt_help}, + { "apic", NULL, "print apic register contents", apic }, + { "ioapic", NULL, "print ioapic register contents", ioapic }, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, NULL }; + +const mdb_modinfo_t * +_mdb_init(void) +{ + GElf_Sym sym; + + if (mdb_lookup_by_name("gld_intr", &sym) != -1) + if (GELF_ST_TYPE(sym.st_info) == STT_FUNC) + gld_intr_addr = (uintptr_t)sym.st_value; + + return (&modinfo); +} diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c index cb7619aef4..f73594ef22 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c +++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include <mdb/mdb_modapi.h> @@ -36,9 +35,15 @@ #include <sys/mutex.h> #include <sys/mutex_impl.h> #include "i86mmu.h" +#include <sys/apix.h> #define TT_HDLR_WIDTH 17 + +/* apix only */ +static apix_impl_t *d_apixs[NCPU]; +static int use_apix = 0; + static int ttrace_ttr_size_check(void) { @@ -267,6 +272,53 @@ ttrace_interrupt(trap_trace_rec_t *rec) return (0); } +static int +ttrace_apix_interrupt(trap_trace_rec_t *rec) +{ + struct autovec av; + apix_impl_t apix; + apix_vector_t apix_vector; + + switch (rec->ttr_regs.r_trapno) { + case T_SOFTINT: + mdb_printf("%-3s %-*s", "-", TT_HDLR_WIDTH, "(fakesoftint)"); + return (0); + default: + break; + } + + mdb_printf("%-3x ", rec->ttr_vector); + + /* Read the per CPU apix entry */ + if (mdb_vread(&apix, sizeof (apix_impl_t), + (uintptr_t)d_apixs[rec->ttr_cpuid]) == -1) { + mdb_warn("\ncouldn't read apix[%d]", rec->ttr_cpuid); + return (-1); + } + if (mdb_vread(&apix_vector, sizeof (apix_vector_t), + (uintptr_t)apix.x_vectbl[rec->ttr_vector]) == -1) { + mdb_warn("\ncouldn't read apix_vector_t[%d]", rec->ttr_vector); + return (-1); + } + if (apix_vector.v_share == 0) { + if (rec->ttr_ipl == XC_CPUPOKE_PIL) + mdb_printf("%-*s", TT_HDLR_WIDTH, "(cpupoke)"); + else + mdb_printf("%-*s", TT_HDLR_WIDTH, "(spurious)"); + } else { + if (mdb_vread(&av, sizeof (struct autovec), + (uintptr_t)(apix_vector.v_autovect)) == -1) { + mdb_warn("couldn't read autovec at %p", + (uintptr_t)apix_vector.v_autovect); + } + + mdb_printf("%-*a", TT_HDLR_WIDTH, av.av_vector); + } + + return (0); +} + + static struct { int tt_trapno; char *tt_name; @@ -505,6 +557,20 @@ ttrace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) dcmd.ttd_cpu = addr; } + if (mdb_readvar(&use_apix, "apix_enable") == -1) { + mdb_warn("failed to read apix_enable"); + use_apix = 0; + } + + if (use_apix) { + if (mdb_readvar(&d_apixs, "apixs") == -1) { + mdb_warn("\nfailed to read apixs."); + return (DCMD_ERR); + } + /* change to apix ttrace interrupt handler */ + ttrace_hdlr[4].t_hdlr = ttrace_apix_interrupt; + } + if (mdb_walk("ttrace", (mdb_walk_cb_t)ttrace_walk, &dcmd) == -1) { mdb_warn("couldn't walk 'ttrace'"); return (DCMD_ERR); diff --git a/usr/src/cmd/mdb/i86pc/modules/uppc/uppc.c b/usr/src/cmd/mdb/i86pc/modules/uppc/uppc.c index 0bd87f8a65..c198d257d3 100644 --- a/usr/src/cmd/mdb/i86pc/modules/uppc/uppc.c +++ b/usr/src/cmd/mdb/i86pc/modules/uppc/uppc.c @@ -19,12 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "intr_common.h" static struct av_head avec_tbl[APIC_MAX_VECTOR+1]; @@ -89,10 +86,10 @@ uppc_interrupt_dump(uintptr_t addr, uint_t flags, int argc, } /* - * By default, on all x86 systems ::interrupts from uppc(7d) gets - * loaded first. For APIC systems the ::interrupts from pcplusmp(7d) - * ought to be executed. Confusion stems as both modules export the - * same dcmd. + * By default, on all x86 systems ::interrupts from uppc gets + * loaded first. For APIC systems the ::interrupts from either + * apix or pcplusmp ought to be executed. Confusion stems as + * these three modules export the same dcmd. */ for (i = 0; i < MAX_ISA_IRQ + 1; i++) if (shared_tbl[i]) { @@ -101,7 +98,10 @@ uppc_interrupt_dump(uintptr_t addr, uint_t flags, int argc, } if (found == B_FALSE) { - if (mdb_lookup_by_obj("pcplusmp", "apic_irq_table", + if (mdb_lookup_by_obj("apix", "apixs", NULL) == 0) { + return (mdb_call_dcmd("apix`interrupts", + addr, flags, argc, argv)); + } else if (mdb_lookup_by_obj("pcplusmp", "apic_irq_table", NULL) == 0) { return (mdb_call_dcmd("pcplusmp`interrupts", addr, flags, argc, argv)); diff --git a/usr/src/cmd/pcitool/pcitool.1m b/usr/src/cmd/pcitool/pcitool.1m index 9ffd25241e..461d0df8e4 100644 --- a/usr/src/cmd/pcitool/pcitool.1m +++ b/usr/src/cmd/pcitool/pcitool.1m @@ -17,16 +17,16 @@ .\" .\" CDDL HEADER END .\" -.\" Copyright 2010 Sun Microsystems, Inc. All rights reserved. -.\" Use is subject to license terms. +.\" Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +.\" .TH pcitool 1M .SH "NAME" pcitool \- interrupt routing tool .SH "SYNOPSIS" .PP -/usr/sbin/pcitool \fIpci@<unit-address>\fR|\f I niu@<unit-address>\fR -i \fI<ino#> | all\fR [ -r [ -c ] | -w \fI<cpu#>\fR [ -g ] ] [ -v ] [ -q ] +/usr/sbin/pcitool \fIpci@<unit-address>\fR|\f I niu@<unit-address>\fR -i \fI<[cpu#],ino#> | all\fR [ -r [ -c ] | -w \fI<cpu#>\fR [ -g ] ] [ -v ] [ -q ] -/usr/sbin/pcitool \fIpci@<unit-address>\fR -m \fI<msi#> | all\fR [ -r [ -c ] | -w \fI<cpu#>\fR [ -g ] ] [ -v ] [ -q ] +/usr/sbin/pcitool \fIpci@<unit-address>\fR -m \fI<[cpu#],msi#> | all\fR [ -r [ -c ] | -w \fI<cpu#>\fR [ -g ] ] [ -v ] [ -q ] /usr/sbin/pcitool [ -h ] @@ -44,13 +44,16 @@ specific CPU. Use the \fIpcitool -m\fR command to retrieve and reroute MSI/Xs. On SPARC platforms, the INO is mapped to an interrupt mondo, where as one or more MSI/Xs are mapped to an INO. So, INO and MSI/Xs are individually retargetable. Use "-i" option to retrieve or reroute a given INO, where as -use "-m" option for MSI/Xs. +use "-m" option for MSI/Xs. On x86 platforms, both INOs and MSI/Xs are mapped to the same interrupt vectors. Use "-i" option to retrieve and reroute any interrupt vectors (both INO and MSI/Xs). So, "-m" option is not required on x86 platforms. Hence it is not supported. +[cpu#] is available on x86 platform, is to identify exclusive vector with ino# +at the same time. [cpu#] is not supported on SPARC platform. + \fIRequired privileges\fR The user must have all privileges in order to access interrupt @@ -73,50 +76,48 @@ Dump interrupt controller information with -c. If neither -r nor -w are provided on the commandline, -r is assumed. -The command for showing all INOs on /pci@1e,600000 is: +The command for showing all INOs on /pci@0,0 is: - # pcitool /pci@1e,600000 -i all - -The command for showing ino 0x19 on the same root nexus, along with sample + # pcitool /pci@0,0 -i all + +The command for showing ino <0x0,0x21> on the same root nexus, along with sample output, is: - - # pcitool /pci@1e,600000 -i 19 - - ino 19 mapped to cpu 0 - Device: /pci@1e,600000/pci@0/pci@9/pci@0/scsi@1 - Driver: mpt, instance 0 - - ino 19 mapped to cpu 0 - Device: /pci@1e,600000/pci@0/pci@2/pci@0/network@4 - Driver: bge, instance 0 + +On x86 platform: + # pcitool /pci@0,0 -i 0,21 + 0x0,0x21: mpt 0 /pci@7b,0/pci1022,7458@11/pci1000,3060@2 + +On SPARC platform: + # pcitool /pci@0,0 -i 21 + 0x0,0x21: mpt 0 /pci@7b,0/pci1022,7458@11/pci1000,3060@2 The command for showing MSI 0x1 on the same root nexus, along with sample output, is: -# pcitool /pci@1e,600000 -m 1 - - msi 1 mapped to cpu 0 - Device: /pci@1e,600000/pci@0/pci@9/pci@0/scsi@1 - Driver: mpt, instance 0 + # pcitool /pci@0,0 -m 0x1 + 0x0,0x1: pcieb 0 /pci@7b,0/pci10de,5d@e -w \fI<cpu#>\fR [ -g ] Route the given INO or MSI/X to the given CPU. Display the new and original routing information. The INO or MSI/X must be specified. -Successful rerouting ino 19 above from cpu 0 to cpu 1 gives the following +Successful rerouting ino 21 above from cpu 0 to cpu 1 gives the following output: - # pcitool /pci@1e,600000 -i 19 -w 1 - - Interrupts on ino 19 reassigned: Old cpu: 0, New cpu: 1 - +On x86 platform: + # pcitool /pci@0,0 -i 0,21 -w 1 + 0x0,0x21 -> 0x1,0x20 + +On SPARC platform: + # pcitool /pci@0,0 -i 21 -w 1 + 0x0,0x21 -> 0x1,0x21 + Successful rerouting msi 1 above from cpu 1 to cpu 0 gives the following output: - # pcitool /pci@1e,600000 -m 1 -w 0 - - Interrupts on msi 1 reassigned: Old cpu: 1, New cpu: 0 + # pcitool /pci@0,0 -m 1 -w 0 + 0x1,0x1 -> 0x0,0x1 On some platforms (such as X86) multiple MSI interrupts of a single function need to be rerouted together. Use -g to do this. -g works only on supported @@ -124,12 +125,16 @@ platforms and only for groups of MSI interrupts. (A "group" of 1 is accepted.) When -g is used, the vector provided must be the lowest-numbered vector of the group. The size of the group is determined internally. -Successful rerouting a group of INOs starting at 60 from cpu 0 to cpu 1 gives +Successful rerouting a group of INOs starting at 24 from cpu 0 to cpu 1 gives the following output: - # pcitool /pci@0,0 -i 60 -w 1 -g +On x86 platform: + # pcitool /pci@0,0 -i 3,24 -w 1 -g + 0x3,0x24 => 0x1,0x22 - Interrupts on ino group starting at ino 60 reassigned: Old cpu: 0, New cpu: 1 +On SPARC platform: + # pcitool /pci@0,0 -i 24 -w 1 -g + 0x3,0x24 => 0x1,0x22 -v diff --git a/usr/src/cmd/pcitool/pcitool.c b/usr/src/cmd/pcitool/pcitool.c index 951980ef4a..3b54d34b21 100644 --- a/usr/src/cmd/pcitool/pcitool.c +++ b/usr/src/cmd/pcitool/pcitool.c @@ -1410,18 +1410,19 @@ print_intr_info(pcitool_intr_get_t *iget_p) { int i; - if (iget_p->flags & PCITOOL_INTR_FLAG_GET_MSI) - (void) printf("\nmsi 0x%x mapped to cpu 0x%x\n", - iget_p->msi, iget_p->cpu_id); - else - (void) printf("\nino 0x%x mapped to cpu 0x%x\n", - iget_p->ino, iget_p->cpu_id); - for (i = 0; i < iget_p->num_devs; i++) { - (void) printf("Device: %s\n", iget_p->dev[i].path); - (void) printf(" Driver: %s, instance %d\n", - iget_p->dev[i].driver_name, iget_p->dev[i].dev_inst); + if (iget_p->flags & PCITOOL_INTR_FLAG_GET_MSI) + (void) printf("0x%x,0x%x: %-10s%d\t %s\n", + iget_p->cpu_id, iget_p->msi & 0xff, + iget_p->dev[i].driver_name, iget_p->dev[i].dev_inst, + iget_p->dev[i].path); + else + (void) printf("0x%x,0x%x: %-10s%d\t %s\n", + iget_p->cpu_id, iget_p->ino & 0xff, + iget_p->dev[i].driver_name, iget_p->dev[i].dev_inst, + iget_p->dev[i].path); } + } /* @@ -1511,6 +1512,7 @@ static int get_interrupts(int fd, pcitool_uiargs_t *input_args_p) { int rval = SUCCESS; /* Return status. */ + int ino, cpu_id; /* * Start with a struct with space for info of INIT_NUM_DEVS devs @@ -1559,6 +1561,7 @@ get_interrupts(int fd, pcitool_uiargs_t *input_args_p) /* Explicit INO requested. */ } else if (input_args_p->flags & INO_SPEC_FLAG) { iget_p->ino = input_args_p->intr_ino; + iget_p->cpu_id = input_args_p->old_cpu; rval = get_single_interrupt(fd, &iget_p, input_args_p); /* Return all INOs. */ } else if (input_args_p->flags & INO_ALL_FLAG) { @@ -1571,19 +1574,41 @@ get_interrupts(int fd, pcitool_uiargs_t *input_args_p) "intr info ioctl failed: %s\n", strerror(errno)); } - } else { - int ino; + free(iget_p); + return (rval); + } - /* - * Search through all interrupts. - * Display info on enabled ones. - */ + /* + * Search through all interrupts. + * Display info on enabled ones. + */ + if (intr_info.ctlr_type == PCITOOL_CTLR_TYPE_APIX) { + for (cpu_id = 0; + ((cpu_id < intr_info.num_cpu) && (rval == SUCCESS)); + cpu_id++) { + for (ino = 0; + ((ino < intr_info.num_intr) && + (rval == SUCCESS)); + ino++) { + bzero(iget_p, + sizeof (pcitool_intr_get_t)); + iget_p->num_devs_ret = INIT_NUM_DEVS; + iget_p->user_version = PCITOOL_VERSION; + iget_p->cpu_id = cpu_id; + iget_p->ino = ino; + rval = get_single_interrupt( + fd, &iget_p, input_args_p); + } + } + } else { for (ino = 0; - ((ino < intr_info.num_intr) && (rval == SUCCESS)); + (ino < intr_info.num_intr) && (rval == SUCCESS); ino++) { - bzero(iget_p, sizeof (pcitool_intr_get_t)); + bzero(iget_p, + sizeof (pcitool_intr_get_t)); iget_p->num_devs_ret = INIT_NUM_DEVS; iget_p->user_version = PCITOOL_VERSION; + iget_p->cpu_id = input_args_p->old_cpu; iget_p->ino = ino; rval = get_single_interrupt( fd, &iget_p, input_args_p); @@ -1623,6 +1648,10 @@ get_interrupt_ctlr(int fd, pcitool_uiargs_t *input_args_p) case PCITOOL_CTLR_TYPE_PCPLUSMP: ctlr_type = "PCPLUSMP"; break; + case PCITOOL_CTLR_TYPE_APIX: + ctlr_type = "APIX"; + break; + default: break; } @@ -1677,6 +1706,7 @@ set_interrupts(int fd, pcitool_uiargs_t *input_args_p) } iset.cpu_id = input_args_p->intr_cpu; + iset.old_cpu = input_args_p->old_cpu; iset.user_version = PCITOOL_VERSION; iset.flags |= (input_args_p->flags & SETGRP_FLAG) ? PCITOOL_INTR_FLAG_SET_GROUP : 0; @@ -1693,14 +1723,29 @@ set_interrupts(int fd, pcitool_uiargs_t *input_args_p) rval = errno; } else { if (input_args_p->flags & SETGRP_FLAG) { - (void) printf("\nInterrupts on %s group starting " - "at %s 0x%x reassigned:", str_type, str_type, intr); + if (iset.flags == PCITOOL_INTR_FLAG_SET_MSI) + (void) printf("0x%x,0x%x => 0x%x,0x%x\n", + iset.cpu_id, + (input_args_p->intr_msi) & 0xff, + input_args_p->intr_cpu, iset.msi); + else + (void) printf("0x%x,0x%x => 0x%x,0x%x\n", + iset.cpu_id, + (input_args_p->intr_ino) & 0xff, + input_args_p->intr_cpu, iset.ino); } else { - (void) printf("\nInterrupts on %s 0x%x reassigned:", - str_type, intr); + if (iset.flags == PCITOOL_INTR_FLAG_SET_MSI) + (void) printf("0x%x,0x%x -> 0x%x,0x%x\n", + iset.cpu_id, + (input_args_p->intr_msi) & 0xff, + input_args_p->intr_cpu, iset.msi); + else + (void) printf("0x%x,0x%x -> 0x%x,0x%x\n", + iset.cpu_id, + (input_args_p->intr_ino) & 0xff, + input_args_p->intr_cpu, iset.ino); } - (void) printf(" Old cpu: 0x%x, New cpu: 0x%x\n", iset.cpu_id, - input_args_p->intr_cpu); + } return (rval); diff --git a/usr/src/cmd/pcitool/pcitool_ui.c b/usr/src/cmd/pcitool/pcitool_ui.c index cf15476de6..4a44c528a9 100644 --- a/usr/src/cmd/pcitool/pcitool_ui.c +++ b/usr/src/cmd/pcitool/pcitool_ui.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -86,7 +85,8 @@ static int extract_bdf(char *value, char **bvalue_p, char **dvalue_p, static int parse_device_opts(char *input, uint64_t *flags_arg, uint8_t *bus_arg, uint8_t *device_arg, uint8_t *func_arg, uint8_t *bank_arg); -static int parse_ino_opts(char *input, uint64_t *flags_arg, uint8_t *ino_arg); +static int parse_ino_opts(char *input, uint64_t *flags_arg, + uint32_t *cpu_arg, uint8_t *ino_arg); static int parse_msi_opts(char *input, uint64_t *flags_arg, uint16_t *msi_arg); static int parse_intr_set_opts(char *input, uint64_t *flags_arg, uint32_t *cpu_arg); @@ -259,6 +259,7 @@ get_commandline_args(int argc, char *argv[], pcitool_uiargs_t *parsed_args) /* parse input to get ino value. */ if (parse_ino_opts(optarg, &parsed_args->flags, + &parsed_args->old_cpu, &parsed_args->intr_ino) != SUCCESS) { (void) fprintf(stderr, "%s: Error parsing interrupt options\n", @@ -1286,15 +1287,47 @@ parse_device_opts( * specified options set. Other args return their respective values. */ static int -parse_ino_opts(char *input, uint64_t *flags_arg, uint8_t *ino_arg) +parse_ino_opts(char *input, uint64_t *flags_arg, uint32_t *cpu_arg, + uint8_t *ino_arg) { uint64_t value; + char *charvalue; int rval = SUCCESS; if (strcmp(input, "all") == 0) { *flags_arg |= INO_ALL_FLAG; - } else if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) { - *ino_arg = (uint8_t)value; +#ifdef __x86 + } else if (strstr(input, ",") == NULL) { + (void) fprintf(stderr, + "Interrupt format should be <cpu#,ino#>.\n"); + rval = FAILURE; +#else + } else if (strstr(input, ",") == NULL) { + if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) + *ino_arg = (uint8_t)value; + + if (*ino_arg != value) { + (void) fprintf(stderr, + "ino argument must fit into 8 bits.\n"); + rval = FAILURE; + } else { + *flags_arg |= INO_SPEC_FLAG; + } +#endif + } else if (charvalue = strtok(input, ",")) { + if ((rval = + get_value64(charvalue, &value, HEX_ONLY)) == SUCCESS) { + *cpu_arg = (int)value; + } + + input = strtok(NULL, ","); + if (input == NULL) { + (void) fprintf(stderr, "ino argument is need.\n"); + return (FAILURE); + } + + if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) + *ino_arg = (uint8_t)value; if (*ino_arg != value) { (void) fprintf(stderr, @@ -1328,8 +1361,26 @@ parse_msi_opts(char *input, uint64_t *flags_arg, uint16_t *msi_arg) if (strcmp(input, "all") == 0) { *flags_arg |= MSI_ALL_FLAG; - } else if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) { - *msi_arg = (uint16_t)value; + } else if (strstr(input, ",") == NULL) { + if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) + *msi_arg = (uint16_t)value; + + if (*msi_arg != value) { + (void) fprintf(stderr, + "msi argument must fit into 16 bits.\n"); + rval = FAILURE; + } else { + *flags_arg |= MSI_SPEC_FLAG; + } + } else if (strtok(input, ",")) { + input = strtok(NULL, ","); + if (input == NULL) { + (void) fprintf(stderr, "msi argument is need.\n"); + return (FAILURE); + } + + if ((rval = get_value64(input, &value, HEX_ONLY)) == SUCCESS) + *msi_arg = (uint16_t)value; if (*msi_arg != value) { (void) fprintf(stderr, diff --git a/usr/src/cmd/pcitool/pcitool_ui.h b/usr/src/cmd/pcitool/pcitool_ui.h index 3d5ff1b02c..b916d47b09 100644 --- a/usr/src/cmd/pcitool/pcitool_ui.h +++ b/usr/src/cmd/pcitool/pcitool_ui.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _PCITOOL_UI_H @@ -93,6 +92,7 @@ typedef struct uiargs { uint32_t offset; uint32_t bytedump_amt; uint32_t intr_cpu; + uint32_t old_cpu; uint8_t bus; uint8_t device; uint8_t function; diff --git a/usr/src/cmd/pcitool/pcitool_usage.c b/usr/src/cmd/pcitool/pcitool_usage.c index 31d9ac97da..1781d973ce 100644 --- a/usr/src/cmd/pcitool/pcitool_usage.c +++ b/usr/src/cmd/pcitool/pcitool_usage.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _PCITOOL_USAGE_TEXT_H @@ -148,15 +147,15 @@ NULL static char *pcitool_usage_intr[] = { "Usage:", "Interrupt mode:", -" %s pci@<unit-address> -i <ino#> | all [ -r [ -c ] | -w <cpu#> [ -g ] ]", -" [ -v ] [ -q ]", -" %s pci@<unit-address> -m <msi#> | all [ -r [ -c ] | -w <cpu#> [ -g ] ]", -" [ -v ] [ -q ]", +" %s pci@<unit-address> -i <[cpu#],ino#> | all", +" [ -r [ -c ] | -w <cpu#> [ -g ] ] [ -v ] [ -q ]", +" %s pci@<unit-address> -m <[cpu#],msi#> | all", +" [ -r [ -c ] | -w <cpu#> [ -g ] ] [ -v ] [ -q ]", "", "where", "", "pci@<unit-address> is a node from /devices, with \"/devices\" stripped off.", -"For example: /pci@1e,600000", +"For example: /pci@0,0", "", "-v gives verbose output for all modes.", "", @@ -171,12 +170,16 @@ static char *pcitool_usage_intr[] = { "Interrupt mode", "--------------", "", -"-i <ino#> changes or retrieves current CPU for interrupts of given nexus", -" and given INO. The special value of 'all' can be used to select all INOs.", +"-i <[cpu#],ino#> changes or retrieves current interrupts information of given", +" nexus and given INO. The special value of 'all' can be used to select all", +" INOs.", "", -"-m <msi#> changes or retrieves current CPU for interrupts of given nexus", -" and given MSI/X. The special value of 'all' can be used to select all", -" MSI/Xs.", +"-m <[cpu#],msi#> changes or retrieves current interrupts information of given", +" nexus and given MSI/X. The special value of 'all' can be used to select", +" all MSI/Xs.", +"", +" Note: [cpu#] is available on x86 platform, is to identify exclusive vector", +" with ino# at the same time. [cpu#] is not supported on SPARC platform.", "", " Note: On x86 platforms, both INOs and MSI/Xs are mapped to the same", " interrupt vectors. Use -i option to retrieve and reroute any interrupt", diff --git a/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.pm b/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.pm index 98cd3353b1..b406bd386a 100644 --- a/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.pm +++ b/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.pm @@ -18,10 +18,7 @@ # CDDL HEADER END # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -#ident "%Z%%M% %I% %E% SMI" +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # # Intrs.pm provides the bootstrap for the private Sun::Solaris::Intrs module. # @@ -35,7 +32,7 @@ use DynaLoader; use vars qw($VERSION @ISA @EXPORT_OK); our @ISA = qw(Exporter DynaLoader); -our @EXPORT_OK = qw(intrmove is_pcplusmp); +our @EXPORT_OK = qw(intrmove is_apic); our $VERSION = '0.02'; bootstrap Sun::Solaris::Intrs $VERSION; diff --git a/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.xs b/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.xs index d532daf734..2ba0217f42 100644 --- a/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.xs +++ b/usr/src/cmd/perl/contrib/Sun/Solaris/Intrs/Intrs.xs @@ -54,8 +54,9 @@ MODULE = Sun::Solaris::Intrs PACKAGE = Sun::Solaris::Intrs PROTOTYPES: ENABLE int -intrmove(path, ino, cpu, num_ino) +intrmove(path, oldcpu, ino, cpu, num_ino) char *path + int oldcpu int ino int cpu int num_ino @@ -67,6 +68,7 @@ intrmove(path, ino, cpu, num_ino) if ((fd = open_dev(path)) == -1) { XSRETURN_UNDEF; } + iset.old_cpu = oldcpu; iset.ino = ino; iset.cpu_id = cpu; iset.flags = (num_ino > 1) ? PCITOOL_INTR_FLAG_SET_GROUP : 0; @@ -81,7 +83,7 @@ intrmove(path, ino, cpu, num_ino) XSRETURN_YES; int -is_pcplusmp(path) +is_apic(path) char *path INIT: @@ -101,7 +103,8 @@ is_pcplusmp(path) XSRETURN_UNDEF; } - if (iinfo.ctlr_type == PCITOOL_CTLR_TYPE_PCPLUSMP) { + if (iinfo.ctlr_type == PCITOOL_CTLR_TYPE_PCPLUSMP || + iinfo.ctlr_type == PCITOOL_CTLR_TYPE_APIX) { XSRETURN_YES; } diff --git a/usr/src/pkg/manifests/developer-debug-mdb.mf b/usr/src/pkg/manifests/developer-debug-mdb.mf index 957d1ca292..392991c2a7 100644 --- a/usr/src/pkg/manifests/developer-debug-mdb.mf +++ b/usr/src/pkg/manifests/developer-debug-mdb.mf @@ -163,6 +163,8 @@ $(i386_ONLY)file path=kernel/kmdb/uhci group=sys mode=0555 $(i386_ONLY)file path=kernel/kmdb/usba group=sys mode=0555 $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/$(ARCH64)/pcplusmp group=sys \ mode=0555 +$(i386_ONLY)file path=platform/i86pc/kernel/kmdb/$(ARCH64)/apix group=sys \ + mode=0555 $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/$(ARCH64)/unix group=sys \ mode=0555 $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/$(ARCH64)/uppc group=sys \ @@ -170,6 +172,7 @@ $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/$(ARCH64)/uppc group=sys \ $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/pcplusmp group=sys mode=0555 $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/unix group=sys mode=0555 $(i386_ONLY)file path=platform/i86pc/kernel/kmdb/uppc group=sys mode=0555 +$(i386_ONLY)file path=platform/i86pc/kernel/kmdb/apix group=sys mode=0555 $(i386_ONLY)file path=platform/i86xpv/kernel/kmdb/$(ARCH64)/unix group=sys \ mode=0555 $(i386_ONLY)file path=platform/i86xpv/kernel/kmdb/$(ARCH64)/xpv_psm group=sys \ @@ -305,6 +308,8 @@ file path=usr/lib/mdb/proc/svc.configd.so group=sys mode=0555 file path=usr/lib/mdb/proc/svc.startd.so group=sys mode=0555 $(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/$(ARCH64)/pcplusmp.so \ group=sys mode=0555 +$(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/$(ARCH64)/apix.so \ + group=sys mode=0555 $(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/$(ARCH64)/unix.so \ group=sys mode=0555 $(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/$(ARCH64)/uppc.so \ @@ -315,6 +320,8 @@ $(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/unix.so group=sys \ mode=0555 $(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/uppc.so group=sys \ mode=0555 +$(i386_ONLY)file path=usr/platform/i86pc/lib/mdb/kvm/apix.so group=sys \ + mode=0555 $(i386_ONLY)file path=usr/platform/i86xpv/lib/mdb/kvm/$(ARCH64)/unix.so \ group=sys mode=0555 $(i386_ONLY)file path=usr/platform/i86xpv/lib/mdb/kvm/$(ARCH64)/xpv.so \ diff --git a/usr/src/pkg/manifests/system-kernel-platform.mf b/usr/src/pkg/manifests/system-kernel-platform.mf index d33bcdd306..70c62f7126 100644 --- a/usr/src/pkg/manifests/system-kernel-platform.mf +++ b/usr/src/pkg/manifests/system-kernel-platform.mf @@ -862,10 +862,13 @@ $(i386_ONLY)file path=platform/i86pc/kernel/drv/rootnex group=sys $(i386_ONLY)file path=platform/i86pc/kernel/drv/rootnex.conf group=sys $(i386_ONLY)file path=platform/i86pc/kernel/mach/$(ARCH64)/pcplusmp group=sys \ mode=0755 +$(i386_ONLY)file path=platform/i86pc/kernel/mach/$(ARCH64)/apix group=sys \ + mode=0755 $(i386_ONLY)file path=platform/i86pc/kernel/mach/$(ARCH64)/uppc group=sys \ mode=0755 $(i386_ONLY)file path=platform/i86pc/kernel/mach/pcplusmp group=sys mode=0755 $(i386_ONLY)file path=platform/i86pc/kernel/mach/uppc group=sys mode=0755 +$(i386_ONLY)file path=platform/i86pc/kernel/mach/apix group=sys mode=0755 $(i386_ONLY)file path=platform/i86pc/kernel/misc/$(ARCH64)/acpidev group=sys \ mode=0755 $(i386_ONLY)file path=platform/i86pc/kernel/misc/$(ARCH64)/gfx_private \ diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 8c1f50dc70..4b5dec6976 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -852,10 +852,18 @@ update_etc_mach_i386() { etc_mach=$rootprefix/etc/mach test -f $etc_mach || return + grep -w "xpv_psm" $etc_mach > /dev/null 2>&1 if [ $? -ne 0 ] ; then echo 'xpv_psm' >> $etc_mach fi + + grep -w "apix" $etc_mach > /dev/null 2>&1 + if [ $? -ne 0 ] ; then + awk '/^[ ]*xpv_psm[ ]*$/{print "apix"} + {print $0}' $etc_mach > $etc_mach.tmp + mv $etc_mach.tmp $etc_mach + fi } # check and update driver class for scsi-self-identifying diff --git a/usr/src/uts/common/io/avintr.c b/usr/src/uts/common/io/avintr.c index 43b6c1025b..59d35310a3 100644 --- a/usr/src/uts/common/io/avintr.c +++ b/usr/src/uts/common/io/avintr.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -217,6 +216,13 @@ add_nmintr(int lvl, avfunc nmintr, char *name, caddr_t arg) /* * register a hardware interrupt handler. + * + * The autovect data structure only supports globally 256 interrupts. + * In order to support 256 * #LocalAPIC interrupts, a new PSM module + * apix is introduced. It defines PSM private data structures for the + * interrupt handlers. The PSM module initializes addintr to a PSM + * private function so that it could override add_avintr() to operate + * on its private data structures. */ int add_avintr(void *intr_id, int lvl, avfunc xxintr, char *name, int vect, @@ -227,6 +233,11 @@ add_avintr(void *intr_id, int lvl, avfunc xxintr, char *name, int vect, int s, vectindex; /* save old spl value */ ushort_t hi_pri; + if (addintr) { + return ((*addintr)(intr_id, lvl, xxintr, name, vect, + arg1, arg2, ticksp, dip)); + } + if ((f = xxintr) == NULL) { printf("Attempt to add null vect for %s on vector %d\n", name, vect); @@ -327,7 +338,10 @@ add_avsoftintr(void *intr_id, int lvl, avfunc xxintr, char *name, return (1); } -/* insert an interrupt vector into chain */ +/* + * insert an interrupt vector into chain by its priority from high + * to low + */ static void insert_av(void *intr_id, struct av_head *vectp, avfunc f, caddr_t arg1, caddr_t arg2, uint64_t *ticksp, int pri_level, dev_info_t *dip) @@ -335,7 +349,7 @@ insert_av(void *intr_id, struct av_head *vectp, avfunc f, caddr_t arg1, /* * Protect rewrites of the list */ - struct autovec *p, *mem; + struct autovec *p, *prep, *mem; mem = kmem_zalloc(sizeof (struct autovec), KM_SLEEP); mem->av_vector = f; @@ -358,8 +372,15 @@ insert_av(void *intr_id, struct av_head *vectp, avfunc f, caddr_t arg1, } /* find where it goes in list */ + prep = NULL; for (p = vectp->avh_link; p != NULL; p = p->av_link) { - if (p->av_vector == NULL) { /* freed struct available */ + if (p->av_vector && p->av_prilevel <= pri_level) + break; + prep = p; + } + if (prep != NULL) { + if (prep->av_vector == NULL) { /* freed struct available */ + p = prep; p->av_intarg1 = arg1; p->av_intarg2 = arg2; p->av_ticksp = ticksp; @@ -381,10 +402,14 @@ insert_av(void *intr_id, struct av_head *vectp, avfunc f, caddr_t arg1, kmem_free(mem, sizeof (struct autovec)); return; } + + mem->av_link = prep->av_link; + prep->av_link = mem; + } else { + /* insert new intpt at beginning of chain */ + mem->av_link = vectp->avh_link; + vectp->avh_link = mem; } - /* insert new intpt at beginning of chain */ - mem->av_link = vectp->avh_link; - vectp->avh_link = mem; if (pri_level > (int)vectp->avh_hi_pri) { vectp->avh_hi_pri = (ushort_t)pri_level; } @@ -450,6 +475,13 @@ rem_avsoftintr(void *intr_id, int lvl, avfunc xxintr) return (av_rem_softintr(intr_id, lvl, xxintr, B_TRUE)); } +/* + * Remove specified interrupt handler. + * + * PSM module could initialize remintr to some PSM private function + * so that it could override rem_avintr() to operate on its private + * data structures. + */ void rem_avintr(void *intr_id, int lvl, avfunc xxintr, int vect) { @@ -457,6 +489,11 @@ rem_avintr(void *intr_id, int lvl, avfunc xxintr, int vect) avfunc f; int s, vectindex; /* save old spl value */ + if (remintr) { + (*remintr)(intr_id, lvl, xxintr, vect); + return; + } + if ((f = xxintr) == NULL) return; @@ -476,7 +513,7 @@ rem_avintr(void *intr_id, int lvl, avfunc xxintr, int vect) * seen each cpu not executing an interrupt at that level--so we know our * change has taken effect completely (no old state in registers, etc). */ -static void +void wait_till_seen(int ipl) { int cpu_in_chain, cix; diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 3d9d2f9b39..3d1c34b5a0 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -582,11 +581,12 @@ typedef struct mac_dladm_intr { /* Bind the interrupt to cpu_num */ static int -mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino) +mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) { pcitool_intr_set_t iset; int err; + iset.old_cpu = oldcpuid; iset.ino = ino; iset.cpu_id = cpu_num; iset.user_version = PCITOOL_VERSION; @@ -625,7 +625,8 @@ mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) * device and where it is bound etc. */ static boolean_t -mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln) +mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, + mac_dladm_intr_t *dln) { pcitool_intr_get_t *iget_p; int ipsz; @@ -643,6 +644,7 @@ mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln) iget_p->num_devs_ret = 0; iget_p->user_version = PCITOOL_VERSION; + iget_p->cpu_id = oldcpuid; iget_p->ino = ino; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, @@ -665,6 +667,7 @@ mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln) iget_p = kmem_zalloc(ipsz, KM_SLEEP); iget_p->num_devs_ret = inum; + iget_p->cpu_id = oldcpuid; iget_p->ino = ino; iget_p->user_version = PCITOOL_VERSION; err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, @@ -697,17 +700,20 @@ mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) pcitool_intr_info_t intr_info; int err; int ino; + int oldcpuid; err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, FKIOCTL, kcred, NULL); if (err != 0) return (-1); - for (ino = 0; ino < intr_info.num_intr; ino++) { - if (mac_get_single_intr(lh, ino, dln)) { - if (dln->cpu_id == cpuid) - return (0); - return (1); + for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { + for (ino = 0; ino < intr_info.num_intr; ino++) { + if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { + if (dln->cpu_id == cpuid) + return (0); + return (1); + } } } return (-1); @@ -804,7 +810,8 @@ mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) } /* cmn_note? */ if (ret != 0) - if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) { + if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) + != 0) { (void) ldi_close(lh, FREAD|FWRITE, kcred); return (B_FALSE); } diff --git a/usr/src/uts/common/io/pci-ide/pci-ide.c b/usr/src/uts/common/io/pci-ide/pci-ide.c index 87f228c170..e25a5ffc1b 100644 --- a/usr/src/uts/common/io/pci-ide/pci-ide.c +++ b/usr/src/uts/common/io/pci-ide/pci-ide.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -45,6 +44,7 @@ #include <sys/pci.h> #include <sys/promif.h> #include <sys/pci_intr_lib.h> +#include <sys/apic.h> int pciide_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); int pciide_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); @@ -110,7 +110,13 @@ static void pciide_compat_setup(dev_info_t *mydip, dev_info_t *cdip, static int pciide_pre26_rnumber_map(dev_info_t *mydip, int rnumber); static int pciide_map_rnumber(int canonical_rnumber, int pri_native, int sec_native); +static int pciide_alloc_intr(dev_info_t *, dev_info_t *, + ddi_intr_handle_impl_t *, void *); +static int pciide_free_intr(dev_info_t *, dev_info_t *, + ddi_intr_handle_impl_t *); +extern int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, + psm_intr_op_t, int *); /* * Config information @@ -716,13 +722,9 @@ pciide_intr_ops(dev_info_t *dip, dev_info_t *rdip, ddi_intr_op_t intr_op, i_ddi_get_intx_nintrs(rdip) : 1; break; case DDI_INTROP_ALLOC: - if ((ispecp = pciide_get_ispec(dip, rdip, hdlp->ih_inum)) == - NULL) - return (DDI_FAILURE); - *(int *)result = hdlp->ih_scratch1; - break; + return (pciide_alloc_intr(dip, rdip, hdlp, result)); case DDI_INTROP_FREE: - break; + return (pciide_free_intr(dip, rdip, hdlp)); case DDI_INTROP_GETPRI: if (pciide_get_pri(dip, rdip, hdlp, &pri) != DDI_SUCCESS) { *(int *)result = 0; @@ -770,6 +772,100 @@ pciide_intr_ops(dev_info_t *dip, dev_info_t *rdip, ddi_intr_op_t intr_op, return (DDI_SUCCESS); } +int +pciide_alloc_intr(dev_info_t *dip, dev_info_t *rdip, + ddi_intr_handle_impl_t *hdlp, void *result) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + int free_phdl = 0; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + if ((ispec = pciide_get_ispec(dip, rdip, hdlp->ih_inum)) == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to allocate the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if (hdlp->ih_private == NULL) { /* allocate phdl structure */ + free_phdl = 1; + i_ddi_alloc_intr_phdl(hdlp); + } + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + if (PCIIDE_NATIVE_MODE(rdip)) { + rdip = dip; + dip = ddi_get_parent(dip); + } else { /* get ptr to the root node */ + dip = ddi_root_node(); + } + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_ALLOC_VECTORS, result); + if (free_phdl) { /* free up the phdl structure */ + free_phdl = 0; + i_ddi_free_intr_phdl(hdlp); + } + } else { + /* + * No APIX module; fall back to the old scheme where the + * interrupt vector is allocated during ddi_enable_intr() call. + */ + *(int *)result = hdlp->ih_scratch1; + ret = DDI_SUCCESS; + } + + return (ret); +} + +int +pciide_free_intr(dev_info_t *dip, dev_info_t *rdip, + ddi_intr_handle_impl_t *hdlp) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to free up the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if ((ispec = pciide_get_ispec(dip, rdip, hdlp->ih_inum)) == + NULL) + return (DDI_FAILURE); + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + if (PCIIDE_NATIVE_MODE(rdip)) { + rdip = dip; + dip = ddi_get_parent(dip); + } else { /* get ptr to the root node */ + dip = ddi_root_node(); + } + return ((*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_FREE_VECTORS, NULL)); + } + + /* + * No APIX module; fall back to the old scheme where + * the interrupt vector was already freed during + * ddi_disable_intr() call. + */ + return (DDI_SUCCESS); +} + /* * This is one of the places where controller specific setup needs to be * considered. diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index 99cf843d35..c2bf912c72 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -34,6 +34,13 @@ #include <sys/sunndi.h> #include <sys/ndi_impldefs.h> /* include prototypes */ +#if defined(__i386) || defined(__amd64) +/* + * MSI-X allocation limit. + */ +extern uint_t ddi_msix_alloc_limit; +#endif + /* * Interrupt Resource Management (IRM). */ @@ -198,6 +205,90 @@ ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp, } /* + * ndi_irm_resize_pool() + * + * Nexus interface to resize IRM pool. If the pool size drops + * below the allocated number of vectors then initiate rebalance + * operation before resizing the pool. If rebalance operation fails + * then return NDI_FAILURE. + */ +int +ndi_irm_resize_pool(ddi_irm_pool_t *pool_p, uint_t new_size) +{ + uint_t prev_size; + + ASSERT(pool_p != NULL); + + DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p" + " current-size 0x%x new-size 0x%x\n", + (void *)pool_p, pool_p->ipool_totsz, new_size)); + + if (pool_p == NULL) + return (NDI_EINVAL); + + /* Check if IRM is enabled */ + if (!irm_enable) + return (NDI_FAILURE); + + mutex_enter(&pool_p->ipool_lock); + + /* + * If we are increasing the pool size or if the reserved + * number of vectors is <= the new pool size then simply + * update the pool size and enqueue a reblance operation + * if necessary to use the new vectors. + */ + if ((pool_p->ipool_totsz < new_size) || + (pool_p->ipool_resno <= new_size)) { + /* set new pool size */ + pool_p->ipool_totsz = new_size; + /* adjust the default allocation limit */ + pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, + MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER)); + /* queue a rebalance operation to use the new vectors */ + if (pool_p->ipool_reqno > pool_p->ipool_resno) + i_ddi_irm_enqueue(pool_p, B_FALSE); + mutex_exit(&pool_p->ipool_lock); + return (NDI_SUCCESS); + } + + DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p" + " needs a rebalance operation\n", (void *)pool_p)); + + /* + * requires a rebalance operation + */ + /* save the current pool size */ + prev_size = pool_p->ipool_totsz; + /* set the pool size to the desired new value */ + pool_p->ipool_totsz = new_size; + /* perform the rebalance operation */ + i_ddi_irm_enqueue(pool_p, B_TRUE); + + /* + * If rebalance operation couldn't free up enough + * vectors then fail the resize operation. + */ + if (pool_p->ipool_resno > new_size) { /* rebalance failed */ + /* restore the pool size to the previous value */ + pool_p->ipool_totsz = prev_size; + /* enqueue a rebalance operation for the original pool size */ + i_ddi_irm_enqueue(pool_p, B_FALSE); + mutex_exit(&pool_p->ipool_lock); + return (NDI_FAILURE); + } else { /* rebalance worked */ + /* adjust the default allocation limit */ + pool_p->ipool_defsz = MIN(DDI_MAX_MSIX_ALLOC, + MAX(DDI_MIN_MSIX_ALLOC, new_size / DDI_MSIX_ALLOC_DIVIDER)); + mutex_exit(&pool_p->ipool_lock); + DDI_INTR_IRMDBG((CE_CONT, "ndi_irm_resize_pool: pool_p %p" + " resized from %x to %x\n", + (void *)pool_p, prev_size, pool_p->ipool_totsz)); + return (NDI_SUCCESS); + } +} + +/* * ndi_irm_destroy() * * Nexus interface to destroy an IRM pool. Destroy the pool @@ -676,6 +767,12 @@ i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag) /* Determine new request size */ nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz); +#if defined(__i386) || defined(__amd64) + /* Use the default static limit for non-IRM drivers */ + if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) + nreq = MIN(nreq, ddi_msix_alloc_limit); +#endif + /* Update pool statistics */ pool_p->ipool_reqno -= req_p->ireq_nreq; pool_p->ipool_reqno += nreq; diff --git a/usr/src/uts/common/sys/avintr.h b/usr/src/uts/common/sys/avintr.h index 28b95fb348..1dc3825f6b 100644 --- a/usr/src/uts/common/sys/avintr.h +++ b/usr/src/uts/common/sys/avintr.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_AVINTR_H @@ -69,8 +68,15 @@ struct autovec { */ void *av_intr_id; dev_info_t *av_dip; + ushort_t av_flags; /* pending flags */ + struct autovec *av_ipl_link; /* pointer to next on ipl chain */ }; +#define AV_PENTRY_VECTMASK 0xff /* low 8 bit used for irqno */ +#define AV_PENTRY_PEND 0x100 /* pending hardware interrupt */ +#define AV_PENTRY_ONPROC 0x200 /* being serviced by CPU */ +#define AV_PENTRY_LEVEL 0x8000 /* level-triggered interrupt */ + struct av_head { struct autovec *avh_link; ushort_t avh_hi_pri; @@ -96,6 +102,7 @@ extern int rem_avsoftintr(void *intr_id, int lvl, avfunc xxintr); extern int av_softint_movepri(void *intr_id, int old_lvl); extern void update_avsoftintr_args(void *intr_id, int lvl, caddr_t arg2); extern void rem_avintr(void *intr_id, int lvl, avfunc xxintr, int vect); +extern void wait_till_seen(int ipl); extern uint_t softlevel1(caddr_t, caddr_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/ddi_intr_impl.h b/usr/src/uts/common/sys/ddi_intr_impl.h index 777dc928d2..cb5617dc8f 100644 --- a/usr/src/uts/common/sys/ddi_intr_impl.h +++ b/usr/src/uts/common/sys/ddi_intr_impl.h @@ -359,6 +359,8 @@ int32_t i_ddi_set_intr_weight(dev_info_t *, int32_t); void i_ddi_alloc_intr_phdl(ddi_intr_handle_impl_t *); void i_ddi_free_intr_phdl(ddi_intr_handle_impl_t *); +extern int irm_enable; /* global flag for IRM */ + #define DDI_INTR_ASSIGN_HDLR_N_ARGS(hdlp, func, arg1, arg2) \ hdlp->ih_cb_func = func; \ hdlp->ih_cb_arg1 = arg1; \ diff --git a/usr/src/uts/common/sys/pci_tools.h b/usr/src/uts/common/sys/pci_tools.h index 2aceaa3d3d..919b1e75ae 100644 --- a/usr/src/uts/common/sys/pci_tools.h +++ b/usr/src/uts/common/sys/pci_tools.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_PCI_TOOLS_H @@ -128,6 +127,7 @@ typedef struct pcitool_intr_set { uint32_t ino; /* interrupt to set - to kernel */ uint32_t msi; /* Specific MSI to set - to kernel */ uint32_t cpu_id; /* to: cpu to set / from: old cpu returned */ + uint32_t old_cpu; /* to/from kernel: old cpu id */ uint32_t flags; /* to kernel */ pcitool_errno_t status; /* from kernel */ } pcitool_intr_set_t; @@ -182,6 +182,7 @@ typedef struct pcitool_intr_info { uint16_t drvr_version; /* Driver version - from kernel */ uint32_t flags; /* to kernel */ uint32_t num_intr; /* Number of intrs suppt by nexus */ + uint32_t num_cpu; uint32_t ctlr_version; /* Intr ctlr HW version - from kernel */ uchar_t ctlr_type; /* A PCITOOL_CTLR_TYPE - from kernel */ } pcitool_intr_info_t; @@ -193,6 +194,7 @@ typedef struct pcitool_intr_info { #define PCITOOL_CTLR_TYPE_RISC 1 #define PCITOOL_CTLR_TYPE_UPPC 2 #define PCITOOL_CTLR_TYPE_PCPLUSMP 3 +#define PCITOOL_CTLR_TYPE_APIX 4 /* * Size and endian fields for acc_attr bitmask. diff --git a/usr/src/uts/common/sys/sunndi.h b/usr/src/uts/common/sys/sunndi.h index f17ab18a7c..6b28ba698f 100644 --- a/usr/src/uts/common/sys/sunndi.h +++ b/usr/src/uts/common/sys/sunndi.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SUNNDI_H @@ -280,7 +279,7 @@ boolean_t ndi_port_type(dev_info_t *dip, boolean_t up, uint32_t port_type); /* - * Create/Destroy Interrupt Resource Management (IRM) Pools. + * Interrupt Resource Management (IRM) Pools. */ int ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp, @@ -289,6 +288,9 @@ ndi_irm_create(dev_info_t *dip, ddi_irm_params_t *paramsp, int ndi_irm_destroy(ddi_irm_pool_t *poolp); +int +ndi_irm_resize_pool(ddi_irm_pool_t *poolp, uint_t newsize); + /* * Take a device node "Offline". * diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 8156574ad1..2758749056 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -187,7 +187,11 @@ PCI_E_NEXUS_OBJS += npe.o npe_misc.o PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o PCINEXUS_OBJS += pci.o pci_common.o pci_kstats.o pci_tools.o PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o \ - mp_platform_common.o hpet_acpi.o + mp_platform_common.o mp_platform_misc.o \ + hpet_acpi.o apic_common.o +APIX_OBJS += apix.o apic_regops.o psm_common.o apix_intr.o apix_utils.o \ + apix_irm.o mp_platform_common.o hpet_acpi.o apic_common.o + ACPI_DRV_OBJS += acpi_drv.o acpi_video.o ACPINEX_OBJS += acpinex_drv.o acpinex_event.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared index a8fbb83fdf..f41e91a4fc 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc.shared +++ b/usr/src/uts/i86pc/Makefile.i86pc.shared @@ -22,8 +22,8 @@ # # uts/i86pc/Makefile.i86pc # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# # # This makefile contains the common definitions for the i86pc unix # and all i86pc implementation architecture dependent modules. @@ -250,6 +250,7 @@ MACH_NOT_YET_KMODS = $(AUTOCONF_OBJS) DRV_KMODS += rootnex DRV_KMODS += isa DRV_KMODS += pcplusmp +DRV_KMODS += apix DRV_KMODS += cpc DRV_KMODS += pci DRV_KMODS += npe diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 592250db83..dfff27de9f 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -20,8 +20,7 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # # This Makefile defines the build rules for the directory uts/i86pc @@ -138,6 +137,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pcplusmp/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pcplusmp/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/apix/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ppm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -366,6 +369,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/pcplusmp/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/pcplusmp/%.s @($(LHEAD) $(LINT.s) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/apix/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/apix/%.s + @($(LHEAD) $(LINT.s) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ppm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/i86pc/apix/Makefile b/usr/src/uts/i86pc/apix/Makefile new file mode 100644 index 0000000000..5e67b3f03a --- /dev/null +++ b/usr/src/uts/i86pc/apix/Makefile @@ -0,0 +1,90 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/i86pc/apix/Makefile +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# +# This makefile drives the production of the pcplusmp "mach" +# kernel module. +# +# pcplusmp implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = apix +OBJECTS = $(APIX_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(APIX_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MACH_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +DEBUG_FLGS = +$(NOT_RELEASE_BUILD)DEBUG_DEFS += $(DEBUG_FLGS) + +# +# Depends on ACPI CA interpreter +# +LDFLAGS += -dy -N misc/acpica + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ + diff --git a/usr/src/uts/i86pc/io/apix/apix.c b/usr/src/uts/i86pc/io/apix/apix.c new file mode 100644 index 0000000000..f2fdc19282 --- /dev/null +++ b/usr/src/uts/i86pc/io/apix/apix.c @@ -0,0 +1,2564 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * PSMI 1.1 extensions are supported only in 2.6 and later versions. + * PSMI 1.2 extensions are supported only in 2.7 and later versions. + * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. + * PSMI 1.5 extensions are supported in Solaris Nevada. + * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. + */ +#define PSMI_1_7 + +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/cram.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/psm_common.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/archsystm.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/sysmacros.h> +#include <sys/cpuvar.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/spl.h> +#include <sys/clock.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/x_call.h> +#include <sys/reboot.h> +#include <sys/mach_intr.h> +#include <sys/apix.h> +#include <sys/apix_irm_impl.h> + +static int apix_probe(); +static void apix_init(); +static void apix_picinit(void); +static int apix_intr_enter(int, int *); +static void apix_intr_exit(int, int); +static void apix_setspl(int); +static int apix_disable_intr(processorid_t); +static void apix_enable_intr(processorid_t); +static int apix_get_clkvect(int); +static int apix_get_ipivect(int, int); +static void apix_post_cyclic_setup(void *); +static int apix_post_cpu_start(); +static int apix_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *, + psm_intr_op_t, int *); + +/* + * Helper functions for apix_intr_ops() + */ +static void apix_redistribute_compute(void); +static int apix_get_pending(apix_vector_t *); +static apix_vector_t *apix_get_req_vector(ddi_intr_handle_impl_t *, ushort_t); +static int apix_get_intr_info(ddi_intr_handle_impl_t *, apic_get_intr_t *); +static char *apix_get_apic_type(void); +static int apix_intx_get_pending(int); +static void apix_intx_set_mask(int irqno); +static void apix_intx_clear_mask(int irqno); +static int apix_intx_get_shared(int irqno); +static void apix_intx_set_shared(int irqno, int delta); +static apix_vector_t *apix_intx_xlate_vector(dev_info_t *, int, + struct intrspec *); +static int apix_intx_alloc_vector(dev_info_t *, int, struct intrspec *); + +extern int apic_clkinit(int); + +/* IRM initialization for APIX PSM module */ +extern void apix_irm_init(void); + +extern int irm_enable; + +/* + * Local static data + */ +static struct psm_ops apix_ops = { + apix_probe, + + apix_init, + apix_picinit, + apix_intr_enter, + apix_intr_exit, + apix_setspl, + apix_addspl, + apix_delspl, + apix_disable_intr, + apix_enable_intr, + NULL, /* psm_softlvl_to_irq */ + NULL, /* psm_set_softintr */ + + apic_set_idlecpu, + apic_unset_idlecpu, + + apic_clkinit, + apix_get_clkvect, + NULL, /* psm_hrtimeinit */ + apic_gethrtime, + + apic_get_next_processorid, + apic_cpu_start, + apix_post_cpu_start, + apic_shutdown, + apix_get_ipivect, + apic_send_ipi, + + NULL, /* psm_translate_irq */ + NULL, /* psm_notify_error */ + NULL, /* psm_notify_func */ + apic_timer_reprogram, + apic_timer_enable, + apic_timer_disable, + apix_post_cyclic_setup, + apic_preshutdown, + apix_intr_ops, /* Advanced DDI Interrupt framework */ + apic_state, /* save, restore apic state for S3 */ + apic_cpu_ops, /* CPU control interface. */ +}; + +struct psm_ops *psmops = &apix_ops; + +static struct psm_info apix_psm_info = { + PSM_INFO_VER01_7, /* version */ + PSM_OWN_EXCLUSIVE, /* ownership */ + &apix_ops, /* operation */ + APIX_NAME, /* machine name */ + "apix MPv1.4 compatible", +}; + +static void *apix_hdlp; + +static int apix_is_enabled = 0; + +/* + * Flag to indicate if APIX is to be enabled only for platforms + * with specific hw feature(s). + */ +int apix_hw_chk_enable = 1; + +/* + * Hw features that are checked for enabling APIX support. + */ +#define APIX_SUPPORT_X2APIC 0x00000001 +uint_t apix_supported_hw = APIX_SUPPORT_X2APIC; + +/* + * apix_lock is used for cpu selection and vector re-binding + */ +lock_t apix_lock; +apix_impl_t *apixs[NCPU]; +/* + * Mapping between device interrupt and the allocated vector. Indexed + * by major number. + */ +apix_dev_vector_t **apix_dev_vector; +/* + * Mapping between device major number and cpu id. It gets used + * when interrupt binding policy round robin with affinity is + * applied. With that policy, devices with the same major number + * will be bound to the same CPU. + */ +processorid_t *apix_major_to_cpu; /* major to cpu mapping */ +kmutex_t apix_mutex; /* for apix_dev_vector & apix_major_to_cpu */ + +int apix_nipis = 16; /* Maximum number of IPIs */ +/* + * Maximum number of vectors in a CPU that can be used for interrupt + * allocation (including IPIs and the reserved vectors). + */ +int apix_cpu_nvectors = APIX_NVECTOR; + +/* gcpu.h */ + +extern void apic_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp); +extern void apic_change_eoi(); + +/* + * This is the loadable module wrapper + */ + +int +_init(void) +{ + if (apic_coarse_hrtime) + apix_ops.psm_gethrtime = &apic_gettime; + return (psm_mod_init(&apix_hdlp, &apix_psm_info)); +} + +int +_fini(void) +{ + return (psm_mod_fini(&apix_hdlp, &apix_psm_info)); +} + +int +_info(struct modinfo *modinfop) +{ + return (psm_mod_info(&apix_hdlp, &apix_psm_info, modinfop)); +} + +static int +apix_probe() +{ + int rval; + + if (apix_enable == 0) + return (PSM_FAILURE); + + /* check for hw features if specified */ + if (apix_hw_chk_enable) { + /* check if x2APIC mode is supported */ + if ((apix_supported_hw & APIX_SUPPORT_X2APIC) == + APIX_SUPPORT_X2APIC) { + if (!((apic_local_mode() == LOCAL_X2APIC) || + apic_detect_x2apic())) { + /* x2APIC mode is not supported in the hw */ + apix_enable = 0; + } + } + if (apix_enable == 0) + return (PSM_FAILURE); + } + + rval = apic_probe_common(apix_psm_info.p_mach_idstring); + if (rval == PSM_SUCCESS) + apix_is_enabled = 1; + else + apix_is_enabled = 0; + return (rval); +} + +/* + * Initialize the data structures needed by pcplusmpx module. + * Specifically, the data structures used by addspl() and delspl() + * routines. + */ +static void +apix_softinit() +{ + int i, *iptr; + apix_impl_t *hdlp; + int nproc; + + nproc = max(apic_nproc, apic_max_nproc); + + hdlp = kmem_zalloc(nproc * sizeof (apix_impl_t), KM_SLEEP); + for (i = 0; i < nproc; i++) { + apixs[i] = &hdlp[i]; + apixs[i]->x_cpuid = i; + LOCK_INIT_CLEAR(&apixs[i]->x_lock); + } + + /* cpu 0 is always up (for now) */ + apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; + + iptr = (int *)&apic_irq_table[0]; + for (i = 0; i <= APIC_MAX_VECTOR; i++) { + apic_level_intr[i] = 0; + *iptr++ = NULL; + } + mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); + + apix_dev_vector = kmem_zalloc(sizeof (apix_dev_vector_t *) * devcnt, + KM_SLEEP); + + if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { + apix_major_to_cpu = kmem_zalloc(sizeof (int) * devcnt, + KM_SLEEP); + for (i = 0; i < devcnt; i++) + apix_major_to_cpu[i] = IRQ_UNINIT; + } + + mutex_init(&apix_mutex, NULL, MUTEX_DEFAULT, NULL); +} + +static int +apix_get_pending_spl(void) +{ + int cpuid = CPU->cpu_id; + + return (bsrw_insn(apixs[cpuid]->x_intr_pending)); +} + +static uintptr_t +apix_get_intr_handler(int cpu, short vec) +{ + apix_vector_t *apix_vector; + + ASSERT(cpu < apic_nproc && vec < APIX_NVECTOR); + if (cpu >= apic_nproc) + return (NULL); + + apix_vector = apixs[cpu]->x_vectbl[vec]; + + return ((uintptr_t)(apix_vector->v_autovect)); +} + +#if defined(__amd64) +static unsigned char dummy_cpu_pri[MAXIPL + 1] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +#endif + +static void +apix_init() +{ + extern void (*do_interrupt_common)(struct regs *, trap_trace_rec_t *); + + APIC_VERBOSE(INIT, (CE_CONT, "apix: psm_softinit\n")); + + do_interrupt_common = apix_do_interrupt; + addintr = apix_add_avintr; + remintr = apix_rem_avintr; + get_pending_spl = apix_get_pending_spl; + get_intr_handler = apix_get_intr_handler; + psm_get_localapicid = apic_get_localapicid; + psm_get_ioapicid = apic_get_ioapicid; + + apix_softinit(); +#if defined(__amd64) + /* + * Make cpu-specific interrupt info point to cr8pri vector + */ + CPU->cpu_pri_data = dummy_cpu_pri; +#else + if (cpuid_have_cr8access(CPU)) + apic_have_32bit_cr8 = 1; +#endif /* __amd64 */ + + /* + * Initialize IRM pool parameters + */ + if (irm_enable) { + int i; + int lowest_irq; + int highest_irq; + + /* number of CPUs present */ + apix_irminfo.apix_ncpus = apic_nproc; + /* total number of entries in all of the IOAPICs present */ + lowest_irq = apic_io_vectbase[0]; + highest_irq = apic_io_vectend[0]; + for (i = 1; i < apic_io_max; i++) { + if (apic_io_vectbase[i] < lowest_irq) + lowest_irq = apic_io_vectbase[i]; + if (apic_io_vectend[i] > highest_irq) + highest_irq = apic_io_vectend[i]; + } + apix_irminfo.apix_ioapic_max_vectors = + highest_irq - lowest_irq + 1; + /* + * Number of available per-CPU vectors excluding + * reserved vectors for Dtrace, int80, system-call, + * fast-trap, etc. + */ + apix_irminfo.apix_per_cpu_vectors = APIX_NAVINTR - + APIX_SW_RESERVED_VECTORS; + + /* Number of vectors (pre) allocated (SCI and HPET) */ + apix_irminfo.apix_vectors_allocated = 0; + if (apic_hpet_vect != -1) + apix_irminfo.apix_vectors_allocated++; + if (apic_sci_vect != -1) + apix_irminfo.apix_vectors_allocated++; + } +} + +static void +apix_init_intr() +{ + processorid_t cpun = psm_get_cpu_id(); + uint_t nlvt; + uint32_t svr = AV_UNIT_ENABLE | APIC_SPUR_INTR; + extern void cmi_cmci_trap(void); + + apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL); + + if (apic_mode == LOCAL_APIC) { + /* + * We are running APIC in MMIO mode. + */ + if (apic_flat_model) { + apic_reg_ops->apic_write(APIC_FORMAT_REG, + APIC_FLAT_MODEL); + } else { + apic_reg_ops->apic_write(APIC_FORMAT_REG, + APIC_CLUSTER_MODEL); + } + + apic_reg_ops->apic_write(APIC_DEST_REG, + AV_HIGH_ORDER >> cpun); + } + + if (apic_directed_EOI_supported()) { + /* + * Setting the 12th bit in the Spurious Interrupt Vector + * Register suppresses broadcast EOIs generated by the local + * APIC. The suppression of broadcast EOIs happens only when + * interrupts are level-triggered. + */ + svr |= APIC_SVR_SUPPRESS_BROADCAST_EOI; + } + + /* need to enable APIC before unmasking NMI */ + apic_reg_ops->apic_write(APIC_SPUR_INT_REG, svr); + + /* + * Presence of an invalid vector with delivery mode AV_FIXED can + * cause an error interrupt, even if the entry is masked...so + * write a valid vector to LVT entries along with the mask bit + */ + + /* All APICs have timer and LINT0/1 */ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK|APIC_RESV_IRQ); + apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK|APIC_RESV_IRQ); + apic_reg_ops->apic_write(APIC_INT_VECT1, AV_NMI); /* enable NMI */ + + /* + * On integrated APICs, the number of LVT entries is + * 'Max LVT entry' + 1; on 82489DX's (non-integrated + * APICs), nlvt is "3" (LINT0, LINT1, and timer) + */ + + if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) { + nlvt = 3; + } else { + nlvt = ((apic_reg_ops->apic_read(APIC_VERS_REG) >> 16) & + 0xFF) + 1; + } + + if (nlvt >= 5) { + /* Enable performance counter overflow interrupt */ + + if ((x86_feature & X86_MSR) != X86_MSR) + apic_enable_cpcovf_intr = 0; + if (apic_enable_cpcovf_intr) { + if (apic_cpcovf_vect == 0) { + int ipl = APIC_PCINT_IPL; + + apic_cpcovf_vect = apix_get_ipivect(ipl, -1); + ASSERT(apic_cpcovf_vect); + + (void) add_avintr(NULL, ipl, + (avfunc)kcpc_hw_overflow_intr, + "apic pcint", apic_cpcovf_vect, + NULL, NULL, NULL, NULL); + kcpc_hw_overflow_intr_installed = 1; + kcpc_hw_enable_cpc_intr = + apic_cpcovf_mask_clear; + } + apic_reg_ops->apic_write(APIC_PCINT_VECT, + apic_cpcovf_vect); + } + } + + if (nlvt >= 6) { + /* Only mask TM intr if the BIOS apparently doesn't use it */ + + uint32_t lvtval; + + lvtval = apic_reg_ops->apic_read(APIC_THERM_VECT); + if (((lvtval & AV_MASK) == AV_MASK) || + ((lvtval & AV_DELIV_MODE) != AV_SMI)) { + apic_reg_ops->apic_write(APIC_THERM_VECT, + AV_MASK|APIC_RESV_IRQ); + } + } + + /* Enable error interrupt */ + + if (nlvt >= 4 && apic_enable_error_intr) { + if (apic_errvect == 0) { + int ipl = 0xf; /* get highest priority intr */ + apic_errvect = apix_get_ipivect(ipl, -1); + ASSERT(apic_errvect); + /* + * Not PSMI compliant, but we are going to merge + * with ON anyway + */ + (void) add_avintr(NULL, ipl, + (avfunc)apic_error_intr, "apic error intr", + apic_errvect, NULL, NULL, NULL, NULL); + } + apic_reg_ops->apic_write(APIC_ERR_VECT, apic_errvect); + apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); + apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); + } + + /* Enable CMCI interrupt */ + if (cmi_enable_cmci) { + mutex_enter(&cmci_cpu_setup_lock); + if (cmci_cpu_setup_registered == 0) { + mutex_enter(&cpu_lock); + register_cpu_setup_func(cmci_cpu_setup, NULL); + mutex_exit(&cpu_lock); + cmci_cpu_setup_registered = 1; + } + mutex_exit(&cmci_cpu_setup_lock); + + if (apic_cmci_vect == 0) { + int ipl = 0x2; + apic_cmci_vect = apix_get_ipivect(ipl, -1); + ASSERT(apic_cmci_vect); + + (void) add_avintr(NULL, ipl, + (avfunc)cmi_cmci_trap, "apic cmci intr", + apic_cmci_vect, NULL, NULL, NULL, NULL); + } + apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect); + } + + apic_reg_ops->apic_write_task_reg(0); +} + +static void +apix_picinit(void) +{ + int i, j; + uint_t isr; + + APIC_VERBOSE(INIT, (CE_CONT, "apix: psm_picinit\n")); + + /* + * initialize interrupt remapping before apic + * hardware initialization + */ + apic_intrmap_init(apic_mode); + if (apic_vt_ops == psm_vt_ops) + apix_mul_ioapic_method = APIC_MUL_IOAPIC_IIR; + + /* + * On UniSys Model 6520, the BIOS leaves vector 0x20 isr + * bit on without clearing it with EOI. Since softint + * uses vector 0x20 to interrupt itself, so softint will + * not work on this machine. In order to fix this problem + * a check is made to verify all the isr bits are clear. + * If not, EOIs are issued to clear the bits. + */ + for (i = 7; i >= 1; i--) { + isr = apic_reg_ops->apic_read(APIC_ISR_REG + (i * 4)); + if (isr != 0) + for (j = 0; ((j < 32) && (isr != 0)); j++) + if (isr & (1 << j)) { + apic_reg_ops->apic_write( + APIC_EOI_REG, 0); + isr &= ~(1 << j); + apic_error |= APIC_ERR_BOOT_EOI; + } + } + + /* set a flag so we know we have run apic_picinit() */ + apic_picinit_called = 1; + LOCK_INIT_CLEAR(&apic_gethrtime_lock); + LOCK_INIT_CLEAR(&apic_ioapic_lock); + LOCK_INIT_CLEAR(&apic_error_lock); + LOCK_INIT_CLEAR(&apic_mode_switch_lock); + + picsetup(); /* initialise the 8259 */ + + /* add nmi handler - least priority nmi handler */ + LOCK_INIT_CLEAR(&apic_nmi_lock); + + if (!psm_add_nmintr(0, (avfunc) apic_nmi_intr, + "apix NMI handler", (caddr_t)NULL)) + cmn_err(CE_WARN, "apix: Unable to add nmi handler"); + + apix_init_intr(); + + /* enable apic mode if imcr present */ + if (apic_imcrp) { + outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT); + outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC); + } + + ioapix_init_intr(IOAPIC_MASK); + + /* setup global IRM pool if applicable */ + if (irm_enable) + apix_irm_init(); +} + +static __inline__ void +apix_send_eoi(void) +{ + if (apic_mode == LOCAL_APIC) + LOCAL_APIC_WRITE_REG(APIC_EOI_REG, 0); + else + X2APIC_WRITE(APIC_EOI_REG, 0); +} + +/* + * platform_intr_enter + * + * Called at the beginning of the interrupt service routine to + * mask all level equal to and below the interrupt priority + * of the interrupting vector. An EOI should be given to + * the interrupt controller to enable other HW interrupts. + * + * Return -1 for spurious interrupts + * + */ +static int +apix_intr_enter(int ipl, int *vectorp) +{ + struct cpu *cpu = CPU; + uint32_t cpuid = CPU->cpu_id; + apic_cpus_info_t *cpu_infop; + uchar_t vector; + apix_vector_t *vecp; + int nipl = -1; + + /* + * The real vector delivered is (*vectorp + 0x20), but our caller + * subtracts 0x20 from the vector before passing it to us. + * (That's why APIC_BASE_VECT is 0x20.) + */ + vector = *vectorp = (uchar_t)*vectorp + APIC_BASE_VECT; + + cpu_infop = &apic_cpus[cpuid]; + if (vector == APIC_SPUR_INTR) { + cpu_infop->aci_spur_cnt++; + return (APIC_INT_SPURIOUS); + } + + vecp = xv_vector(cpuid, vector); + if (vecp == NULL) { + if (APIX_IS_FAKE_INTR(vector)) + nipl = apix_rebindinfo.i_pri; + apix_send_eoi(); + return (nipl); + } + nipl = vecp->v_pri; + + /* if interrupted by the clock, increment apic_nsec_since_boot */ + if (vector == (apic_clkvect + APIC_BASE_VECT)) { + if (!apic_oneshot) { + /* NOTE: this is not MT aware */ + apic_hrtime_stamp++; + apic_nsec_since_boot += apic_nsec_per_intr; + apic_hrtime_stamp++; + last_count_read = apic_hertz_count; + apix_redistribute_compute(); + } + + apix_send_eoi(); + + return (nipl); + } + + ASSERT(vecp->v_state != APIX_STATE_OBSOLETED); + + /* pre-EOI handling for level-triggered interrupts */ + if (!APIX_IS_DIRECTED_EOI(apix_mul_ioapic_method) && + (vecp->v_type & APIX_TYPE_FIXED) && apic_level_intr[vecp->v_inum]) + apix_level_intr_pre_eoi(vecp->v_inum); + + /* send back EOI */ + apix_send_eoi(); + + cpu_infop->aci_current[nipl] = vector; + if ((nipl > ipl) && (nipl > cpu->cpu_base_spl)) { + cpu_infop->aci_curipl = (uchar_t)nipl; + cpu_infop->aci_ISR_in_progress |= 1 << nipl; + } + +#ifdef DEBUG + if (vector >= APIX_IPI_MIN) + return (nipl); /* skip IPI */ + + APIC_DEBUG_BUF_PUT(vector); + APIC_DEBUG_BUF_PUT(vecp->v_inum); + APIC_DEBUG_BUF_PUT(nipl); + APIC_DEBUG_BUF_PUT(psm_get_cpu_id()); + if ((apic_stretch_interrupts) && (apic_stretch_ISR & (1 << nipl))) + drv_usecwait(apic_stretch_interrupts); +#endif /* DEBUG */ + + return (nipl); +} + +/* + * Any changes made to this function must also change X2APIC + * version of intr_exit. + */ +static void +apix_intr_exit(int prev_ipl, int arg2) +{ + int cpuid = psm_get_cpu_id(); + apic_cpus_info_t *cpu_infop = &apic_cpus[cpuid]; + apix_impl_t *apixp = apixs[cpuid]; + + UNREFERENCED_1PARAMETER(arg2); + + cpu_infop->aci_curipl = (uchar_t)prev_ipl; + /* ISR above current pri could not be in progress */ + cpu_infop->aci_ISR_in_progress &= (2 << prev_ipl) - 1; + + if (apixp->x_obsoletes != NULL) { + if (APIX_CPU_LOCK_HELD(cpuid)) + return; + + APIX_ENTER_CPU_LOCK(cpuid); + (void) apix_obsolete_vector(apixp->x_obsoletes); + APIX_LEAVE_CPU_LOCK(cpuid); + } +} + +/* + * Mask all interrupts below or equal to the given IPL. + * Any changes made to this function must also change X2APIC + * version of setspl. + */ +static void +apix_setspl(int ipl) +{ + /* interrupts at ipl above this cannot be in progress */ + apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1; + + /* + * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf). + * Otherwise, enable all interrupts (i.e. set TPR to 0). + */ + if (ipl != XC_HI_PIL) + ipl = 0; + +#if defined(__amd64) + setcr8((ulong_t)ipl); +#else + if (apic_have_32bit_cr8) + setcr8((ulong_t)ipl); + else + apicadr[APIC_TASK_REG] = ipl << APIC_IPL_SHIFT; +#endif + + /* + * this is a patch fix for the ALR QSMP P5 machine, so that interrupts + * have enough time to come in before the priority is raised again + * during the idle() loop. + */ + if (apic_setspl_delay) + (void) apic_reg_ops->apic_get_pri(); +} + +/* + * X2APIC version of setspl. + */ +static void +x2apix_setspl(int ipl) +{ + /* interrupts at ipl above this cannot be in progress */ + apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1; + + /* + * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf). + * Otherwise, enable all interrupts (i.e. set TPR to 0). + */ + if (ipl != XC_HI_PIL) + ipl = 0; + + X2APIC_WRITE(APIC_TASK_REG, ipl << APIC_IPL_SHIFT); +} + +int +apix_addspl(int virtvec, int ipl, int min_ipl, int max_ipl) +{ + uint32_t cpuid = APIX_VIRTVEC_CPU(virtvec); + uchar_t vector = (uchar_t)APIX_VIRTVEC_VECTOR(virtvec); + apix_vector_t *vecp = xv_vector(cpuid, vector); + + UNREFERENCED_3PARAMETER(ipl, min_ipl, max_ipl); + ASSERT(vecp != NULL && LOCK_HELD(&apix_lock)); + + if (vecp->v_type == APIX_TYPE_FIXED) + apix_intx_set_shared(vecp->v_inum, 1); + + /* There are more interrupts, so it's already been enabled */ + if (vecp->v_share > 1) + return (PSM_SUCCESS); + + /* return if it is not hardware interrupt */ + if (vecp->v_type == APIX_TYPE_IPI) + return (PSM_SUCCESS); + + /* + * if apix_picinit() has not been called yet, just return. + * At the end of apic_picinit(), we will call setup_io_intr(). + */ + if (!apic_picinit_called) + return (PSM_SUCCESS); + + (void) apix_setup_io_intr(vecp); + + return (PSM_SUCCESS); +} + +int +apix_delspl(int virtvec, int ipl, int min_ipl, int max_ipl) +{ + uint32_t cpuid = APIX_VIRTVEC_CPU(virtvec); + uchar_t vector = (uchar_t)APIX_VIRTVEC_VECTOR(virtvec); + apix_vector_t *vecp = xv_vector(cpuid, vector); + + UNREFERENCED_3PARAMETER(ipl, min_ipl, max_ipl); + ASSERT(vecp != NULL && LOCK_HELD(&apix_lock)); + + if (vecp->v_type == APIX_TYPE_FIXED) + apix_intx_set_shared(vecp->v_inum, -1); + + /* There are more interrupts */ + if (vecp->v_share > 1) + return (PSM_SUCCESS); + + /* return if it is not hardware interrupt */ + if (vecp->v_type == APIX_TYPE_IPI) + return (PSM_SUCCESS); + + if (!apic_picinit_called) { + cmn_err(CE_WARN, "apix: delete 0x%x before apic init", + virtvec); + return (PSM_SUCCESS); + } + + apix_disable_vector(vecp); + + return (PSM_SUCCESS); +} + +/* + * Try and disable all interrupts. We just assign interrupts to other + * processors based on policy. If any were bound by user request, we + * let them continue and return failure. We do not bother to check + * for cache affinity while rebinding. + */ +static int +apix_disable_intr(processorid_t cpun) +{ + apix_impl_t *apixp = apixs[cpun]; + apix_vector_t *vecp, *newp; + int bindcpu, i, hardbound = 0, errbound = 0, ret, loop, type; + + lock_set(&apix_lock); + + apic_cpus[cpun].aci_status &= ~APIC_CPU_INTR_ENABLE; + apic_cpus[cpun].aci_curipl = 0; + + /* if this is for SUSPEND operation, skip rebinding */ + if (apic_cpus[cpun].aci_status & APIC_CPU_SUSPEND) { + for (i = APIX_AVINTR_MIN; i <= APIX_AVINTR_MAX; i++) { + vecp = apixp->x_vectbl[i]; + if (!IS_VECT_ENABLED(vecp)) + continue; + + apix_disable_vector(vecp); + } + lock_clear(&apix_lock); + return (PSM_SUCCESS); + } + + for (i = APIX_AVINTR_MIN; i <= APIX_AVINTR_MAX; i++) { + vecp = apixp->x_vectbl[i]; + if (!IS_VECT_ENABLED(vecp)) + continue; + + if (vecp->v_flags & APIX_VECT_USER_BOUND) { + hardbound++; + continue; + } + type = vecp->v_type; + + /* + * If there are bound interrupts on this cpu, then + * rebind them to other processors. + */ + loop = 0; + do { + bindcpu = apic_find_cpu(APIC_CPU_INTR_ENABLE); + + if (type != APIX_TYPE_MSI) + newp = apix_set_cpu(vecp, bindcpu, &ret); + else + newp = apix_grp_set_cpu(vecp, bindcpu, &ret); + } while ((newp == NULL) && (loop++ < apic_nproc)); + + if (loop >= apic_nproc) { + errbound++; + cmn_err(CE_WARN, "apix: failed to rebind vector %x/%x", + vecp->v_cpuid, vecp->v_vector); + } + } + + lock_clear(&apix_lock); + + if (hardbound || errbound) { + cmn_err(CE_WARN, "Could not disable interrupts on %d" + "due to user bound interrupts or failed operation", + cpun); + return (PSM_FAILURE); + } + + return (PSM_SUCCESS); +} + +/* + * Bind interrupts to specified CPU + */ +static void +apix_enable_intr(processorid_t cpun) +{ + apix_vector_t *vecp; + int i, ret; + processorid_t n; + + lock_set(&apix_lock); + + apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE; + + /* interrupt enabling for system resume */ + if (apic_cpus[cpun].aci_status & APIC_CPU_SUSPEND) { + for (i = APIX_AVINTR_MIN; i <= APIX_AVINTR_MAX; i++) { + vecp = xv_vector(cpun, i); + if (!IS_VECT_ENABLED(vecp)) + continue; + + apix_enable_vector(vecp); + } + apic_cpus[cpun].aci_status &= ~APIC_CPU_SUSPEND; + } + + for (n = 0; n < apic_nproc; n++) { + if (!apic_cpu_in_range(n) || n == cpun || + (apic_cpus[n].aci_status & APIC_CPU_INTR_ENABLE) == 0) + continue; + + for (i = APIX_AVINTR_MIN; i <= APIX_AVINTR_MAX; i++) { + vecp = xv_vector(n, i); + if (!IS_VECT_ENABLED(vecp) || + vecp->v_bound_cpuid != cpun) + continue; + + if (vecp->v_type != APIX_TYPE_MSI) + (void) apix_set_cpu(vecp, cpun, &ret); + else + (void) apix_grp_set_cpu(vecp, cpun, &ret); + } + } + + lock_clear(&apix_lock); +} + +/* + * Allocate vector for IPI + * type == -1 indicates it is an internal request. Do not change + * resv_vector for these requests. + */ +static int +apix_get_ipivect(int ipl, int type) +{ + uchar_t vector; + + if ((vector = apix_alloc_ipi(ipl)) > 0) { + if (type != -1) + apic_resv_vector[ipl] = vector; + return (vector); + } + apic_error |= APIC_ERR_GET_IPIVECT_FAIL; + return (-1); /* shouldn't happen */ +} + +static int +apix_get_clkvect(int ipl) +{ + int vector; + + if ((vector = apix_get_ipivect(ipl, -1)) == -1) + return (-1); + + apic_clkvect = vector - APIC_BASE_VECT; + APIC_VERBOSE(IPI, (CE_CONT, "apix: clock vector = %x\n", + apic_clkvect)); + return (vector); +} + +static int +apix_post_cpu_start() +{ + int cpun; + static int cpus_started = 1; + + /* We know this CPU + BSP started successfully. */ + cpus_started++; + + /* + * On BSP we would have enabled X2APIC, if supported by processor, + * in acpi_probe(), but on AP we do it here. + * + * We enable X2APIC mode only if BSP is running in X2APIC & the + * local APIC mode of the current CPU is MMIO (xAPIC). + */ + if (apic_mode == LOCAL_X2APIC && apic_detect_x2apic() && + apic_local_mode() == LOCAL_APIC) { + apic_enable_x2apic(); + } + + /* + * Switch back to x2apic IPI sending method for performance when target + * CPU has entered x2apic mode. + */ + if (apic_mode == LOCAL_X2APIC) { + apic_switch_ipi_callback(B_FALSE); + } + + splx(ipltospl(LOCK_LEVEL)); + apix_init_intr(); + + /* + * since some systems don't enable the internal cache on the non-boot + * cpus, so we have to enable them here + */ + setcr0(getcr0() & ~(CR0_CD | CR0_NW)); + +#ifdef DEBUG + APIC_AV_PENDING_SET(); +#else + if (apic_mode == LOCAL_APIC) + APIC_AV_PENDING_SET(); +#endif /* DEBUG */ + + /* + * We may be booting, or resuming from suspend; aci_status will + * be APIC_CPU_INTR_ENABLE if coming from suspend, so we add the + * APIC_CPU_ONLINE flag here rather than setting aci_status completely. + */ + cpun = psm_get_cpu_id(); + apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE; + + apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); + + return (PSM_SUCCESS); +} + +/* + * If this module needs a periodic handler for the interrupt distribution, it + * can be added here. The argument to the periodic handler is not currently + * used, but is reserved for future. + */ +static void +apix_post_cyclic_setup(void *arg) +{ + UNREFERENCED_1PARAMETER(arg); + + /* cpu_lock is held */ + /* set up a periodic handler for intr redistribution */ + + /* + * In peridoc mode intr redistribution processing is done in + * apic_intr_enter during clk intr processing + */ + if (!apic_oneshot) + return; + + /* + * Register a periodical handler for the redistribution processing. + * On X86, CY_LOW_LEVEL is mapped to the level 2 interrupt, so + * DDI_IPL_2 should be passed to ddi_periodic_add() here. + */ + apic_periodic_id = ddi_periodic_add( + (void (*)(void *))apix_redistribute_compute, NULL, + apic_redistribute_sample_interval, DDI_IPL_2); +} + +void +x2apic_update_psm() +{ + struct psm_ops *pops = &apix_ops; + + ASSERT(pops != NULL); + + /* + * The xxx_intr_exit() sets TPR and sends back EOI. The + * xxx_setspl() sets TPR. These two routines are not + * needed in new design. + * + * pops->psm_intr_exit = x2apic_intr_exit; + * pops->psm_setspl = x2apic_setspl; + */ + pops->psm_setspl = x2apix_setspl; + pops->psm_send_ipi = x2apic_send_ipi; + + send_dirintf = pops->psm_send_ipi; + + apic_mode = LOCAL_X2APIC; + apic_change_ops(); +} + +/* + * This function provides external interface to the nexus for all + * functionalities related to the new DDI interrupt framework. + * + * Input: + * dip - pointer to the dev_info structure of the requested device + * hdlp - pointer to the internal interrupt handle structure for the + * requested interrupt + * intr_op - opcode for this call + * result - pointer to the integer that will hold the result to be + * passed back if return value is PSM_SUCCESS + * + * Output: + * return value is either PSM_SUCCESS or PSM_FAILURE + */ +static int +apix_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, + psm_intr_op_t intr_op, int *result) +{ + int cap; + apix_vector_t *vecp, *newvecp; + struct intrspec *ispec, intr_spec; + processorid_t target; + + ispec = &intr_spec; + ispec->intrspec_pri = hdlp->ih_pri; + ispec->intrspec_vec = hdlp->ih_inum; + ispec->intrspec_func = hdlp->ih_cb_func; + + switch (intr_op) { + case PSM_INTR_OP_ALLOC_VECTORS: + switch (hdlp->ih_type) { + case DDI_INTR_TYPE_MSI: + /* allocate MSI vectors */ + *result = apix_alloc_msi(dip, hdlp->ih_inum, + hdlp->ih_scratch1, + (int)(uintptr_t)hdlp->ih_scratch2); + break; + case DDI_INTR_TYPE_MSIX: + /* allocate MSI-X vectors */ + *result = apix_alloc_msix(dip, hdlp->ih_inum, + hdlp->ih_scratch1, + (int)(uintptr_t)hdlp->ih_scratch2); + break; + case DDI_INTR_TYPE_FIXED: + /* allocate or share vector for fixed */ + if ((ihdl_plat_t *)hdlp->ih_private == NULL) { + return (PSM_FAILURE); + } + ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp; + *result = apix_intx_alloc_vector(dip, hdlp->ih_inum, + ispec); + break; + default: + return (PSM_FAILURE); + } + break; + case PSM_INTR_OP_FREE_VECTORS: + apix_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1, + hdlp->ih_type); + break; + case PSM_INTR_OP_XLATE_VECTOR: + /* + * Vectors are allocated by ALLOC and freed by FREE. + * XLATE finds and returns APIX_VIRTVEC_VECTOR(cpu, vector). + */ + *result = APIX_INVALID_VECT; + vecp = apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type); + if (vecp != NULL) { + *result = APIX_VIRTVECTOR(vecp->v_cpuid, + vecp->v_vector); + break; + } + + /* + * No vector to device mapping exists. If this is FIXED type + * then check if this IRQ is already mapped for another device + * then return the vector number for it (i.e. shared IRQ case). + * Otherwise, return PSM_FAILURE. + */ + if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) { + vecp = apix_intx_xlate_vector(dip, hdlp->ih_inum, + ispec); + *result = (vecp == NULL) ? APIX_INVALID_VECT : + APIX_VIRTVECTOR(vecp->v_cpuid, vecp->v_vector); + } + if (*result == APIX_INVALID_VECT) + return (PSM_FAILURE); + break; + case PSM_INTR_OP_GET_PENDING: + vecp = apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type); + if (vecp == NULL) + return (PSM_FAILURE); + + *result = apix_get_pending(vecp); + break; + case PSM_INTR_OP_CLEAR_MASK: + if (hdlp->ih_type != DDI_INTR_TYPE_FIXED) + return (PSM_FAILURE); + + vecp = apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type); + if (vecp == NULL) + return (PSM_FAILURE); + + apix_intx_clear_mask(vecp->v_inum); + break; + case PSM_INTR_OP_SET_MASK: + if (hdlp->ih_type != DDI_INTR_TYPE_FIXED) + return (PSM_FAILURE); + + vecp = apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type); + if (vecp == NULL) + return (PSM_FAILURE); + + apix_intx_set_mask(vecp->v_inum); + break; + case PSM_INTR_OP_GET_SHARED: + if (hdlp->ih_type != DDI_INTR_TYPE_FIXED) + return (PSM_FAILURE); + + vecp = apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type); + if (vecp == NULL) + return (PSM_FAILURE); + + *result = apix_intx_get_shared(vecp->v_inum); + break; + case PSM_INTR_OP_SET_PRI: + /* + * Called prior to adding the interrupt handler or when + * an interrupt handler is unassigned. + */ + if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) + return (PSM_SUCCESS); + + if (apix_get_dev_map(dip, hdlp->ih_inum, hdlp->ih_type) == NULL) + return (PSM_FAILURE); + + break; + case PSM_INTR_OP_SET_CPU: + case PSM_INTR_OP_GRP_SET_CPU: + /* + * The interrupt handle given here has been allocated + * specifically for this command, and ih_private carries + * a CPU value. + */ + *result = EINVAL; + target = (int)(intptr_t)hdlp->ih_private; + if (!apic_cpu_in_range(target)) { + DDI_INTR_IMPLDBG((CE_WARN, + "[grp_]set_cpu: cpu out of range: %d\n", target)); + return (PSM_FAILURE); + } + + lock_set(&apix_lock); + + vecp = apix_get_req_vector(hdlp, hdlp->ih_flags); + if (!IS_VECT_ENABLED(vecp)) { + DDI_INTR_IMPLDBG((CE_WARN, + "[grp]_set_cpu: invalid vector 0x%x\n", + hdlp->ih_vector)); + lock_clear(&apix_lock); + return (PSM_FAILURE); + } + + *result = 0; + + if (intr_op == PSM_INTR_OP_SET_CPU) + newvecp = apix_set_cpu(vecp, target, result); + else + newvecp = apix_grp_set_cpu(vecp, target, result); + + lock_clear(&apix_lock); + + if (newvecp == NULL) { + *result = EIO; + return (PSM_FAILURE); + } + newvecp->v_bound_cpuid = target; + hdlp->ih_vector = APIX_VIRTVECTOR(newvecp->v_cpuid, + newvecp->v_vector); + break; + + case PSM_INTR_OP_GET_INTR: + /* + * The interrupt handle given here has been allocated + * specifically for this command, and ih_private carries + * a pointer to a apic_get_intr_t. + */ + if (apix_get_intr_info(hdlp, hdlp->ih_private) != PSM_SUCCESS) + return (PSM_FAILURE); + break; + + case PSM_INTR_OP_CHECK_MSI: + /* + * Check MSI/X is supported or not at APIC level and + * masked off the MSI/X bits in hdlp->ih_type if not + * supported before return. If MSI/X is supported, + * leave the ih_type unchanged and return. + * + * hdlp->ih_type passed in from the nexus has all the + * interrupt types supported by the device. + */ + if (apic_support_msi == 0) { /* uninitialized */ + /* + * if apic_support_msi is not set, call + * apic_check_msi_support() to check whether msi + * is supported first + */ + if (apic_check_msi_support() == PSM_SUCCESS) + apic_support_msi = 1; /* supported */ + else + apic_support_msi = -1; /* not-supported */ + } + if (apic_support_msi == 1) { + if (apic_msix_enable) + *result = hdlp->ih_type; + else + *result = hdlp->ih_type & ~DDI_INTR_TYPE_MSIX; + } else + *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI | + DDI_INTR_TYPE_MSIX); + break; + case PSM_INTR_OP_GET_CAP: + cap = DDI_INTR_FLAG_PENDING; + if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) + cap |= DDI_INTR_FLAG_MASKABLE; + *result = cap; + break; + case PSM_INTR_OP_APIC_TYPE: + ((apic_get_type_t *)(hdlp->ih_private))->avgi_type = + apix_get_apic_type(); + ((apic_get_type_t *)(hdlp->ih_private))->avgi_num_intr = + APIX_IPI_MIN; + ((apic_get_type_t *)(hdlp->ih_private))->avgi_num_cpu = + apic_nproc; + hdlp->ih_ver = apic_get_apic_version(); + break; + case PSM_INTR_OP_SET_CAP: + default: + return (PSM_FAILURE); + } + + return (PSM_SUCCESS); +} + +static void +apix_cleanup_busy(void) +{ + int i, j; + apix_vector_t *vecp; + + for (i = 0; i < apic_nproc; i++) { + if (!apic_cpu_in_range(i)) + continue; + apic_cpus[i].aci_busy = 0; + for (j = APIX_AVINTR_MIN; j < APIX_AVINTR_MAX; j++) { + if ((vecp = xv_vector(i, j)) != NULL) + vecp->v_busy = 0; + } + } +} + +static void +apix_redistribute_compute(void) +{ + int i, j, max_busy; + + if (!apic_enable_dynamic_migration) + return; + + if (++apic_nticks == apic_sample_factor_redistribution) { + /* + * Time to call apic_intr_redistribute(). + * reset apic_nticks. This will cause max_busy + * to be calculated below and if it is more than + * apic_int_busy, we will do the whole thing + */ + apic_nticks = 0; + } + max_busy = 0; + for (i = 0; i < apic_nproc; i++) { + if (!apic_cpu_in_range(i)) + continue; + /* + * Check if curipl is non zero & if ISR is in + * progress + */ + if (((j = apic_cpus[i].aci_curipl) != 0) && + (apic_cpus[i].aci_ISR_in_progress & (1 << j))) { + + int vect; + apic_cpus[i].aci_busy++; + vect = apic_cpus[i].aci_current[j]; + apixs[i]->x_vectbl[vect]->v_busy++; + } + + if (!apic_nticks && + (apic_cpus[i].aci_busy > max_busy)) + max_busy = apic_cpus[i].aci_busy; + } + if (!apic_nticks) { + if (max_busy > apic_int_busy_mark) { + /* + * We could make the following check be + * skipped > 1 in which case, we get a + * redistribution at half the busy mark (due to + * double interval). Need to be able to collect + * more empirical data to decide if that is a + * good strategy. Punt for now. + */ + apix_cleanup_busy(); + apic_skipped_redistribute = 0; + } else + apic_skipped_redistribute++; + } +} + +/* + * intr_ops() service routines + */ + +static int +apix_get_pending(apix_vector_t *vecp) +{ + int bit, index, irr, pending; + + /* need to get on the bound cpu */ + mutex_enter(&cpu_lock); + affinity_set(vecp->v_cpuid); + + index = vecp->v_vector / 32; + bit = vecp->v_vector % 32; + irr = apic_reg_ops->apic_read(APIC_IRR_REG + index); + + affinity_clear(); + mutex_exit(&cpu_lock); + + pending = (irr & (1 << bit)) ? 1 : 0; + if (!pending && vecp->v_type == APIX_TYPE_FIXED) + pending = apix_intx_get_pending(vecp->v_inum); + + return (pending); +} + +static apix_vector_t * +apix_get_req_vector(ddi_intr_handle_impl_t *hdlp, ushort_t flags) +{ + apix_vector_t *vecp; + processorid_t cpuid; + int32_t virt_vec = 0; + + switch (flags & PSMGI_INTRBY_FLAGS) { + case PSMGI_INTRBY_IRQ: + return (apix_intx_get_vector(hdlp->ih_vector)); + case PSMGI_INTRBY_VEC: + virt_vec = (virt_vec == 0) ? hdlp->ih_vector : virt_vec; + + cpuid = APIX_VIRTVEC_CPU(virt_vec); + if (!apic_cpu_in_range(cpuid)) + return (NULL); + + vecp = xv_vector(cpuid, APIX_VIRTVEC_VECTOR(virt_vec)); + break; + case PSMGI_INTRBY_DEFAULT: + vecp = apix_get_dev_map(hdlp->ih_dip, hdlp->ih_inum, + hdlp->ih_type); + break; + default: + return (NULL); + } + + return (vecp); +} + +static int +apix_get_intr_info(ddi_intr_handle_impl_t *hdlp, + apic_get_intr_t *intr_params_p) +{ + apix_vector_t *vecp; + struct autovec *av_dev; + int i; + + vecp = apix_get_req_vector(hdlp, intr_params_p->avgi_req_flags); + if (IS_VECT_FREE(vecp)) { + intr_params_p->avgi_num_devs = 0; + intr_params_p->avgi_cpu_id = 0; + intr_params_p->avgi_req_flags = 0; + return (PSM_SUCCESS); + } + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_CPUID) { + intr_params_p->avgi_cpu_id = vecp->v_cpuid; + + /* Return user bound info for intrd. */ + if (intr_params_p->avgi_cpu_id & IRQ_USER_BOUND) { + intr_params_p->avgi_cpu_id &= ~IRQ_USER_BOUND; + intr_params_p->avgi_cpu_id |= PSMGI_CPU_USER_BOUND; + } + } + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_VECTOR) + intr_params_p->avgi_vector = vecp->v_vector; + + if (intr_params_p->avgi_req_flags & + (PSMGI_REQ_NUM_DEVS | PSMGI_REQ_GET_DEVS)) + /* Get number of devices from apic_irq table shared field. */ + intr_params_p->avgi_num_devs = vecp->v_share; + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_GET_DEVS) { + + intr_params_p->avgi_req_flags |= PSMGI_REQ_NUM_DEVS; + + /* Some devices have NULL dip. Don't count these. */ + if (intr_params_p->avgi_num_devs > 0) { + for (i = 0, av_dev = vecp->v_autovect; av_dev; + av_dev = av_dev->av_link) { + if (av_dev->av_vector && av_dev->av_dip) + i++; + } + intr_params_p->avgi_num_devs = + (uint8_t)MIN(intr_params_p->avgi_num_devs, i); + } + + /* There are no viable dips to return. */ + if (intr_params_p->avgi_num_devs == 0) { + intr_params_p->avgi_dip_list = NULL; + + } else { /* Return list of dips */ + + /* Allocate space in array for that number of devs. */ + intr_params_p->avgi_dip_list = kmem_zalloc( + intr_params_p->avgi_num_devs * + sizeof (dev_info_t *), + KM_NOSLEEP); + if (intr_params_p->avgi_dip_list == NULL) { + DDI_INTR_IMPLDBG((CE_WARN, + "apix_get_vector_intr_info: no memory")); + return (PSM_FAILURE); + } + + /* + * Loop through the device list of the autovec table + * filling in the dip array. + * + * Note that the autovect table may have some special + * entries which contain NULL dips. These will be + * ignored. + */ + for (i = 0, av_dev = vecp->v_autovect; av_dev; + av_dev = av_dev->av_link) { + if (av_dev->av_vector && av_dev->av_dip) + intr_params_p->avgi_dip_list[i++] = + av_dev->av_dip; + } + } + } + + return (PSM_SUCCESS); +} + +static char * +apix_get_apic_type(void) +{ + return (apix_psm_info.p_mach_idstring); +} + +apix_vector_t * +apix_set_cpu(apix_vector_t *vecp, int new_cpu, int *result) +{ + apix_vector_t *newp = NULL; + dev_info_t *dip; + int inum, cap_ptr; + ddi_acc_handle_t handle; + ddi_intr_msix_t *msix_p; + ushort_t msix_ctrl; + uintptr_t off; + uint32_t mask; + + ASSERT(LOCK_HELD(&apix_lock)); + *result = ENXIO; + + /* Fail if this is an MSI intr and is part of a group. */ + if (vecp->v_type == APIX_TYPE_MSI) { + if (i_ddi_intr_get_current_nintrs(APIX_GET_DIP(vecp)) > 1) + return (NULL); + else + return (apix_grp_set_cpu(vecp, new_cpu, result)); + } + + /* + * Mask MSI-X. It's unmasked when MSI-X gets enabled. + */ + if (vecp->v_type == APIX_TYPE_MSIX) { + if ((dip = APIX_GET_DIP(vecp)) == NULL) + return (NULL); + inum = vecp->v_devp->dv_inum; + + handle = i_ddi_get_pci_config_handle(dip); + cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + msix_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); + if ((msix_ctrl & PCI_MSIX_FUNCTION_MASK) == 0) { + /* + * Function is not masked, then mask "inum"th + * entry in the MSI-X table + */ + msix_p = i_ddi_get_msix(dip); + off = (uintptr_t)msix_p->msix_tbl_addr + (inum * + PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; + mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); + ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, + mask | 1); + } + } + + *result = 0; + + if ((newp = apix_rebind(vecp, new_cpu, 1)) == NULL) + *result = EIO; + + return (newp); +} + +/* + * Set cpu for MSIs + */ +apix_vector_t * +apix_grp_set_cpu(apix_vector_t *vecp, int new_cpu, int *result) +{ + apix_vector_t *newp, *vp; + uint32_t orig_cpu = vecp->v_cpuid; + int orig_vect = vecp->v_vector; + int i, num_vectors, cap_ptr, msi_mask_off; + uint32_t msi_pvm; + ushort_t msi_ctrl; + ddi_acc_handle_t handle; + dev_info_t *dip; + + APIC_VERBOSE(INTR, (CE_CONT, "apix_grp_set_cpu: oldcpu: %x, vector: %x," + " newcpu:%x\n", vecp->v_cpuid, vecp->v_vector, new_cpu)); + + ASSERT(LOCK_HELD(&apix_lock)); + + *result = ENXIO; + + if (vecp->v_type != APIX_TYPE_MSI) { + DDI_INTR_IMPLDBG((CE_WARN, "set_grp: intr not MSI\n")); + return (NULL); + } + + if ((dip = APIX_GET_DIP(vecp)) == NULL) + return (NULL); + + num_vectors = i_ddi_intr_get_current_nintrs(dip); + if ((num_vectors < 1) || ((num_vectors - 1) & orig_vect)) { + APIC_VERBOSE(INTR, (CE_WARN, + "set_grp: base vec not part of a grp or not aligned: " + "vec:0x%x, num_vec:0x%x\n", orig_vect, num_vectors)); + return (NULL); + } + + if (vecp->v_inum != apix_get_min_dev_inum(dip, vecp->v_type)) + return (NULL); + + *result = EIO; + for (i = 1; i < num_vectors; i++) { + if ((vp = xv_vector(orig_cpu, orig_vect + i)) == NULL) + return (NULL); +#ifdef DEBUG + /* + * Sanity check: CPU and dip is the same for all entries. + * May be called when first msi to be enabled, at this time + * add_avintr() is not called for other msi + */ + if ((vp->v_share != 0) && + ((APIX_GET_DIP(vp) != dip) || + (vp->v_cpuid != vecp->v_cpuid))) { + APIC_VERBOSE(INTR, (CE_WARN, + "set_grp: cpu or dip for vec 0x%x difft than for " + "vec 0x%x\n", orig_vect, orig_vect + i)); + APIC_VERBOSE(INTR, (CE_WARN, + " cpu: %d vs %d, dip: 0x%p vs 0x%p\n", orig_cpu, + vp->v_cpuid, (void *)dip, + (void *)APIX_GET_DIP(vp))); + return (NULL); + } +#endif /* DEBUG */ + } + + cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + handle = i_ddi_get_pci_config_handle(dip); + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + + /* MSI Per vector masking is supported. */ + if (msi_ctrl & PCI_MSI_PVM_MASK) { + if (msi_ctrl & PCI_MSI_64BIT_MASK) + msi_mask_off = cap_ptr + PCI_MSI_64BIT_MASKBITS; + else + msi_mask_off = cap_ptr + PCI_MSI_32BIT_MASK; + msi_pvm = pci_config_get32(handle, msi_mask_off); + pci_config_put32(handle, msi_mask_off, (uint32_t)-1); + APIC_VERBOSE(INTR, (CE_CONT, + "set_grp: pvm supported. Mask set to 0x%x\n", + pci_config_get32(handle, msi_mask_off))); + } + + if ((newp = apix_rebind(vecp, new_cpu, num_vectors)) != NULL) + *result = 0; + + /* Reenable vectors if per vector masking is supported. */ + if (msi_ctrl & PCI_MSI_PVM_MASK) { + pci_config_put32(handle, msi_mask_off, msi_pvm); + APIC_VERBOSE(INTR, (CE_CONT, + "set_grp: pvm supported. Mask restored to 0x%x\n", + pci_config_get32(handle, msi_mask_off))); + } + + return (newp); +} + +void +apix_intx_set_vector(int irqno, uint32_t cpuid, uchar_t vector) +{ + apic_irq_t *irqp; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + irqp->airq_cpu = cpuid; + irqp->airq_vector = vector; + apic_record_rdt_entry(irqp, irqno); + mutex_exit(&airq_mutex); +} + +apix_vector_t * +apix_intx_get_vector(int irqno) +{ + apic_irq_t *irqp; + uint32_t cpuid; + uchar_t vector; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno & 0xff]; + if (IS_IRQ_FREE(irqp) || (irqp->airq_cpu == IRQ_UNINIT)) { + mutex_exit(&airq_mutex); + return (NULL); + } + cpuid = irqp->airq_cpu; + vector = irqp->airq_vector; + mutex_exit(&airq_mutex); + + return (xv_vector(cpuid, vector)); +} + +/* + * Must called with interrupts disabled and apic_ioapic_lock held + */ +void +apix_intx_enable(int irqno) +{ + uchar_t ioapicindex, intin; + apic_irq_t *irqp = apic_irq_table[irqno]; + ioapic_rdt_t irdt; + apic_cpus_info_t *cpu_infop; + apix_vector_t *vecp = xv_vector(irqp->airq_cpu, irqp->airq_vector); + + ASSERT(LOCK_HELD(&apic_ioapic_lock) && !IS_IRQ_FREE(irqp)); + + ioapicindex = irqp->airq_ioapicindex; + intin = irqp->airq_intin_no; + cpu_infop = &apic_cpus[irqp->airq_cpu]; + + irdt.ir_lo = AV_PDEST | AV_FIXED | irqp->airq_rdt_entry; + irdt.ir_hi = cpu_infop->aci_local_id; + + apic_vt_ops->apic_intrmap_alloc_entry(&vecp->v_intrmap_private, NULL, + vecp->v_type, 1, ioapicindex); + apic_vt_ops->apic_intrmap_map_entry(vecp->v_intrmap_private, + (void *)&irdt, vecp->v_type, 1); + apic_vt_ops->apic_intrmap_record_rdt(vecp->v_intrmap_private, &irdt); + + /* write RDT entry high dword - destination */ + WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin, + irdt.ir_hi); + + /* Write the vector, trigger, and polarity portion of the RDT */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin, irdt.ir_lo); + + vecp->v_state = APIX_STATE_ENABLED; + + APIC_VERBOSE_IOAPIC((CE_CONT, "apix_intx_enable: ioapic 0x%x" + " intin 0x%x rdt_low 0x%x rdt_high 0x%x\n", + ioapicindex, intin, irdt.ir_lo, irdt.ir_hi)); +} + +/* + * Must called with interrupts disabled and apic_ioapic_lock held + */ +void +apix_intx_disable(int irqno) +{ + apic_irq_t *irqp = apic_irq_table[irqno]; + int ioapicindex, intin; + + ASSERT(LOCK_HELD(&apic_ioapic_lock) && !IS_IRQ_FREE(irqp)); + /* + * The assumption here is that this is safe, even for + * systems with IOAPICs that suffer from the hardware + * erratum because all devices have been quiesced before + * they unregister their interrupt handlers. If that + * assumption turns out to be false, this mask operation + * can induce the same erratum result we're trying to + * avoid. + */ + ioapicindex = irqp->airq_ioapicindex; + intin = irqp->airq_intin_no; + ioapic_write(ioapicindex, APIC_RDT_CMD + 2 * intin, AV_MASK); + + APIC_VERBOSE_IOAPIC((CE_CONT, "apix_intx_disable: ioapic 0x%x" + " intin 0x%x\n", ioapicindex, intin)); +} + +void +apix_intx_free(int irqno) +{ + apic_irq_t *irqp; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + + if (IS_IRQ_FREE(irqp)) { + mutex_exit(&airq_mutex); + return; + } + + irqp->airq_mps_intr_index = FREE_INDEX; + irqp->airq_cpu = IRQ_UNINIT; + irqp->airq_vector = APIX_INVALID_VECT; + mutex_exit(&airq_mutex); +} + +#ifdef DEBUG +int apix_intr_deliver_timeouts = 0; +int apix_intr_rirr_timeouts = 0; +int apix_intr_rirr_reset_failure = 0; +#endif +int apix_max_reps_irr_pending = 10; + +#define GET_RDT_BITS(ioapic, intin, bits) \ + (READ_IOAPIC_RDT_ENTRY_LOW_DWORD((ioapic), (intin)) & (bits)) +#define APIX_CHECK_IRR_DELAY drv_usectohz(5000) + +int +apix_intx_rebind(int irqno, processorid_t cpuid, uchar_t vector) +{ + apic_irq_t *irqp = apic_irq_table[irqno]; + ulong_t iflag; + int waited, ioapic_ix, intin_no, level, repeats, rdt_entry, masked; + + ASSERT(irqp != NULL); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + ioapic_ix = irqp->airq_ioapicindex; + intin_no = irqp->airq_intin_no; + level = apic_level_intr[irqno]; + + /* + * Wait for the delivery status bit to be cleared. This should + * be a very small amount of time. + */ + repeats = 0; + do { + repeats++; + + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_PENDING) == 0) + break; + } + if (!level) + break; + + /* + * Mask the RDT entry for level-triggered interrupts. + */ + irqp->airq_rdt_entry |= AV_MASK; + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((masked = (rdt_entry & AV_MASK)) == 0) { + /* Mask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, + AV_MASK | rdt_entry); + } + + /* + * If there was a race and an interrupt was injected + * just before we masked, check for that case here. + * Then, unmask the RDT entry and try again. If we're + * on our last try, don't unmask (because we want the + * RDT entry to remain masked for the rest of the + * function). + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((masked == 0) && ((rdt_entry & AV_PENDING) != 0) && + (repeats < apic_max_reps_clear_pending)) { + /* Unmask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, rdt_entry & ~AV_MASK); + irqp->airq_rdt_entry &= ~AV_MASK; + } + } while ((rdt_entry & AV_PENDING) && + (repeats < apic_max_reps_clear_pending)); + +#ifdef DEBUG + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_PENDING) != 0) + apix_intr_deliver_timeouts++; +#endif + + if (!level || !APIX_IS_MASK_RDT(apix_mul_ioapic_method)) + goto done; + + /* + * wait for remote IRR to be cleared for level-triggered + * interrupts + */ + repeats = 0; + do { + repeats++; + + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_REMOTE_IRR) + == 0) + break; + } + + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_REMOTE_IRR) != 0) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + delay(APIX_CHECK_IRR_DELAY); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + } + } while (repeats < apix_max_reps_irr_pending); + + if (repeats >= apix_max_reps_irr_pending) { +#ifdef DEBUG + apix_intr_rirr_timeouts++; +#endif + + /* + * If we waited and the Remote IRR bit is still not cleared, + * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS + * times for this interrupt, try the last-ditch workaround: + */ + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_REMOTE_IRR) != 0) { + /* + * Trying to clear the bit through normal + * channels has failed. So as a last-ditch + * effort, try to set the trigger mode to + * edge, then to level. This has been + * observed to work on many systems. + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & ~AV_LEVEL); + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) | AV_LEVEL); + } + + if (GET_RDT_BITS(ioapic_ix, intin_no, AV_REMOTE_IRR) != 0) { +#ifdef DEBUG + apix_intr_rirr_reset_failure++; +#endif + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + prom_printf("apix: Remote IRR still " + "not clear for IOAPIC %d intin %d.\n" + "\tInterrupts to this pin may cease " + "functioning.\n", ioapic_ix, intin_no); + return (1); /* return failure */ + } + } + +done: + /* change apic_irq_table */ + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + apix_intx_set_vector(irqno, cpuid, vector); + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* reprogramme IO-APIC RDT entry */ + apix_intx_enable(irqno); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (0); +} + +static int +apix_intx_get_pending(int irqno) +{ + apic_irq_t *irqp; + int intin, ioapicindex, pending; + ulong_t iflag; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + if (IS_IRQ_FREE(irqp)) { + mutex_exit(&airq_mutex); + return (0); + } + + /* check IO-APIC delivery status */ + intin = irqp->airq_intin_no; + ioapicindex = irqp->airq_ioapicindex; + mutex_exit(&airq_mutex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + pending = (READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin) & + AV_PENDING) ? 1 : 0; + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (pending); +} + +static void +apix_intx_set_mask(int irqno) +{ + int intin, ioapixindex, rdt_entry; + ulong_t iflag; + apic_irq_t *irqp; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + + ASSERT(irqp->airq_mps_intr_index != FREE_INDEX); + + intin = irqp->airq_intin_no; + ioapixindex = irqp->airq_ioapicindex; + mutex_exit(&airq_mutex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapixindex, intin); + + /* clear mask */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapixindex, intin, + (AV_MASK | rdt_entry)); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); +} + +static void +apix_intx_clear_mask(int irqno) +{ + int intin, ioapixindex, rdt_entry; + ulong_t iflag; + apic_irq_t *irqp; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + + ASSERT(irqp->airq_mps_intr_index != FREE_INDEX); + + intin = irqp->airq_intin_no; + ioapixindex = irqp->airq_ioapicindex; + mutex_exit(&airq_mutex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapixindex, intin); + + /* clear mask */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapixindex, intin, + ((~AV_MASK) & rdt_entry)); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); +} + +/* + * For level-triggered interrupt, mask the IRQ line. Mask means + * new interrupts will not be delivered. The interrupt already + * accepted by a local APIC is not affected + */ +void +apix_level_intr_pre_eoi(int irq) +{ + apic_irq_t *irqp = apic_irq_table[irq]; + int apic_ix, intin_ix; + + if (irqp == NULL) + return; + + ASSERT(apic_level_intr[irq] == TRIGGER_MODE_LEVEL); + + lock_set(&apic_ioapic_lock); + + intin_ix = irqp->airq_intin_no; + apic_ix = irqp->airq_ioapicindex; + + if (irqp->airq_cpu != CPU->cpu_id) { + if (!APIX_IS_MASK_RDT(apix_mul_ioapic_method)) + ioapic_write_eoi(apic_ix, irqp->airq_vector); + lock_clear(&apic_ioapic_lock); + return; + } + + if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_IOXAPIC) { + /* + * This is a IOxAPIC and there is EOI register: + * Change the vector to reserved unused vector, so that + * the EOI from Local APIC won't clear the Remote IRR for + * this level trigger interrupt. Instead, we'll manually + * clear it in apix_post_hardint() after ISR handling. + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(apic_ix, intin_ix, + (irqp->airq_rdt_entry & (~0xff)) | APIX_RESV_VECTOR); + } else { + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(apic_ix, intin_ix, + AV_MASK | irqp->airq_rdt_entry); + } + + lock_clear(&apic_ioapic_lock); +} + +/* + * For level-triggered interrupt, unmask the IRQ line + * or restore the original vector number. + */ +void +apix_level_intr_post_dispatch(int irq) +{ + apic_irq_t *irqp = apic_irq_table[irq]; + int apic_ix, intin_ix; + + if (irqp == NULL) + return; + + lock_set(&apic_ioapic_lock); + + intin_ix = irqp->airq_intin_no; + apic_ix = irqp->airq_ioapicindex; + + if (APIX_IS_DIRECTED_EOI(apix_mul_ioapic_method)) { + /* + * Already sent EOI back to Local APIC. + * Send EOI to IO-APIC + */ + ioapic_write_eoi(apic_ix, irqp->airq_vector); + } else { + /* clear the mask or restore the vector */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(apic_ix, intin_ix, + irqp->airq_rdt_entry); + + /* send EOI to IOxAPIC */ + if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_IOXAPIC) + ioapic_write_eoi(apic_ix, irqp->airq_vector); + } + + lock_clear(&apic_ioapic_lock); +} + +static int +apix_intx_get_shared(int irqno) +{ + apic_irq_t *irqp; + int share; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + if (IS_IRQ_FREE(irqp) || (irqp->airq_cpu == IRQ_UNINIT)) { + mutex_exit(&airq_mutex); + return (0); + } + share = irqp->airq_share; + mutex_exit(&airq_mutex); + + return (share); +} + +static void +apix_intx_set_shared(int irqno, int delta) +{ + apic_irq_t *irqp; + + mutex_enter(&airq_mutex); + irqp = apic_irq_table[irqno]; + if (IS_IRQ_FREE(irqp)) { + mutex_exit(&airq_mutex); + return; + } + irqp->airq_share += delta; + mutex_exit(&airq_mutex); +} + +/* + * Setup IRQ table. Return IRQ no or -1 on failure + */ +static int +apix_intx_setup(dev_info_t *dip, int inum, int irqno, + struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *iflagp) +{ + int origirq = ispec->intrspec_vec; + int newirq; + short intr_index; + uchar_t ipin, ioapic, ioapicindex; + apic_irq_t *irqp; + + UNREFERENCED_1PARAMETER(inum); + + if (intrp != NULL) { + intr_index = (short)(intrp - apic_io_intrp); + ioapic = intrp->intr_destid; + ipin = intrp->intr_destintin; + + /* Find ioapicindex. If destid was ALL, we will exit with 0. */ + for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) + if (apic_io_id[ioapicindex] == ioapic) + break; + ASSERT((ioapic == apic_io_id[ioapicindex]) || + (ioapic == INTR_ALL_APIC)); + + /* check whether this intin# has been used by another irqno */ + if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) + return (newirq); + + } else if (iflagp != NULL) { /* ACPI */ + intr_index = ACPI_INDEX; + ioapicindex = acpi_find_ioapic(irqno); + ASSERT(ioapicindex != 0xFF); + ioapic = apic_io_id[ioapicindex]; + ipin = irqno - apic_io_vectbase[ioapicindex]; + + if (apic_irq_table[irqno] && + apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { + ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && + apic_irq_table[irqno]->airq_ioapicindex == + ioapicindex); + return (irqno); + } + + } else { /* default configuration */ + intr_index = DEFAULT_INDEX; + ioapicindex = 0; + ioapic = apic_io_id[ioapicindex]; + ipin = (uchar_t)irqno; + } + + /* allocate a new IRQ no */ + if ((irqp = apic_irq_table[irqno]) == NULL) { + irqp = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + apic_irq_table[irqno] = irqp; + } else { + if (irqp->airq_mps_intr_index != FREE_INDEX) { + newirq = apic_allocate_irq(apic_first_avail_irq); + if (newirq == -1) { + return (-1); + } + irqno = newirq; + irqp = apic_irq_table[irqno]; + ASSERT(irqp != NULL); + } + } + apic_max_device_irq = max(irqno, apic_max_device_irq); + apic_min_device_irq = min(irqno, apic_min_device_irq); + + irqp->airq_mps_intr_index = intr_index; + irqp->airq_ioapicindex = ioapicindex; + irqp->airq_intin_no = ipin; + irqp->airq_dip = dip; + irqp->airq_origirq = (uchar_t)origirq; + if (iflagp != NULL) + irqp->airq_iflag = *iflagp; + irqp->airq_cpu = IRQ_UNINIT; + irqp->airq_vector = 0; + + return (irqno); +} + +/* + * Setup IRQ table for non-pci devices. Return IRQ no or -1 on error + */ +static int +apix_intx_setup_nonpci(dev_info_t *dip, int inum, int bustype, + struct intrspec *ispec) +{ + int irqno = ispec->intrspec_vec; + int newirq, i; + iflag_t intr_flag; + ACPI_SUBTABLE_HEADER *hp; + ACPI_MADT_INTERRUPT_OVERRIDE *isop; + struct apic_io_intr *intrp; + + if (!apic_enable_acpi || apic_use_acpi_madt_only) { + int busid; + + if (bustype == 0) + bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; + + /* loop checking BUS_ISA/BUS_EISA */ + for (i = 0; i < 2; i++) { + if (((busid = apic_find_bus_id(bustype)) != -1) && + ((intrp = apic_find_io_intr_w_busid(irqno, busid)) + != NULL)) { + return (apix_intx_setup(dip, inum, irqno, + intrp, ispec, NULL)); + } + bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; + } + + /* fall back to default configuration */ + return (-1); + } + + /* search iso entries first */ + if (acpi_iso_cnt != 0) { + hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; + i = 0; + while (i < acpi_iso_cnt) { + if (hp->Type == ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { + isop = (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; + if (isop->Bus == 0 && + isop->SourceIrq == irqno) { + newirq = isop->GlobalIrq; + intr_flag.intr_po = isop->IntiFlags & + ACPI_MADT_POLARITY_MASK; + intr_flag.intr_el = (isop->IntiFlags & + ACPI_MADT_TRIGGER_MASK) >> 2; + intr_flag.bustype = BUS_ISA; + + return (apix_intx_setup(dip, inum, + newirq, NULL, ispec, &intr_flag)); + } + i++; + } + hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + + hp->Length); + } + } + intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; + intr_flag.intr_el = INTR_EL_EDGE; + intr_flag.bustype = BUS_ISA; + return (apix_intx_setup(dip, inum, irqno, NULL, ispec, &intr_flag)); +} + + +/* + * Setup IRQ table for pci devices. Return IRQ no or -1 on error + */ +static int +apix_intx_setup_pci(dev_info_t *dip, int inum, int bustype, + struct intrspec *ispec) +{ + int busid, devid, pci_irq; + ddi_acc_handle_t cfg_handle; + uchar_t ipin; + iflag_t intr_flag; + struct apic_io_intr *intrp; + + if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) + return (-1); + + if (busid == 0 && apic_pci_bus_total == 1) + busid = (int)apic_single_pci_busid; + + if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) + return (-1); + ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; + pci_config_teardown(&cfg_handle); + + if (apic_enable_acpi && !apic_use_acpi_madt_only) { /* ACPI */ + if (apic_acpi_translate_pci_irq(dip, busid, devid, + ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) + return (-1); + + intr_flag.bustype = (uchar_t)bustype; + return (apix_intx_setup(dip, inum, pci_irq, NULL, ispec, + &intr_flag)); + } + + /* MP configuration table */ + pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); + if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) == NULL) { + pci_irq = apic_handle_pci_pci_bridge(dip, devid, ipin, &intrp); + if (pci_irq == -1) + return (-1); + } + + return (apix_intx_setup(dip, inum, pci_irq, intrp, ispec, NULL)); +} + +/* + * Translate and return IRQ no + */ +static int +apix_intx_xlate_irq(dev_info_t *dip, int inum, struct intrspec *ispec) +{ + int newirq, irqno = ispec->intrspec_vec; + int parent_is_pci_or_pciex = 0, child_is_pciex = 0; + int bustype = 0, dev_len; + char dev_type[16]; + + if (apic_defconf) { + mutex_enter(&airq_mutex); + goto defconf; + } + + if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) { + mutex_enter(&airq_mutex); + goto nonpci; + } + + /* + * use ddi_getlongprop_buf() instead of ddi_prop_lookup_string() + * to avoid extra buffer allocation. + */ + dev_len = sizeof (dev_type); + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), + DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if ((strcmp(dev_type, "pci") == 0) || + (strcmp(dev_type, "pciex") == 0)) + parent_is_pci_or_pciex = 1; + } + + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if (strstr(dev_type, "pciex")) + child_is_pciex = 1; + } + + mutex_enter(&airq_mutex); + + if (parent_is_pci_or_pciex) { + bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; + newirq = apix_intx_setup_pci(dip, inum, bustype, ispec); + if (newirq != -1) + goto done; + bustype = 0; + } else if (strcmp(dev_type, "isa") == 0) + bustype = BUS_ISA; + else if (strcmp(dev_type, "eisa") == 0) + bustype = BUS_EISA; + +nonpci: + newirq = apix_intx_setup_nonpci(dip, inum, bustype, ispec); + if (newirq != -1) + goto done; + +defconf: + newirq = apix_intx_setup(dip, inum, irqno, NULL, ispec, NULL); + if (newirq == -1) { + mutex_exit(&airq_mutex); + return (-1); + } +done: + ASSERT(apic_irq_table[newirq]); + mutex_exit(&airq_mutex); + return (newirq); +} + +static int +apix_intx_alloc_vector(dev_info_t *dip, int inum, struct intrspec *ispec) +{ + int irqno; + apix_vector_t *vecp; + + if ((irqno = apix_intx_xlate_irq(dip, inum, ispec)) == -1) + return (0); + + if ((vecp = apix_alloc_intx(dip, inum, irqno)) == NULL) + return (0); + + DDI_INTR_IMPLDBG((CE_CONT, "apix_intx_alloc_vector: dip=0x%p name=%s " + "irqno=0x%x cpuid=%d vector=0x%x\n", + (void *)dip, ddi_driver_name(dip), irqno, + vecp->v_cpuid, vecp->v_vector)); + + return (1); +} + +/* + * Return the vector number if the translated IRQ for this device + * has a vector mapping setup. If no IRQ setup exists or no vector is + * allocated to it then return 0. + */ +static apix_vector_t * +apix_intx_xlate_vector(dev_info_t *dip, int inum, struct intrspec *ispec) +{ + int irqno; + apix_vector_t *vecp; + + /* get the IRQ number */ + if ((irqno = apix_intx_xlate_irq(dip, inum, ispec)) == -1) + return (NULL); + + /* get the vector number if a vector is allocated to this irqno */ + vecp = apix_intx_get_vector(irqno); + + return (vecp); +} + +/* stub function */ +int +apix_loaded(void) +{ + return (apix_is_enabled); +} diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c new file mode 100644 index 0000000000..e5d072b525 --- /dev/null +++ b/usr/src/uts/i86pc/io/apix/apix_intr.c @@ -0,0 +1,968 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/cpuvar.h> +#include <sys/cpu_event.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/t_lock.h> +#include <sys/kmem.h> +#include <sys/machlock.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/atomic.h> +#include <sys/sdt.h> +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/cram.h> +#include <sys/apic.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/sysmacros.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/spl.h> +#include <sys/clock.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/x_call.h> +#include <sys/reboot.h> +#include <vm/hat_i86.h> +#include <sys/stack.h> +#include <sys/apix.h> + +static void apix_post_hardint(int); + +/* + * Insert an vector into the tail of the interrupt pending list + */ +static __inline__ void +apix_insert_pending_av(apix_impl_t *apixp, struct autovec *avp, int ipl) +{ + struct autovec **head = apixp->x_intr_head; + struct autovec **tail = apixp->x_intr_tail; + + avp->av_ipl_link = NULL; + if (tail[ipl] == NULL) { + head[ipl] = tail[ipl] = avp; + return; + } + + tail[ipl]->av_ipl_link = avp; + tail[ipl] = avp; +} + +/* + * Remove and return an vector from the head of hardware interrupt + * pending list. + */ +static __inline__ struct autovec * +apix_remove_pending_av(apix_impl_t *apixp, int ipl) +{ + struct cpu *cpu = CPU; + struct autovec **head = apixp->x_intr_head; + struct autovec **tail = apixp->x_intr_tail; + struct autovec *avp = head[ipl]; + + if (avp == NULL) + return (NULL); + + if (avp->av_vector != NULL && avp->av_prilevel < cpu->cpu_base_spl) { + /* + * If there is blocked higher level interrupts, return + * NULL to quit handling of current IPL level. + */ + apixp->x_intr_pending |= (1 << avp->av_prilevel); + return (NULL); + } + + avp->av_flags &= ~AV_PENTRY_PEND; + avp->av_flags |= AV_PENTRY_ONPROC; + head[ipl] = avp->av_ipl_link; + avp->av_ipl_link = NULL; + + if (head[ipl] == NULL) + tail[ipl] = NULL; + + return (avp); +} + +/* + * add_pending_hardint: + * + * Add hardware interrupts to the interrupt pending list. + */ +static void +apix_add_pending_hardint(int vector) +{ + uint32_t cpuid = psm_get_cpu_id(); + apix_impl_t *apixp = apixs[cpuid]; + apix_vector_t *vecp = apixp->x_vectbl[vector]; + struct autovec *p, *prevp = NULL; + int ipl; + + /* + * The MSI interrupt not supporting per-vector masking could + * be triggered on a false vector as a result of rebinding + * operation cannot programme MSI address & data atomically. + * Add ISR of this interrupt to the pending list for such + * suspicious interrupt. + */ + APIX_DO_FAKE_INTR(cpuid, vector); + if (vecp == NULL) + return; + + for (p = vecp->v_autovect; p != NULL; p = p->av_link) { + if (p->av_vector == NULL) + continue; /* skip freed entry */ + + ipl = p->av_prilevel; + prevp = p; + + /* set pending at specified priority level */ + apixp->x_intr_pending |= (1 << ipl); + + if (p->av_flags & AV_PENTRY_PEND) + continue; /* already in the pending list */ + p->av_flags |= AV_PENTRY_PEND; + + /* insert into pending list by it original IPL */ + apix_insert_pending_av(apixp, p, ipl); + } + + /* last one of the linked list */ + if (prevp && ((prevp->av_flags & AV_PENTRY_LEVEL) != 0)) + prevp->av_flags |= (vector & AV_PENTRY_VECTMASK); +} + +/* + * Walk pending hardware interrupts at given priority level, invoking + * each interrupt handler as we go. + */ +extern uint64_t intr_get_time(void); + +static void +apix_dispatch_pending_autovect(uint_t ipl) +{ + uint32_t cpuid = psm_get_cpu_id(); + apix_impl_t *apixp = apixs[cpuid]; + struct autovec *av; + + while ((av = apix_remove_pending_av(apixp, ipl)) != NULL) { + uint_t r; + uint_t (*intr)() = av->av_vector; + caddr_t arg1 = av->av_intarg1; + caddr_t arg2 = av->av_intarg2; + dev_info_t *dip = av->av_dip; + uchar_t vector = av->av_flags & AV_PENTRY_VECTMASK; + + if (intr == NULL) + continue; + + /* Don't enable interrupts during x-calls */ + if (ipl != XC_HI_PIL) + sti(); + + DTRACE_PROBE4(interrupt__start, dev_info_t *, dip, + void *, intr, caddr_t, arg1, caddr_t, arg2); + r = (*intr)(arg1, arg2); + DTRACE_PROBE4(interrupt__complete, dev_info_t *, dip, + void *, intr, caddr_t, arg1, uint_t, r); + + if (av->av_ticksp && av->av_prilevel <= LOCK_LEVEL) + atomic_add_64(av->av_ticksp, intr_get_time()); + + cli(); + + if (vector) { + if ((av->av_flags & AV_PENTRY_PEND) == 0) + av->av_flags &= ~AV_PENTRY_VECTMASK; + + apix_post_hardint(vector); + } + + /* mark it as idle */ + av->av_flags &= ~AV_PENTRY_ONPROC; + } +} + +static caddr_t +apix_do_softint_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, + caddr_t stackptr) +{ + kthread_t *t, *volatile it; + struct machcpu *mcpu = &cpu->cpu_m; + hrtime_t now; + + UNREFERENCED_1PARAMETER(oldpil); + ASSERT(pil > mcpu->mcpu_pri && pil > cpu->cpu_base_spl); + + atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending, ~(1 << pil)); + + mcpu->mcpu_pri = pil; + + now = tsc_read(); + + /* + * Get set to run interrupt thread. + * There should always be an interrupt thread since we + * allocate one for each level on the CPU. + */ + it = cpu->cpu_intr_thread; + ASSERT(it != NULL); + cpu->cpu_intr_thread = it->t_link; + + /* t_intr_start could be zero due to cpu_intr_swtch_enter. */ + t = cpu->cpu_thread; + if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) { + hrtime_t intrtime = now - t->t_intr_start; + mcpu->intrstat[pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + t->t_intr_start = 0; + } + + /* + * Note that the code in kcpc_overflow_intr -relies- on the + * ordering of events here - in particular that t->t_lwp of + * the interrupt thread is set to the pinned thread *before* + * curthread is changed. + */ + it->t_lwp = t->t_lwp; + it->t_state = TS_ONPROC; + + /* + * Push interrupted thread onto list from new thread. + * Set the new thread as the current one. + * Set interrupted thread's T_SP because if it is the idle thread, + * resume() may use that stack between threads. + */ + + ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr); + t->t_sp = (uintptr_t)stackptr; + + it->t_intr = t; + cpu->cpu_thread = it; + + /* + * Set bit for this pil in CPU's interrupt active bitmask. + */ + ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0); + cpu->cpu_intr_actv |= (1 << pil); + + /* + * Initialize thread priority level from intr_pri + */ + it->t_pil = (uchar_t)pil; + it->t_pri = (pri_t)pil + intr_pri; + it->t_intr_start = now; + + return (it->t_stk); +} + +static void +apix_do_softint_epilog(struct cpu *cpu, uint_t oldpil) +{ + struct machcpu *mcpu = &cpu->cpu_m; + kthread_t *t, *it; + uint_t pil, basespl; + hrtime_t intrtime; + hrtime_t now = tsc_read(); + + it = cpu->cpu_thread; + pil = it->t_pil; + + cpu->cpu_stats.sys.intr[pil - 1]++; + + ASSERT(cpu->cpu_intr_actv & (1 << pil)); + cpu->cpu_intr_actv &= ~(1 << pil); + + intrtime = now - it->t_intr_start; + mcpu->intrstat[pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + + /* + * If there is still an interrupted thread underneath this one + * then the interrupt was never blocked and the return is + * fairly simple. Otherwise it isn't. + */ + if ((t = it->t_intr) == NULL) { + /* + * Put thread back on the interrupt thread list. + * This was an interrupt thread, so set CPU's base SPL. + */ + set_base_spl(); + /* mcpu->mcpu_pri = cpu->cpu_base_spl; */ + + it->t_state = TS_FREE; + it->t_link = cpu->cpu_intr_thread; + cpu->cpu_intr_thread = it; + (void) splhigh(); + sti(); + swtch(); + /*NOTREACHED*/ + panic("dosoftint_epilog: swtch returned"); + } + it->t_link = cpu->cpu_intr_thread; + cpu->cpu_intr_thread = it; + it->t_state = TS_FREE; + cpu->cpu_thread = t; + if (t->t_flag & T_INTR_THREAD) + t->t_intr_start = now; + basespl = cpu->cpu_base_spl; + pil = MAX(oldpil, basespl); + mcpu->mcpu_pri = pil; +} + +/* + * Dispatch a soft interrupt + */ +static void +apix_dispatch_softint(uint_t oldpil, uint_t arg2) +{ + struct cpu *cpu = CPU; + + UNREFERENCED_1PARAMETER(arg2); + + sti(); + av_dispatch_softvect((int)cpu->cpu_thread->t_pil); + cli(); + + /* + * Must run softint_epilog() on the interrupt thread stack, since + * there may not be a return from it if the interrupt thread blocked. + */ + apix_do_softint_epilog(cpu, oldpil); +} + +/* + * Deliver any softints the current interrupt priority allows. + * Called with interrupts disabled. + */ +int +apix_do_softint(struct regs *regs) +{ + struct cpu *cpu = CPU; + int oldipl; + int newipl; + volatile uint16_t pending; + caddr_t newsp; + + while ((pending = cpu->cpu_softinfo.st_pending) != 0) { + newipl = bsrw_insn(pending); + oldipl = cpu->cpu_pri; + if (newipl <= oldipl || newipl <= cpu->cpu_base_spl) + return (-1); + + newsp = apix_do_softint_prolog(cpu, newipl, oldipl, + (caddr_t)regs); + ASSERT(newsp != NULL); + switch_sp_and_call(newsp, apix_dispatch_softint, oldipl, 0); + } + + return (0); +} + +static int +apix_hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, + struct regs *rp) +{ + struct machcpu *mcpu = &cpu->cpu_m; + hrtime_t intrtime; + hrtime_t now = tsc_read(); + apix_impl_t *apixp = apixs[cpu->cpu_id]; + uint_t mask; + + ASSERT(pil > mcpu->mcpu_pri && pil > cpu->cpu_base_spl); + + if (pil == CBE_HIGH_PIL) { /* 14 */ + cpu->cpu_profile_pil = oldpil; + if (USERMODE(rp->r_cs)) { + cpu->cpu_profile_pc = 0; + cpu->cpu_profile_upc = rp->r_pc; + cpu->cpu_cpcprofile_pc = 0; + cpu->cpu_cpcprofile_upc = rp->r_pc; + } else { + cpu->cpu_profile_pc = rp->r_pc; + cpu->cpu_profile_upc = 0; + cpu->cpu_cpcprofile_pc = rp->r_pc; + cpu->cpu_cpcprofile_upc = 0; + } + } + + mcpu->mcpu_pri = pil; + + mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK; + if (mask != 0) { + int nestpil; + + /* + * We have interrupted another high-level interrupt. + * Load starting timestamp, compute interval, update + * cumulative counter. + */ + nestpil = bsrw_insn((uint16_t)mask); + intrtime = now - + mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)]; + mcpu->intrstat[nestpil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + } else { + kthread_t *t = cpu->cpu_thread; + + /* + * See if we are interrupting a low-level interrupt thread. + * If so, account for its time slice only if its time stamp + * is non-zero. + */ + if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) { + intrtime = now - t->t_intr_start; + mcpu->intrstat[t->t_pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + t->t_intr_start = 0; + } + } + + /* store starting timestamp in CPu structure for this IPL */ + mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now; + + if (pil == 15) { + /* + * To support reentrant level 15 interrupts, we maintain a + * recursion count in the top half of cpu_intr_actv. Only + * when this count hits zero do we clear the PIL 15 bit from + * the lower half of cpu_intr_actv. + */ + uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1; + (*refcntp)++; + } + + cpu->cpu_intr_actv |= (1 << pil); + /* clear pending ipl level bit */ + apixp->x_intr_pending &= ~(1 << pil); + + return (mask); +} + +static int +apix_hilevel_intr_epilog(struct cpu *cpu, uint_t oldpil) +{ + struct machcpu *mcpu = &cpu->cpu_m; + uint_t mask, pil; + hrtime_t intrtime; + hrtime_t now = tsc_read(); + + pil = mcpu->mcpu_pri; + cpu->cpu_stats.sys.intr[pil - 1]++; + + ASSERT(cpu->cpu_intr_actv & (1 << pil)); + + if (pil == 15) { + /* + * To support reentrant level 15 interrupts, we maintain a + * recursion count in the top half of cpu_intr_actv. Only + * when this count hits zero do we clear the PIL 15 bit from + * the lower half of cpu_intr_actv. + */ + uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1; + + ASSERT(*refcntp > 0); + + if (--(*refcntp) == 0) + cpu->cpu_intr_actv &= ~(1 << pil); + } else { + cpu->cpu_intr_actv &= ~(1 << pil); + } + + ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0); + + intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)]; + mcpu->intrstat[pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + + /* + * Check for lower-pil nested high-level interrupt beneath + * current one. If so, place a starting timestamp in its + * pil_high_start entry. + */ + mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK; + if (mask != 0) { + int nestpil; + + /* + * find PIL of nested interrupt + */ + nestpil = bsrw_insn((uint16_t)mask); + ASSERT(nestpil < pil); + mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now; + /* + * (Another high-level interrupt is active below this one, + * so there is no need to check for an interrupt + * thread. That will be done by the lowest priority + * high-level interrupt active.) + */ + } else { + /* + * Check to see if there is a low-level interrupt active. + * If so, place a starting timestamp in the thread + * structure. + */ + kthread_t *t = cpu->cpu_thread; + + if (t->t_flag & T_INTR_THREAD) + t->t_intr_start = now; + } + + mcpu->mcpu_pri = oldpil; + if (pil < CBE_HIGH_PIL) + (void) (*setlvlx)(oldpil, 0); + + return (mask); +} + +/* + * Dispatch a hilevel interrupt (one above LOCK_LEVEL) + */ +static void +apix_dispatch_pending_hilevel(uint_t ipl, uint_t arg2) +{ + UNREFERENCED_1PARAMETER(arg2); + + apix_dispatch_pending_autovect(ipl); +} + +static __inline__ int +apix_do_pending_hilevel(struct cpu *cpu, struct regs *rp) +{ + volatile uint16_t pending; + uint_t newipl, oldipl; + caddr_t newsp; + + while ((pending = HILEVEL_PENDING(cpu)) != 0) { + newipl = bsrw_insn(pending); + ASSERT(newipl > LOCK_LEVEL && newipl > cpu->cpu_base_spl); + oldipl = cpu->cpu_pri; + if (newipl <= oldipl) + return (-1); + + /* + * High priority interrupts run on this cpu's interrupt stack. + */ + if (apix_hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) { + newsp = cpu->cpu_intr_stack; + switch_sp_and_call(newsp, apix_dispatch_pending_hilevel, + newipl, 0); + } else { /* already on the interrupt stack */ + apix_dispatch_pending_hilevel(newipl, 0); + } + (void) apix_hilevel_intr_epilog(cpu, oldipl); + } + + return (0); +} + +/* + * Get an interrupt thread and swith to it. It's called from do_interrupt(). + * The IF flag is cleared and thus all maskable interrupts are blocked at + * the time of calling. + */ +static caddr_t +apix_intr_thread_prolog(struct cpu *cpu, uint_t pil, caddr_t stackptr) +{ + apix_impl_t *apixp = apixs[cpu->cpu_id]; + struct machcpu *mcpu = &cpu->cpu_m; + hrtime_t now = tsc_read(); + kthread_t *t, *volatile it; + + ASSERT(pil > mcpu->mcpu_pri && pil > cpu->cpu_base_spl); + + apixp->x_intr_pending &= ~(1 << pil); + ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0); + cpu->cpu_intr_actv |= (1 << pil); + mcpu->mcpu_pri = pil; + + /* + * Get set to run interrupt thread. + * There should always be an interrupt thread since we + * allocate one for each level on the CPU. + */ + /* t_intr_start could be zero due to cpu_intr_swtch_enter. */ + t = cpu->cpu_thread; + if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) { + hrtime_t intrtime = now - t->t_intr_start; + mcpu->intrstat[pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + t->t_intr_start = 0; + } + + /* + * Push interrupted thread onto list from new thread. + * Set the new thread as the current one. + * Set interrupted thread's T_SP because if it is the idle thread, + * resume() may use that stack between threads. + */ + + ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr); + + t->t_sp = (uintptr_t)stackptr; /* mark stack in curthread for resume */ + + /* + * Note that the code in kcpc_overflow_intr -relies- on the + * ordering of events here - in particular that t->t_lwp of + * the interrupt thread is set to the pinned thread *before* + * curthread is changed. + */ + it = cpu->cpu_intr_thread; + cpu->cpu_intr_thread = it->t_link; + it->t_intr = t; + it->t_lwp = t->t_lwp; + + /* + * (threads on the interrupt thread free list could have state + * preset to TS_ONPROC, but it helps in debugging if + * they're TS_FREE.) + */ + it->t_state = TS_ONPROC; + + cpu->cpu_thread = it; + + /* + * Initialize thread priority level from intr_pri + */ + it->t_pil = (uchar_t)pil; + it->t_pri = (pri_t)pil + intr_pri; + it->t_intr_start = now; + + return (it->t_stk); +} + +static void +apix_intr_thread_epilog(struct cpu *cpu, uint_t oldpil) +{ + struct machcpu *mcpu = &cpu->cpu_m; + kthread_t *t, *it = cpu->cpu_thread; + uint_t pil, basespl; + hrtime_t intrtime; + hrtime_t now = tsc_read(); + + pil = it->t_pil; + cpu->cpu_stats.sys.intr[pil - 1]++; + + ASSERT(cpu->cpu_intr_actv & (1 << pil)); + cpu->cpu_intr_actv &= ~(1 << pil); + + ASSERT(it->t_intr_start != 0); + intrtime = now - it->t_intr_start; + mcpu->intrstat[pil][0] += intrtime; + cpu->cpu_intracct[cpu->cpu_mstate] += intrtime; + + /* + * If there is still an interrupted thread underneath this one + * then the interrupt was never blocked and the return is + * fairly simple. Otherwise it isn't. + */ + if ((t = it->t_intr) == NULL) { + /* + * The interrupted thread is no longer pinned underneath + * the interrupt thread. This means the interrupt must + * have blocked, and the interrupted thread has been + * unpinned, and has probably been running around the + * system for a while. + * + * Since there is no longer a thread under this one, put + * this interrupt thread back on the CPU's free list and + * resume the idle thread which will dispatch the next + * thread to run. + */ + cpu->cpu_stats.sys.intrblk++; + + /* + * Put thread back on the interrupt thread list. + * This was an interrupt thread, so set CPU's base SPL. + */ + set_base_spl(); + basespl = cpu->cpu_base_spl; + mcpu->mcpu_pri = basespl; + (*setlvlx)(basespl, 0); + + it->t_state = TS_FREE; + /* + * Return interrupt thread to pool + */ + it->t_link = cpu->cpu_intr_thread; + cpu->cpu_intr_thread = it; + + (void) splhigh(); + sti(); + swtch(); + /*NOTREACHED*/ + panic("dosoftint_epilog: swtch returned"); + } + + /* + * Return interrupt thread to the pool + */ + it->t_link = cpu->cpu_intr_thread; + cpu->cpu_intr_thread = it; + it->t_state = TS_FREE; + + cpu->cpu_thread = t; + if (t->t_flag & T_INTR_THREAD) + t->t_intr_start = now; + basespl = cpu->cpu_base_spl; + mcpu->mcpu_pri = MAX(oldpil, basespl); + (*setlvlx)(mcpu->mcpu_pri, 0); +} + + +static void +apix_dispatch_pending_hardint(uint_t oldpil, uint_t arg2) +{ + struct cpu *cpu = CPU; + + UNREFERENCED_1PARAMETER(arg2); + + apix_dispatch_pending_autovect((int)cpu->cpu_thread->t_pil); + + /* + * Must run intr_thread_epilog() on the interrupt thread stack, since + * there may not be a return from it if the interrupt thread blocked. + */ + apix_intr_thread_epilog(cpu, oldpil); +} + +static __inline__ int +apix_do_pending_hardint(struct cpu *cpu, struct regs *rp) +{ + volatile uint16_t pending; + uint_t newipl, oldipl; + caddr_t newsp; + + while ((pending = LOWLEVEL_PENDING(cpu)) != 0) { + newipl = bsrw_insn(pending); + ASSERT(newipl <= LOCK_LEVEL); + oldipl = cpu->cpu_pri; + if (newipl <= oldipl || newipl <= cpu->cpu_base_spl) + return (-1); + + /* + * Run this interrupt in a separate thread. + */ + newsp = apix_intr_thread_prolog(cpu, newipl, (caddr_t)rp); + ASSERT(newsp != NULL); + switch_sp_and_call(newsp, apix_dispatch_pending_hardint, + oldipl, 0); + } + + return (0); +} + +/* + * Unmask level triggered interrupts + */ +static void +apix_post_hardint(int vector) +{ + apix_vector_t *vecp = xv_vector(psm_get_cpu_id(), vector); + int irqno = vecp->v_inum; + + ASSERT(vecp->v_type == APIX_TYPE_FIXED && apic_level_intr[irqno]); + + apix_level_intr_post_dispatch(irqno); +} + +static void +apix_dispatch_by_vector(uint_t vector) +{ + struct cpu *cpu = CPU; + apix_vector_t *vecp = xv_vector(cpu->cpu_id, vector); + struct autovec *avp; + uint_t r, (*intr)(); + caddr_t arg1, arg2; + dev_info_t *dip; + + if (vecp == NULL || + (avp = vecp->v_autovect) == NULL || avp->av_vector == NULL) + return; + + avp->av_flags |= AV_PENTRY_ONPROC; + intr = avp->av_vector; + arg1 = avp->av_intarg1; + arg2 = avp->av_intarg2; + dip = avp->av_dip; + + if (avp->av_prilevel != XC_HI_PIL) + sti(); + + DTRACE_PROBE4(interrupt__start, dev_info_t *, dip, + void *, intr, caddr_t, arg1, caddr_t, arg2); + r = (*intr)(arg1, arg2); + DTRACE_PROBE4(interrupt__complete, dev_info_t *, dip, + void *, intr, caddr_t, arg1, uint_t, r); + + cli(); + avp->av_flags &= ~AV_PENTRY_ONPROC; +} + + +static void +apix_dispatch_hilevel(uint_t vector, uint_t arg2) +{ + UNREFERENCED_1PARAMETER(arg2); + + apix_dispatch_by_vector(vector); +} + +static void +apix_dispatch_lowlevel(uint_t vector, uint_t oldipl) +{ + struct cpu *cpu = CPU; + + apix_dispatch_by_vector(vector); + + /* + * Must run intr_thread_epilog() on the interrupt thread stack, since + * there may not be a return from it if the interrupt thread blocked. + */ + apix_intr_thread_epilog(cpu, oldipl); +} + +void +apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp) +{ + struct cpu *cpu = CPU; + int vector = rp->r_trapno, newipl, oldipl = cpu->cpu_pri, ret; + apix_vector_t *vecp = NULL; + +#ifdef TRAPTRACE + ttp->ttr_marker = TT_INTERRUPT; + ttp->ttr_cpuid = cpu->cpu_id; + ttp->ttr_ipl = 0xff; + ttp->ttr_pri = (uchar_t)oldipl; + ttp->ttr_spl = cpu->cpu_base_spl; + ttp->ttr_vector = 0xff; +#endif /* TRAPTRACE */ + + cpu_idle_exit(CPU_IDLE_CB_FLAG_INTR); + + ++*(uint16_t *)&cpu->cpu_m.mcpu_istamp; + + /* + * If it's a softint go do it now. + */ + if (rp->r_trapno == T_SOFTINT) { + /* + * It might be the case that when an interrupt is triggered, + * the spl is raised to high by splhigh(). Later when do_splx() + * is called to restore the spl, both hardware and software + * interrupt pending flags are check and an SOFTINT is faked + * accordingly. + */ + (void) apix_do_pending_hilevel(cpu, rp); + (void) apix_do_pending_hardint(cpu, rp); + (void) apix_do_softint(rp); + ASSERT(!interrupts_enabled()); +#ifdef TRAPTRACE + ttp->ttr_vector = T_SOFTINT; +#endif + return; + } + + /* + * Raise the interrupt priority. Send EOI to local APIC + */ + newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno); +#ifdef TRAPTRACE + ttp->ttr_ipl = (uchar_t)newipl; +#endif /* TRAPTRACE */ + + /* + * Bail if it is a spurious interrupt + */ + if (newipl == -1) + return; + + vector = rp->r_trapno; + vecp = xv_vector(cpu->cpu_id, vector); +#ifdef TRAPTRACE + ttp->ttr_vector = (short)vector; +#endif /* TRAPTRACE */ + + /* + * Direct dispatch for IPI, MSI, MSI-X + */ + if (vecp && vecp->v_type != APIX_TYPE_FIXED && + newipl > MAX(oldipl, cpu->cpu_base_spl)) { + caddr_t newsp; + + if (newipl > LOCK_LEVEL) { + if (apix_hilevel_intr_prolog(cpu, newipl, oldipl, rp) + == 0) { + newsp = cpu->cpu_intr_stack; + switch_sp_and_call(newsp, apix_dispatch_hilevel, + vector, 0); + } else { + apix_dispatch_hilevel(vector, 0); + } + (void) apix_hilevel_intr_epilog(cpu, oldipl); + } else { + newsp = apix_intr_thread_prolog(cpu, newipl, + (caddr_t)rp); + switch_sp_and_call(newsp, apix_dispatch_lowlevel, + vector, oldipl); + } + } else { + /* Add to per-pil pending queue */ + apix_add_pending_hardint(vector); + if (newipl <= MAX(oldipl, cpu->cpu_base_spl) || + !apixs[cpu->cpu_id]->x_intr_pending) + return; + } + + if (apix_do_pending_hilevel(cpu, rp) < 0) + return; + + do { + ret = apix_do_pending_hardint(cpu, rp); + + /* + * Deliver any pending soft interrupts. + */ + (void) apix_do_softint(rp); + } while (!ret && LOWLEVEL_PENDING(cpu)); +} diff --git a/usr/src/uts/i86pc/io/apix/apix_irm.c b/usr/src/uts/i86pc/io/apix/apix_irm.c new file mode 100644 index 0000000000..fb7295c420 --- /dev/null +++ b/usr/src/uts/i86pc/io/apix/apix_irm.c @@ -0,0 +1,597 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/ddi.h> +#include <sys/sunndi.h> +#include <sys/ddi_impldefs.h> +#include <sys/psm_types.h> +#include <sys/smp_impldefs.h> +#include <sys/apic.h> +#include <sys/processor.h> +#include <sys/apix_irm_impl.h> + +/* global variable for static default limit for non-IRM drivers */ +extern int ddi_msix_alloc_limit; + +/* Extern declarations */ +extern int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, + psm_intr_op_t, int *); + +/* + * Global variables for IRM pool configuration: + * + * (1) apix_system_max_vectors -- this would limit the maximum + * number of interrupt vectors that will be made avilable + * to the device drivers. The default value (-1) indicates + * that all the available vectors could be used. + * + * (2) apix_irm_cpu_factor -- This would specify the number of CPUs that + * should be excluded from the global IRM pool of interrupt vectors. + * By default this would be zero, so vectors from all the CPUs + * present will be factored into the IRM pool. + * + * (3) apix_irm_reserve_fixed_vectors -- This would specify the number + * of vectors that should be reserved for FIXED type interrupts and + * exclude them from the IRM pool. The value can be one of the + * following: + * 0 - no reservation (default) + * <n> - a positive number for the reserved cache + * -1 - reserve the maximum needed + * + * (4) apix_irm_free_fixed_vectors -- This flag specifies if the + * vectors for FIXED type should be freed and added back + * to the IRM pool when ddi_intr_free() is called. The default + * is to add it back to the pool. + */ +int apix_system_max_vectors = -1; +int apix_irm_cpu_factor = 0; +int apix_irm_reserve_fixed_vectors = 0; +int apix_irm_free_fixed_vector = 1; + +/* info from APIX module for IRM configuration */ +apix_irm_info_t apix_irminfo; + +kmutex_t apix_irm_lock; /* global mutex for apix_irm_* data */ +ddi_irm_params_t apix_irm_params; /* IRM pool info */ +int apix_irm_cache_size = 0; /* local cache for FIXED type requests */ +int apix_irm_cpu_factor_available = 0; +int apix_irm_max_cpus = 0; +int apix_irm_cpus_used = 0; +int apix_irm_fixed_intr_vectors_used; + +extern int ncpus; + +/* local data/functions */ +static int apix_irm_chk_apix(); +int apix_irm_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *handle, + psm_intr_op_t op, int *result); +int apix_irm_disable_intr(processorid_t); +void apix_irm_enable_intr(processorid_t); +int (*psm_intr_ops_saved)(dev_info_t *dip, ddi_intr_handle_impl_t *handle, + psm_intr_op_t op, int *result) = NULL; +int (*psm_disable_intr_saved)(processorid_t) = NULL; +void (*psm_enable_intr_saved)(processorid_t) = NULL; +int apix_irm_alloc_fixed(dev_info_t *, ddi_intr_handle_impl_t *, int *); +int apix_irm_free_fixed(dev_info_t *, ddi_intr_handle_impl_t *, int *); + +/* + * Initilaize IRM pool for APIC interrupts if the PSM module + * is of APIX type. This should be called only after PSM module + * is loaded and APIC interrupt system is initialized. + */ +void +apix_irm_init(void) +{ + dev_info_t *dip; + int total_avail_vectors; + int cpus_used; + int cache_size; + + /* nothing to do if IRM is disabled */ + if (!irm_enable) + return; + + /* + * Use root devinfo node to associate the IRM pool with it + * as the pool is global to the system. + */ + dip = ddi_root_node(); + + /* + * Check if PSM module is initialized and it is APIX + * module (which supports IRM functionality). + */ + if ((psm_intr_ops == NULL) || !apix_irm_chk_apix()) { + /* not an APIX module */ + APIX_IRM_DEBUG((CE_CONT, + "apix_irm_init: APIX module not present")); + return; + } + + /* + * Now, determine the IRM pool parameters based on the + * info from APIX module and global config variables. + */ + + /* + * apix_ncpus shows all the CPUs present in the + * system but not all of them may have been enabled + * (i.e. mp_startup() may not have been called yet). + * So, use ncpus for IRM pool creation. + */ + if (apix_irminfo.apix_ncpus > ncpus) + apix_irminfo.apix_ncpus = ncpus; + + /* apply the CPU factor if possible */ + if ((apix_irm_cpu_factor > 0) && + (apix_irminfo.apix_ncpus > apix_irm_cpu_factor)) { + cpus_used = apix_irminfo.apix_ncpus - apix_irm_cpu_factor; + apix_irm_cpu_factor_available = apix_irm_cpu_factor; + } else { + cpus_used = apix_irminfo.apix_ncpus; + } + apix_irm_cpus_used = apix_irm_max_cpus = cpus_used; + + APIX_IRM_DEBUG((CE_CONT, + "apix_irm_init: %d CPUs used for IRM pool size", cpus_used)); + + total_avail_vectors = cpus_used * apix_irminfo.apix_per_cpu_vectors - + apix_irminfo.apix_vectors_allocated; + + apix_irm_fixed_intr_vectors_used = apix_irminfo.apix_vectors_allocated; + + if (total_avail_vectors <= 0) { + /* can not determine pool size */ + APIX_IRM_DEBUG((CE_NOTE, + "apix_irm_init: can not determine pool size")); + return; + } + + /* adjust the pool size as per the global config variable */ + if ((apix_system_max_vectors > 0) && + (apix_system_max_vectors < total_avail_vectors)) + total_avail_vectors = apix_system_max_vectors; + + /* pre-reserve vectors (i.e. local cache) for FIXED type if needed */ + if (apix_irm_reserve_fixed_vectors != 0) { + cache_size = apix_irm_reserve_fixed_vectors; + if ((cache_size == -1) || + (cache_size > apix_irminfo.apix_ioapic_max_vectors)) + cache_size = apix_irminfo.apix_ioapic_max_vectors; + total_avail_vectors -= cache_size; + apix_irm_cache_size = cache_size; + } + + if (total_avail_vectors <= 0) { + APIX_IRM_DEBUG((CE_NOTE, + "apix_irm_init: invalid config parameters!")); + return; + } + + /* IRM pool is used only for MSI/X interrupts */ + apix_irm_params.iparams_types = DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX; + apix_irm_params.iparams_total = total_avail_vectors; + + if (ndi_irm_create(dip, &apix_irm_params, + &apix_irm_pool_p) == NDI_SUCCESS) { + /* + * re-direct psm_intr_ops to intercept FIXED + * interrupt allocation requests. + */ + psm_intr_ops_saved = psm_intr_ops; + psm_intr_ops = apix_irm_intr_ops; + /* + * re-direct psm_enable_intr()/psm_disable_intr() to + * intercept CPU offline/online requests. + */ + psm_disable_intr_saved = psm_disable_intr; + psm_enable_intr_saved = psm_enable_intr; + psm_enable_intr = apix_irm_enable_intr; + psm_disable_intr = apix_irm_disable_intr; + + mutex_init(&apix_irm_lock, NULL, MUTEX_DRIVER, NULL); + + /* + * Set default alloc limit for non-IRM drivers + * to DDI_MIN_MSIX_ALLOC (currently defined as 8). + * + * NOTE: This is done here so that the limit of 8 vectors + * is applicable only with APIX module. For the old pcplusmp + * implementation, the current default of 2 (i.e + * DDI_DEFAULT_MSIX_ALLOC) is retained. + */ + if (ddi_msix_alloc_limit < DDI_MIN_MSIX_ALLOC) + ddi_msix_alloc_limit = DDI_MIN_MSIX_ALLOC; + } else { + APIX_IRM_DEBUG((CE_NOTE, + "apix_irm_init: ndi_irm_create() failed")); + apix_irm_pool_p = NULL; + } +} + +/* + * Check if the PSM module is "APIX" type which supports IRM feature. + * Returns 0 if it is not an APIX module. + */ +static int +apix_irm_chk_apix(void) +{ + ddi_intr_handle_impl_t info_hdl; + apic_get_type_t type_info; + + if (!psm_intr_ops) + return (0); + + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if (((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, + NULL)) != PSM_SUCCESS) { + /* unknown type; assume not an APIX module */ + return (0); + } + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) + return (1); + else + return (0); +} + +/* + * This function intercepts PSM_INTR_OP_* requests to deal with + * IRM pool maintainance for FIXED type interrupts. The following + * commands are intercepted and the rest are simply passed back to + * the original psm_intr_ops function: + * PSM_INTR_OP_ALLOC_VECTORS + * PSM_INTR_OP_FREE_VECTORS + * Return value is either PSM_SUCCESS or PSM_FAILURE. + */ +int +apix_irm_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *handle, + psm_intr_op_t op, int *result) +{ + switch (op) { + case PSM_INTR_OP_ALLOC_VECTORS: + if (handle->ih_type == DDI_INTR_TYPE_FIXED) + return (apix_irm_alloc_fixed(dip, handle, result)); + else + break; + case PSM_INTR_OP_FREE_VECTORS: + if (handle->ih_type == DDI_INTR_TYPE_FIXED) + return (apix_irm_free_fixed(dip, handle, result)); + else + break; + default: + break; + } + + /* pass the request to APIX */ + return ((*psm_intr_ops_saved)(dip, handle, op, result)); +} + +/* + * Allocate a FIXED type interrupt. The procedure for this + * operation is as follows: + * + * 1) Check if this IRQ is shared (i.e. IRQ is already mapped + * and a vector has been already allocated). If so, then no + * new vector is needed and simply pass the request to APIX + * and return. + * 2) Check the local cache pool for an available vector. If + * the cache is not empty then take it from there and simply + * pass the request to APIX and return. + * 3) Otherwise, get a vector from the IRM pool by reducing the + * pool size by 1. If it is successful then pass the + * request to APIX module. Otherwise return PSM_FAILURE. + */ +int +apix_irm_alloc_fixed(dev_info_t *dip, ddi_intr_handle_impl_t *handle, + int *result) +{ + int vector; + uint_t new_pool_size; + int ret; + + /* + * Check if this IRQ has been mapped (i.e. shared IRQ case) + * by doing PSM_INTR_OP_XLATE_VECTOR. + */ + ret = (*psm_intr_ops_saved)(dip, handle, PSM_INTR_OP_XLATE_VECTOR, + &vector); + if (ret == PSM_SUCCESS) { + APIX_IRM_DEBUG((CE_CONT, + "apix_irm_alloc_fixed: dip %p (%s) xlated vector 0x%x", + (void *)dip, ddi_driver_name(dip), vector)); + /* (1) mapping already exists; pass the request to PSM */ + return ((*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_ALLOC_VECTORS, result)); + } + + /* check the local cache for an available vector */ + mutex_enter(&apix_irm_lock); + if (apix_irm_cache_size) { /* cache is not empty */ + --apix_irm_cache_size; + apix_irm_fixed_intr_vectors_used++; + mutex_exit(&apix_irm_lock); + /* (2) use the vector from the local cache */ + return ((*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_ALLOC_VECTORS, result)); + } + + /* (3) get a vector from the IRM pool */ + + new_pool_size = apix_irm_params.iparams_total - 1; + + APIX_IRM_DEBUG((CE_CONT, "apix_irm_alloc_fixed: dip %p (%s) resize pool" + " from %x to %x\n", (void *)dip, ddi_driver_name(dip), + apix_irm_pool_p->ipool_totsz, new_pool_size)); + + if (ndi_irm_resize_pool(apix_irm_pool_p, new_pool_size) == + NDI_SUCCESS) { + /* update the pool size info */ + apix_irm_params.iparams_total = new_pool_size; + apix_irm_fixed_intr_vectors_used++; + mutex_exit(&apix_irm_lock); + return ((*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_ALLOC_VECTORS, result)); + } + + mutex_exit(&apix_irm_lock); + + return (PSM_FAILURE); +} + +/* + * Free up the FIXED type interrupt. + * + * 1) If it is a shared vector then simply pass the request to + * APIX and return. + * 2) Otherwise, if apix_irm_free_fixed_vector is not set then add the + * vector back to the IRM pool. Otherwise, keep it in the local cache. + */ +int +apix_irm_free_fixed(dev_info_t *dip, ddi_intr_handle_impl_t *handle, + int *result) +{ + int shared; + int ret; + uint_t new_pool_size; + + /* check if it is a shared vector */ + ret = (*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_GET_SHARED, &shared); + + if ((ret == PSM_SUCCESS) && (shared > 0)) { + /* (1) it is a shared vector; simply pass the request */ + APIX_IRM_DEBUG((CE_CONT, "apix_irm_free_fixed: dip %p (%s) " + "shared %d\n", (void *)dip, ddi_driver_name(dip), shared)); + return ((*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_FREE_VECTORS, result)); + } + + ret = (*psm_intr_ops_saved)(dip, handle, + PSM_INTR_OP_FREE_VECTORS, result); + + if (ret == PSM_SUCCESS) { + mutex_enter(&apix_irm_lock); + if (apix_irm_free_fixed_vector) { + /* (2) add the vector back to IRM pool */ + new_pool_size = apix_irm_params.iparams_total + 1; + APIX_IRM_DEBUG((CE_CONT, "apix_irm_free_fixed: " + "dip %p (%s) resize pool from %x to %x\n", + (void *)dip, ddi_driver_name(dip), + apix_irm_pool_p->ipool_totsz, new_pool_size)); + if (ndi_irm_resize_pool(apix_irm_pool_p, + new_pool_size) == NDI_SUCCESS) { + /* update the pool size info */ + apix_irm_params.iparams_total = new_pool_size; + } else { + cmn_err(CE_NOTE, + "apix_irm_free_fixed: failed to add" + " a vector to IRM pool"); + } + } else { + /* keep the vector in the local cache */ + apix_irm_cache_size += 1; + } + apix_irm_fixed_intr_vectors_used--; + mutex_exit(&apix_irm_lock); + } + + return (ret); +} + +/* + * Disable the CPU for interrupts. It is assumed that this is called to + * offline/disable the CPU so that no interrupts are allocated on + * that CPU. For IRM perspective, the interrupt vectors on this + * CPU are to be excluded for any allocations. + * + * If APIX module is successful in migrating all the vectors + * from this CPU then reduce the IRM pool size to exclude the + * interrupt vectors for that CPU. + */ +int +apix_irm_disable_intr(processorid_t id) +{ + uint_t new_pool_size; + + /* Interrupt disabling for Suspend/Resume */ + if (apic_cpus[id].aci_status & APIC_CPU_SUSPEND) + return ((*psm_disable_intr_saved)(id)); + + mutex_enter(&apix_irm_lock); + /* + * Don't remove the CPU from the IRM pool if we have CPU factor + * available. + */ + if ((apix_irm_cpu_factor > 0) && (apix_irm_cpu_factor_available > 0)) { + apix_irm_cpu_factor_available--; + } else { + /* can't disable if there is only one CPU used */ + if (apix_irm_cpus_used == 1) { + mutex_exit(&apix_irm_lock); + return (PSM_FAILURE); + } + /* Calculate the new size for the IRM pool */ + new_pool_size = apix_irm_params.iparams_total - + apix_irminfo.apix_per_cpu_vectors; + + /* Apply the max. limit */ + if (apix_system_max_vectors > 0) { + uint_t max; + + max = apix_system_max_vectors - + apix_irm_fixed_intr_vectors_used - + apix_irm_cache_size; + + new_pool_size = MIN(new_pool_size, max); + } + + if (new_pool_size == 0) { + cmn_err(CE_WARN, "Invalid pool size 0 with " + "apix_system_max_vectors = %d", + apix_system_max_vectors); + mutex_exit(&apix_irm_lock); + return (PSM_FAILURE); + } + + if (new_pool_size != apix_irm_params.iparams_total) { + /* remove the CPU from the IRM pool */ + if (ndi_irm_resize_pool(apix_irm_pool_p, + new_pool_size) != NDI_SUCCESS) { + mutex_exit(&apix_irm_lock); + APIX_IRM_DEBUG((CE_NOTE, + "apix_irm_disable_intr: failed to resize" + " the IRM pool")); + return (PSM_FAILURE); + } + /* update the pool size info */ + apix_irm_params.iparams_total = new_pool_size; + } + + /* decrement the CPU count used by IRM pool */ + apix_irm_cpus_used--; + } + + /* + * Now, disable the CPU for interrupts. + */ + if ((*psm_disable_intr_saved)(id) != PSM_SUCCESS) { + APIX_IRM_DEBUG((CE_NOTE, + "apix_irm_disable_intr: failed to disable CPU interrupts" + " for CPU#%d", id)); + mutex_exit(&apix_irm_lock); + return (PSM_FAILURE); + } + /* decrement the CPU count enabled for interrupts */ + apix_irm_max_cpus--; + mutex_exit(&apix_irm_lock); + return (PSM_SUCCESS); +} + +/* + * Enable the CPU for interrupts. It is assumed that this function is + * called to enable/online the CPU so that interrupts could be assigned + * to it. If successful, add available vectors for that CPU to the IRM + * pool if apix_irm_cpu_factor is already satisfied. + */ +void +apix_irm_enable_intr(processorid_t id) +{ + uint_t new_pool_size; + + /* Interrupt enabling for Suspend/Resume */ + if (apic_cpus[id].aci_status & APIC_CPU_SUSPEND) { + (*psm_enable_intr_saved)(id); + return; + } + + mutex_enter(&apix_irm_lock); + + /* enable the CPU for interrupts */ + (*psm_enable_intr_saved)(id); + + /* increment the number of CPUs enabled for interrupts */ + apix_irm_max_cpus++; + + ASSERT(apix_irminfo.apix_per_cpu_vectors > 0); + + /* + * Check if the apix_irm_cpu_factor is satisfied before. + * If satisfied, add the CPU to IRM pool. + */ + if ((apix_irm_cpu_factor > 0) && + (apix_irm_cpu_factor_available < apix_irm_cpu_factor)) { + /* + * Don't add the CPU to the IRM pool. Just update + * the available CPU factor. + */ + apix_irm_cpu_factor_available++; + mutex_exit(&apix_irm_lock); + return; + } + + /* + * Add the CPU to the IRM pool. + */ + + /* increment the CPU count used by IRM */ + apix_irm_cpus_used++; + + /* Calculate the new pool size */ + new_pool_size = apix_irm_params.iparams_total + + apix_irminfo.apix_per_cpu_vectors; + + /* Apply the max. limit */ + if (apix_system_max_vectors > 0) { + uint_t max; + + max = apix_system_max_vectors - + apix_irm_fixed_intr_vectors_used - + apix_irm_cache_size; + + new_pool_size = MIN(new_pool_size, max); + } + if (new_pool_size == apix_irm_params.iparams_total) { + /* no change to pool size */ + mutex_exit(&apix_irm_lock); + return; + } + if (new_pool_size < apix_irm_params.iparams_total) { + cmn_err(CE_WARN, "new_pool_size %d is inconsistent " + "with irm_params.iparams_total %d", + new_pool_size, apix_irm_params.iparams_total); + mutex_exit(&apix_irm_lock); + return; + } + + (void) ndi_irm_resize_pool(apix_irm_pool_p, new_pool_size); + + /* update the pool size info */ + apix_irm_params.iparams_total = new_pool_size; + + mutex_exit(&apix_irm_lock); +} diff --git a/usr/src/uts/i86pc/io/apix/apix_utils.c b/usr/src/uts/i86pc/io/apix/apix_utils.c new file mode 100644 index 0000000000..2924c43714 --- /dev/null +++ b/usr/src/uts/i86pc/io/apix/apix_utils.c @@ -0,0 +1,1902 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/cram.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/psm_common.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/archsystm.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/sysmacros.h> +#include <sys/cpuvar.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/spl.h> +#include <sys/clock.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/x_call.h> +#include <sys/reboot.h> +#include <sys/apix.h> + +static int apix_get_avail_vector_oncpu(uint32_t, int, int); +static apix_vector_t *apix_init_vector(processorid_t, uchar_t); +static void apix_cleanup_vector(apix_vector_t *); +static void apix_insert_av(apix_vector_t *, void *, avfunc, caddr_t, caddr_t, + uint64_t *, int, dev_info_t *); +static void apix_remove_av(apix_vector_t *, struct autovec *); +static void apix_clear_dev_map(dev_info_t *, int, int); +static boolean_t apix_is_cpu_enabled(processorid_t); +static void apix_wait_till_seen(processorid_t, int); + +#define GET_INTR_INUM(ihdlp) \ + (((ihdlp) != NULL) ? ((ddi_intr_handle_impl_t *)(ihdlp))->ih_inum : 0) + +apix_rebind_info_t apix_rebindinfo = {0, 0, 0, NULL, 0, NULL}; + +/* + * Allocate IPI + * + * Return vector number or 0 on error + */ +uchar_t +apix_alloc_ipi(int ipl) +{ + apix_vector_t *vecp; + uchar_t vector; + int cpun; + int nproc; + + APIX_ENTER_CPU_LOCK(0); + + vector = apix_get_avail_vector_oncpu(0, APIX_IPI_MIN, APIX_IPI_MAX); + if (vector == 0) { + APIX_LEAVE_CPU_LOCK(0); + cmn_err(CE_WARN, "apix: no available IPI\n"); + apic_error |= APIC_ERR_GET_IPIVECT_FAIL; + return (0); + } + + nproc = max(apic_nproc, apic_max_nproc); + for (cpun = 0; cpun < nproc; cpun++) { + vecp = xv_vector(cpun, vector); + if (vecp == NULL) { + vecp = kmem_zalloc(sizeof (apix_vector_t), KM_NOSLEEP); + if (vecp == NULL) { + cmn_err(CE_WARN, "apix: No memory for ipi"); + goto fail; + } + xv_vector(cpun, vector) = vecp; + } + vecp->v_state = APIX_STATE_ALLOCED; + vecp->v_type = APIX_TYPE_IPI; + vecp->v_cpuid = vecp->v_bound_cpuid = cpun; + vecp->v_vector = vector; + vecp->v_pri = ipl; + } + APIX_LEAVE_CPU_LOCK(0); + return (vector); + +fail: + while (--cpun >= 0) + apix_cleanup_vector(xv_vector(cpun, vector)); + APIX_LEAVE_CPU_LOCK(0); + return (0); +} + +/* + * Add IPI service routine + */ +static int +apix_add_ipi(int ipl, avfunc xxintr, char *name, int vector, + caddr_t arg1, caddr_t arg2) +{ + int cpun; + apix_vector_t *vecp; + int nproc; + + ASSERT(vector >= APIX_IPI_MIN && vector <= APIX_IPI_MAX); + + nproc = max(apic_nproc, apic_max_nproc); + for (cpun = 0; cpun < nproc; cpun++) { + APIX_ENTER_CPU_LOCK(cpun); + vecp = xv_vector(cpun, vector); + apix_insert_av(vecp, NULL, xxintr, arg1, arg2, NULL, ipl, NULL); + vecp->v_state = APIX_STATE_ENABLED; + APIX_LEAVE_CPU_LOCK(cpun); + } + + APIC_VERBOSE(IPI, (CE_CONT, "apix: add ipi for %s, vector %x " + "ipl %x\n", name, vector, ipl)); + + return (1); +} + +/* + * Find and return first free vector in range (start, end) + */ +static int +apix_get_avail_vector_oncpu(uint32_t cpuid, int start, int end) +{ + int i; + apix_impl_t *apixp = apixs[cpuid]; + + for (i = start; i <= end; i++) { + if (APIC_CHECK_RESERVE_VECTORS(i)) + continue; + if (IS_VECT_FREE(apixp->x_vectbl[i])) + return (i); + } + + return (0); +} + +/* + * Allocate a vector on specified cpu + * + * Return NULL on error + */ +static apix_vector_t * +apix_alloc_vector_oncpu(uint32_t cpuid, dev_info_t *dip, int inum, int type) +{ + processorid_t tocpu = cpuid & ~IRQ_USER_BOUND; + apix_vector_t *vecp; + int vector; + + ASSERT(APIX_CPU_LOCK_HELD(tocpu)); + + /* find free vector */ + vector = apix_get_avail_vector_oncpu(tocpu, APIX_AVINTR_MIN, + APIX_AVINTR_MAX); + if (vector == 0) + return (NULL); + + vecp = apix_init_vector(tocpu, vector); + vecp->v_type = (ushort_t)type; + vecp->v_inum = inum; + vecp->v_flags = (cpuid & IRQ_USER_BOUND) ? APIX_VECT_USER_BOUND : 0; + + if (dip != NULL) + apix_set_dev_map(vecp, dip, inum); + + return (vecp); +} + +/* + * Allocates "count" contiguous MSI vectors starting at the proper alignment. + * Caller needs to make sure that count has to be power of 2 and should not + * be < 1. + * + * Return first vector number + */ +apix_vector_t * +apix_alloc_nvectors_oncpu(uint32_t cpuid, dev_info_t *dip, int inum, + int count, int type) +{ + int i, msibits, start = 0, navail = 0; + apix_vector_t *vecp, *startp = NULL; + processorid_t tocpu = cpuid & ~IRQ_USER_BOUND; + uint_t flags; + + ASSERT(APIX_CPU_LOCK_HELD(tocpu)); + + /* + * msibits is the no. of lower order message data bits for the + * allocated MSI vectors and is used to calculate the aligned + * starting vector + */ + msibits = count - 1; + + /* It has to be contiguous */ + for (i = APIX_AVINTR_MIN; i <= APIX_AVINTR_MAX; i++) { + if (!IS_VECT_FREE(xv_vector(tocpu, i))) + continue; + + /* + * starting vector has to be aligned accordingly for + * multiple MSIs + */ + if (msibits) + i = (i + msibits) & ~msibits; + + for (navail = 0, start = i; i <= APIX_AVINTR_MAX; i++) { + if (!IS_VECT_FREE(xv_vector(tocpu, i))) + break; + if (APIC_CHECK_RESERVE_VECTORS(i)) + break; + if (++navail == count) + goto done; + } + } + + return (NULL); + +done: + flags = (cpuid & IRQ_USER_BOUND) ? APIX_VECT_USER_BOUND : 0; + + for (i = 0; i < count; i++) { + if ((vecp = apix_init_vector(tocpu, start + i)) == NULL) + goto fail; + + vecp->v_type = (ushort_t)type; + vecp->v_inum = inum + i; + vecp->v_flags = flags; + + if (dip != NULL) + apix_set_dev_map(vecp, dip, inum + i); + + if (i == 0) + startp = vecp; + } + + return (startp); + +fail: + while (i-- > 0) { /* Free allocated vectors */ + vecp = xv_vector(tocpu, start + i); + apix_clear_dev_map(dip, inum + i, type); + apix_cleanup_vector(vecp); + } + return (NULL); +} + +#define APIX_WRITE_MSI_DATA(_hdl, _cap, _ctrl, _v)\ +do {\ + if ((_ctrl) & PCI_MSI_64BIT_MASK)\ + pci_config_put16((_hdl), (_cap) + PCI_MSI_64BIT_DATA, (_v));\ + else\ + pci_config_put16((_hdl), (_cap) + PCI_MSI_32BIT_DATA, (_v));\ +_NOTE(CONSTCOND)} while (0) + +static void +apix_pci_msi_enable_vector(apix_vector_t *vecp, dev_info_t *dip, int type, + int inum, int count, uchar_t vector, int target_apic_id) +{ + uint64_t msi_addr, msi_data; + ushort_t msi_ctrl; + int i, cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip); + msi_regs_t msi_regs; + void *intrmap_tbl[PCI_MSI_MAX_INTRS]; + + DDI_INTR_IMPLDBG((CE_CONT, "apix_pci_msi_enable_vector: dip=0x%p\n" + "\tdriver = %s, inum=0x%x vector=0x%x apicid=0x%x\n", (void *)dip, + ddi_driver_name(dip), inum, vector, target_apic_id)); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + msi_regs.mr_data = vector; + msi_regs.mr_addr = target_apic_id; + + intrmap_tbl[0] = vecp->v_intrmap_private; + apic_vt_ops->apic_intrmap_alloc_entry(intrmap_tbl, dip, type, + count, 0xff); + for (i = 0; i < count; i++) + xv_intrmap_private(vecp->v_cpuid, vector + i) = intrmap_tbl[i]; + + apic_vt_ops->apic_intrmap_map_entry(vecp->v_intrmap_private, + (void *)&msi_regs, type, count); + apic_vt_ops->apic_intrmap_record_msi(vecp->v_intrmap_private, + &msi_regs); + + /* MSI Address */ + msi_addr = msi_regs.mr_addr; + + /* MSI Data: MSI is edge triggered according to spec */ + msi_data = msi_regs.mr_data; + + DDI_INTR_IMPLDBG((CE_CONT, "apix_pci_msi_enable_vector: addr=0x%lx " + "data=0x%lx\n", (long)msi_addr, (long)msi_data)); + + if (type == APIX_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + + /* Set the bits to inform how many MSIs are enabled */ + msi_ctrl |= ((highbit(count) - 1) << PCI_MSI_MME_SHIFT); + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + + if ((vecp->v_flags & APIX_VECT_MASKABLE) == 0) + APIX_WRITE_MSI_DATA(handle, cap_ptr, msi_ctrl, + APIX_RESV_VECTOR); + + pci_config_put32(handle, + cap_ptr + PCI_MSI_ADDR_OFFSET, msi_addr); + if (msi_ctrl & PCI_MSI_64BIT_MASK) + pci_config_put32(handle, + cap_ptr + PCI_MSI_ADDR_OFFSET + 4, msi_addr >> 32); + + APIX_WRITE_MSI_DATA(handle, cap_ptr, msi_ctrl, msi_data); + } else if (type == APIX_TYPE_MSIX) { + uintptr_t off; + ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip); + + /* Offset into the "inum"th entry in the MSI-X table */ + off = (uintptr_t)msix_p->msix_tbl_addr + + (inum * PCI_MSIX_VECTOR_SIZE); + + ddi_put32(msix_p->msix_tbl_hdl, + (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), msi_data); + ddi_put64(msix_p->msix_tbl_hdl, + (uint64_t *)(off + PCI_MSIX_LOWER_ADDR_OFFSET), msi_addr); + } +} + +static void +apix_pci_msi_enable_mode(dev_info_t *dip, int type, int inum) +{ + ushort_t msi_ctrl; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + if (type == APIX_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + if ((msi_ctrl & PCI_MSI_ENABLE_BIT)) + return; + + msi_ctrl |= PCI_MSI_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + + } else if (type == DDI_INTR_TYPE_MSIX) { + uintptr_t off; + uint32_t mask; + ddi_intr_msix_t *msix_p; + + msix_p = i_ddi_get_msix(dip); + + /* Offset into "inum"th entry in the MSI-X table & clear mask */ + off = (uintptr_t)msix_p->msix_tbl_addr + (inum * + PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; + + mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); + + ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask & ~1)); + + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); + + if (!(msi_ctrl & PCI_MSIX_ENABLE_BIT)) { + msi_ctrl |= PCI_MSIX_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, + msi_ctrl); + } + } +} + +/* + * Setup interrupt, pogramming IO-APIC or MSI/X address/data. + */ +void +apix_enable_vector(apix_vector_t *vecp) +{ + int tocpu = vecp->v_cpuid, type = vecp->v_type; + apic_cpus_info_t *cpu_infop; + ulong_t iflag; + + ASSERT(tocpu < apic_nproc); + + cpu_infop = &apic_cpus[tocpu]; + if (vecp->v_flags & APIX_VECT_USER_BOUND) + cpu_infop->aci_bound++; + else + cpu_infop->aci_temp_bound++; + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { /* fixed */ + apix_intx_enable(vecp->v_inum); + } else { + int inum = vecp->v_inum; + dev_info_t *dip = APIX_GET_DIP(vecp); + int count = i_ddi_intr_get_current_nintrs(dip); + + if (type == APIX_TYPE_MSI) { /* MSI */ + if (inum == apix_get_max_dev_inum(dip, type)) { + /* last one */ + uchar_t start_inum = inum + 1 - count; + uchar_t start_vect = vecp->v_vector + 1 - count; + apix_vector_t *start_vecp = + xv_vector(vecp->v_cpuid, start_vect); + + APIC_VERBOSE(INTR, (CE_CONT, "apix: call " + "apix_pci_msi_enable_vector\n")); + apix_pci_msi_enable_vector(start_vecp, dip, + type, start_inum, count, start_vect, + cpu_infop->aci_local_id); + + APIC_VERBOSE(INTR, (CE_CONT, "apix: call " + "apix_pci_msi_enable_mode\n")); + apix_pci_msi_enable_mode(dip, type, inum); + } + } else { /* MSI-X */ + apix_pci_msi_enable_vector(vecp, dip, + type, inum, 1, vecp->v_vector, + cpu_infop->aci_local_id); + apix_pci_msi_enable_mode(dip, type, inum); + } + } + vecp->v_state = APIX_STATE_ENABLED; + apic_redist_cpu_skip &= ~(1 << tocpu); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); +} + +/* + * Disable the interrupt + */ +void +apix_disable_vector(apix_vector_t *vecp) +{ + struct autovec *avp = vecp->v_autovect; + ulong_t iflag; + + ASSERT(avp != NULL); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + switch (vecp->v_type) { + case APIX_TYPE_MSI: + ASSERT(avp->av_vector != NULL && avp->av_dip != NULL); + /* + * Disable the MSI vector + * Make sure we only disable on the last + * of the multi-MSI support + */ + if (i_ddi_intr_get_current_nenables(avp->av_dip) == 1) { + apic_pci_msi_disable_mode(avp->av_dip, + DDI_INTR_TYPE_MSI); + } + break; + case APIX_TYPE_MSIX: + ASSERT(avp->av_vector != NULL && avp->av_dip != NULL); + /* + * Disable the MSI-X vector + * needs to clear its mask and addr/data for each MSI-X + */ + apic_pci_msi_unconfigure(avp->av_dip, DDI_INTR_TYPE_MSIX, + vecp->v_inum); + /* + * Make sure we only disable on the last MSI-X + */ + if (i_ddi_intr_get_current_nenables(avp->av_dip) == 1) { + apic_pci_msi_disable_mode(avp->av_dip, + DDI_INTR_TYPE_MSIX); + } + break; + default: + apix_intx_disable(vecp->v_inum); + break; + } + + if (!(apic_cpus[vecp->v_cpuid].aci_status & APIC_CPU_SUSPEND)) + vecp->v_state = APIX_STATE_DISABLED; + apic_vt_ops->apic_intrmap_free_entry(&vecp->v_intrmap_private); + vecp->v_intrmap_private = NULL; + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); +} + +/* + * Mark vector as obsoleted or freed. The vector is marked + * obsoleted if there are pending requests on it. Otherwise, + * free the vector. The obsoleted vectors get freed after + * being serviced. + * + * Return 1 on being obosoleted and 0 on being freed. + */ +#define INTR_BUSY(_avp)\ + ((((volatile ushort_t)(_avp)->av_flags) &\ + (AV_PENTRY_PEND | AV_PENTRY_ONPROC)) != 0) +#define LOCAL_WITH_INTR_DISABLED(_cpuid)\ + ((_cpuid) == psm_get_cpu_id() && !interrupts_enabled()) +static uint64_t dummy_tick; + +int +apix_obsolete_vector(apix_vector_t *vecp) +{ + struct autovec *avp = vecp->v_autovect; + int repeats, tries, ipl, busy = 0, cpuid = vecp->v_cpuid; + apix_impl_t *apixp = apixs[cpuid]; + + ASSERT(APIX_CPU_LOCK_HELD(cpuid)); + + for (avp = vecp->v_autovect; avp != NULL; avp = avp->av_link) { + if (avp->av_vector == NULL) + continue; + + if (LOCAL_WITH_INTR_DISABLED(cpuid)) { + int bit, index, irr; + + if (INTR_BUSY(avp)) { + busy++; + continue; + } + + /* check IRR for pending interrupts */ + index = vecp->v_vector / 32; + bit = vecp->v_vector % 32; + irr = apic_reg_ops->apic_read(APIC_IRR_REG + index); + if ((irr & (1 << bit)) != 0) + busy++; + + if (!busy) + apix_remove_av(vecp, avp); + + continue; + } + + repeats = 0; + do { + repeats++; + for (tries = 0; tries < apic_max_reps_clear_pending; + tries++) + if (!INTR_BUSY(avp)) + break; + } while (INTR_BUSY(avp) && + (repeats < apic_max_reps_clear_pending)); + + if (INTR_BUSY(avp)) + busy++; + else { + /* + * Interrupt is not in pending list or being serviced. + * However it might be cached in Local APIC's IRR + * register. It's impossible to check another CPU's + * IRR register. Then wait till lower levels finish + * running. + */ + for (ipl = 1; ipl < MIN(LOCK_LEVEL, vecp->v_pri); ipl++) + apix_wait_till_seen(cpuid, ipl); + if (INTR_BUSY(avp)) + busy++; + } + + if (!busy) + apix_remove_av(vecp, avp); + } + + if (busy) { + apix_vector_t *tp = apixp->x_obsoletes; + + if (vecp->v_state == APIX_STATE_OBSOLETED) + return (1); + + vecp->v_state = APIX_STATE_OBSOLETED; + vecp->v_next = NULL; + if (tp == NULL) + apixp->x_obsoletes = vecp; + else { + while (tp->v_next != NULL) + tp = tp->v_next; + tp->v_next = vecp; + } + return (1); + } + + /* interrupt is not busy */ + if (vecp->v_state == APIX_STATE_OBSOLETED) { + /* remove from obsoleted list */ + apixp->x_obsoletes = vecp->v_next; + vecp->v_next = NULL; + } + apix_cleanup_vector(vecp); + return (0); +} + +/* + * Duplicate number of continuous vectors to specified target vectors. + */ +static void +apix_dup_vectors(apix_vector_t *oldp, apix_vector_t *newp, int count) +{ + struct autovec *avp; + apix_vector_t *fromp, *top; + processorid_t oldcpu = oldp->v_cpuid, newcpu = newp->v_cpuid; + uchar_t oldvec = oldp->v_vector, newvec = newp->v_vector; + int i, inum; + + ASSERT(oldp->v_type != APIX_TYPE_IPI); + + for (i = 0; i < count; i++) { + fromp = xv_vector(oldcpu, oldvec + i); + top = xv_vector(newcpu, newvec + i); + ASSERT(fromp != NULL && top != NULL); + + /* copy over original one */ + top->v_state = fromp->v_state; + top->v_type = fromp->v_type; + top->v_bound_cpuid = fromp->v_bound_cpuid; + top->v_inum = fromp->v_inum; + top->v_flags = fromp->v_flags; + top->v_intrmap_private = fromp->v_intrmap_private; + + for (avp = fromp->v_autovect; avp != NULL; avp = avp->av_link) { + if (avp->av_vector == NULL) + continue; + + apix_insert_av(top, avp->av_intr_id, avp->av_vector, + avp->av_intarg1, avp->av_intarg2, avp->av_ticksp, + avp->av_prilevel, avp->av_dip); + + if (fromp->v_type == APIX_TYPE_FIXED && + avp->av_dip != NULL) { + inum = GET_INTR_INUM(avp->av_intr_id); + apix_set_dev_map(top, avp->av_dip, inum); + } + } + + if (DDI_INTR_IS_MSI_OR_MSIX(fromp->v_type) && + fromp->v_devp != NULL) + apix_set_dev_map(top, fromp->v_devp->dv_dip, + fromp->v_devp->dv_inum); + } +} + +static apix_vector_t * +apix_init_vector(processorid_t cpuid, uchar_t vector) +{ + apix_impl_t *apixp = apixs[cpuid]; + apix_vector_t *vecp = apixp->x_vectbl[vector]; + + ASSERT(IS_VECT_FREE(vecp)); + + if (vecp == NULL) { + vecp = kmem_zalloc(sizeof (apix_vector_t), KM_NOSLEEP); + if (vecp == NULL) { + cmn_err(CE_WARN, "apix: no memory to allocate vector"); + return (NULL); + } + apixp->x_vectbl[vector] = vecp; + } + vecp->v_state = APIX_STATE_ALLOCED; + vecp->v_cpuid = vecp->v_bound_cpuid = cpuid; + vecp->v_vector = vector; + + return (vecp); +} + +static void +apix_cleanup_vector(apix_vector_t *vecp) +{ + ASSERT(vecp->v_share == 0); + vecp->v_bound_cpuid = IRQ_UNINIT; + vecp->v_state = APIX_STATE_FREED; + vecp->v_type = 0; + vecp->v_flags = 0; + vecp->v_busy = 0; +} + +static void +apix_dprint_vector(apix_vector_t *vecp, dev_info_t *dip, int count) +{ +#ifdef DEBUG + major_t major; + char *name, *drv_name; + int instance, len, t_len; + char mesg[1024] = "apix: "; + + t_len = sizeof (mesg); + len = strlen(mesg); + if (dip != NULL) { + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + drv_name = ddi_major_to_name(major); + instance = ddi_get_instance(dip); + (void) snprintf(mesg + len, t_len - len, "%s (%s) instance %d ", + name, drv_name, instance); + } + len = strlen(mesg); + + switch (vecp->v_type) { + case APIX_TYPE_FIXED: + (void) snprintf(mesg + len, t_len - len, "irqno %d", + vecp->v_inum); + break; + case APIX_TYPE_MSI: + (void) snprintf(mesg + len, t_len - len, + "msi inum %d (count %d)", vecp->v_inum, count); + break; + case APIX_TYPE_MSIX: + (void) snprintf(mesg + len, t_len - len, "msi-x inum %d", + vecp->v_inum); + break; + default: + break; + + } + + APIC_VERBOSE(ALLOC, (CE_CONT, "%s allocated with vector 0x%x on " + "cpu %d\n", mesg, vecp->v_vector, vecp->v_cpuid)); +#endif /* DEBUG */ +} + +/* + * Operations on avintr + */ + +#define INIT_AUTOVEC(p, intr_id, f, arg1, arg2, ticksp, ipl, dip) \ +do { \ + (p)->av_intr_id = intr_id; \ + (p)->av_vector = f; \ + (p)->av_intarg1 = arg1; \ + (p)->av_intarg2 = arg2; \ + (p)->av_ticksp = ticksp; \ + (p)->av_prilevel = ipl; \ + (p)->av_dip = dip; \ + (p)->av_flags = 0; \ +_NOTE(CONSTCOND)} while (0) + +/* + * Insert an interrupt service routine into chain by its priority from + * high to low + */ +static void +apix_insert_av(apix_vector_t *vecp, void *intr_id, avfunc f, caddr_t arg1, + caddr_t arg2, uint64_t *ticksp, int ipl, dev_info_t *dip) +{ + struct autovec *p, *prep, *mem; + + APIC_VERBOSE(INTR, (CE_CONT, "apix_insert_av: dip %p, vector 0x%x, " + "cpu %d\n", (void *)dip, vecp->v_vector, vecp->v_cpuid)); + + mem = kmem_zalloc(sizeof (struct autovec), KM_SLEEP); + INIT_AUTOVEC(mem, intr_id, f, arg1, arg2, ticksp, ipl, dip); + if (vecp->v_type == APIX_TYPE_FIXED && apic_level_intr[vecp->v_inum]) + mem->av_flags |= AV_PENTRY_LEVEL; + + vecp->v_share++; + vecp->v_pri = (ipl > vecp->v_pri) ? ipl : vecp->v_pri; + if (vecp->v_autovect == NULL) { /* Nothing on list - put it at head */ + vecp->v_autovect = mem; + return; + } + + if (DDI_INTR_IS_MSI_OR_MSIX(vecp->v_type)) { /* MSI/X */ + ASSERT(vecp->v_share == 1); /* No sharing for MSI/X */ + + INIT_AUTOVEC(vecp->v_autovect, intr_id, f, arg1, arg2, ticksp, + ipl, dip); + prep = vecp->v_autovect->av_link; + vecp->v_autovect->av_link = NULL; + + /* Free the following autovect chain */ + while (prep != NULL) { + ASSERT(prep->av_vector == NULL); + + p = prep; + prep = prep->av_link; + kmem_free(p, sizeof (struct autovec)); + } + + kmem_free(mem, sizeof (struct autovec)); + return; + } + + /* find where it goes in list */ + prep = NULL; + for (p = vecp->v_autovect; p != NULL; p = p->av_link) { + if (p->av_vector && p->av_prilevel <= ipl) + break; + prep = p; + } + if (prep != NULL) { + if (prep->av_vector == NULL) { /* freed struct available */ + INIT_AUTOVEC(prep, intr_id, f, arg1, arg2, + ticksp, ipl, dip); + prep->av_flags = mem->av_flags; + kmem_free(mem, sizeof (struct autovec)); + return; + } + + mem->av_link = prep->av_link; + prep->av_link = mem; + } else { + /* insert new intpt at beginning of chain */ + mem->av_link = vecp->v_autovect; + vecp->v_autovect = mem; + } +} + +/* + * After having made a change to an autovector list, wait until we have + * seen specified cpu not executing an interrupt at that level--so we + * know our change has taken effect completely (no old state in registers, + * etc). + */ +#define APIX_CPU_ENABLED(_cp) \ + (quiesce_active == 0 && \ + (((_cp)->cpu_flags & (CPU_QUIESCED|CPU_OFFLINE)) == 0)) + +static void +apix_wait_till_seen(processorid_t cpuid, int ipl) +{ + struct cpu *cp = cpu[cpuid]; + + if (cp == NULL || LOCAL_WITH_INTR_DISABLED(cpuid)) + return; + + /* + * Don't wait if the CPU is quiesced or offlined. This can happen + * when a CPU is running pause thread but hardware triggered an + * interrupt and the interrupt gets queued. + */ + for (;;) { + if (!INTR_ACTIVE((volatile struct cpu *)cpu[cpuid], ipl) && + (!APIX_CPU_ENABLED(cp) || + !INTR_PENDING((volatile apix_impl_t *)apixs[cpuid], ipl))) + return; + } +} + +static void +apix_remove_av(apix_vector_t *vecp, struct autovec *target) +{ + int hi_pri = 0; + struct autovec *p; + + if (target == NULL) + return; + + APIC_VERBOSE(INTR, (CE_CONT, "apix_remove_av: dip %p, vector 0x%x, " + "cpu %d\n", (void *)target->av_dip, vecp->v_vector, vecp->v_cpuid)); + + for (p = vecp->v_autovect; p; p = p->av_link) { + if (p == target || p->av_vector == NULL) + continue; + hi_pri = (p->av_prilevel > hi_pri) ? p->av_prilevel : hi_pri; + } + + vecp->v_share--; + vecp->v_pri = hi_pri; + + /* + * This drops the handler from the chain, it can no longer be called. + * However, there is no guarantee that the handler is not currently + * still executing. + */ + target->av_vector = NULL; + /* + * There is a race where we could be just about to pick up the ticksp + * pointer to increment it after returning from the service routine + * in av_dispatch_autovect. Rather than NULL it out let's just point + * it off to something safe so that any final tick update attempt + * won't fault. + */ + target->av_ticksp = &dummy_tick; + apix_wait_till_seen(vecp->v_cpuid, target->av_prilevel); +} + +static struct autovec * +apix_find_av(apix_vector_t *vecp, void *intr_id, avfunc f) +{ + struct autovec *p; + + for (p = vecp->v_autovect; p; p = p->av_link) { + if ((p->av_vector == f) && (p->av_intr_id == intr_id)) { + /* found the handler */ + return (p); + } + } + + return (NULL); +} + +static apix_vector_t * +apix_find_vector_by_avintr(void *intr_id, avfunc f) +{ + apix_vector_t *vecp; + processorid_t n; + uchar_t v; + + for (n = 0; n < apic_nproc; n++) { + if (!apix_is_cpu_enabled(n)) + continue; + + for (v = APIX_AVINTR_MIN; v <= APIX_AVINTR_MIN; v++) { + vecp = xv_vector(n, v); + if (vecp == NULL || + vecp->v_state <= APIX_STATE_OBSOLETED) + continue; + + if (apix_find_av(vecp, intr_id, f) != NULL) + return (vecp); + } + } + + return (NULL); +} + +/* + * Add interrupt service routine. + * + * For legacy interrupts (HPET timer, ACPI SCI), the vector is actually + * IRQ no. A vector is then allocated. Otherwise, the vector is already + * allocated. The input argument virt_vect is virtual vector of format + * APIX_VIRTVEC_VECTOR(cpuid, vector). + * + * Return 1 on success, 0 on failure. + */ +int +apix_add_avintr(void *intr_id, int ipl, avfunc xxintr, char *name, + int virt_vect, caddr_t arg1, caddr_t arg2, uint64_t *ticksp, + dev_info_t *dip) +{ + int cpuid; + uchar_t v = (uchar_t)APIX_VIRTVEC_VECTOR(virt_vect); + apix_vector_t *vecp; + + if (xxintr == NULL) { + cmn_err(CE_WARN, "Attempt to add null for %s " + "on vector 0x%x,0x%x", name, + APIX_VIRTVEC_CPU(virt_vect), + APIX_VIRTVEC_VECTOR(virt_vect)); + return (0); + } + + if (v >= APIX_IPI_MIN) /* IPIs */ + return (apix_add_ipi(ipl, xxintr, name, v, arg1, arg2)); + + if (!APIX_IS_VIRTVEC(virt_vect)) { /* got irq */ + int irqno = virt_vect; + int inum = GET_INTR_INUM(intr_id); + + /* + * Senarios include: + * a. add_avintr() is called before irqp initialized (legacy) + * b. irqp is initialized, vector is not allocated (fixed) + * c. irqp is initialized, vector is allocated (fixed & shared) + */ + if ((vecp = apix_alloc_intx(dip, inum, irqno)) == NULL) + return (0); + + cpuid = vecp->v_cpuid; + v = vecp->v_vector; + virt_vect = APIX_VIRTVECTOR(cpuid, v); + } else { /* got virtual vector */ + cpuid = APIX_VIRTVEC_CPU(virt_vect); + vecp = xv_vector(cpuid, v); + ASSERT(vecp != NULL); + } + + lock_set(&apix_lock); + if (vecp->v_state <= APIX_STATE_OBSOLETED) { + vecp = NULL; + + /* + * Basically the allocated but not enabled interrupts + * will not get re-targeted. But MSIs in allocated state + * could be re-targeted due to group re-targeting. + */ + if (intr_id != NULL && dip != NULL) { + ddi_intr_handle_impl_t *hdlp = intr_id; + vecp = apix_get_dev_map(dip, hdlp->ih_inum, + hdlp->ih_type); + ASSERT(vecp->v_state == APIX_STATE_ALLOCED); + } + if (vecp == NULL) { + lock_clear(&apix_lock); + cmn_err(CE_WARN, "Invalid interrupt 0x%x,0x%x " + " for %p to add", cpuid, v, intr_id); + return (0); + } + cpuid = vecp->v_cpuid; + virt_vect = APIX_VIRTVECTOR(cpuid, vecp->v_vector); + } + + APIX_ENTER_CPU_LOCK(cpuid); + apix_insert_av(vecp, intr_id, xxintr, arg1, arg2, ticksp, ipl, dip); + APIX_LEAVE_CPU_LOCK(cpuid); + + (void) apix_addspl(virt_vect, ipl, 0, 0); + + lock_clear(&apix_lock); + + return (1); +} + +/* + * Remove avintr + * + * For fixed, if it's the last one of shared interrupts, free the vector. + * For msi/x, only disable the interrupt but not free the vector, which + * is freed by PSM_XXX_FREE_XXX. + */ +void +apix_rem_avintr(void *intr_id, int ipl, avfunc xxintr, int virt_vect) +{ + avfunc f; + apix_vector_t *vecp; + struct autovec *avp; + processorid_t cpuid; + + if ((f = xxintr) == NULL) + return; + + lock_set(&apix_lock); + + if (!APIX_IS_VIRTVEC(virt_vect)) { /* got irq */ + vecp = apix_intx_get_vector(virt_vect); + virt_vect = APIX_VIRTVECTOR(vecp->v_cpuid, vecp->v_vector); + } else /* got virtual vector */ + vecp = xv_vector(APIX_VIRTVEC_CPU(virt_vect), + APIX_VIRTVEC_VECTOR(virt_vect)); + + if (vecp == NULL) { + lock_clear(&apix_lock); + cmn_err(CE_CONT, "Invalid interrupt 0x%x,0x%x to remove", + APIX_VIRTVEC_CPU(virt_vect), + APIX_VIRTVEC_VECTOR(virt_vect)); + return; + } + + if (vecp->v_state <= APIX_STATE_OBSOLETED || + ((avp = apix_find_av(vecp, intr_id, f)) == NULL)) { + /* + * It's possible that the interrupt is rebound to a + * different cpu before rem_avintr() is called. Search + * through all vectors once it happens. + */ + if ((vecp = apix_find_vector_by_avintr(intr_id, f)) + == NULL) { + lock_clear(&apix_lock); + cmn_err(CE_CONT, "Unknown interrupt 0x%x,0x%x " + "for %p to remove", APIX_VIRTVEC_CPU(virt_vect), + APIX_VIRTVEC_VECTOR(virt_vect), intr_id); + return; + } + virt_vect = APIX_VIRTVECTOR(vecp->v_cpuid, vecp->v_vector); + avp = apix_find_av(vecp, intr_id, f); + } + cpuid = vecp->v_cpuid; + + /* disable interrupt */ + (void) apix_delspl(virt_vect, ipl, 0, 0); + + /* remove ISR entry */ + APIX_ENTER_CPU_LOCK(cpuid); + apix_remove_av(vecp, avp); + APIX_LEAVE_CPU_LOCK(cpuid); + + lock_clear(&apix_lock); +} + +/* + * Device to vector mapping table + */ + +static void +apix_clear_dev_map(dev_info_t *dip, int inum, int type) +{ + char *name; + major_t major; + apix_dev_vector_t *dvp, *prev = NULL; + int found = 0; + + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + + mutex_enter(&apix_mutex); + + for (dvp = apix_dev_vector[major]; dvp != NULL; + prev = dvp, dvp = dvp->dv_next) { + if (dvp->dv_dip == dip && dvp->dv_inum == inum && + dvp->dv_type == type) { + found++; + break; + } + } + + if (!found) { + mutex_exit(&apix_mutex); + return; + } + + if (prev != NULL) + prev->dv_next = dvp->dv_next; + + if (apix_dev_vector[major] == dvp) + apix_dev_vector[major] = dvp->dv_next; + + dvp->dv_vector->v_devp = NULL; + + mutex_exit(&apix_mutex); + + kmem_free(dvp, sizeof (apix_dev_vector_t)); +} + +void +apix_set_dev_map(apix_vector_t *vecp, dev_info_t *dip, int inum) +{ + apix_dev_vector_t *dvp; + char *name; + major_t major; + uint32_t found = 0; + + ASSERT(dip != NULL); + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + + mutex_enter(&apix_mutex); + + for (dvp = apix_dev_vector[major]; dvp != NULL; + dvp = dvp->dv_next) { + if (dvp->dv_dip == dip && dvp->dv_inum == inum && + dvp->dv_type == vecp->v_type) { + found++; + break; + } + } + + if (found == 0) { /* not found */ + dvp = kmem_zalloc(sizeof (apix_dev_vector_t), KM_SLEEP); + dvp->dv_dip = dip; + dvp->dv_inum = inum; + dvp->dv_type = vecp->v_type; + + dvp->dv_next = apix_dev_vector[major]; + apix_dev_vector[major] = dvp; + } + dvp->dv_vector = vecp; + vecp->v_devp = dvp; + + mutex_exit(&apix_mutex); + + DDI_INTR_IMPLDBG((CE_CONT, "apix_set_dev_map: dip=0x%p " + "inum=0x%x vector=0x%x/0x%x\n", + (void *)dip, inum, vecp->v_cpuid, vecp->v_vector)); +} + +apix_vector_t * +apix_get_dev_map(dev_info_t *dip, int inum, int type) +{ + char *name; + major_t major; + apix_dev_vector_t *dvp; + apix_vector_t *vecp; + + name = ddi_get_name(dip); + if ((major = ddi_name_to_major(name)) == DDI_MAJOR_T_NONE) + return (NULL); + + mutex_enter(&apix_mutex); + for (dvp = apix_dev_vector[major]; dvp != NULL; + dvp = dvp->dv_next) { + if (dvp->dv_dip == dip && dvp->dv_inum == inum && + dvp->dv_type == type) { + vecp = dvp->dv_vector; + mutex_exit(&apix_mutex); + return (vecp); + } + } + mutex_exit(&apix_mutex); + + return (NULL); +} + +/* + * Get minimum inum for specified device, used for MSI + */ +int +apix_get_min_dev_inum(dev_info_t *dip, int type) +{ + char *name; + major_t major; + apix_dev_vector_t *dvp; + int inum = -1; + + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + + mutex_enter(&apix_mutex); + for (dvp = apix_dev_vector[major]; dvp != NULL; + dvp = dvp->dv_next) { + if (dvp->dv_dip == dip && dvp->dv_type == type) { + if (inum == -1) + inum = dvp->dv_inum; + else + inum = (dvp->dv_inum < inum) ? + dvp->dv_inum : inum; + } + } + mutex_exit(&apix_mutex); + + return (inum); +} + +int +apix_get_max_dev_inum(dev_info_t *dip, int type) +{ + char *name; + major_t major; + apix_dev_vector_t *dvp; + int inum = -1; + + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + + mutex_enter(&apix_mutex); + for (dvp = apix_dev_vector[major]; dvp != NULL; + dvp = dvp->dv_next) { + if (dvp->dv_dip == dip && dvp->dv_type == type) { + if (inum == -1) + inum = dvp->dv_inum; + else + inum = (dvp->dv_inum > inum) ? + dvp->dv_inum : inum; + } + } + mutex_exit(&apix_mutex); + + return (inum); +} + +/* + * Major to cpu binding, for INTR_ROUND_ROBIN_WITH_AFFINITY cpu + * binding policy + */ + +static uint32_t +apix_get_dev_binding(dev_info_t *dip) +{ + major_t major; + char *name; + uint32_t cpu = IRQ_UNINIT; + + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + if (major < devcnt) { + mutex_enter(&apix_mutex); + cpu = apix_major_to_cpu[major]; + mutex_exit(&apix_mutex); + } + + return (cpu); +} + +static void +apix_set_dev_binding(dev_info_t *dip, uint32_t cpu) +{ + major_t major; + char *name; + + /* setup major to cpu mapping */ + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + if (apix_major_to_cpu[major] == IRQ_UNINIT) { + mutex_enter(&apix_mutex); + apix_major_to_cpu[major] = cpu; + mutex_exit(&apix_mutex); + } +} + +/* + * return the cpu to which this intr should be bound. + * Check properties or any other mechanism to see if user wants it + * bound to a specific CPU. If so, return the cpu id with high bit set. + * If not, use the policy to choose a cpu and return the id. + */ +uint32_t +apix_bind_cpu(dev_info_t *dip) +{ + int instance, instno, prop_len, bind_cpu, count; + uint_t i, rc; + major_t major; + char *name, *drv_name, *prop_val, *cptr; + char prop_name[32]; + + lock_set(&apix_lock); + + if (apic_intr_policy == INTR_LOWEST_PRIORITY) { + cmn_err(CE_WARN, "apix: unsupported interrupt binding policy " + "LOWEST PRIORITY, use ROUND ROBIN instead"); + apic_intr_policy = INTR_ROUND_ROBIN; + } + + if (apic_nproc == 1) { + lock_clear(&apix_lock); + return (0); + } + + drv_name = NULL; + rc = DDI_PROP_NOT_FOUND; + major = (major_t)-1; + if (dip != NULL) { + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + drv_name = ddi_major_to_name(major); + instance = ddi_get_instance(dip); + if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { + bind_cpu = apix_get_dev_binding(dip); + if (bind_cpu != IRQ_UNINIT) { + lock_clear(&apix_lock); + return (bind_cpu); + } + } + /* + * search for "drvname"_intpt_bind_cpus property first, the + * syntax of the property should be "a[,b,c,...]" where + * instance 0 binds to cpu a, instance 1 binds to cpu b, + * instance 3 binds to cpu c... + * ddi_getlongprop() will search /option first, then / + * if "drvname"_intpt_bind_cpus doesn't exist, then find + * intpt_bind_cpus property. The syntax is the same, and + * it applies to all the devices if its "drvname" specific + * property doesn't exist + */ + (void) strcpy(prop_name, drv_name); + (void) strcat(prop_name, "_intpt_bind_cpus"); + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, + (caddr_t)&prop_val, &prop_len); + if (rc != DDI_PROP_SUCCESS) { + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, + "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); + } + } + if (rc == DDI_PROP_SUCCESS) { + for (i = count = 0; i < (prop_len - 1); i++) + if (prop_val[i] == ',') + count++; + if (prop_val[i-1] != ',') + count++; + /* + * if somehow the binding instances defined in the + * property are not enough for this instno., then + * reuse the pattern for the next instance until + * it reaches the requested instno + */ + instno = instance % count; + i = 0; + cptr = prop_val; + while (i < instno) + if (*cptr++ == ',') + i++; + bind_cpu = stoi(&cptr); + kmem_free(prop_val, prop_len); + /* if specific cpu is bogus, then default to cpu 0 */ + if (bind_cpu >= apic_nproc) { + cmn_err(CE_WARN, "apix: %s=%s: CPU %d not present", + prop_name, prop_val, bind_cpu); + bind_cpu = 0; + } else { + /* indicate that we are bound at user request */ + bind_cpu |= IRQ_USER_BOUND; + } + /* + * no need to check apic_cpus[].aci_status, if specific cpu is + * not up, then post_cpu_start will handle it. + */ + } else { + bind_cpu = apic_get_next_bind_cpu(); + } + + lock_clear(&apix_lock); + + return ((uint32_t)bind_cpu); +} + +static boolean_t +apix_is_cpu_enabled(processorid_t cpuid) +{ + apic_cpus_info_t *cpu_infop; + + cpu_infop = &apic_cpus[cpuid]; + + if ((cpu_infop->aci_status & APIC_CPU_INTR_ENABLE) == 0) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Must be called with apix_lock held. This function can be + * called from above lock level by apix_intr_redistribute(). + * + * Arguments: + * vecp : Vector to be rebound + * tocpu : Target cpu. IRQ_UNINIT means target is vecp->v_cpuid. + * count : Number of continuous vectors + * + * Return new vector being bound to + */ +apix_vector_t * +apix_rebind(apix_vector_t *vecp, processorid_t newcpu, int count) +{ + apix_vector_t *newp, *oldp; + processorid_t oldcpu = vecp->v_cpuid; + uchar_t newvec, oldvec = vecp->v_vector; + int i; + + ASSERT(LOCK_HELD(&apix_lock) && count > 0); + + if (!apix_is_cpu_enabled(newcpu)) + return (NULL); + + if (vecp->v_cpuid == newcpu) /* rebind to the same cpu */ + return (vecp); + + APIX_ENTER_CPU_LOCK(oldcpu); + APIX_ENTER_CPU_LOCK(newcpu); + + /* allocate vector */ + if (count == 1) + newp = apix_alloc_vector_oncpu(newcpu, NULL, 0, vecp->v_type); + else { + ASSERT(vecp->v_type == APIX_TYPE_MSI); + newp = apix_alloc_nvectors_oncpu(newcpu, NULL, 0, count, + vecp->v_type); + } + if (newp == NULL) { + APIX_LEAVE_CPU_LOCK(newcpu); + APIX_LEAVE_CPU_LOCK(oldcpu); + return (NULL); + } + + newvec = newp->v_vector; + apix_dup_vectors(vecp, newp, count); + + APIX_LEAVE_CPU_LOCK(newcpu); + APIX_LEAVE_CPU_LOCK(oldcpu); + + if (!DDI_INTR_IS_MSI_OR_MSIX(vecp->v_type)) { + ASSERT(count == 1); + if (apix_intx_rebind(vecp->v_inum, newcpu, newvec) != 0) { + struct autovec *avp; + int inum; + + /* undo duplication */ + APIX_ENTER_CPU_LOCK(oldcpu); + APIX_ENTER_CPU_LOCK(newcpu); + for (avp = newp->v_autovect; avp != NULL; + avp = avp->av_link) { + if (avp->av_dip != NULL) { + inum = GET_INTR_INUM(avp->av_intr_id); + apix_set_dev_map(vecp, avp->av_dip, + inum); + } + apix_remove_av(newp, avp); + } + apix_cleanup_vector(newp); + APIX_LEAVE_CPU_LOCK(newcpu); + APIX_LEAVE_CPU_LOCK(oldcpu); + APIC_VERBOSE(REBIND, (CE_CONT, "apix: rebind fixed " + "interrupt 0x%x to cpu %d failed\n", + vecp->v_inum, newcpu)); + return (NULL); + } + + APIX_ENTER_CPU_LOCK(oldcpu); + (void) apix_obsolete_vector(vecp); + APIX_LEAVE_CPU_LOCK(oldcpu); + APIC_VERBOSE(REBIND, (CE_CONT, "apix: rebind fixed interrupt" + " 0x%x/0x%x to 0x%x/0x%x\n", + oldcpu, oldvec, newcpu, newvec)); + return (newp); + } + + for (i = 0; i < count; i++) { + oldp = xv_vector(oldcpu, oldvec + i); + newp = xv_vector(newcpu, newvec + i); + + if (newp->v_share > 0) { + APIX_SET_REBIND_INFO(oldp, newp); + + apix_enable_vector(newp); + + APIX_CLR_REBIND_INFO(); + } + + APIX_ENTER_CPU_LOCK(oldcpu); + (void) apix_obsolete_vector(oldp); + APIX_LEAVE_CPU_LOCK(oldcpu); + } + APIC_VERBOSE(REBIND, (CE_CONT, "apix: rebind vector 0x%x/0x%x " + "to 0x%x/0x%x, count=%d\n", + oldcpu, oldvec, newcpu, newvec, count)); + + return (xv_vector(newcpu, newvec)); +} + +/* + * Senarios include: + * a. add_avintr() is called before irqp initialized (legacy) + * b. irqp is initialized, vector is not allocated (fixed interrupts) + * c. irqp is initialized, vector is allocated (shared interrupts) + */ +apix_vector_t * +apix_alloc_intx(dev_info_t *dip, int inum, int irqno) +{ + apic_irq_t *irqp; + apix_vector_t *vecp; + + /* + * Allocate IRQ. Caller is later responsible for the + * initialization + */ + mutex_enter(&airq_mutex); + if ((irqp = apic_irq_table[irqno]) == NULL) { + /* allocate irq */ + irqp = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + irqp->airq_mps_intr_index = FREE_INDEX; + apic_irq_table[irqno] = irqp; + } + if (irqp->airq_mps_intr_index == FREE_INDEX) { + irqp->airq_mps_intr_index = DEFAULT_INDEX; + irqp->airq_cpu = IRQ_UNINIT; + irqp->airq_origirq = (uchar_t)irqno; + } + + mutex_exit(&airq_mutex); + + /* + * allocate vector + */ + if (irqp->airq_cpu == IRQ_UNINIT) { + uint32_t bindcpu, cpuid; + + /* select cpu by system policy */ + bindcpu = apix_bind_cpu(dip); + cpuid = bindcpu & ~IRQ_USER_BOUND; + + /* allocate vector */ + APIX_ENTER_CPU_LOCK(cpuid); + + if ((vecp = apix_alloc_vector_oncpu(bindcpu, dip, inum, + APIX_TYPE_FIXED)) == NULL) { + cmn_err(CE_WARN, "No interrupt vector for irq %x", + irqno); + APIX_LEAVE_CPU_LOCK(cpuid); + return (NULL); + } + vecp->v_inum = irqno; + vecp->v_flags |= APIX_VECT_MASKABLE; + + apix_intx_set_vector(irqno, vecp->v_cpuid, vecp->v_vector); + + APIX_LEAVE_CPU_LOCK(cpuid); + } else { + vecp = xv_vector(irqp->airq_cpu, irqp->airq_vector); + ASSERT(!IS_VECT_FREE(vecp)); + + if (dip != NULL) + apix_set_dev_map(vecp, dip, inum); + } + + if ((dip != NULL) && + (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) && + ((vecp->v_flags & APIX_VECT_USER_BOUND) == 0)) + apix_set_dev_binding(dip, vecp->v_cpuid); + + apix_dprint_vector(vecp, dip, 1); + + return (vecp); +} + +int +apix_alloc_msi(dev_info_t *dip, int inum, int count, int behavior) +{ + int i, cap_ptr, rcount = count; + apix_vector_t *vecp; + processorid_t bindcpu, cpuid; + ushort_t msi_ctrl; + ddi_acc_handle_t handle; + + DDI_INTR_IMPLDBG((CE_CONT, "apix_alloc_msi_vectors: dip=0x%p " + "inum=0x%x count=0x%x behavior=%d\n", + (void *)dip, inum, count, behavior)); + + if (count > 1) { + if (behavior == DDI_INTR_ALLOC_STRICT && + apic_multi_msi_enable == 0) + return (0); + if (apic_multi_msi_enable == 0) + count = 1; + } + + /* Check whether it supports per-vector masking */ + cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + handle = i_ddi_get_pci_config_handle(dip); + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + + /* bind to cpu */ + bindcpu = apix_bind_cpu(dip); + cpuid = bindcpu & ~IRQ_USER_BOUND; + + /* if not ISP2, then round it down */ + if (!ISP2(rcount)) + rcount = 1 << (highbit(rcount) - 1); + + APIX_ENTER_CPU_LOCK(cpuid); + for (vecp = NULL; rcount > 0; rcount >>= 1) { + vecp = apix_alloc_nvectors_oncpu(bindcpu, dip, inum, rcount, + APIX_TYPE_MSI); + if (vecp != NULL || behavior == DDI_INTR_ALLOC_STRICT) + break; + } + for (i = 0; vecp && i < rcount; i++) + xv_vector(vecp->v_cpuid, vecp->v_vector + i)->v_flags |= + (msi_ctrl & PCI_MSI_PVM_MASK) ? APIX_VECT_MASKABLE : 0; + APIX_LEAVE_CPU_LOCK(cpuid); + if (vecp == NULL) { + APIC_VERBOSE(INTR, (CE_CONT, + "apix_alloc_msi: no %d cont vectors found on cpu 0x%x\n", + count, bindcpu)); + return (0); + } + + /* major to cpu binding */ + if ((apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) && + ((vecp->v_flags & APIX_VECT_USER_BOUND) == 0)) + apix_set_dev_binding(dip, vecp->v_cpuid); + + apix_dprint_vector(vecp, dip, rcount); + + return (rcount); +} + +int +apix_alloc_msix(dev_info_t *dip, int inum, int count, int behavior) +{ + apix_vector_t *vecp; + processorid_t bindcpu, cpuid; + int i; + + for (i = 0; i < count; i++) { + /* select cpu by system policy */ + bindcpu = apix_bind_cpu(dip); + cpuid = bindcpu & ~IRQ_USER_BOUND; + + /* allocate vector */ + APIX_ENTER_CPU_LOCK(cpuid); + if ((vecp = apix_alloc_vector_oncpu(bindcpu, dip, inum + i, + APIX_TYPE_MSIX)) == NULL) { + APIX_LEAVE_CPU_LOCK(cpuid); + APIC_VERBOSE(INTR, (CE_CONT, "apix_alloc_msix: " + "allocate msix for device dip=%p, inum=%d on" + " cpu %d failed", (void *)dip, inum + i, bindcpu)); + break; + } + vecp->v_flags |= APIX_VECT_MASKABLE; + APIX_LEAVE_CPU_LOCK(cpuid); + + /* major to cpu mapping */ + if ((i == 0) && + (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) && + ((vecp->v_flags & APIX_VECT_USER_BOUND) == 0)) + apix_set_dev_binding(dip, vecp->v_cpuid); + + apix_dprint_vector(vecp, dip, 1); + } + + if (i < count && behavior == DDI_INTR_ALLOC_STRICT) { + APIC_VERBOSE(INTR, (CE_WARN, "apix_alloc_msix: " + "strictly allocate %d vectors failed, got %d\n", + count, i)); + apix_free_vectors(dip, inum, i, APIX_TYPE_MSIX); + i = 0; + } + + return (i); +} + +/* + * A rollback free for vectors allocated by apix_alloc_xxx(). + */ +void +apix_free_vectors(dev_info_t *dip, int inum, int count, int type) +{ + int i, cpuid; + apix_vector_t *vecp; + + DDI_INTR_IMPLDBG((CE_CONT, "apix_free_vectors: dip: %p inum: %x " + "count: %x type: %x\n", + (void *)dip, inum, count, type)); + + lock_set(&apix_lock); + + for (i = 0; i < count; i++, inum++) { + if ((vecp = apix_get_dev_map(dip, inum, type)) == NULL) { + lock_clear(&apix_lock); + DDI_INTR_IMPLDBG((CE_CONT, "apix_free_vectors: " + "dip=0x%p inum=0x%x type=0x%x apix_find_intr() " + "failed\n", (void *)dip, inum, type)); + continue; + } + + APIX_ENTER_CPU_LOCK(vecp->v_cpuid); + cpuid = vecp->v_cpuid; + + DDI_INTR_IMPLDBG((CE_CONT, "apix_free_vectors: " + "dip=0x%p inum=0x%x type=0x%x vector 0x%x (share %d)\n", + (void *)dip, inum, type, vecp->v_vector, vecp->v_share)); + + /* tear down device interrupt to vector mapping */ + apix_clear_dev_map(dip, inum, type); + + if (vecp->v_type == APIX_TYPE_FIXED) { + if (vecp->v_share > 0) { /* share IRQ line */ + APIX_LEAVE_CPU_LOCK(cpuid); + continue; + } + + /* Free apic_irq_table entry */ + apix_intx_free(vecp->v_inum); + } + + /* free vector */ + apix_cleanup_vector(vecp); + + APIX_LEAVE_CPU_LOCK(cpuid); + } + + lock_clear(&apix_lock); +} + +/* + * Must be called with apix_lock held + */ +apix_vector_t * +apix_setup_io_intr(apix_vector_t *vecp) +{ + processorid_t bindcpu; + int ret; + + ASSERT(LOCK_HELD(&apix_lock)); + + /* + * Interrupts are enabled on the CPU, programme IOAPIC RDT + * entry or MSI/X address/data to enable the interrupt. + */ + if (apix_is_cpu_enabled(vecp->v_cpuid)) { + apix_enable_vector(vecp); + return (vecp); + } + + /* + * CPU is not up or interrupts are disabled. Fall back to the + * first avialable CPU. + */ + bindcpu = apic_find_cpu(APIC_CPU_INTR_ENABLE); + + if (vecp->v_type == APIX_TYPE_MSI) + return (apix_grp_set_cpu(vecp, bindcpu, &ret)); + + return (apix_set_cpu(vecp, bindcpu, &ret)); +} + +/* + * For interrupts which call add_avintr() before apic is initialized. + * ioapix_setup_intr() will + * - allocate vector + * - copy over ISR + */ +static void +ioapix_setup_intr(int irqno, iflag_t *flagp) +{ + extern struct av_head autovect[]; + apix_vector_t *vecp; + apic_irq_t *irqp; + uchar_t ioapicindex, ipin; + ulong_t iflag; + struct autovec *avp; + + irqp = apic_irq_table[irqno]; + ioapicindex = acpi_find_ioapic(irqno); + ASSERT(ioapicindex != 0xFF); + ipin = irqno - apic_io_vectbase[ioapicindex]; + + if ((irqp != NULL) && (irqp->airq_mps_intr_index == ACPI_INDEX)) { + ASSERT(irqp->airq_intin_no == ipin && + irqp->airq_ioapicindex == ioapicindex); + vecp = xv_vector(irqp->airq_cpu, irqp->airq_vector); + ASSERT(!IS_VECT_FREE(vecp)); + } else { + vecp = apix_alloc_intx(NULL, 0, irqno); + + irqp = apic_irq_table[irqno]; + irqp->airq_mps_intr_index = ACPI_INDEX; + irqp->airq_ioapicindex = ioapicindex; + irqp->airq_intin_no = ipin; + irqp->airq_iflag = *flagp; + irqp->airq_share++; + apic_record_rdt_entry(irqp, irqno); + } + + /* copy over autovect */ + for (avp = autovect[irqno].avh_link; avp; avp = avp->av_link) + apix_insert_av(vecp, avp->av_intr_id, avp->av_vector, + avp->av_intarg1, avp->av_intarg2, avp->av_ticksp, + avp->av_prilevel, avp->av_dip); + + /* Program I/O APIC */ + iflag = intr_clear(); + lock_set(&apix_lock); + + (void) apix_setup_io_intr(vecp); + + lock_clear(&apix_lock); + intr_restore(iflag); + + APIC_VERBOSE_IOAPIC((CE_CONT, "apix: setup ioapic, irqno %x " + "(ioapic %x, ipin %x) is bound to cpu %x, vector %x\n", + irqno, ioapicindex, ipin, irqp->airq_cpu, irqp->airq_vector)); +} + +void +ioapix_init_intr(int mask_apic) +{ + int ioapicindex; + int i, j; + + /* mask interrupt vectors */ + for (j = 0; j < apic_io_max && mask_apic; j++) { + int intin_max; + + ioapicindex = j; + /* Bits 23-16 define the maximum redirection entries */ + intin_max = (ioapic_read(ioapicindex, APIC_VERS_CMD) >> 16) + & 0xff; + for (i = 0; i <= intin_max; i++) + ioapic_write(ioapicindex, APIC_RDT_CMD + 2 * i, + AV_MASK); + } + + /* + * Hack alert: deal with ACPI SCI interrupt chicken/egg here + */ + if (apic_sci_vect > 0) + ioapix_setup_intr(apic_sci_vect, &apic_sci_flags); + + /* + * Hack alert: deal with ACPI HPET interrupt chicken/egg here. + */ + if (apic_hpet_vect > 0) + ioapix_setup_intr(apic_hpet_vect, &apic_hpet_flags); +} diff --git a/usr/src/uts/i86pc/io/hpet_acpi.c b/usr/src/uts/i86pc/io/hpet_acpi.c index b051205091..8b33cafc8a 100644 --- a/usr/src/uts/i86pc/io/hpet_acpi.c +++ b/usr/src/uts/i86pc/io/hpet_acpi.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/hpet_acpi.h> @@ -36,6 +35,52 @@ #include <sys/archsystm.h> #include <sys/cpupart.h> +static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags); +static boolean_t hpet_install_proxy(void); +static boolean_t hpet_callback(int code); +static boolean_t hpet_cpr(int code); +static boolean_t hpet_resume(void); +static void hpet_cst_callback(uint32_t code); +static boolean_t hpet_deep_idle_config(int code); +static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table); +static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len); +static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table); +static int hpet_start_main_counter(hpet_info_t *hip); +static int hpet_stop_main_counter(hpet_info_t *hip); +static uint64_t hpet_read_main_counter_value(hpet_info_t *hip); +static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value); +static uint64_t hpet_read_gen_cap(hpet_info_t *hip); +static uint64_t hpet_read_gen_config(hpet_info_t *hip); +static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip); +static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n); +static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf); +/* LINTED E_STATIC_UNUSED */ +static uint64_t hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n); +/* LINTED E_STATIC_UNUSED */ +static void hpet_write_gen_cap(hpet_info_t *hip, uint64_t l); +static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l); +static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l); +static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l); +static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l); +static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n); +static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n); +/* LINTED E_STATIC_UNUSED */ +static void hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l); +static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip); +static int hpet_timer_available(uint32_t allocated_timers, uint32_t n); +static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n); +static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, + uint32_t interrupt); +static uint_t hpet_isr(char *arg); +static uint32_t hpet_install_interrupt_handler(uint_t (*func)(char *), + int vector); +static void hpet_uninstall_interrupt_handler(void); +static void hpet_expire_all(void); +static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time); +static boolean_t hpet_use_hpet_timer(hrtime_t *expire); +static void hpet_use_lapic_timer(hrtime_t expire); +static void hpet_init_proxy_data(void); + /* * hpet_state_lock is used to synchronize disabling/enabling deep c-states * and to synchronize suspend/resume. diff --git a/usr/src/uts/i86pc/io/immu_dmar.c b/usr/src/uts/i86pc/io/immu_dmar.c index deebce3072..734363beef 100644 --- a/usr/src/uts/i86pc/io/immu_dmar.c +++ b/usr/src/uts/i86pc/io/immu_dmar.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Portions Copyright (c) 2010, Oracle and/or its affiliates. - * All rights reserved. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -49,6 +48,7 @@ #include <sys/acpica.h> #include <sys/iommulib.h> #include <sys/immu.h> +#include <sys/smp_impldefs.h> static void dmar_table_destroy(dmar_table_t *tbl); @@ -1274,7 +1274,7 @@ immu_dmar_ioapic_sid(int ioapic_ix) { ioapic_drhd_t *idt; - idt = ioapic_drhd_lookup(apic_io_id[ioapic_ix]); + idt = ioapic_drhd_lookup(psm_get_ioapicid(ioapic_ix)); if (idt == NULL) { ddi_err(DER_PANIC, NULL, "cannot determine source-id for " "IOAPIC (index = %d)", ioapic_ix); @@ -1290,7 +1290,7 @@ immu_dmar_ioapic_immu(int ioapic_ix) { ioapic_drhd_t *idt; - idt = ioapic_drhd_lookup(apic_io_id[ioapic_ix]); + idt = ioapic_drhd_lookup(psm_get_ioapicid(ioapic_ix)); if (idt) { return (idt->ioapic_drhd ? idt->ioapic_drhd->dr_immu : NULL); } diff --git a/usr/src/uts/i86pc/io/immu_intrmap.c b/usr/src/uts/i86pc/io/immu_intrmap.c index e96c919d4b..e47a16414f 100644 --- a/usr/src/uts/i86pc/io/immu_intrmap.c +++ b/usr/src/uts/i86pc/io/immu_intrmap.c @@ -20,8 +20,7 @@ */ /* - * Portions Copyright (c) 2010, Oracle and/or its affiliates. - * All rights reserved. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -43,8 +42,7 @@ typedef struct intrmap_private { uint32_t ir_sid_svt_sq; } intrmap_private_t; -#define INTRMAP_PRIVATE(airq) ((intrmap_private_t *)airq->airq_intrmap_private) -#define AIRQ_PRIVATE(airq) (airq->airq_intrmap_private) +#define INTRMAP_PRIVATE(intrmap) ((intrmap_private_t *)intrmap) /* interrupt remapping table entry */ typedef struct intrmap_rte { @@ -135,11 +133,13 @@ static char *immu_intrmap_faults[] = { /* Function prototypes */ static int immu_intrmap_init(int apic_mode); static void immu_intrmap_switchon(int suppress_brdcst_eoi); -static void immu_intrmap_alloc(apic_irq_t *irq_ptr); -static void immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data); -static void immu_intrmap_free(apic_irq_t *irq_ptr); -static void immu_intrmap_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt); -static void immu_intrmap_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs); +static void immu_intrmap_alloc(void **intrmap_private_tbl, dev_info_t *dip, + uint16_t type, int count, uchar_t ioapic_index); +static void immu_intrmap_map(void *intrmap_private, void *intrmap_data, + uint16_t type, int count); +static void immu_intrmap_free(void **intrmap_privatep); +static void immu_intrmap_rdt(void *intrmap_private, ioapic_rdt_t *irdt); +static void immu_intrmap_msi(void *intrmap_private, msi_regs_t *mregs); static struct apic_intrmap_ops intrmap_ops = { immu_intrmap_init, @@ -383,24 +383,19 @@ init_unit(immu_t *immu) return (DDI_SUCCESS); } -static void -get_immu(apic_irq_t *irq_ptr) +static immu_t * +get_immu(dev_info_t *dip, uint16_t type, uchar_t ioapic_index) { immu_t *immu = NULL; - ASSERT(INTRMAP_PRIVATE(irq_ptr)->ir_immu == NULL); - - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { - immu = immu_dmar_ioapic_immu(irq_ptr->airq_ioapicindex); + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { + immu = immu_dmar_ioapic_immu(ioapic_index); } else { - if (irq_ptr->airq_dip != NULL) { - immu = immu_dmar_get_immu(irq_ptr->airq_dip); - } + if (dip != NULL) + immu = immu_dmar_get_immu(dip); } - if (immu && (immu->immu_intrmap_running == B_TRUE)) { - INTRMAP_PRIVATE(irq_ptr)->ir_immu = immu; - } + return (immu); } static int @@ -437,26 +432,25 @@ intrmap_top_pcibridge(dev_info_t *rdip) } /* function to get interrupt request source id */ -static void -get_sid(apic_irq_t *irq_ptr) +static uint32_t +get_sid(dev_info_t *dip, uint16_t type, uchar_t ioapic_index) { - dev_info_t *dip, *pdip; + dev_info_t *pdip; immu_devi_t *immu_devi; uint16_t sid; uchar_t svt, sq; if (!intrmap_enable_sid_verify) { - return; + return (0); } - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { /* for interrupt through I/O APIC */ - sid = immu_dmar_ioapic_sid(irq_ptr->airq_ioapicindex); + sid = immu_dmar_ioapic_sid(ioapic_index); svt = SVT_ALL_VERIFY; sq = SQ_VERIFY_ALL; } else { /* MSI/MSI-X interrupt */ - dip = irq_ptr->airq_dip; ASSERT(dip); pdip = intrmap_top_pcibridge(dip); ASSERT(pdip); @@ -476,8 +470,7 @@ get_sid(apic_irq_t *irq_ptr) } } - INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq = - sid | (svt << 18) | (sq << 16); + return (sid | (svt << 18) | (sq << 16)); } static void @@ -667,52 +660,49 @@ immu_intrmap_switchon(int suppress_brdcst_eoi) /* alloc remapping entry for the interrupt */ static void -immu_intrmap_alloc(apic_irq_t *irq_ptr) +immu_intrmap_alloc(void **intrmap_private_tbl, dev_info_t *dip, + uint16_t type, int count, uchar_t ioapic_index) { immu_t *immu; intrmap_t *intrmap; - uint32_t idx, cnt, i; - uint_t vector, irqno; + uint32_t idx, i; uint32_t sid_svt_sq; + intrmap_private_t *intrmap_private; - if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE || - AIRQ_PRIVATE(irq_ptr) != NULL) { + if (intrmap_private_tbl[0] == INTRMAP_DISABLE || + intrmap_private_tbl[0] != NULL) { return; } - AIRQ_PRIVATE(irq_ptr) = + intrmap_private_tbl[0] = kmem_zalloc(sizeof (intrmap_private_t), KM_SLEEP); + intrmap_private = INTRMAP_PRIVATE(intrmap_private_tbl[0]); - get_immu(irq_ptr); - - immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu; - if (immu == NULL) { + immu = get_immu(dip, type, ioapic_index); + if ((immu != NULL) && (immu->immu_intrmap_running == B_TRUE)) { + intrmap_private->ir_immu = immu; + } else { goto intrmap_disable; } intrmap = immu->immu_intrmap; - if (irq_ptr->airq_mps_intr_index == MSI_INDEX) { - cnt = irq_ptr->airq_intin_no; - } else { - cnt = 1; - } - - if (cnt == 1) { + if (count == 1) { idx = alloc_tbl_entry(intrmap); } else { - idx = alloc_tbl_multi_entries(intrmap, cnt); + idx = alloc_tbl_multi_entries(intrmap, count); } if (idx == INTRMAP_IDX_FULL) { goto intrmap_disable; } - INTRMAP_PRIVATE(irq_ptr)->ir_idx = idx; + intrmap_private->ir_idx = idx; - get_sid(irq_ptr); + sid_svt_sq = intrmap_private->ir_sid_svt_sq = + get_sid(dip, type, ioapic_index); - if (cnt == 1) { + if (count == 1) { if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) { immu_qinv_intr_one_cache(immu, idx); } else { @@ -721,26 +711,18 @@ immu_intrmap_alloc(apic_irq_t *irq_ptr) return; } - sid_svt_sq = INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq; - - vector = irq_ptr->airq_vector; - - for (i = 1; i < cnt; i++) { - irqno = apic_vector_to_irq[vector + i]; - irq_ptr = apic_irq_table[irqno]; - - ASSERT(irq_ptr); - - AIRQ_PRIVATE(irq_ptr) = + for (i = 1; i < count; i++) { + intrmap_private_tbl[i] = kmem_zalloc(sizeof (intrmap_private_t), KM_SLEEP); - INTRMAP_PRIVATE(irq_ptr)->ir_immu = immu; - INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq = sid_svt_sq; - INTRMAP_PRIVATE(irq_ptr)->ir_idx = idx + i; + INTRMAP_PRIVATE(intrmap_private_tbl[i])->ir_immu = immu; + INTRMAP_PRIVATE(intrmap_private_tbl[i])->ir_sid_svt_sq = + sid_svt_sq; + INTRMAP_PRIVATE(intrmap_private_tbl[i])->ir_idx = idx + i; } if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) { - immu_qinv_intr_caches(immu, idx, cnt); + immu_qinv_intr_caches(immu, idx, count); } else { immu_regs_wbf_flush(immu); } @@ -748,41 +730,34 @@ immu_intrmap_alloc(apic_irq_t *irq_ptr) return; intrmap_disable: - kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intrmap_private_t)); - AIRQ_PRIVATE(irq_ptr) = INTRMAP_DISABLE; + kmem_free(intrmap_private_tbl[0], sizeof (intrmap_private_t)); + intrmap_private_tbl[0] = INTRMAP_DISABLE; } /* remapping the interrupt */ static void -immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data) +immu_intrmap_map(void *intrmap_private, void *intrmap_data, uint16_t type, + int count) { immu_t *immu; intrmap_t *intrmap; ioapic_rdt_t *irdt = (ioapic_rdt_t *)intrmap_data; msi_regs_t *mregs = (msi_regs_t *)intrmap_data; intrmap_rte_t irte; - uint_t idx, i, cnt; + uint_t idx, i; uint32_t dst, sid_svt_sq; uchar_t vector, dlm, tm, rh, dm; - if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE) { + if (intrmap_private == INTRMAP_DISABLE) return; - } - - if (irq_ptr->airq_mps_intr_index == MSI_INDEX) { - cnt = irq_ptr->airq_intin_no; - } else { - cnt = 1; - } - idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx; - immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu; + idx = INTRMAP_PRIVATE(intrmap_private)->ir_idx; + immu = INTRMAP_PRIVATE(intrmap_private)->ir_immu; intrmap = immu->immu_intrmap; - sid_svt_sq = INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq; - vector = irq_ptr->airq_vector; + sid_svt_sq = INTRMAP_PRIVATE(intrmap_private)->ir_sid_svt_sq; - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { dm = RDT_DM(irdt->ir_lo); rh = 0; tm = RDT_TM(irdt->ir_lo); @@ -795,18 +770,22 @@ immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data) if (intrmap_suppress_brdcst_eoi) { tm = TRIGGER_MODE_EDGE; } + + vector = RDT_VECTOR(irdt->ir_lo); } else { dm = MSI_ADDR_DM_PHYSICAL; rh = MSI_ADDR_RH_FIXED; tm = TRIGGER_MODE_EDGE; dlm = 0; dst = mregs->mr_addr; + + vector = mregs->mr_data & 0xff; } if (intrmap_apic_mode == LOCAL_APIC) dst = (dst & 0xFF) << 8; - if (cnt == 1) { + if (count == 1) { irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1); irte.hi = IRTE_HIGH(sid_svt_sq); @@ -818,8 +797,7 @@ immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data) immu_qinv_intr_one_cache(immu, idx); } else { - vector = irq_ptr->airq_vector; - for (i = 0; i < cnt; i++) { + for (i = 0; i < count; i++) { irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1); irte.hi = IRTE_HIGH(sid_svt_sq); @@ -831,26 +809,26 @@ immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data) idx++; } - immu_qinv_intr_caches(immu, idx, cnt); + immu_qinv_intr_caches(immu, idx, count); } } /* free the remapping entry */ static void -immu_intrmap_free(apic_irq_t *irq_ptr) +immu_intrmap_free(void **intrmap_privatep) { immu_t *immu; intrmap_t *intrmap; uint32_t idx; - if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE) { - AIRQ_PRIVATE(irq_ptr) = NULL; + if (*intrmap_privatep == INTRMAP_DISABLE || *intrmap_privatep == NULL) { + *intrmap_privatep = NULL; return; } - immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu; + immu = INTRMAP_PRIVATE(*intrmap_privatep)->ir_immu; intrmap = immu->immu_intrmap; - idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx; + idx = INTRMAP_PRIVATE(*intrmap_privatep)->ir_idx; bzero(intrmap->intrmap_vaddr + idx * INTRMAP_RTE_SIZE, INTRMAP_RTE_SIZE); @@ -864,24 +842,23 @@ immu_intrmap_free(apic_irq_t *irq_ptr) } mutex_exit(&intrmap->intrmap_lock); - kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intrmap_private_t)); - AIRQ_PRIVATE(irq_ptr) = NULL; + kmem_free(*intrmap_privatep, sizeof (intrmap_private_t)); + *intrmap_privatep = NULL; } /* record the ioapic rdt entry */ static void -immu_intrmap_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt) +immu_intrmap_rdt(void *intrmap_private, ioapic_rdt_t *irdt) { uint32_t rdt_entry, tm, pol, idx, vector; rdt_entry = irdt->ir_lo; - if (INTRMAP_PRIVATE(irq_ptr) != NULL && - INTRMAP_PRIVATE(irq_ptr) != INTRMAP_DISABLE) { - idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx; + if (intrmap_private != INTRMAP_DISABLE && intrmap_private != NULL) { + idx = INTRMAP_PRIVATE(intrmap_private)->ir_idx; tm = RDT_TM(rdt_entry); pol = RDT_POL(rdt_entry); - vector = irq_ptr->airq_vector; + vector = RDT_VECTOR(rdt_entry); irdt->ir_lo = (tm << INTRMAP_IOAPIC_TM_SHIFT) | (pol << INTRMAP_IOAPIC_POL_SHIFT) | ((idx >> 15) << INTRMAP_IOAPIC_IDX15_SHIFT) | @@ -896,13 +873,12 @@ immu_intrmap_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt) /* record the msi interrupt structure */ /*ARGSUSED*/ static void -immu_intrmap_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs) +immu_intrmap_msi(void *intrmap_private, msi_regs_t *mregs) { uint_t idx; - if (INTRMAP_PRIVATE(irq_ptr) != NULL && - INTRMAP_PRIVATE(irq_ptr) != INTRMAP_DISABLE) { - idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx; + if (intrmap_private != INTRMAP_DISABLE && intrmap_private != NULL) { + idx = INTRMAP_PRIVATE(intrmap_private)->ir_idx; mregs->mr_data = 0; mregs->mr_addr = MSI_ADDR_HDR | @@ -979,21 +955,27 @@ immu_intr_register(immu_t *immu) uint32_t msi_data; uint32_t uaddr; uint32_t msi_addr; + uint32_t localapic_id = 0; + + if (psm_get_localapicid) + localapic_id = psm_get_localapicid(0); msi_addr = (MSI_ADDR_HDR | - apic_cpus[0].aci_local_id & 0xFF) << ((MSI_ADDR_DEST_SHIFT) | + ((localapic_id & 0xFF) << MSI_ADDR_DEST_SHIFT) | (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT)); if (intrmap_apic_mode == LOCAL_X2APIC) { - uaddr = (apic_cpus[0].aci_local_id & 0xFFFFFF00); + uaddr = localapic_id & 0xFFFFFF00; } else { uaddr = 0; } /* Dont need to hold immu_intr_lock since we are in boot */ - irq = psm_get_ipivect(IMMU_INTR_IPL, -1); - vect = apic_irq_table[irq]->airq_vector; + irq = vect = psm_get_ipivect(IMMU_INTR_IPL, -1); + if (psm_xlate_vector_by_irq != NULL) + vect = psm_xlate_vector_by_irq(irq); + msi_data = ((MSI_DATA_DELIVERY_FIXED << MSI_DATA_DELIVERY_SHIFT) | vect); diff --git a/usr/src/uts/i86pc/io/isa.c b/usr/src/uts/i86pc/io/isa.c index 8080b0c2b1..aa5cea74f1 100644 --- a/usr/src/uts/i86pc/io/isa.c +++ b/usr/src/uts/i86pc/io/isa.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -48,6 +47,7 @@ #include <sys/pci.h> #include <sys/note.h> #include <sys/boot_console.h> +#include <sys/apic.h> #if defined(__xpv) #include <sys/hypervisor.h> #include <sys/evtchn_impl.h> @@ -55,6 +55,7 @@ extern int console_hypervisor_device; #endif + extern int pseudo_isa; extern int isa_resource_setup(void); extern int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, @@ -157,6 +158,8 @@ isa_ctlops(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, void *); static int isa_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, ddi_intr_handle_impl_t *hdlp, void *result); +static int isa_alloc_intr_fixed(dev_info_t *, ddi_intr_handle_impl_t *, void *); +static int isa_free_intr_fixed(dev_info_t *, ddi_intr_handle_impl_t *); struct bus_ops isa_bus_ops = { BUSO_REV, @@ -774,13 +777,11 @@ isa_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, return (DDI_FAILURE); break; case DDI_INTROP_ALLOC: - if ((ispec = isa_get_ispec(rdip, hdlp->ih_inum)) == NULL) - return (DDI_FAILURE); - hdlp->ih_pri = ispec->intrspec_pri; - *(int *)result = hdlp->ih_scratch1; - break; + ASSERT(hdlp->ih_type == DDI_INTR_TYPE_FIXED); + return (isa_alloc_intr_fixed(rdip, hdlp, result)); case DDI_INTROP_FREE: - break; + ASSERT(hdlp->ih_type == DDI_INTR_TYPE_FIXED); + return (isa_free_intr_fixed(rdip, hdlp)); case DDI_INTROP_GETPRI: if ((ispec = isa_get_ispec(rdip, hdlp->ih_inum)) == NULL) return (DDI_FAILURE); @@ -900,6 +901,97 @@ isa_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, return (DDI_SUCCESS); } +/* + * Allocate interrupt vector for FIXED (legacy) type. + */ +static int +isa_alloc_intr_fixed(dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp, + void *result) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + int free_phdl = 0; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + if ((ispec = isa_get_ispec(rdip, hdlp->ih_inum)) == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to allocate the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if (hdlp->ih_private == NULL) { /* allocate phdl structure */ + free_phdl = 1; + i_ddi_alloc_intr_phdl(hdlp); + } + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_ALLOC_VECTORS, result); + if (free_phdl) { /* free up the phdl structure */ + free_phdl = 0; + i_ddi_free_intr_phdl(hdlp); + hdlp->ih_private = NULL; + } + } else { + /* + * No APIX module; fall back to the old scheme where the + * interrupt vector is allocated during ddi_enable_intr() call. + */ + hdlp->ih_pri = ispec->intrspec_pri; + *(int *)result = hdlp->ih_scratch1; + ret = DDI_SUCCESS; + } + + return (ret); +} + +/* + * Free up interrupt vector for FIXED (legacy) type. + */ +static int +isa_free_intr_fixed(dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to free up the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if ((ispec = isa_get_ispec(rdip, hdlp->ih_inum)) == NULL) + return (DDI_FAILURE); + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_FREE_VECTORS, NULL); + } else { + /* + * No APIX module; fall back to the old scheme where + * the interrupt vector was already freed during + * ddi_disable_intr() call. + */ + ret = DDI_SUCCESS; + } + + return (ret); +} + static void isa_vendor(uint32_t id, char *vendor) { diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index c33e10c9ff..134a945207 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -80,31 +80,20 @@ static struct apic_mpfps_hdr *apic_find_fps_sig(caddr_t fptr, int size); static int apic_checksum(caddr_t bptr, int len); static int apic_find_bus_type(char *bus); static int apic_find_bus(int busid); -static int apic_find_bus_id(int bustype); static struct apic_io_intr *apic_find_io_intr(int irqno); static int apic_find_free_irq(int start, int end); -static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); -static void apic_xlate_vector_free_timeout_handler(void *arg); -static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, - int new_bind_cpu, int apicindex, int intin_no, int which_irq, - struct ioapic_reprogram_data *drep); -static void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); -static struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); -static int apic_find_intin(uchar_t ioapic, uchar_t intin); -static int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, - int child_ipin, struct apic_io_intr **intrp); -static int apic_setup_irq_table(dev_info_t *dip, int irqno, - struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, - int type); +struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); static void apic_set_pwroff_method_from_mpcnfhdr(struct apic_mp_cnf_hdr *hdrp); static void apic_free_apic_cpus(void); -static void apic_try_deferred_reprogram(int ipl, int vect); -static void delete_defer_repro_ent(int which_irq); -static void apic_ioapic_wait_pending_clear(int ioapicindex, - int intin_no); static boolean_t apic_is_ioapic_AMD_813x(uint32_t physaddr); static int apic_acpi_enter_apicmode(void); +int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, + int child_ipin, struct apic_io_intr **intrp); +int apic_find_bus_id(int bustype); +int apic_find_intin(uchar_t ioapic, uchar_t intin); +void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); + int apic_debug_mps_id = 0; /* 1 - print MPS ID strings */ /* ACPI SCI interrupt configuration; -1 if SCI not used */ @@ -120,33 +109,21 @@ iflag_t apic_hpet_flags; /* * psm name pointer */ -static char *psm_name; +char *psm_name; /* ACPI support routines */ static int acpi_probe(char *); static int apic_acpi_irq_configure(acpi_psm_lnk_t *acpipsmlnkp, dev_info_t *dip, int *pci_irqp, iflag_t *intr_flagp); -static int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, +int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, int ipin, int *pci_irqp, iflag_t *intr_flagp); -static uchar_t acpi_find_ioapic(int irq); +uchar_t acpi_find_ioapic(int irq); static int acpi_intr_compatible(iflag_t iflag1, iflag_t iflag2); -/* - * number of bits per byte, from <sys/param.h> - */ -#define UCHAR_MAX ((1 << NBBY) - 1) - /* Max wait time (in repetitions) for flags to clear in an RDT entry. */ int apic_max_reps_clear_pending = 1000; -/* The irq # is implicit in the array index: */ -struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; -/* - * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info - * is indexed by IRQ number, NOT by vector number. - */ - int apic_intr_policy = INTR_ROUND_ROBIN; int apic_next_bind_cpu = 1; /* For round robin assignment */ @@ -192,9 +169,6 @@ int apic_redistribute_sample_interval = NANOSEC / 100; /* 10 millisec */ */ int apic_sample_factor_redistribution = 101; -/* timeout for xlate_vector, mark_vector */ -int apic_revector_timeout = 16 * 10000; /* 160 millisec */ - int apic_redist_cpu_skip = 0; int apic_num_imbalance = 0; int apic_num_rebind = 0; @@ -230,15 +204,15 @@ int apic_unconditional_srs = 1; int apic_prefer_crs = 1; -uchar_t apic_io_id[MAX_IO_APIC]; +uchar_t apic_io_id[MAX_IO_APIC]; volatile uint32_t *apicioadr[MAX_IO_APIC]; -static uchar_t apic_io_ver[MAX_IO_APIC]; -static uchar_t apic_io_vectbase[MAX_IO_APIC]; -static uchar_t apic_io_vectend[MAX_IO_APIC]; +uchar_t apic_io_ver[MAX_IO_APIC]; +uchar_t apic_io_vectbase[MAX_IO_APIC]; +uchar_t apic_io_vectend[MAX_IO_APIC]; uchar_t apic_reserved_irqlist[MAX_ISA_IRQ + 1]; uint32_t apic_physaddr[MAX_IO_APIC]; -static boolean_t ioapic_mask_workaround[MAX_IO_APIC]; +boolean_t ioapic_mask_workaround[MAX_IO_APIC]; /* * First available slot to be used as IRQ index into the apic_irq_table @@ -252,45 +226,20 @@ int apic_first_avail_irq = APIC_FIRST_FREE_IRQ; */ lock_t apic_ioapic_lock; -/* - * apic_defer_reprogram_lock ensures that only one processor is handling - * deferred interrupt programming at *_intr_exit time. - */ -static lock_t apic_defer_reprogram_lock; +int apic_io_max = 0; /* no. of i/o apics enabled */ -/* - * The current number of deferred reprogrammings outstanding - */ -uint_t apic_reprogram_outstanding = 0; - -#ifdef DEBUG -/* - * Counters that keep track of deferred reprogramming stats - */ -uint_t apic_intr_deferrals = 0; -uint_t apic_intr_deliver_timeouts = 0; -uint_t apic_last_ditch_reprogram_failures = 0; -uint_t apic_deferred_setup_failures = 0; -uint_t apic_defer_repro_total_retries = 0; -uint_t apic_defer_repro_successes = 0; -uint_t apic_deferred_spurious_enters = 0; -#endif - -static int apic_io_max = 0; /* no. of i/o apics enabled */ - -static struct apic_io_intr *apic_io_intrp = 0; +struct apic_io_intr *apic_io_intrp = NULL; static struct apic_bus *apic_busp; -uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; uchar_t apic_resv_vector[MAXIPL+1]; char apic_level_intr[APIC_MAX_VECTOR+1]; -static uint32_t eisa_level_intr_mask = 0; +uint32_t eisa_level_intr_mask = 0; /* At least MSB will be set if EISA bus */ -static int apic_pci_bus_total = 0; -static uchar_t apic_single_pci_busid = 0; +int apic_pci_bus_total = 0; +uchar_t apic_single_pci_busid = 0; /* * airq_mutex protects additions to the apic_irq_table - the first @@ -307,15 +256,6 @@ apic_irq_t *apic_irq_table[APIC_MAX_VECTOR+1]; int apic_max_device_irq = 0; int apic_min_device_irq = APIC_MAX_VECTOR; -/* - * Following declarations are for revectoring; used when ISRs at different - * IPLs share an irq. - */ -static lock_t apic_revector_lock; -int apic_revector_pending = 0; -static uchar_t *apic_oldvec_to_newvec; -static uchar_t *apic_newvec_to_oldvec; - typedef struct prs_irq_list_ent { int list_prio; int32_t irq; @@ -335,8 +275,8 @@ int apic_enable_acpi = 0; static ACPI_TABLE_MADT *acpi_mapic_dtp = NULL; /* ACPI Interrupt Source Override Structure ptr */ -static ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop = NULL; -static int acpi_iso_cnt = 0; +ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop = NULL; +int acpi_iso_cnt = 0; /* ACPI Non-maskable Interrupt Sources ptr */ static ACPI_MADT_NMI_SOURCE *acpi_nmi_sp = NULL; @@ -385,7 +325,7 @@ apic_probe_common(char *modname) { uint32_t mpct_addr, ebda_start = 0, base_mem_end; caddr_t biosdatap; - caddr_t mpct; + caddr_t mpct = 0; caddr_t fptr; int i, mpct_size, mapsize, retval = PSM_FAILURE; ushort_t ebda_seg, base_mem_size; @@ -421,7 +361,7 @@ apic_probe_common(char *modname) */ biosdatap = psm_map_phys(0x400, 0x20, PROT_READ); if (!biosdatap) - return (retval); + goto apic_ret; fpsp = (struct apic_mpfps_hdr *)NULL; mapsize = MPFPS_RAM_WIN_LEN; /*LINTED: pointer cast may result in improper alignment */ @@ -469,20 +409,20 @@ apic_probe_common(char *modname) if (!(fpsp = apic_find_fps_sig(fptr, MPFPS_ROM_WIN_LEN))) { psm_unmap_phys(fptr, MPFPS_ROM_WIN_LEN); - return (retval); + goto apic_ret; } } } if (apic_checksum((caddr_t)fpsp, fpsp->mpfps_length * 16) != 0) { psm_unmap_phys(fptr, MPFPS_ROM_WIN_LEN); - return (retval); + goto apic_ret; } apic_spec_rev = fpsp->mpfps_spec_rev; if ((apic_spec_rev != 04) && (apic_spec_rev != 01)) { psm_unmap_phys(fptr, MPFPS_ROM_WIN_LEN); - return (retval); + goto apic_ret; } /* check IMCR is present or not */ @@ -491,7 +431,10 @@ apic_probe_common(char *modname) /* check default configuration (dual CPUs) */ if ((apic_defconf = fpsp->mpfps_featinfo1) != 0) { psm_unmap_phys(fptr, mapsize); - return (apic_handle_defconf()); + if ((retval = apic_handle_defconf()) != PSM_SUCCESS) + return (retval); + + goto apic_ret; } /* MP Configuration Table */ @@ -508,12 +451,12 @@ apic_probe_common(char *modname) hdrp = (struct apic_mp_cnf_hdr *)psm_map_phys(mpct_addr, sizeof (struct apic_mp_cnf_hdr), PROT_READ); if (!hdrp) - return (retval); + goto apic_ret; /* check mp configuration table signature PCMP */ if (hdrp->mpcnf_sig != 0x504d4350) { psm_unmap_phys((caddr_t)hdrp, sizeof (struct apic_mp_cnf_hdr)); - return (retval); + goto apic_ret; } mpct_size = (int)hdrp->mpcnf_tbl_length; @@ -523,7 +466,7 @@ apic_probe_common(char *modname) if ((retval == PSM_SUCCESS) && !apic_use_acpi_madt_only) { /* This is an ACPI machine No need for further checks */ - return (retval); + goto apic_ret; } /* @@ -533,12 +476,11 @@ apic_probe_common(char *modname) */ mpct = psm_map_phys(mpct_addr, mpct_size, PROT_READ); if (!mpct) - return (retval); + goto apic_ret; if (apic_checksum(mpct, mpct_size) != 0) goto apic_fail1; - /*LINTED: pointer cast may result in improper alignment */ hdrp = (struct apic_mp_cnf_hdr *)mpct; apicadr = (uint32_t *)mapin_apic((uint32_t)hdrp->mpcnf_local_apic, @@ -549,17 +491,36 @@ apic_probe_common(char *modname) /* Parse all information in the tables */ bypass_cpu_and_ioapics_in_mptables = (retval == PSM_SUCCESS); if (apic_parse_mpct(mpct, bypass_cpu_and_ioapics_in_mptables) == - PSM_SUCCESS) - return (PSM_SUCCESS); + PSM_SUCCESS) { + retval = PSM_SUCCESS; + goto apic_ret; + } + +apic_fail1: + psm_unmap_phys(mpct, mpct_size); + mpct = NULL; + +apic_ret: + if (retval == PSM_SUCCESS) { + extern int apic_ioapic_method_probe(); + + if ((retval = apic_ioapic_method_probe()) == PSM_SUCCESS) + return (PSM_SUCCESS); + } for (i = 0; i < apic_io_max; i++) mapout_ioapic((caddr_t)apicioadr[i], APIC_IO_MEMLEN); - if (apic_cpus) + if (apic_cpus) { kmem_free(apic_cpus, apic_cpus_size); - if (apicadr) + apic_cpus = NULL; + } + if (apicadr) { mapout_apic((caddr_t)apicadr, APIC_LOCAL_MEMLEN); -apic_fail1: - psm_unmap_phys(mpct, mpct_size); + apicadr = NULL; + } + if (mpct) + psm_unmap_phys(mpct, mpct_size); + return (retval); } @@ -1036,11 +997,9 @@ apic_handle_defconf() apic_free_apic_cpus(); plat_dr_disable_cpu(); - /*LINTED: pointer cast may result in improper alignment */ - apicioadr[0] = mapin_ioapic(APIC_IO_ADDR, + apicioadr[0] = (void *)mapin_ioapic(APIC_IO_ADDR, APIC_IO_MEMLEN, PROT_READ | PROT_WRITE); - /*LINTED: pointer cast may result in improper alignment */ - apicadr = (uint32_t *)psm_map_phys(APIC_LOCAL_ADDR, + apicadr = (void *)psm_map_phys(APIC_LOCAL_ADDR, APIC_LOCAL_MEMLEN, PROT_READ); apic_cpus_size = 2 * sizeof (*apic_cpus); apic_cpus = (apic_cpus_info_t *) @@ -1080,7 +1039,6 @@ apic_handle_defconf() return (PSM_SUCCESS); apic_handle_defconf_fail: - apic_free_apic_cpus(); if (apicadr) mapout_apic((caddr_t)apicadr, APIC_LOCAL_MEMLEN); if (apicioadr[0]) @@ -1206,9 +1164,8 @@ apic_parse_mpct(caddr_t mpct, int bypass_cpus_and_ioapics) if (ioapicp->io_flags & IOAPIC_FLAGS_EN) { apic_io_id[apic_io_max] = ioapicp->io_apicid; apic_io_ver[apic_io_max] = ioapicp->io_version; - /*LINTED: pointer cast may result in improper alignment */ apicioadr[apic_io_max] = - mapin_ioapic( + (void *)mapin_ioapic( (uint32_t)ioapicp->io_apic_addr, APIC_IO_MEMLEN, PROT_READ | PROT_WRITE); @@ -1284,17 +1241,12 @@ apic_cpu_in_range(int cpu) return (B_TRUE); } -/* - * Must be called with interrupts disabled and the apic_ioapic_lock held. - */ processorid_t apic_get_next_bind_cpu(void) { int i, count; processorid_t cpuid = 0; - ASSERT(LOCK_HELD(&apic_ioapic_lock)); - for (count = 0; count < apic_nproc; count++) { if (apic_next_bind_cpu >= apic_nproc) { apic_next_bind_cpu = 0; @@ -1367,754 +1319,6 @@ apic_checksum(caddr_t bptr, int len) return ((int)cksum); } - -/* - * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable - * are also set to NULL. vector->irq is set to a value which cannot map - * to a real irq to show that it is free. - */ -void -apic_init_common() -{ - int i, j, indx; - int *iptr; - - /* - * Initialize apic_ipls from apic_vectortoipl. This array is - * used in apic_intr_enter to determine the IPL to use for the - * corresponding vector. On some systems, due to hardware errata - * and interrupt sharing, the IPL may not correspond to the IPL listed - * in apic_vectortoipl (see apic_addspl and apic_delspl). - */ - for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { - indx = i * APIC_VECTOR_PER_IPL; - - for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) - apic_ipls[indx] = apic_vectortoipl[i]; - } - - /* cpu 0 is always up (for now) */ - apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; - - iptr = (int *)&apic_irq_table[0]; - for (i = 0; i <= APIC_MAX_VECTOR; i++) { - apic_level_intr[i] = 0; - *iptr++ = NULL; - apic_vector_to_irq[i] = APIC_RESV_IRQ; - - /* These *must* be initted to B_TRUE! */ - apic_reprogram_info[i].done = B_TRUE; - apic_reprogram_info[i].irqp = NULL; - apic_reprogram_info[i].tries = 0; - apic_reprogram_info[i].bindcpu = 0; - } - - /* - * Allocate a dummy irq table entry for the reserved entry. - * This takes care of the race between removing an irq and - * clock detecting a CPU in that irq during interrupt load - * sampling. - */ - apic_irq_table[APIC_RESV_IRQ] = - kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP); - - mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); -} - -void -ioapic_init_intr(int mask_apic) -{ - int ioapic_ix; - struct intrspec ispec; - apic_irq_t *irqptr; - int i, j; - ulong_t iflag; - - LOCK_INIT_CLEAR(&apic_revector_lock); - LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); - - /* mask interrupt vectors */ - for (j = 0; j < apic_io_max && mask_apic; j++) { - int intin_max; - - ioapic_ix = j; - /* Bits 23-16 define the maximum redirection entries */ - intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) - & 0xff; - for (i = 0; i <= intin_max; i++) - ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); - } - - /* - * Hack alert: deal with ACPI SCI interrupt chicken/egg here - */ - if (apic_sci_vect > 0) { - /* - * acpica has already done add_avintr(); we just - * to finish the job by mimicing translate_irq() - * - * Fake up an intrspec and setup the tables - */ - ispec.intrspec_vec = apic_sci_vect; - ispec.intrspec_pri = SCI_IPL; - - if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, - &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { - cmn_err(CE_WARN, "!apic: SCI setup failed"); - return; - } - irqptr = apic_irq_table[apic_sci_vect]; - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - /* Program I/O APIC */ - (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - irqptr->airq_share++; - } - -#if !defined(__xpv) - /* - * Hack alert: deal with ACPI HPET interrupt chicken/egg here. - */ - if (apic_hpet_vect > 0) { - /* - * hpet has already done add_avintr(); we just need - * to finish the job by mimicing translate_irq() - * - * Fake up an intrspec and setup the tables - */ - ispec.intrspec_vec = apic_hpet_vect; - ispec.intrspec_pri = CBE_HIGH_PIL; - - if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, - &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { - cmn_err(CE_WARN, "!apic: HPET setup failed"); - return; - } - irqptr = apic_irq_table[apic_hpet_vect]; - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - /* Program I/O APIC */ - (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - irqptr->airq_share++; - } -#endif /* !defined(__xpv) */ -} - -/* - * Add mask bits to disable interrupt vector from happening - * at or above IPL. In addition, it should remove mask bits - * to enable interrupt vectors below the given IPL. - * - * Both add and delspl are complicated by the fact that different interrupts - * may share IRQs. This can happen in two ways. - * 1. The same H/W line is shared by more than 1 device - * 1a. with interrupts at different IPLs - * 1b. with interrupts at same IPL - * 2. We ran out of vectors at a given IPL and started sharing vectors. - * 1b and 2 should be handled gracefully, except for the fact some ISRs - * will get called often when no interrupt is pending for the device. - * For 1a, we just hope that the machine blows up with the person who - * set it up that way!. In the meantime, we handle it at the higher IPL. - */ -/*ARGSUSED*/ -int -apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) -{ - uchar_t vector; - ulong_t iflag; - apic_irq_t *irqptr, *irqheadptr; - int irqindex; - - ASSERT(max_ipl <= UCHAR_MAX); - irqindex = IRQINDEX(irqno); - - if ((irqindex == -1) || (!apic_irq_table[irqindex])) - return (PSM_FAILURE); - - mutex_enter(&airq_mutex); - irqptr = irqheadptr = apic_irq_table[irqindex]; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " - "vector=0x%x\n", (void *)irqptr->airq_dip, - irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); - - while (irqptr) { - if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) - break; - irqptr = irqptr->airq_next; - } - irqptr->airq_share++; - - mutex_exit(&airq_mutex); - - /* return if it is not hardware interrupt */ - if (irqptr->airq_mps_intr_index == RESERVE_INDEX) - return (PSM_SUCCESS); - - /* Or if there are more interupts at a higher IPL */ - if (ipl != max_ipl) - return (PSM_SUCCESS); - - /* - * if apic_picinit() has not been called yet, just return. - * At the end of apic_picinit(), we will call setup_io_intr(). - */ - - if (!apic_picinit_called) - return (PSM_SUCCESS); - - /* - * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, - * return failure. Not very elegant, but then we hope the - * machine will blow up with ... - */ - if (irqptr->airq_ipl != max_ipl && - !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { - - vector = apic_allocate_vector(max_ipl, irqindex, 1); - if (vector == 0) { - irqptr->airq_share--; - return (PSM_FAILURE); - } - irqptr = irqheadptr; - apic_mark_vector(irqptr->airq_vector, vector); - while (irqptr) { - irqptr->airq_vector = vector; - irqptr->airq_ipl = (uchar_t)max_ipl; - /* - * reprogram irq being added and every one else - * who is not in the UNINIT state - */ - if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == - irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { - apic_record_rdt_entry(irqptr, irqindex); - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - (void) apic_setup_io_intr(irqptr, irqindex, - B_FALSE); - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - } - irqptr = irqptr->airq_next; - } - return (PSM_SUCCESS); - - } else if (irqptr->airq_ipl != max_ipl && - ioapic_mask_workaround[irqptr->airq_ioapicindex]) { - /* - * We cannot upgrade the vector, but we can change - * the IPL that this vector induces. - * - * Note that we subtract APIC_BASE_VECT from the vector - * here because this array is used in apic_intr_enter - * (no need to add APIC_BASE_VECT in that hot code - * path since we can do it in the rarely-executed path - * here). - */ - apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = - (uchar_t)max_ipl; - - irqptr = irqheadptr; - while (irqptr) { - irqptr->airq_ipl = (uchar_t)max_ipl; - irqptr = irqptr->airq_next; - } - - return (PSM_SUCCESS); - } - - ASSERT(irqptr); - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - return (PSM_SUCCESS); -} - -/* - * Recompute mask bits for the given interrupt vector. - * If there is no interrupt servicing routine for this - * vector, this function should disable interrupt vector - * from happening at all IPLs. If there are still - * handlers using the given vector, this function should - * disable the given vector from happening below the lowest - * IPL of the remaining hadlers. - */ -/*ARGSUSED*/ -int -apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) -{ - uchar_t vector; - uint32_t bind_cpu; - int intin, irqindex; - int ioapic_ix; - apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; - ulong_t iflag; - - mutex_enter(&airq_mutex); - irqindex = IRQINDEX(irqno); - irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " - "vector=0x%x\n", (void *)irqptr->airq_dip, - irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); - - while (irqptr) { - if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) - break; - preirqptr = irqptr; - irqptr = irqptr->airq_next; - } - ASSERT(irqptr); - - irqptr->airq_share--; - - mutex_exit(&airq_mutex); - - /* - * If there are more interrupts at a higher IPL, we don't need - * to disable anything. - */ - if (ipl < max_ipl) - return (PSM_SUCCESS); - - /* return if it is not hardware interrupt */ - if (irqptr->airq_mps_intr_index == RESERVE_INDEX) - return (PSM_SUCCESS); - - if (!apic_picinit_called) { - /* - * Clear irq_struct. If two devices shared an intpt - * line & 1 unloaded before picinit, we are hosed. But, then - * we hope the machine will survive. - */ - irqptr->airq_mps_intr_index = FREE_INDEX; - irqptr->airq_temp_cpu = IRQ_UNINIT; - apic_free_vector(irqptr->airq_vector); - return (PSM_SUCCESS); - } - /* - * Downgrade vector to new max_ipl if needed. If we cannot allocate, - * use old IPL. Not very elegant, but it should work. - */ - if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && - !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { - apic_irq_t *irqp; - if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { - apic_mark_vector(irqheadptr->airq_vector, vector); - irqp = irqheadptr; - while (irqp) { - irqp->airq_vector = vector; - irqp->airq_ipl = (uchar_t)max_ipl; - if (irqp->airq_temp_cpu != IRQ_UNINIT) { - apic_record_rdt_entry(irqp, irqindex); - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - (void) apic_setup_io_intr(irqp, - irqindex, B_FALSE); - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - } - irqp = irqp->airq_next; - } - } - - } else if (irqptr->airq_ipl != max_ipl && - max_ipl != PSM_INVALID_IPL && - ioapic_mask_workaround[irqptr->airq_ioapicindex]) { - - /* - * We cannot downgrade the IPL of the vector below the vector's - * hardware priority. If we did, it would be possible for a - * higher-priority hardware vector to interrupt a CPU running at an IPL - * lower than the hardware priority of the interrupting vector (but - * higher than the soft IPL of this IRQ). When this happens, we would - * then try to drop the IPL BELOW what it was (effectively dropping - * below base_spl) which would be potentially catastrophic. - * - * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 - * (hardware IPL of 4). Further assume that the old IPL of this IRQ - * was 4, but the new IPL is 1. If we forced vector 0x40 to result in - * an IPL of 1, it would be possible for the processor to be executing - * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting - * the currently-executing ISR. When apic_intr_enter consults - * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 - * so even though the processor was running at IPL 4, an IPL 1 - * interrupt will have interrupted it, which must not happen)). - * - * Effectively, this means that the hardware priority corresponding to - * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's - * hardware priority. - * - * (In the above example, then, after removal of the IPL 4 device's - * interrupt handler, the new IPL will continue to be 4 because the - * hardware priority that IPL 1 implies is lower than the hardware - * priority of the vector used.) - */ - /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ - const int apic_ipls_index = irqptr->airq_vector - - APIC_BASE_VECT; - const int vect_inherent_hwpri = irqptr->airq_vector >> - APIC_IPL_SHIFT; - - /* - * If there are still devices using this IRQ, determine the - * new ipl to use. - */ - if (irqptr->airq_share) { - int vect_desired_hwpri, hwpri; - - ASSERT(max_ipl < MAXIPL); - vect_desired_hwpri = apic_ipltopri[max_ipl] >> - APIC_IPL_SHIFT; - - /* - * If the desired IPL's hardware priority is lower - * than that of the vector, use the hardware priority - * of the vector to determine the new IPL. - */ - hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? - vect_inherent_hwpri : vect_desired_hwpri; - - /* - * Now, to get the right index for apic_vectortoipl, - * we need to subtract APIC_BASE_VECT from the - * hardware-vector-equivalent (in hwpri). Since hwpri - * is already shifted, we shift APIC_BASE_VECT before - * doing the subtraction. - */ - hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); - - ASSERT(hwpri >= 0); - ASSERT(hwpri < MAXIPL); - max_ipl = apic_vectortoipl[hwpri]; - apic_ipls[apic_ipls_index] = max_ipl; - - irqp = irqheadptr; - while (irqp) { - irqp->airq_ipl = (uchar_t)max_ipl; - irqp = irqp->airq_next; - } - } else { - /* - * No more devices on this IRQ, so reset this vector's - * element in apic_ipls to the original IPL for this - * vector - */ - apic_ipls[apic_ipls_index] = - apic_vectortoipl[vect_inherent_hwpri]; - } - } - - /* - * If there are still active interrupts, we are done. - */ - if (irqptr->airq_share) - return (PSM_SUCCESS); - - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - if (irqptr->airq_mps_intr_index == MSI_INDEX) { - /* - * Disable the MSI vector - * Make sure we only disable on the last - * of the multi-MSI support - */ - if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { - apic_pci_msi_disable_mode(irqptr->airq_dip, - DDI_INTR_TYPE_MSI); - } - } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { - /* - * Disable the MSI-X vector - * needs to clear its mask and addr/data for each MSI-X - */ - apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, - irqptr->airq_origirq); - /* - * Make sure we only disable on the last MSI-X - */ - if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { - apic_pci_msi_disable_mode(irqptr->airq_dip, - DDI_INTR_TYPE_MSIX); - } - } else { - /* - * The assumption here is that this is safe, even for - * systems with IOAPICs that suffer from the hardware - * erratum because all devices have been quiesced before - * they unregister their interrupt handlers. If that - * assumption turns out to be false, this mask operation - * can induce the same erratum result we're trying to - * avoid. - */ - ioapic_ix = irqptr->airq_ioapicindex; - intin = irqptr->airq_intin_no; - ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); - } - -#if !defined(__xpv) - apic_vt_ops->apic_intrmap_free_entry(irqptr); -#endif - - /* - * This irq entry is the only one in the chain. - */ - if (irqheadptr->airq_next == NULL) { - ASSERT(irqheadptr == irqptr); - bind_cpu = irqptr->airq_temp_cpu; - if (((uint32_t)bind_cpu != IRQ_UNBOUND) && - ((uint32_t)bind_cpu != IRQ_UNINIT)) { - ASSERT(apic_cpu_in_range(bind_cpu)); - if (bind_cpu & IRQ_USER_BOUND) { - /* If hardbound, temp_cpu == cpu */ - bind_cpu &= ~IRQ_USER_BOUND; - apic_cpus[bind_cpu].aci_bound--; - } else - apic_cpus[bind_cpu].aci_temp_bound--; - } - irqptr->airq_temp_cpu = IRQ_UNINIT; - irqptr->airq_mps_intr_index = FREE_INDEX; - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - apic_free_vector(irqptr->airq_vector); - return (PSM_SUCCESS); - } - - /* - * If we get here, we are sharing the vector and there are more than - * one active irq entries in the chain. - */ - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - mutex_enter(&airq_mutex); - /* Remove the irq entry from the chain */ - if (irqptr == irqheadptr) { /* The irq entry is at the head */ - apic_irq_table[irqindex] = irqptr->airq_next; - } else { - preirqptr->airq_next = irqptr->airq_next; - } - /* Free the irq entry */ - kmem_free(irqptr, sizeof (apic_irq_t)); - mutex_exit(&airq_mutex); - - return (PSM_SUCCESS); -} - -/* - * apic_introp_xlate() replaces apic_translate_irq() and is - * called only from apic_intr_ops(). With the new ADII framework, - * the priority can no longer be retrieved through i_ddi_get_intrspec(). - * It has to be passed in from the caller. - * - * Return value: - * Success: irqno for the given device - * Failure: -1 - */ -int -apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) -{ - char dev_type[16]; - int dev_len, pci_irq, newirq, bustype, devid, busid, i; - int irqno = ispec->intrspec_vec; - ddi_acc_handle_t cfg_handle; - uchar_t ipin; - struct apic_io_intr *intrp; - iflag_t intr_flag; - ACPI_SUBTABLE_HEADER *hp; - ACPI_MADT_INTERRUPT_OVERRIDE *isop; - apic_irq_t *airqp; - int parent_is_pci_or_pciex = 0; - int child_is_pciex = 0; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " - "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, - irqno)); - - dev_len = sizeof (dev_type); - if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), - DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, - &dev_len) == DDI_PROP_SUCCESS) { - if ((strcmp(dev_type, "pci") == 0) || - (strcmp(dev_type, "pciex") == 0)) - parent_is_pci_or_pciex = 1; - } - - if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, - DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, - &dev_len) == DDI_PROP_SUCCESS) { - if (strstr(dev_type, "pciex")) - child_is_pciex = 1; - } - - - if (DDI_INTR_IS_MSI_OR_MSIX(type)) { - if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { - airqp->airq_iflag.bustype = - child_is_pciex ? BUS_PCIE : BUS_PCI; - return (apic_vector_to_irq[airqp->airq_vector]); - } - return (apic_setup_irq_table(dip, irqno, NULL, ispec, - NULL, type)); - } - - bustype = 0; - - /* check if we have already translated this irq */ - mutex_enter(&airq_mutex); - newirq = apic_min_device_irq; - for (; newirq <= apic_max_device_irq; newirq++) { - airqp = apic_irq_table[newirq]; - while (airqp) { - if ((airqp->airq_dip == dip) && - (airqp->airq_origirq == irqno) && - (airqp->airq_mps_intr_index != FREE_INDEX)) { - - mutex_exit(&airq_mutex); - return (VIRTIRQ(newirq, airqp->airq_share_id)); - } - airqp = airqp->airq_next; - } - } - mutex_exit(&airq_mutex); - - if (apic_defconf) - goto defconf; - - if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) - goto nonpci; - - if (parent_is_pci_or_pciex) { - /* pci device */ - if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) - goto nonpci; - if (busid == 0 && apic_pci_bus_total == 1) - busid = (int)apic_single_pci_busid; - - if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) - return (-1); - ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; - pci_config_teardown(&cfg_handle); - if (apic_enable_acpi && !apic_use_acpi_madt_only) { - if (apic_acpi_translate_pci_irq(dip, busid, devid, - ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) - return (-1); - - intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; - return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, - &intr_flag, type)); - } else { - pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); - if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) - == NULL) { - if ((pci_irq = apic_handle_pci_pci_bridge(dip, - devid, ipin, &intrp)) == -1) - return (-1); - } - return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, - NULL, type)); - } - } else if (strcmp(dev_type, "isa") == 0) - bustype = BUS_ISA; - else if (strcmp(dev_type, "eisa") == 0) - bustype = BUS_EISA; - -nonpci: - if (apic_enable_acpi && !apic_use_acpi_madt_only) { - /* search iso entries first */ - if (acpi_iso_cnt != 0) { - hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; - i = 0; - while (i < acpi_iso_cnt) { - if (hp->Type == - ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { - isop = - (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; - if (isop->Bus == 0 && - isop->SourceIrq == irqno) { - newirq = isop->GlobalIrq; - intr_flag.intr_po = - isop->IntiFlags & - ACPI_MADT_POLARITY_MASK; - intr_flag.intr_el = - (isop->IntiFlags & - ACPI_MADT_TRIGGER_MASK) - >> 2; - intr_flag.bustype = BUS_ISA; - - return (apic_setup_irq_table( - dip, newirq, NULL, ispec, - &intr_flag, type)); - - } - i++; - } - hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + - hp->Length); - } - } - intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; - intr_flag.intr_el = INTR_EL_EDGE; - intr_flag.bustype = BUS_ISA; - return (apic_setup_irq_table(dip, irqno, NULL, ispec, - &intr_flag, type)); - } else { - if (bustype == 0) - bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; - for (i = 0; i < 2; i++) { - if (((busid = apic_find_bus_id(bustype)) != -1) && - ((intrp = apic_find_io_intr_w_busid(irqno, busid)) - != NULL)) { - if ((newirq = apic_setup_irq_table(dip, irqno, - intrp, ispec, NULL, type)) != -1) { - return (newirq); - } - goto defconf; - } - bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; - } - } - -/* MPS default configuration */ -defconf: - newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); - if (newirq == -1) - return (-1); - ASSERT(IRQINDEX(newirq) == irqno); - ASSERT(apic_irq_table[irqno]); - return (newirq); -} - - - - - - /* * On machines with PCI-PCI bridges, a device behind a PCI-PCI bridge * needs special handling. We may need to chase up the device tree, @@ -2125,7 +1329,7 @@ defconf: * We handle both cases in the search below. */ /* this is the non-acpi version */ -static int +int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, int child_ipin, struct apic_io_intr **intrp) { @@ -2176,17 +1380,14 @@ apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, int child_ipin, /*LINTED: function will not fall off the bottom */ } - - - -static uchar_t +uchar_t acpi_find_ioapic(int irq) { int i; for (i = 0; i < apic_io_max; i++) { if (irq >= apic_io_vectbase[i] && irq <= apic_io_vectend[i]) - return (i); + return ((uchar_t)i); } return (0xFF); /* shouldn't happen */ } @@ -2232,407 +1433,7 @@ acpi_intr_compatible(iflag_t iflag1, iflag_t iflag2) return (0); } -/* - * Attempt to share vector with someone else - */ -static int -apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, - uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) -{ -#ifdef DEBUG - apic_irq_t *tmpirqp = NULL; -#endif /* DEBUG */ - apic_irq_t *irqptr, dummyirq; - int newirq, chosen_irq = -1, share = 127; - int lowest, highest, i; - uchar_t share_id; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " - "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); - - highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; - lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; - - if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ - lowest -= APIC_VECTOR_PER_IPL; - dummyirq.airq_mps_intr_index = intr_index; - dummyirq.airq_ioapicindex = ioapicindex; - dummyirq.airq_intin_no = ipin; - if (intr_flagp) - dummyirq.airq_iflag = *intr_flagp; - apic_record_rdt_entry(&dummyirq, irqno); - for (i = lowest; i <= highest; i++) { - newirq = apic_vector_to_irq[i]; - if (newirq == APIC_RESV_IRQ) - continue; - irqptr = apic_irq_table[newirq]; - - if ((dummyirq.airq_rdt_entry & 0xFF00) != - (irqptr->airq_rdt_entry & 0xFF00)) - /* not compatible */ - continue; - - if (irqptr->airq_share < share) { - share = irqptr->airq_share; - chosen_irq = newirq; - } - } - if (chosen_irq != -1) { - /* - * Assign a share id which is free or which is larger - * than the largest one. - */ - share_id = 1; - mutex_enter(&airq_mutex); - irqptr = apic_irq_table[chosen_irq]; - while (irqptr) { - if (irqptr->airq_mps_intr_index == FREE_INDEX) { - share_id = irqptr->airq_share_id; - break; - } - if (share_id <= irqptr->airq_share_id) - share_id = irqptr->airq_share_id + 1; -#ifdef DEBUG - tmpirqp = irqptr; -#endif /* DEBUG */ - irqptr = irqptr->airq_next; - } - if (!irqptr) { - irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); - irqptr->airq_temp_cpu = IRQ_UNINIT; - irqptr->airq_next = - apic_irq_table[chosen_irq]->airq_next; - apic_irq_table[chosen_irq]->airq_next = irqptr; -#ifdef DEBUG - tmpirqp = apic_irq_table[chosen_irq]; -#endif /* DEBUG */ - } - irqptr->airq_mps_intr_index = intr_index; - irqptr->airq_ioapicindex = ioapicindex; - irqptr->airq_intin_no = ipin; - if (intr_flagp) - irqptr->airq_iflag = *intr_flagp; - irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; - irqptr->airq_share_id = share_id; - apic_record_rdt_entry(irqptr, irqno); - *irqptrp = irqptr; -#ifdef DEBUG - /* shuffle the pointers to test apic_delspl path */ - if (tmpirqp) { - tmpirqp->airq_next = irqptr->airq_next; - irqptr->airq_next = apic_irq_table[chosen_irq]; - apic_irq_table[chosen_irq] = irqptr; - } -#endif /* DEBUG */ - mutex_exit(&airq_mutex); - return (VIRTIRQ(chosen_irq, share_id)); - } - return (-1); -} - -/* - * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry - * is used already, we will try to allocate a new irqno. - * - * Return value: - * Success: irqno - * Failure: -1 - */ -static int -apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, - struct intrspec *ispec, iflag_t *intr_flagp, int type) -{ - int origirq = ispec->intrspec_vec; - uchar_t ipl = ispec->intrspec_pri; - int newirq, intr_index; - uchar_t ipin, ioapic, ioapicindex, vector; - apic_irq_t *irqptr; - major_t major; - dev_info_t *sdip; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " - "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); - - ASSERT(ispec != NULL); - - major = (dip != NULL) ? ddi_driver_major(dip) : 0; - - if (DDI_INTR_IS_MSI_OR_MSIX(type)) { - /* MSI/X doesn't need to setup ioapic stuffs */ - ioapicindex = 0xff; - ioapic = 0xff; - ipin = (uchar_t)0xff; - intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : - MSIX_INDEX; - mutex_enter(&airq_mutex); - if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { - mutex_exit(&airq_mutex); - /* need an irq for MSI/X to index into autovect[] */ - cmn_err(CE_WARN, "No interrupt irq: %s instance %d", - ddi_get_name(dip), ddi_get_instance(dip)); - return (-1); - } - mutex_exit(&airq_mutex); - - } else if (intrp != NULL) { - intr_index = (int)(intrp - apic_io_intrp); - ioapic = intrp->intr_destid; - ipin = intrp->intr_destintin; - /* Find ioapicindex. If destid was ALL, we will exit with 0. */ - for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) - if (apic_io_id[ioapicindex] == ioapic) - break; - ASSERT((ioapic == apic_io_id[ioapicindex]) || - (ioapic == INTR_ALL_APIC)); - - /* check whether this intin# has been used by another irqno */ - if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { - return (newirq); - } - - } else if (intr_flagp != NULL) { - /* ACPI case */ - intr_index = ACPI_INDEX; - ioapicindex = acpi_find_ioapic(irqno); - ASSERT(ioapicindex != 0xFF); - ioapic = apic_io_id[ioapicindex]; - ipin = irqno - apic_io_vectbase[ioapicindex]; - if (apic_irq_table[irqno] && - apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { - ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && - apic_irq_table[irqno]->airq_ioapicindex == - ioapicindex); - return (irqno); - } - - } else { - /* default configuration */ - ioapicindex = 0; - ioapic = apic_io_id[ioapicindex]; - ipin = (uchar_t)irqno; - intr_index = DEFAULT_INDEX; - } - - if (ispec == NULL) { - APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", - irqno)); - } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { - if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, - ipl, ioapicindex, ipin, &irqptr)) != -1) { - irqptr->airq_ipl = ipl; - irqptr->airq_origirq = (uchar_t)origirq; - irqptr->airq_dip = dip; - irqptr->airq_major = major; - sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; - /* This is OK to do really */ - if (sdip == NULL) { - cmn_err(CE_WARN, "Sharing vectors: %s" - " instance %d and SCI", - ddi_get_name(dip), ddi_get_instance(dip)); - } else { - cmn_err(CE_WARN, "Sharing vectors: %s" - " instance %d and %s instance %d", - ddi_get_name(sdip), ddi_get_instance(sdip), - ddi_get_name(dip), ddi_get_instance(dip)); - } - return (newirq); - } - /* try high priority allocation now that share has failed */ - if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { - cmn_err(CE_WARN, "No interrupt vector: %s instance %d", - ddi_get_name(dip), ddi_get_instance(dip)); - return (-1); - } - } - - mutex_enter(&airq_mutex); - if (apic_irq_table[irqno] == NULL) { - irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); - irqptr->airq_temp_cpu = IRQ_UNINIT; - apic_irq_table[irqno] = irqptr; - } else { - irqptr = apic_irq_table[irqno]; - if (irqptr->airq_mps_intr_index != FREE_INDEX) { - /* - * The slot is used by another irqno, so allocate - * a free irqno for this interrupt - */ - newirq = apic_allocate_irq(apic_first_avail_irq); - if (newirq == -1) { - mutex_exit(&airq_mutex); - return (-1); - } - irqno = newirq; - irqptr = apic_irq_table[irqno]; - if (irqptr == NULL) { - irqptr = kmem_zalloc(sizeof (apic_irq_t), - KM_SLEEP); - irqptr->airq_temp_cpu = IRQ_UNINIT; - apic_irq_table[irqno] = irqptr; - } - vector = apic_modify_vector(vector, newirq); - } - } - apic_max_device_irq = max(irqno, apic_max_device_irq); - apic_min_device_irq = min(irqno, apic_min_device_irq); - mutex_exit(&airq_mutex); - irqptr->airq_ioapicindex = ioapicindex; - irqptr->airq_intin_no = ipin; - irqptr->airq_ipl = ipl; - irqptr->airq_vector = vector; - irqptr->airq_origirq = (uchar_t)origirq; - irqptr->airq_share_id = 0; - irqptr->airq_mps_intr_index = (short)intr_index; - irqptr->airq_dip = dip; - irqptr->airq_major = major; - irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); - if (intr_flagp) - irqptr->airq_iflag = *intr_flagp; - - if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { - /* setup I/O APIC entry for non-MSI/X interrupts */ - apic_record_rdt_entry(irqptr, irqno); - } - return (irqno); -} - -/* - * return the cpu to which this intr should be bound. - * Check properties or any other mechanism to see if user wants it - * bound to a specific CPU. If so, return the cpu id with high bit set. - * If not, use the policy to choose a cpu and return the id. - */ -uint32_t -apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) -{ - int instance, instno, prop_len, bind_cpu, count; - uint_t i, rc; - uint32_t cpu; - major_t major; - char *name, *drv_name, *prop_val, *cptr; - char prop_name[32]; - ulong_t iflag; - - - if (apic_intr_policy == INTR_LOWEST_PRIORITY) - return (IRQ_UNBOUND); - - if (apic_nproc == 1) - return (0); - - drv_name = NULL; - rc = DDI_PROP_NOT_FOUND; - major = (major_t)-1; - if (dip != NULL) { - name = ddi_get_name(dip); - major = ddi_name_to_major(name); - drv_name = ddi_major_to_name(major); - instance = ddi_get_instance(dip); - if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { - i = apic_min_device_irq; - for (; i <= apic_max_device_irq; i++) { - - if ((i == irq) || (apic_irq_table[i] == NULL) || - (apic_irq_table[i]->airq_mps_intr_index - == FREE_INDEX)) - continue; - - if ((apic_irq_table[i]->airq_major == major) && - (!(apic_irq_table[i]->airq_cpu & - IRQ_USER_BOUND))) { - - cpu = apic_irq_table[i]->airq_cpu; - - cmn_err(CE_CONT, - "!%s: %s (%s) instance #%d " - "irq 0x%x vector 0x%x ioapic 0x%x " - "intin 0x%x is bound to cpu %d\n", - psm_name, - name, drv_name, instance, irq, - apic_irq_table[irq]->airq_vector, - ioapicid, intin, cpu); - return (cpu); - } - } - } - /* - * search for "drvname"_intpt_bind_cpus property first, the - * syntax of the property should be "a[,b,c,...]" where - * instance 0 binds to cpu a, instance 1 binds to cpu b, - * instance 3 binds to cpu c... - * ddi_getlongprop() will search /option first, then / - * if "drvname"_intpt_bind_cpus doesn't exist, then find - * intpt_bind_cpus property. The syntax is the same, and - * it applies to all the devices if its "drvname" specific - * property doesn't exist - */ - (void) strcpy(prop_name, drv_name); - (void) strcat(prop_name, "_intpt_bind_cpus"); - rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, - (caddr_t)&prop_val, &prop_len); - if (rc != DDI_PROP_SUCCESS) { - rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, - "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); - } - } - if (rc == DDI_PROP_SUCCESS) { - for (i = count = 0; i < (prop_len - 1); i++) - if (prop_val[i] == ',') - count++; - if (prop_val[i-1] != ',') - count++; - /* - * if somehow the binding instances defined in the - * property are not enough for this instno., then - * reuse the pattern for the next instance until - * it reaches the requested instno - */ - instno = instance % count; - i = 0; - cptr = prop_val; - while (i < instno) - if (*cptr++ == ',') - i++; - bind_cpu = stoi(&cptr); - kmem_free(prop_val, prop_len); - /* if specific CPU is bogus, then default to next cpu */ - if (!apic_cpu_in_range(bind_cpu)) { - cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", - psm_name, prop_name, prop_val, bind_cpu); - rc = DDI_PROP_NOT_FOUND; - } else { - /* indicate that we are bound at user request */ - bind_cpu |= IRQ_USER_BOUND; - } - /* - * no need to check apic_cpus[].aci_status, if specific CPU is - * not up, then post_cpu_start will handle it. - */ - } - if (rc != DDI_PROP_SUCCESS) { - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - bind_cpu = apic_get_next_bind_cpu(); - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - } - - if (drv_name != NULL) - cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " - "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", - psm_name, name, drv_name, instance, irq, - apic_irq_table[irq]->airq_vector, ioapicid, intin, - bind_cpu & ~IRQ_USER_BOUND); - else - cmn_err(CE_CONT, "!%s: irq 0x%x " - "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", - psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, - intin, bind_cpu & ~IRQ_USER_BOUND); - - return ((uint32_t)bind_cpu); -} - -static struct apic_io_intr * +struct apic_io_intr * apic_find_io_intr_w_busid(int irqno, int busid) { struct apic_io_intr *intrp; @@ -2709,7 +1510,7 @@ apic_find_bus(int busid) return (0); } -static int +int apic_find_bus_id(int bustype) { struct apic_bus *busp; @@ -2749,7 +1550,7 @@ apic_find_io_intr(int irqno) * Check if the given ioapicindex intin combination has already been assigned * an irq. If so return irqno. Else -1 */ -static int +int apic_find_intin(uchar_t ioapic, uchar_t intin) { apic_irq_t *irqptr; @@ -2832,118 +1633,11 @@ apic_find_free_irq(int start, int end) return (-1); } - -/* - * Mark vector as being in the process of being deleted. Interrupts - * may still come in on some CPU. The moment an interrupt comes with - * the new vector, we know we can free the old one. Called only from - * addspl and delspl with interrupts disabled. Because an interrupt - * can be shared, but no interrupt from either device may come in, - * we also use a timeout mechanism, which we arbitrarily set to - * apic_revector_timeout microseconds. - */ -static void -apic_mark_vector(uchar_t oldvector, uchar_t newvector) -{ - ulong_t iflag; - - iflag = intr_clear(); - lock_set(&apic_revector_lock); - if (!apic_oldvec_to_newvec) { - apic_oldvec_to_newvec = - kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, - KM_NOSLEEP); - - if (!apic_oldvec_to_newvec) { - /* - * This failure is not catastrophic. - * But, the oldvec will never be freed. - */ - apic_error |= APIC_ERR_MARK_VECTOR_FAIL; - lock_clear(&apic_revector_lock); - intr_restore(iflag); - return; - } - apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; - } - - /* See if we already did this for drivers which do double addintrs */ - if (apic_oldvec_to_newvec[oldvector] != newvector) { - apic_oldvec_to_newvec[oldvector] = newvector; - apic_newvec_to_oldvec[newvector] = oldvector; - apic_revector_pending++; - } - lock_clear(&apic_revector_lock); - intr_restore(iflag); - (void) timeout(apic_xlate_vector_free_timeout_handler, - (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); -} - -/* - * xlate_vector is called from intr_enter if revector_pending is set. - * It will xlate it if needed and mark the old vector as free. - */ -uchar_t -apic_xlate_vector(uchar_t vector) -{ - uchar_t newvector, oldvector = 0; - - lock_set(&apic_revector_lock); - /* Do we really need to do this ? */ - if (!apic_revector_pending) { - lock_clear(&apic_revector_lock); - return (vector); - } - if ((newvector = apic_oldvec_to_newvec[vector]) != 0) - oldvector = vector; - else { - /* - * The incoming vector is new . See if a stale entry is - * remaining - */ - if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) - newvector = vector; - } - - if (oldvector) { - apic_revector_pending--; - apic_oldvec_to_newvec[oldvector] = 0; - apic_newvec_to_oldvec[newvector] = 0; - apic_free_vector(oldvector); - lock_clear(&apic_revector_lock); - /* There could have been more than one reprogramming! */ - return (apic_xlate_vector(newvector)); - } - lock_clear(&apic_revector_lock); - return (vector); -} - -void -apic_xlate_vector_free_timeout_handler(void *arg) -{ - ulong_t iflag; - uchar_t oldvector, newvector; - - oldvector = (uchar_t)(uintptr_t)arg; - iflag = intr_clear(); - lock_set(&apic_revector_lock); - if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { - apic_free_vector(oldvector); - apic_oldvec_to_newvec[oldvector] = 0; - apic_newvec_to_oldvec[newvector] = 0; - apic_revector_pending--; - } - - lock_clear(&apic_revector_lock); - intr_restore(iflag); -} - - /* * compute the polarity, trigger mode and vector for programming into * the I/O apic and record in airq_rdt_entry. */ -static void +void apic_record_rdt_entry(apic_irq_t *irqptr, int irq) { int ioapicindex, bus_type, vector; @@ -3016,778 +1710,15 @@ apic_record_rdt_entry(apic_irq_t *irqptr, int irq) io_po = 0; if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) - printf("setio: ioapic=%x intin=%x level=%x po=%x vector=%x\n", - ioapicindex, irqptr->airq_intin_no, level, io_po, vector); + prom_printf("setio: ioapic=0x%x intin=0x%x level=0x%x po=0x%x " + "vector=0x%x cpu=0x%x\n\n", ioapicindex, + irqptr->airq_intin_no, level, io_po, vector, + irqptr->airq_cpu); irqptr->airq_rdt_entry = level|io_po|vector; } -/* - * Bind interrupt corresponding to irq_ptr to bind_cpu. - * Must be called with interrupts disabled and apic_ioapic_lock held - */ -int -apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, - struct ioapic_reprogram_data *drep) -{ - int ioapicindex, intin_no; - uint32_t airq_temp_cpu; - apic_cpus_info_t *cpu_infop; - uint32_t rdt_entry; - int which_irq; - ioapic_rdt_t irdt; - - which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; - - intin_no = irq_ptr->airq_intin_no; - ioapicindex = irq_ptr->airq_ioapicindex; - airq_temp_cpu = irq_ptr->airq_temp_cpu; - if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { - if (airq_temp_cpu & IRQ_USER_BOUND) - /* Mask off high bit so it can be used as array index */ - airq_temp_cpu &= ~IRQ_USER_BOUND; - - ASSERT(apic_cpu_in_range(airq_temp_cpu)); - } - - /* - * Can't bind to a CPU that's not accepting interrupts: - */ - cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; - if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) - return (1); - - /* - * If we are about to change the interrupt vector for this interrupt, - * and this interrupt is level-triggered, attached to an IOAPIC, - * has been delivered to a CPU and that CPU has not handled it - * yet, we cannot reprogram the IOAPIC now. - */ - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { - - rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, - intin_no); - - if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && - apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, - bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { - - return (0); - } - - /* - * NOTE: We do not unmask the RDT here, as an interrupt MAY - * still come in before we have a chance to reprogram it below. - * The reprogramming below will simultaneously change and - * unmask the RDT entry. - */ - - if ((uint32_t)bind_cpu == IRQ_UNBOUND) { - irdt.ir_lo = AV_LDEST | AV_LOPRI | - irq_ptr->airq_rdt_entry; -#if !defined(__xpv) - irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET; - - apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr); - apic_vt_ops->apic_intrmap_map_entry( - irq_ptr, (void *)&irdt); - apic_vt_ops->apic_intrmap_record_rdt(irq_ptr, &irdt); - - /* Write the RDT entry -- no specific CPU binding */ - WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, - irdt.ir_hi | AV_TOALL); -#else - WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, - AV_TOALL); -#endif - if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != - IRQ_UNBOUND) - apic_cpus[airq_temp_cpu].aci_temp_bound--; - - /* - * Write the vector, trigger, and polarity portion of - * the RDT - */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, - irdt.ir_lo); - - irq_ptr->airq_temp_cpu = IRQ_UNBOUND; - return (0); - } - } - - if (bind_cpu & IRQ_USER_BOUND) { - cpu_infop->aci_bound++; - } else { - cpu_infop->aci_temp_bound++; - } - ASSERT(apic_cpu_in_range(bind_cpu)); - - if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { - apic_cpus[airq_temp_cpu].aci_temp_bound--; - } - if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { - - irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; - irdt.ir_hi = cpu_infop->aci_local_id; - -#if !defined(__xpv) - apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr); - apic_vt_ops->apic_intrmap_map_entry(irq_ptr, (void *)&irdt); - apic_vt_ops->apic_intrmap_record_rdt(irq_ptr, &irdt); - - /* Write the RDT entry -- bind to a specific CPU: */ - WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, - irdt.ir_hi); -#else - /* Write the RDT entry -- bind to a specific CPU: */ - WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, - irdt.ir_hi << APIC_ID_BIT_OFFSET); -#endif - /* Write the vector, trigger, and polarity portion of the RDT */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, - irdt.ir_lo); - - } else { - int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? - DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; - if (type == DDI_INTR_TYPE_MSI) { - if (irq_ptr->airq_ioapicindex == - irq_ptr->airq_origirq) { - /* first one */ - DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " - "apic_pci_msi_enable_vector\n")); - apic_pci_msi_enable_vector(irq_ptr, - type, which_irq, irq_ptr->airq_vector, - irq_ptr->airq_intin_no, - cpu_infop->aci_local_id); - } - if ((irq_ptr->airq_ioapicindex + - irq_ptr->airq_intin_no - 1) == - irq_ptr->airq_origirq) { /* last one */ - DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " - "apic_pci_msi_enable_mode\n")); - apic_pci_msi_enable_mode(irq_ptr->airq_dip, - type, which_irq); - } - } else { /* MSI-X */ - apic_pci_msi_enable_vector(irq_ptr, type, - irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, - cpu_infop->aci_local_id); - apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, - irq_ptr->airq_origirq); - } - } - irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; - apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); - return (0); -} - -static void -apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) -{ - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) - & AV_REMOTE_IRR) != 0) { - /* - * Trying to clear the bit through normal - * channels has failed. So as a last-ditch - * effort, try to set the trigger mode to - * edge, then to level. This has been - * observed to work on many systems. - */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no, - READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no) & ~AV_LEVEL); - - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no, - READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no) | AV_LEVEL); - - /* - * If the bit's STILL set, this interrupt may - * be hosed. - */ - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no) & AV_REMOTE_IRR) != 0) { - - prom_printf("%s: Remote IRR still " - "not clear for IOAPIC %d intin %d.\n" - "\tInterrupts to this pin may cease " - "functioning.\n", psm_name, ioapic_ix, - intin_no); -#ifdef DEBUG - apic_last_ditch_reprogram_failures++; -#endif - } - } -} - -/* - * This function is protected by apic_ioapic_lock coupled with the - * fact that interrupts are disabled. - */ -static void -delete_defer_repro_ent(int which_irq) -{ - ASSERT(which_irq >= 0); - ASSERT(which_irq <= 255); - - if (apic_reprogram_info[which_irq].done) - return; - - apic_reprogram_info[which_irq].done = B_TRUE; - -#ifdef DEBUG - apic_defer_repro_total_retries += - apic_reprogram_info[which_irq].tries; - - apic_defer_repro_successes++; -#endif - - if (--apic_reprogram_outstanding == 0) { - - setlvlx = psm_intr_exit_fn(); - } -} - - -/* - * Interrupts must be disabled during this function to prevent - * self-deadlock. Interrupts are disabled because this function - * is called from apic_check_stuck_interrupt(), which is called - * from apic_rebind(), which requires its caller to disable interrupts. - */ -static void -add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) -{ - ASSERT(which_irq >= 0); - ASSERT(which_irq <= 255); - - /* - * On the off-chance that there's already a deferred - * reprogramming on this irq, check, and if so, just update the - * CPU and irq pointer to which the interrupt is targeted, then return. - */ - if (!apic_reprogram_info[which_irq].done) { - apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; - apic_reprogram_info[which_irq].irqp = irq_ptr; - return; - } - - apic_reprogram_info[which_irq].irqp = irq_ptr; - apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; - apic_reprogram_info[which_irq].tries = 0; - /* - * This must be the last thing set, since we're not - * grabbing any locks, apic_try_deferred_reprogram() will - * make its decision about using this entry iff done - * is false. - */ - apic_reprogram_info[which_irq].done = B_FALSE; - - /* - * If there were previously no deferred reprogrammings, change - * setlvlx to call apic_try_deferred_reprogram() - */ - if (++apic_reprogram_outstanding == 1) { - - setlvlx = apic_try_deferred_reprogram; - } -} - -static void -apic_try_deferred_reprogram(int prev_ipl, int irq) -{ - int reproirq; - ulong_t iflag; - struct ioapic_reprogram_data *drep; - - (*psm_intr_exit_fn())(prev_ipl, irq); - - if (!lock_try(&apic_defer_reprogram_lock)) { - return; - } - - /* - * Acquire the apic_ioapic_lock so that any other operations that - * may affect the apic_reprogram_info state are serialized. - * It's still possible for the last deferred reprogramming to clear - * between the time we entered this function and the time we get to - * the for loop below. In that case, *setlvlx will have been set - * back to *_intr_exit and drep will be NULL. (There's no way to - * stop that from happening -- we would need to grab a lock before - * calling *setlvlx, which is neither realistic nor prudent). - */ - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - /* - * For each deferred RDT entry, try to reprogram it now. Note that - * there is no lock acquisition to read apic_reprogram_info because - * '.done' is set only after the other fields in the structure are set. - */ - - drep = NULL; - for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { - if (apic_reprogram_info[reproirq].done == B_FALSE) { - drep = &apic_reprogram_info[reproirq]; - break; - } - } - - /* - * Either we found a deferred action to perform, or - * we entered this function spuriously, after *setlvlx - * was restored to point to *_intr_exit. Any other - * permutation is invalid. - */ - ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); - - /* - * Though we can't really do anything about errors - * at this point, keep track of them for reporting. - * Note that it is very possible for apic_setup_io_intr - * to re-register this very timeout if the Remote IRR bit - * has not yet cleared. - */ - -#ifdef DEBUG - if (drep != NULL) { - if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { - apic_deferred_setup_failures++; - } - } else { - apic_deferred_spurious_enters++; - } -#else - if (drep != NULL) - (void) apic_setup_io_intr(drep, reproirq, B_TRUE); -#endif - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - lock_clear(&apic_defer_reprogram_lock); -} - -static void -apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) -{ - int waited; - - /* - * Wait for the delivery pending bit to clear. - */ - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & - (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { - - /* - * If we're still waiting on the delivery of this interrupt, - * continue to wait here until it is delivered (this should be - * a very small amount of time, but include a timeout just in - * case). - */ - for (waited = 0; waited < apic_max_reps_clear_pending; - waited++) { - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no) & AV_PENDING) == 0) { - break; - } - } - } -} - - -/* - * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR - * bit set. Calls functions that modify the function that setlvlx points to, - * so that the reprogramming can be retried very shortly. - * - * This function will mask the RDT entry if the interrupt is level-triggered. - * (The caller is responsible for unmasking the RDT entry.) - * - * Returns non-zero if the caller should defer IOAPIC reprogramming. - */ -static int -apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, - int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, - struct ioapic_reprogram_data *drep) -{ - int32_t rdt_entry; - int waited; - int reps = 0; - - /* - * Wait for the delivery pending bit to clear. - */ - do { - ++reps; - - apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); - - /* - * Mask the RDT entry, but only if it's a level-triggered - * interrupt - */ - rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no); - if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { - - /* Mask it */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, - AV_MASK | rdt_entry); - } - - if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { - /* - * If there was a race and an interrupt was injected - * just before we masked, check for that case here. - * Then, unmask the RDT entry and try again. If we're - * on our last try, don't unmask (because we want the - * RDT entry to remain masked for the rest of the - * function). - */ - rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no); - if ((rdt_entry & AV_PENDING) && - (reps < apic_max_reps_clear_pending)) { - /* Unmask it */ - WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no, rdt_entry & ~AV_MASK); - } - } - - } while ((rdt_entry & AV_PENDING) && - (reps < apic_max_reps_clear_pending)); - -#ifdef DEBUG - if (rdt_entry & AV_PENDING) - apic_intr_deliver_timeouts++; -#endif - - /* - * If the remote IRR bit is set, then the interrupt has been sent - * to a CPU for processing. We have no choice but to wait for - * that CPU to process the interrupt, at which point the remote IRR - * bit will be cleared. - */ - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & - (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { - - /* - * If the CPU that this RDT is bound to is NOT the current - * CPU, wait until that CPU handles the interrupt and ACKs - * it. If this interrupt is not bound to any CPU (that is, - * if it's bound to the logical destination of "anyone"), it - * may have been delivered to the current CPU so handle that - * case by deferring the reprogramming (below). - */ - if ((old_bind_cpu != IRQ_UNBOUND) && - (old_bind_cpu != IRQ_UNINIT) && - (old_bind_cpu != psm_get_cpu_id())) { - for (waited = 0; waited < apic_max_reps_clear_pending; - waited++) { - if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, - intin_no) & AV_REMOTE_IRR) == 0) { - - delete_defer_repro_ent(which_irq); - - /* Remote IRR has cleared! */ - return (0); - } - } - } - - /* - * If we waited and the Remote IRR bit is still not cleared, - * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS - * times for this interrupt, try the last-ditch workaround: - */ - if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { - - apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); - - /* Mark this one as reprogrammed: */ - delete_defer_repro_ent(which_irq); - - return (0); - } else { -#ifdef DEBUG - apic_intr_deferrals++; -#endif - - /* - * If waiting for the Remote IRR bit (above) didn't - * allow it to clear, defer the reprogramming. - * Add a new deferred-programming entry if the - * caller passed a NULL one (and update the existing one - * in case anything changed). - */ - add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); - if (drep) - drep->tries++; - - /* Inform caller to defer IOAPIC programming: */ - return (1); - } - - } - - /* Remote IRR is clear */ - delete_defer_repro_ent(which_irq); - - return (0); -} - -/* - * Called to migrate all interrupts at an irq to another cpu. - * Must be called with interrupts disabled and apic_ioapic_lock held - */ int -apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) -{ - apic_irq_t *irqptr = irq_ptr; - int retval = 0; - - while (irqptr) { - if (irqptr->airq_temp_cpu != IRQ_UNINIT) - retval |= apic_rebind(irqptr, bind_cpu, NULL); - irqptr = irqptr->airq_next; - } - - return (retval); -} - -/* - * apic_intr_redistribute does all the messy computations for identifying - * which interrupt to move to which CPU. Currently we do just one interrupt - * at a time. This reduces the time we spent doing all this within clock - * interrupt. When it is done in idle, we could do more than 1. - * First we find the most busy and the most free CPU (time in ISR only) - * skipping those CPUs that has been identified as being ineligible (cpu_skip) - * Then we look for IRQs which are closest to the difference between the - * most busy CPU and the average ISR load. We try to find one whose load - * is less than difference.If none exists, then we chose one larger than the - * difference, provided it does not make the most idle CPU worse than the - * most busy one. In the end, we clear all the busy fields for CPUs. For - * IRQs, they are cleared as they are scanned. - */ -void -apic_intr_redistribute() -{ - int busiest_cpu, most_free_cpu; - int cpu_free, cpu_busy, max_busy, min_busy; - int min_free, diff; - int average_busy, cpus_online; - int i, busy; - ulong_t iflag; - apic_cpus_info_t *cpu_infop; - apic_irq_t *min_busy_irq = NULL; - apic_irq_t *max_busy_irq = NULL; - - busiest_cpu = most_free_cpu = -1; - cpu_free = cpu_busy = max_busy = average_busy = 0; - min_free = apic_sample_factor_redistribution; - cpus_online = 0; - /* - * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu - * without ioapic_lock. That is OK as we are just doing statistical - * sampling anyway and any inaccuracy now will get corrected next time - * The call to rebind which actually changes things will make sure - * we are consistent. - */ - for (i = 0; i < apic_nproc; i++) { - if (apic_cpu_in_range(i) && - !(apic_redist_cpu_skip & (1 << i)) && - (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { - - cpu_infop = &apic_cpus[i]; - /* - * If no unbound interrupts or only 1 total on this - * CPU, skip - */ - if (!cpu_infop->aci_temp_bound || - (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) - == 1) { - apic_redist_cpu_skip |= 1 << i; - continue; - } - - busy = cpu_infop->aci_busy; - average_busy += busy; - cpus_online++; - if (max_busy < busy) { - max_busy = busy; - busiest_cpu = i; - } - if (min_free > busy) { - min_free = busy; - most_free_cpu = i; - } - if (busy > apic_int_busy_mark) { - cpu_busy |= 1 << i; - } else { - if (busy < apic_int_free_mark) - cpu_free |= 1 << i; - } - } - } - if ((cpu_busy && cpu_free) || - (max_busy >= (min_free + apic_diff_for_redistribution))) { - - apic_num_imbalance++; -#ifdef DEBUG - if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { - prom_printf( - "redistribute busy=%x free=%x max=%x min=%x", - cpu_busy, cpu_free, max_busy, min_free); - } -#endif /* DEBUG */ - - - average_busy /= cpus_online; - - diff = max_busy - average_busy; - min_busy = max_busy; /* start with the max possible value */ - max_busy = 0; - min_busy_irq = max_busy_irq = NULL; - i = apic_min_device_irq; - for (; i <= apic_max_device_irq; i++) { - apic_irq_t *irq_ptr; - /* Change to linked list per CPU ? */ - if ((irq_ptr = apic_irq_table[i]) == NULL) - continue; - /* Check for irq_busy & decide which one to move */ - /* Also zero them for next round */ - if ((irq_ptr->airq_temp_cpu == busiest_cpu) && - irq_ptr->airq_busy) { - if (irq_ptr->airq_busy < diff) { - /* - * Check for least busy CPU, - * best fit or what ? - */ - if (max_busy < irq_ptr->airq_busy) { - /* - * Most busy within the - * required differential - */ - max_busy = irq_ptr->airq_busy; - max_busy_irq = irq_ptr; - } - } else { - if (min_busy > irq_ptr->airq_busy) { - /* - * least busy, but more than - * the reqd diff - */ - if (min_busy < - (diff + average_busy - - min_free)) { - /* - * Making sure new cpu - * will not end up - * worse - */ - min_busy = - irq_ptr->airq_busy; - - min_busy_irq = irq_ptr; - } - } - } - } - irq_ptr->airq_busy = 0; - } - - if (max_busy_irq != NULL) { -#ifdef DEBUG - if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { - prom_printf("rebinding %x to %x", - max_busy_irq->airq_vector, most_free_cpu); - } -#endif /* DEBUG */ - iflag = intr_clear(); - if (lock_try(&apic_ioapic_lock)) { - if (apic_rebind_all(max_busy_irq, - most_free_cpu) == 0) { - /* Make change permenant */ - max_busy_irq->airq_cpu = - (uint32_t)most_free_cpu; - } - lock_clear(&apic_ioapic_lock); - } - intr_restore(iflag); - - } else if (min_busy_irq != NULL) { -#ifdef DEBUG - if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { - prom_printf("rebinding %x to %x", - min_busy_irq->airq_vector, most_free_cpu); - } -#endif /* DEBUG */ - - iflag = intr_clear(); - if (lock_try(&apic_ioapic_lock)) { - if (apic_rebind_all(min_busy_irq, - most_free_cpu) == 0) { - /* Make change permenant */ - min_busy_irq->airq_cpu = - (uint32_t)most_free_cpu; - } - lock_clear(&apic_ioapic_lock); - } - intr_restore(iflag); - - } else { - if (cpu_busy != (1 << busiest_cpu)) { - apic_redist_cpu_skip |= 1 << busiest_cpu; - /* - * We leave cpu_skip set so that next time we - * can choose another cpu - */ - } - } - apic_num_rebind++; - } else { - /* - * found nothing. Could be that we skipped over valid CPUs - * or we have balanced everything. If we had a variable - * ticks_for_redistribution, it could be increased here. - * apic_int_busy, int_free etc would also need to be - * changed. - */ - if (apic_redist_cpu_skip) - apic_redist_cpu_skip = 0; - } - for (i = 0; i < apic_nproc; i++) { - if (apic_cpu_in_range(i)) { - apic_cpus[i].aci_busy = 0; - } - } -} - -void -apic_cleanup_busy() -{ - int i; - apic_irq_t *irq_ptr; - - for (i = 0; i < apic_nproc; i++) { - if (apic_cpu_in_range(i)) { - apic_cpus[i].aci_busy = 0; - } - } - - for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { - if ((irq_ptr = apic_irq_table[i]) != NULL) - irq_ptr->airq_busy = 0; - } -} - - -static int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, int ipin, int *pci_irqp, iflag_t *intr_flagp) { @@ -4268,7 +2199,6 @@ apic_is_ioapic_AMD_813x(uint32_t physaddr) if (did == DEVID_8131_IOAPIC || did == DEVID_8132_IOAPIC) { - rv = B_TRUE; done = B_TRUE; } @@ -4324,7 +2254,7 @@ apic_acpi_enter_apicmode(void) static void apic_save_state(struct apic_state *sp) { - int i; + int i, cpuid; ulong_t iflag; PMD(PMD_SX, ("apic_save_state %p\n", (void *)sp)) @@ -4347,7 +2277,7 @@ apic_save_state(struct apic_state *sp) /* * If on the boot processor then save the IOAPICs' IDs */ - if (psm_get_cpu_id() == 0) { + if ((cpuid = psm_get_cpu_id()) == 0) { iflag = intr_clear(); lock_set(&apic_ioapic_lock); @@ -4358,6 +2288,9 @@ apic_save_state(struct apic_state *sp) lock_clear(&apic_ioapic_lock); intr_restore(iflag); } + + /* apic_state() is currently invoked only in Suspend/Resume */ + apic_cpus[cpuid].aci_status |= APIC_CPU_SUSPEND; } static void diff --git a/usr/src/uts/i86pc/io/mp_platform_misc.c b/usr/src/uts/i86pc/io/mp_platform_misc.c new file mode 100644 index 0000000000..af0e0cff00 --- /dev/null +++ b/usr/src/uts/i86pc/io/mp_platform_misc.c @@ -0,0 +1,2213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * PSMI 1.1 extensions are supported only in 2.6 and later versions. + * PSMI 1.2 extensions are supported only in 2.7 and later versions. + * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. + * PSMI 1.5 extensions are supported in Solaris Nevada. + * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. + */ +#define PSMI_1_7 + +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/inttypes.h> +#include <sys/cram.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/psm_common.h> +#include <sys/apic.h> +#include <sys/apic_common.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/archsystm.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/cpuvar.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/cyclic.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/sunndi.h> +#include <sys/hpet.h> +#include <sys/clock.h> + +/* + * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm + * but not apix. + * These functions may be moved to xpv_psm later when apix and pcplusmp + * are merged together + */ + +/* + * Local Function Prototypes + */ +static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); +static void apic_xlate_vector_free_timeout_handler(void *arg); +static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, + int new_bind_cpu, int apicindex, int intin_no, int which_irq, + struct ioapic_reprogram_data *drep); +static int apic_setup_irq_table(dev_info_t *dip, int irqno, + struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, + int type); +static void apic_try_deferred_reprogram(int ipl, int vect); +static void delete_defer_repro_ent(int which_irq); +static void apic_ioapic_wait_pending_clear(int ioapicindex, + int intin_no); + +extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, + int ipin, int *pci_irqp, iflag_t *intr_flagp); +extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, + int child_ipin, struct apic_io_intr **intrp); +extern uchar_t acpi_find_ioapic(int irq); +extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); +extern int apic_find_bus_id(int bustype); +extern int apic_find_intin(uchar_t ioapic, uchar_t intin); +extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); + +extern int apic_sci_vect; +extern iflag_t apic_sci_flags; +/* ACPI HPET interrupt configuration; -1 if HPET not used */ +extern int apic_hpet_vect; +extern iflag_t apic_hpet_flags; +extern int apic_intr_policy; +extern char *psm_name; + +/* + * number of bits per byte, from <sys/param.h> + */ +#define UCHAR_MAX UINT8_MAX + +/* Max wait time (in repetitions) for flags to clear in an RDT entry. */ +extern int apic_max_reps_clear_pending; + +/* The irq # is implicit in the array index: */ +struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; +/* + * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info + * is indexed by IRQ number, NOT by vector number. + */ + +extern int apic_int_busy_mark; +extern int apic_int_free_mark; +extern int apic_diff_for_redistribution; +extern int apic_sample_factor_redistribution; +extern int apic_redist_cpu_skip; +extern int apic_num_imbalance; +extern int apic_num_rebind; + +/* timeout for xlate_vector, mark_vector */ +int apic_revector_timeout = 16 * 10000; /* 160 millisec */ + +extern int apic_defconf; +extern int apic_irq_translate; + +extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ + +extern uchar_t apic_io_vectbase[MAX_IO_APIC]; + +extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; + +/* + * First available slot to be used as IRQ index into the apic_irq_table + * for those interrupts (like MSI/X) that don't have a physical IRQ. + */ +extern int apic_first_avail_irq; + +/* + * apic_defer_reprogram_lock ensures that only one processor is handling + * deferred interrupt programming at *_intr_exit time. + */ +static lock_t apic_defer_reprogram_lock; + +/* + * The current number of deferred reprogrammings outstanding + */ +uint_t apic_reprogram_outstanding = 0; + +#ifdef DEBUG +/* + * Counters that keep track of deferred reprogramming stats + */ +uint_t apic_intr_deferrals = 0; +uint_t apic_intr_deliver_timeouts = 0; +uint_t apic_last_ditch_reprogram_failures = 0; +uint_t apic_deferred_setup_failures = 0; +uint_t apic_defer_repro_total_retries = 0; +uint_t apic_defer_repro_successes = 0; +uint_t apic_deferred_spurious_enters = 0; +#endif + +extern int apic_io_max; +extern struct apic_io_intr *apic_io_intrp; + +uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; + +extern uint32_t eisa_level_intr_mask; + /* At least MSB will be set if EISA bus */ + +extern int apic_pci_bus_total; +extern uchar_t apic_single_pci_busid; + +/* + * Following declarations are for revectoring; used when ISRs at different + * IPLs share an irq. + */ +static lock_t apic_revector_lock; +int apic_revector_pending = 0; +static uchar_t *apic_oldvec_to_newvec; +static uchar_t *apic_newvec_to_oldvec; + +/* ACPI Interrupt Source Override Structure ptr */ +extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; +extern int acpi_iso_cnt; + +/* + * Auto-configuration routines + */ + +/* + * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable + * are also set to NULL. vector->irq is set to a value which cannot map + * to a real irq to show that it is free. + */ +void +apic_init_common(void) +{ + int i, j, indx; + int *iptr; + + /* + * Initialize apic_ipls from apic_vectortoipl. This array is + * used in apic_intr_enter to determine the IPL to use for the + * corresponding vector. On some systems, due to hardware errata + * and interrupt sharing, the IPL may not correspond to the IPL listed + * in apic_vectortoipl (see apic_addspl and apic_delspl). + */ + for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { + indx = i * APIC_VECTOR_PER_IPL; + + for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) + apic_ipls[indx] = apic_vectortoipl[i]; + } + + /* cpu 0 is always up (for now) */ + apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; + + iptr = (int *)&apic_irq_table[0]; + for (i = 0; i <= APIC_MAX_VECTOR; i++) { + apic_level_intr[i] = 0; + *iptr++ = NULL; + apic_vector_to_irq[i] = APIC_RESV_IRQ; + + /* These *must* be initted to B_TRUE! */ + apic_reprogram_info[i].done = B_TRUE; + apic_reprogram_info[i].irqp = NULL; + apic_reprogram_info[i].tries = 0; + apic_reprogram_info[i].bindcpu = 0; + } + + /* + * Allocate a dummy irq table entry for the reserved entry. + * This takes care of the race between removing an irq and + * clock detecting a CPU in that irq during interrupt load + * sampling. + */ + apic_irq_table[APIC_RESV_IRQ] = + kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + + mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); +} + +void +ioapic_init_intr(int mask_apic) +{ + int ioapic_ix; + struct intrspec ispec; + apic_irq_t *irqptr; + int i, j; + ulong_t iflag; + + LOCK_INIT_CLEAR(&apic_revector_lock); + LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); + + /* mask interrupt vectors */ + for (j = 0; j < apic_io_max && mask_apic; j++) { + int intin_max; + + ioapic_ix = j; + /* Bits 23-16 define the maximum redirection entries */ + intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) + & 0xff; + for (i = 0; i <= intin_max; i++) + ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); + } + + /* + * Hack alert: deal with ACPI SCI interrupt chicken/egg here + */ + if (apic_sci_vect > 0) { + /* + * acpica has already done add_avintr(); we just + * to finish the job by mimicing translate_irq() + * + * Fake up an intrspec and setup the tables + */ + ispec.intrspec_vec = apic_sci_vect; + ispec.intrspec_pri = SCI_IPL; + + if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, + &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { + cmn_err(CE_WARN, "!apic: SCI setup failed"); + return; + } + irqptr = apic_irq_table[apic_sci_vect]; + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Program I/O APIC */ + (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + irqptr->airq_share++; + } + + /* + * Hack alert: deal with ACPI HPET interrupt chicken/egg here. + */ + if (apic_hpet_vect > 0) { + /* + * hpet has already done add_avintr(); we just need + * to finish the job by mimicing translate_irq() + * + * Fake up an intrspec and setup the tables + */ + ispec.intrspec_vec = apic_hpet_vect; + ispec.intrspec_pri = CBE_HIGH_PIL; + + if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL, + &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) { + cmn_err(CE_WARN, "!apic: HPET setup failed"); + return; + } + irqptr = apic_irq_table[apic_hpet_vect]; + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Program I/O APIC */ + (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + irqptr->airq_share++; + } +} + +/* + * Add mask bits to disable interrupt vector from happening + * at or above IPL. In addition, it should remove mask bits + * to enable interrupt vectors below the given IPL. + * + * Both add and delspl are complicated by the fact that different interrupts + * may share IRQs. This can happen in two ways. + * 1. The same H/W line is shared by more than 1 device + * 1a. with interrupts at different IPLs + * 1b. with interrupts at same IPL + * 2. We ran out of vectors at a given IPL and started sharing vectors. + * 1b and 2 should be handled gracefully, except for the fact some ISRs + * will get called often when no interrupt is pending for the device. + * For 1a, we handle it at the higher IPL. + */ +/*ARGSUSED*/ +int +apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) +{ + uchar_t vector; + ulong_t iflag; + apic_irq_t *irqptr, *irqheadptr; + int irqindex; + + ASSERT(max_ipl <= UCHAR_MAX); + irqindex = IRQINDEX(irqno); + + if ((irqindex == -1) || (!apic_irq_table[irqindex])) + return (PSM_FAILURE); + + mutex_enter(&airq_mutex); + irqptr = irqheadptr = apic_irq_table[irqindex]; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " + "vector=0x%x\n", (void *)irqptr->airq_dip, + irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); + + while (irqptr) { + if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) + break; + irqptr = irqptr->airq_next; + } + irqptr->airq_share++; + + mutex_exit(&airq_mutex); + + /* return if it is not hardware interrupt */ + if (irqptr->airq_mps_intr_index == RESERVE_INDEX) + return (PSM_SUCCESS); + + /* Or if there are more interupts at a higher IPL */ + if (ipl != max_ipl) + return (PSM_SUCCESS); + + /* + * if apic_picinit() has not been called yet, just return. + * At the end of apic_picinit(), we will call setup_io_intr(). + */ + + if (!apic_picinit_called) + return (PSM_SUCCESS); + + /* + * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, + * return failure. + */ + if (irqptr->airq_ipl != max_ipl && + !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + + vector = apic_allocate_vector(max_ipl, irqindex, 1); + if (vector == 0) { + irqptr->airq_share--; + return (PSM_FAILURE); + } + irqptr = irqheadptr; + apic_mark_vector(irqptr->airq_vector, vector); + while (irqptr) { + irqptr->airq_vector = vector; + irqptr->airq_ipl = (uchar_t)max_ipl; + /* + * reprogram irq being added and every one else + * who is not in the UNINIT state + */ + if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == + irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { + apic_record_rdt_entry(irqptr, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, + B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + irqptr = irqptr->airq_next; + } + return (PSM_SUCCESS); + + } else if (irqptr->airq_ipl != max_ipl && + ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + /* + * We cannot upgrade the vector, but we can change + * the IPL that this vector induces. + * + * Note that we subtract APIC_BASE_VECT from the vector + * here because this array is used in apic_intr_enter + * (no need to add APIC_BASE_VECT in that hot code + * path since we can do it in the rarely-executed path + * here). + */ + apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = + (uchar_t)max_ipl; + + irqptr = irqheadptr; + while (irqptr) { + irqptr->airq_ipl = (uchar_t)max_ipl; + irqptr = irqptr->airq_next; + } + + return (PSM_SUCCESS); + } + + ASSERT(irqptr); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (PSM_SUCCESS); +} + +/* + * Recompute mask bits for the given interrupt vector. + * If there is no interrupt servicing routine for this + * vector, this function should disable interrupt vector + * from happening at all IPLs. If there are still + * handlers using the given vector, this function should + * disable the given vector from happening below the lowest + * IPL of the remaining hadlers. + */ +/*ARGSUSED*/ +int +apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) +{ + uchar_t vector; + uint32_t bind_cpu; + int intin, irqindex; + int ioapic_ix; + apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; + ulong_t iflag; + + mutex_enter(&airq_mutex); + irqindex = IRQINDEX(irqno); + irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " + "vector=0x%x\n", (void *)irqptr->airq_dip, + irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); + + while (irqptr) { + if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) + break; + preirqptr = irqptr; + irqptr = irqptr->airq_next; + } + ASSERT(irqptr); + + irqptr->airq_share--; + + mutex_exit(&airq_mutex); + + /* + * If there are more interrupts at a higher IPL, we don't need + * to disable anything. + */ + if (ipl < max_ipl) + return (PSM_SUCCESS); + + /* return if it is not hardware interrupt */ + if (irqptr->airq_mps_intr_index == RESERVE_INDEX) + return (PSM_SUCCESS); + + if (!apic_picinit_called) { + /* + * Clear irq_struct. If two devices shared an intpt + * line & 1 unloaded before picinit, we are hosed. But, then + * we hope the machine survive. + */ + irqptr->airq_mps_intr_index = FREE_INDEX; + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_free_vector(irqptr->airq_vector); + return (PSM_SUCCESS); + } + /* + * Downgrade vector to new max_ipl if needed. If we cannot allocate, + * use old IPL. Not very elegant, but it should work. + */ + if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && + !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + apic_irq_t *irqp; + if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { + apic_mark_vector(irqheadptr->airq_vector, vector); + irqp = irqheadptr; + while (irqp) { + irqp->airq_vector = vector; + irqp->airq_ipl = (uchar_t)max_ipl; + if (irqp->airq_temp_cpu != IRQ_UNINIT) { + apic_record_rdt_entry(irqp, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqp, + irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + irqp = irqp->airq_next; + } + } + + } else if (irqptr->airq_ipl != max_ipl && + max_ipl != PSM_INVALID_IPL && + ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + + /* + * We cannot downgrade the IPL of the vector below the vector's + * hardware priority. If we did, it would be possible for a + * higher-priority hardware vector to interrupt a CPU running at an IPL + * lower than the hardware priority of the interrupting vector (but + * higher than the soft IPL of this IRQ). When this happens, we would + * then try to drop the IPL BELOW what it was (effectively dropping + * below base_spl) which would be potentially catastrophic. + * + * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 + * (hardware IPL of 4). Further assume that the old IPL of this IRQ + * was 4, but the new IPL is 1. If we forced vector 0x40 to result in + * an IPL of 1, it would be possible for the processor to be executing + * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting + * the currently-executing ISR. When apic_intr_enter consults + * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 + * so even though the processor was running at IPL 4, an IPL 1 + * interrupt will have interrupted it, which must not happen)). + * + * Effectively, this means that the hardware priority corresponding to + * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's + * hardware priority. + * + * (In the above example, then, after removal of the IPL 4 device's + * interrupt handler, the new IPL will continue to be 4 because the + * hardware priority that IPL 1 implies is lower than the hardware + * priority of the vector used.) + */ + /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ + const int apic_ipls_index = irqptr->airq_vector - + APIC_BASE_VECT; + const int vect_inherent_hwpri = irqptr->airq_vector >> + APIC_IPL_SHIFT; + + /* + * If there are still devices using this IRQ, determine the + * new ipl to use. + */ + if (irqptr->airq_share) { + int vect_desired_hwpri, hwpri; + + ASSERT(max_ipl < MAXIPL); + vect_desired_hwpri = apic_ipltopri[max_ipl] >> + APIC_IPL_SHIFT; + + /* + * If the desired IPL's hardware priority is lower + * than that of the vector, use the hardware priority + * of the vector to determine the new IPL. + */ + hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? + vect_inherent_hwpri : vect_desired_hwpri; + + /* + * Now, to get the right index for apic_vectortoipl, + * we need to subtract APIC_BASE_VECT from the + * hardware-vector-equivalent (in hwpri). Since hwpri + * is already shifted, we shift APIC_BASE_VECT before + * doing the subtraction. + */ + hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); + + ASSERT(hwpri >= 0); + ASSERT(hwpri < MAXIPL); + max_ipl = apic_vectortoipl[hwpri]; + apic_ipls[apic_ipls_index] = max_ipl; + + irqp = irqheadptr; + while (irqp) { + irqp->airq_ipl = (uchar_t)max_ipl; + irqp = irqp->airq_next; + } + } else { + /* + * No more devices on this IRQ, so reset this vector's + * element in apic_ipls to the original IPL for this + * vector + */ + apic_ipls[apic_ipls_index] = + apic_vectortoipl[vect_inherent_hwpri]; + } + } + + /* + * If there are still active interrupts, we are done. + */ + if (irqptr->airq_share) + return (PSM_SUCCESS); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + if (irqptr->airq_mps_intr_index == MSI_INDEX) { + /* + * Disable the MSI vector + * Make sure we only disable on the last + * of the multi-MSI support + */ + if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { + apic_pci_msi_disable_mode(irqptr->airq_dip, + DDI_INTR_TYPE_MSI); + } + } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { + /* + * Disable the MSI-X vector + * needs to clear its mask and addr/data for each MSI-X + */ + apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, + irqptr->airq_origirq); + /* + * Make sure we only disable on the last MSI-X + */ + if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { + apic_pci_msi_disable_mode(irqptr->airq_dip, + DDI_INTR_TYPE_MSIX); + } + } else { + /* + * The assumption here is that this is safe, even for + * systems with IOAPICs that suffer from the hardware + * erratum because all devices have been quiesced before + * they unregister their interrupt handlers. If that + * assumption turns out to be false, this mask operation + * can induce the same erratum result we're trying to + * avoid. + */ + ioapic_ix = irqptr->airq_ioapicindex; + intin = irqptr->airq_intin_no; + ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); + } + + apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private); + + /* + * This irq entry is the only one in the chain. + */ + if (irqheadptr->airq_next == NULL) { + ASSERT(irqheadptr == irqptr); + bind_cpu = irqptr->airq_temp_cpu; + if (((uint32_t)bind_cpu != IRQ_UNBOUND) && + ((uint32_t)bind_cpu != IRQ_UNINIT)) { + ASSERT(apic_cpu_in_range(bind_cpu)); + if (bind_cpu & IRQ_USER_BOUND) { + /* If hardbound, temp_cpu == cpu */ + bind_cpu &= ~IRQ_USER_BOUND; + apic_cpus[bind_cpu].aci_bound--; + } else + apic_cpus[bind_cpu].aci_temp_bound--; + } + irqptr->airq_temp_cpu = IRQ_UNINIT; + irqptr->airq_mps_intr_index = FREE_INDEX; + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + apic_free_vector(irqptr->airq_vector); + return (PSM_SUCCESS); + } + + /* + * If we get here, we are sharing the vector and there are more than + * one active irq entries in the chain. + */ + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + mutex_enter(&airq_mutex); + /* Remove the irq entry from the chain */ + if (irqptr == irqheadptr) { /* The irq entry is at the head */ + apic_irq_table[irqindex] = irqptr->airq_next; + } else { + preirqptr->airq_next = irqptr->airq_next; + } + /* Free the irq entry */ + kmem_free(irqptr, sizeof (apic_irq_t)); + mutex_exit(&airq_mutex); + + return (PSM_SUCCESS); +} + +/* + * apic_introp_xlate() replaces apic_translate_irq() and is + * called only from apic_intr_ops(). With the new ADII framework, + * the priority can no longer be retrieved through i_ddi_get_intrspec(). + * It has to be passed in from the caller. + * + * Return value: + * Success: irqno for the given device + * Failure: -1 + */ +int +apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) +{ + char dev_type[16]; + int dev_len, pci_irq, newirq, bustype, devid, busid, i; + int irqno = ispec->intrspec_vec; + ddi_acc_handle_t cfg_handle; + uchar_t ipin; + struct apic_io_intr *intrp; + iflag_t intr_flag; + ACPI_SUBTABLE_HEADER *hp; + ACPI_MADT_INTERRUPT_OVERRIDE *isop; + apic_irq_t *airqp; + int parent_is_pci_or_pciex = 0; + int child_is_pciex = 0; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " + "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, + irqno)); + + dev_len = sizeof (dev_type); + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), + DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if ((strcmp(dev_type, "pci") == 0) || + (strcmp(dev_type, "pciex") == 0)) + parent_is_pci_or_pciex = 1; + } + + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if (strstr(dev_type, "pciex")) + child_is_pciex = 1; + } + + if (DDI_INTR_IS_MSI_OR_MSIX(type)) { + if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { + airqp->airq_iflag.bustype = + child_is_pciex ? BUS_PCIE : BUS_PCI; + return (apic_vector_to_irq[airqp->airq_vector]); + } + return (apic_setup_irq_table(dip, irqno, NULL, ispec, + NULL, type)); + } + + bustype = 0; + + /* check if we have already translated this irq */ + mutex_enter(&airq_mutex); + newirq = apic_min_device_irq; + for (; newirq <= apic_max_device_irq; newirq++) { + airqp = apic_irq_table[newirq]; + while (airqp) { + if ((airqp->airq_dip == dip) && + (airqp->airq_origirq == irqno) && + (airqp->airq_mps_intr_index != FREE_INDEX)) { + + mutex_exit(&airq_mutex); + return (VIRTIRQ(newirq, airqp->airq_share_id)); + } + airqp = airqp->airq_next; + } + } + mutex_exit(&airq_mutex); + + if (apic_defconf) + goto defconf; + + if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) + goto nonpci; + + if (parent_is_pci_or_pciex) { + /* pci device */ + if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) + goto nonpci; + if (busid == 0 && apic_pci_bus_total == 1) + busid = (int)apic_single_pci_busid; + + if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) + return (-1); + ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; + pci_config_teardown(&cfg_handle); + if (apic_enable_acpi && !apic_use_acpi_madt_only) { + if (apic_acpi_translate_pci_irq(dip, busid, devid, + ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) + return (-1); + + intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; + return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, + &intr_flag, type)); + } else { + pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); + if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) + == NULL) { + if ((pci_irq = apic_handle_pci_pci_bridge(dip, + devid, ipin, &intrp)) == -1) + return (-1); + } + return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, + NULL, type)); + } + } else if (strcmp(dev_type, "isa") == 0) + bustype = BUS_ISA; + else if (strcmp(dev_type, "eisa") == 0) + bustype = BUS_EISA; + +nonpci: + if (apic_enable_acpi && !apic_use_acpi_madt_only) { + /* search iso entries first */ + if (acpi_iso_cnt != 0) { + hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; + i = 0; + while (i < acpi_iso_cnt) { + if (hp->Type == + ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { + isop = + (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; + if (isop->Bus == 0 && + isop->SourceIrq == irqno) { + newirq = isop->GlobalIrq; + intr_flag.intr_po = + isop->IntiFlags & + ACPI_MADT_POLARITY_MASK; + intr_flag.intr_el = + (isop->IntiFlags & + ACPI_MADT_TRIGGER_MASK) + >> 2; + intr_flag.bustype = BUS_ISA; + + return (apic_setup_irq_table( + dip, newirq, NULL, ispec, + &intr_flag, type)); + + } + i++; + } + hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + + hp->Length); + } + } + intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; + intr_flag.intr_el = INTR_EL_EDGE; + intr_flag.bustype = BUS_ISA; + return (apic_setup_irq_table(dip, irqno, NULL, ispec, + &intr_flag, type)); + } else { + if (bustype == 0) /* not initialized */ + bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; + for (i = 0; i < 2; i++) { + if (((busid = apic_find_bus_id(bustype)) != -1) && + ((intrp = apic_find_io_intr_w_busid(irqno, busid)) + != NULL)) { + if ((newirq = apic_setup_irq_table(dip, irqno, + intrp, ispec, NULL, type)) != -1) { + return (newirq); + } + goto defconf; + } + bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; + } + } + +/* MPS default configuration */ +defconf: + newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); + if (newirq == -1) + return (-1); + ASSERT(IRQINDEX(newirq) == irqno); + ASSERT(apic_irq_table[irqno]); + return (newirq); +} + +/* + * Attempt to share vector with someone else + */ +static int +apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, + uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) +{ +#ifdef DEBUG + apic_irq_t *tmpirqp = NULL; +#endif /* DEBUG */ + apic_irq_t *irqptr, dummyirq; + int newirq, chosen_irq = -1, share = 127; + int lowest, highest, i; + uchar_t share_id; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " + "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); + + highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; + lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; + + if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ + lowest -= APIC_VECTOR_PER_IPL; + dummyirq.airq_mps_intr_index = intr_index; + dummyirq.airq_ioapicindex = ioapicindex; + dummyirq.airq_intin_no = ipin; + if (intr_flagp) + dummyirq.airq_iflag = *intr_flagp; + apic_record_rdt_entry(&dummyirq, irqno); + for (i = lowest; i <= highest; i++) { + newirq = apic_vector_to_irq[i]; + if (newirq == APIC_RESV_IRQ) + continue; + irqptr = apic_irq_table[newirq]; + + if ((dummyirq.airq_rdt_entry & 0xFF00) != + (irqptr->airq_rdt_entry & 0xFF00)) + /* not compatible */ + continue; + + if (irqptr->airq_share < share) { + share = irqptr->airq_share; + chosen_irq = newirq; + } + } + if (chosen_irq != -1) { + /* + * Assign a share id which is free or which is larger + * than the largest one. + */ + share_id = 1; + mutex_enter(&airq_mutex); + irqptr = apic_irq_table[chosen_irq]; + while (irqptr) { + if (irqptr->airq_mps_intr_index == FREE_INDEX) { + share_id = irqptr->airq_share_id; + break; + } + if (share_id <= irqptr->airq_share_id) + share_id = irqptr->airq_share_id + 1; +#ifdef DEBUG + tmpirqp = irqptr; +#endif /* DEBUG */ + irqptr = irqptr->airq_next; + } + if (!irqptr) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + irqptr->airq_next = + apic_irq_table[chosen_irq]->airq_next; + apic_irq_table[chosen_irq]->airq_next = irqptr; +#ifdef DEBUG + tmpirqp = apic_irq_table[chosen_irq]; +#endif /* DEBUG */ + } + irqptr->airq_mps_intr_index = intr_index; + irqptr->airq_ioapicindex = ioapicindex; + irqptr->airq_intin_no = ipin; + if (intr_flagp) + irqptr->airq_iflag = *intr_flagp; + irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; + irqptr->airq_share_id = share_id; + apic_record_rdt_entry(irqptr, irqno); + *irqptrp = irqptr; +#ifdef DEBUG + /* shuffle the pointers to test apic_delspl path */ + if (tmpirqp) { + tmpirqp->airq_next = irqptr->airq_next; + irqptr->airq_next = apic_irq_table[chosen_irq]; + apic_irq_table[chosen_irq] = irqptr; + } +#endif /* DEBUG */ + mutex_exit(&airq_mutex); + return (VIRTIRQ(chosen_irq, share_id)); + } + return (-1); +} + +/* + * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry + * is used already, we will try to allocate a new irqno. + * + * Return value: + * Success: irqno + * Failure: -1 + */ +static int +apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, + struct intrspec *ispec, iflag_t *intr_flagp, int type) +{ + int origirq = ispec->intrspec_vec; + uchar_t ipl = ispec->intrspec_pri; + int newirq, intr_index; + uchar_t ipin, ioapic, ioapicindex, vector; + apic_irq_t *irqptr; + major_t major; + dev_info_t *sdip; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " + "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); + + ASSERT(ispec != NULL); + + major = (dip != NULL) ? ddi_driver_major(dip) : 0; + + if (DDI_INTR_IS_MSI_OR_MSIX(type)) { + /* MSI/X doesn't need to setup ioapic stuffs */ + ioapicindex = 0xff; + ioapic = 0xff; + ipin = (uchar_t)0xff; + intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : + MSIX_INDEX; + mutex_enter(&airq_mutex); + if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { + mutex_exit(&airq_mutex); + /* need an irq for MSI/X to index into autovect[] */ + cmn_err(CE_WARN, "No interrupt irq: %s instance %d", + ddi_get_name(dip), ddi_get_instance(dip)); + return (-1); + } + mutex_exit(&airq_mutex); + + } else if (intrp != NULL) { + intr_index = (int)(intrp - apic_io_intrp); + ioapic = intrp->intr_destid; + ipin = intrp->intr_destintin; + /* Find ioapicindex. If destid was ALL, we will exit with 0. */ + for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) + if (apic_io_id[ioapicindex] == ioapic) + break; + ASSERT((ioapic == apic_io_id[ioapicindex]) || + (ioapic == INTR_ALL_APIC)); + + /* check whether this intin# has been used by another irqno */ + if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { + return (newirq); + } + + } else if (intr_flagp != NULL) { + /* ACPI case */ + intr_index = ACPI_INDEX; + ioapicindex = acpi_find_ioapic(irqno); + ASSERT(ioapicindex != 0xFF); + ioapic = apic_io_id[ioapicindex]; + ipin = irqno - apic_io_vectbase[ioapicindex]; + if (apic_irq_table[irqno] && + apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { + ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && + apic_irq_table[irqno]->airq_ioapicindex == + ioapicindex); + return (irqno); + } + + } else { + /* default configuration */ + ioapicindex = 0; + ioapic = apic_io_id[ioapicindex]; + ipin = (uchar_t)irqno; + intr_index = DEFAULT_INDEX; + } + + if (ispec == NULL) { + APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", + irqno)); + } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { + if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, + ipl, ioapicindex, ipin, &irqptr)) != -1) { + irqptr->airq_ipl = ipl; + irqptr->airq_origirq = (uchar_t)origirq; + irqptr->airq_dip = dip; + irqptr->airq_major = major; + sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; + /* This is OK to do really */ + if (sdip == NULL) { + cmn_err(CE_WARN, "Sharing vectors: %s" + " instance %d and SCI", + ddi_get_name(dip), ddi_get_instance(dip)); + } else { + cmn_err(CE_WARN, "Sharing vectors: %s" + " instance %d and %s instance %d", + ddi_get_name(sdip), ddi_get_instance(sdip), + ddi_get_name(dip), ddi_get_instance(dip)); + } + return (newirq); + } + /* try high priority allocation now that share has failed */ + if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { + cmn_err(CE_WARN, "No interrupt vector: %s instance %d", + ddi_get_name(dip), ddi_get_instance(dip)); + return (-1); + } + } + + mutex_enter(&airq_mutex); + if (apic_irq_table[irqno] == NULL) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_irq_table[irqno] = irqptr; + } else { + irqptr = apic_irq_table[irqno]; + if (irqptr->airq_mps_intr_index != FREE_INDEX) { + /* + * The slot is used by another irqno, so allocate + * a free irqno for this interrupt + */ + newirq = apic_allocate_irq(apic_first_avail_irq); + if (newirq == -1) { + mutex_exit(&airq_mutex); + return (-1); + } + irqno = newirq; + irqptr = apic_irq_table[irqno]; + if (irqptr == NULL) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), + KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_irq_table[irqno] = irqptr; + } + vector = apic_modify_vector(vector, newirq); + } + } + apic_max_device_irq = max(irqno, apic_max_device_irq); + apic_min_device_irq = min(irqno, apic_min_device_irq); + mutex_exit(&airq_mutex); + irqptr->airq_ioapicindex = ioapicindex; + irqptr->airq_intin_no = ipin; + irqptr->airq_ipl = ipl; + irqptr->airq_vector = vector; + irqptr->airq_origirq = (uchar_t)origirq; + irqptr->airq_share_id = 0; + irqptr->airq_mps_intr_index = (short)intr_index; + irqptr->airq_dip = dip; + irqptr->airq_major = major; + irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); + if (intr_flagp) + irqptr->airq_iflag = *intr_flagp; + + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { + /* setup I/O APIC entry for non-MSI/X interrupts */ + apic_record_rdt_entry(irqptr, irqno); + } + return (irqno); +} + +/* + * return the cpu to which this intr should be bound. + * Check properties or any other mechanism to see if user wants it + * bound to a specific CPU. If so, return the cpu id with high bit set. + * If not, use the policy to choose a cpu and return the id. + */ +uint32_t +apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) +{ + int instance, instno, prop_len, bind_cpu, count; + uint_t i, rc; + uint32_t cpu; + major_t major; + char *name, *drv_name, *prop_val, *cptr; + char prop_name[32]; + ulong_t iflag; + + + if (apic_intr_policy == INTR_LOWEST_PRIORITY) + return (IRQ_UNBOUND); + + if (apic_nproc == 1) + return (0); + + drv_name = NULL; + rc = DDI_PROP_NOT_FOUND; + major = (major_t)-1; + if (dip != NULL) { + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + drv_name = ddi_major_to_name(major); + instance = ddi_get_instance(dip); + if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { + i = apic_min_device_irq; + for (; i <= apic_max_device_irq; i++) { + + if ((i == irq) || (apic_irq_table[i] == NULL) || + (apic_irq_table[i]->airq_mps_intr_index + == FREE_INDEX)) + continue; + + if ((apic_irq_table[i]->airq_major == major) && + (!(apic_irq_table[i]->airq_cpu & + IRQ_USER_BOUND))) { + + cpu = apic_irq_table[i]->airq_cpu; + + cmn_err(CE_CONT, + "!%s: %s (%s) instance #%d " + "irq 0x%x vector 0x%x ioapic 0x%x " + "intin 0x%x is bound to cpu %d\n", + psm_name, + name, drv_name, instance, irq, + apic_irq_table[irq]->airq_vector, + ioapicid, intin, cpu); + return (cpu); + } + } + } + /* + * search for "drvname"_intpt_bind_cpus property first, the + * syntax of the property should be "a[,b,c,...]" where + * instance 0 binds to cpu a, instance 1 binds to cpu b, + * instance 3 binds to cpu c... + * ddi_getlongprop() will search /option first, then / + * if "drvname"_intpt_bind_cpus doesn't exist, then find + * intpt_bind_cpus property. The syntax is the same, and + * it applies to all the devices if its "drvname" specific + * property doesn't exist + */ + (void) strcpy(prop_name, drv_name); + (void) strcat(prop_name, "_intpt_bind_cpus"); + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, + (caddr_t)&prop_val, &prop_len); + if (rc != DDI_PROP_SUCCESS) { + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, + "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); + } + } + if (rc == DDI_PROP_SUCCESS) { + for (i = count = 0; i < (prop_len - 1); i++) + if (prop_val[i] == ',') + count++; + if (prop_val[i-1] != ',') + count++; + /* + * if somehow the binding instances defined in the + * property are not enough for this instno., then + * reuse the pattern for the next instance until + * it reaches the requested instno + */ + instno = instance % count; + i = 0; + cptr = prop_val; + while (i < instno) + if (*cptr++ == ',') + i++; + bind_cpu = stoi(&cptr); + kmem_free(prop_val, prop_len); + /* if specific CPU is bogus, then default to next cpu */ + if (!apic_cpu_in_range(bind_cpu)) { + cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", + psm_name, prop_name, prop_val, bind_cpu); + rc = DDI_PROP_NOT_FOUND; + } else { + /* indicate that we are bound at user request */ + bind_cpu |= IRQ_USER_BOUND; + } + /* + * no need to check apic_cpus[].aci_status, if specific CPU is + * not up, then post_cpu_start will handle it. + */ + } + if (rc != DDI_PROP_SUCCESS) { + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + bind_cpu = apic_get_next_bind_cpu(); + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + + if (drv_name != NULL) + cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " + "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", + psm_name, name, drv_name, instance, irq, + apic_irq_table[irq]->airq_vector, ioapicid, intin, + bind_cpu & ~IRQ_USER_BOUND); + else + cmn_err(CE_CONT, "!%s: irq 0x%x " + "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", + psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, + intin, bind_cpu & ~IRQ_USER_BOUND); + + return ((uint32_t)bind_cpu); +} + +/* + * Mark vector as being in the process of being deleted. Interrupts + * may still come in on some CPU. The moment an interrupt comes with + * the new vector, we know we can free the old one. Called only from + * addspl and delspl with interrupts disabled. Because an interrupt + * can be shared, but no interrupt from either device may come in, + * we also use a timeout mechanism, which we arbitrarily set to + * apic_revector_timeout microseconds. + */ +static void +apic_mark_vector(uchar_t oldvector, uchar_t newvector) +{ + ulong_t iflag; + + iflag = intr_clear(); + lock_set(&apic_revector_lock); + if (!apic_oldvec_to_newvec) { + apic_oldvec_to_newvec = + kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, + KM_NOSLEEP); + + if (!apic_oldvec_to_newvec) { + /* + * This failure is not catastrophic. + * But, the oldvec will never be freed. + */ + apic_error |= APIC_ERR_MARK_VECTOR_FAIL; + lock_clear(&apic_revector_lock); + intr_restore(iflag); + return; + } + apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; + } + + /* See if we already did this for drivers which do double addintrs */ + if (apic_oldvec_to_newvec[oldvector] != newvector) { + apic_oldvec_to_newvec[oldvector] = newvector; + apic_newvec_to_oldvec[newvector] = oldvector; + apic_revector_pending++; + } + lock_clear(&apic_revector_lock); + intr_restore(iflag); + (void) timeout(apic_xlate_vector_free_timeout_handler, + (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); +} + +/* + * xlate_vector is called from intr_enter if revector_pending is set. + * It will xlate it if needed and mark the old vector as free. + */ +uchar_t +apic_xlate_vector(uchar_t vector) +{ + uchar_t newvector, oldvector = 0; + + lock_set(&apic_revector_lock); + /* Do we really need to do this ? */ + if (!apic_revector_pending) { + lock_clear(&apic_revector_lock); + return (vector); + } + if ((newvector = apic_oldvec_to_newvec[vector]) != 0) + oldvector = vector; + else { + /* + * The incoming vector is new . See if a stale entry is + * remaining + */ + if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) + newvector = vector; + } + + if (oldvector) { + apic_revector_pending--; + apic_oldvec_to_newvec[oldvector] = 0; + apic_newvec_to_oldvec[newvector] = 0; + apic_free_vector(oldvector); + lock_clear(&apic_revector_lock); + /* There could have been more than one reprogramming! */ + return (apic_xlate_vector(newvector)); + } + lock_clear(&apic_revector_lock); + return (vector); +} + +void +apic_xlate_vector_free_timeout_handler(void *arg) +{ + ulong_t iflag; + uchar_t oldvector, newvector; + + oldvector = (uchar_t)(uintptr_t)arg; + iflag = intr_clear(); + lock_set(&apic_revector_lock); + if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { + apic_free_vector(oldvector); + apic_oldvec_to_newvec[oldvector] = 0; + apic_newvec_to_oldvec[newvector] = 0; + apic_revector_pending--; + } + + lock_clear(&apic_revector_lock); + intr_restore(iflag); +} + +/* + * Bind interrupt corresponding to irq_ptr to bind_cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held + */ +int +apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, + struct ioapic_reprogram_data *drep) +{ + int ioapicindex, intin_no; + uint32_t airq_temp_cpu; + apic_cpus_info_t *cpu_infop; + uint32_t rdt_entry; + int which_irq; + ioapic_rdt_t irdt; + + which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; + + intin_no = irq_ptr->airq_intin_no; + ioapicindex = irq_ptr->airq_ioapicindex; + airq_temp_cpu = irq_ptr->airq_temp_cpu; + if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { + if (airq_temp_cpu & IRQ_USER_BOUND) + /* Mask off high bit so it can be used as array index */ + airq_temp_cpu &= ~IRQ_USER_BOUND; + + ASSERT(apic_cpu_in_range(airq_temp_cpu)); + } + + /* + * Can't bind to a CPU that's not accepting interrupts: + */ + cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; + if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) + return (1); + + /* + * If we are about to change the interrupt vector for this interrupt, + * and this interrupt is level-triggered, attached to an IOAPIC, + * has been delivered to a CPU and that CPU has not handled it + * yet, we cannot reprogram the IOAPIC now. + */ + if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, + intin_no); + + if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && + apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, + bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { + + return (0); + } + + /* + * NOTE: We do not unmask the RDT here, as an interrupt MAY + * still come in before we have a chance to reprogram it below. + * The reprogramming below will simultaneously change and + * unmask the RDT entry. + */ + + if ((uint32_t)bind_cpu == IRQ_UNBOUND) { + irdt.ir_lo = AV_LDEST | AV_LOPRI | + irq_ptr->airq_rdt_entry; + + irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET; + + apic_vt_ops->apic_intrmap_alloc_entry( + &irq_ptr->airq_intrmap_private, NULL, + DDI_INTR_TYPE_FIXED, 1, ioapicindex); + apic_vt_ops->apic_intrmap_map_entry( + irq_ptr->airq_intrmap_private, (void *)&irdt, + DDI_INTR_TYPE_FIXED, 1); + apic_vt_ops->apic_intrmap_record_rdt( + irq_ptr->airq_intrmap_private, &irdt); + + /* Write the RDT entry -- no specific CPU binding */ + WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, + irdt.ir_hi | AV_TOALL); + + if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != + IRQ_UNBOUND) + apic_cpus[airq_temp_cpu].aci_temp_bound--; + + /* + * Write the vector, trigger, and polarity portion of + * the RDT + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, + irdt.ir_lo); + + irq_ptr->airq_temp_cpu = IRQ_UNBOUND; + return (0); + } + } + + if (bind_cpu & IRQ_USER_BOUND) { + cpu_infop->aci_bound++; + } else { + cpu_infop->aci_temp_bound++; + } + ASSERT(apic_cpu_in_range(bind_cpu)); + + if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { + apic_cpus[airq_temp_cpu].aci_temp_bound--; + } + if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; + irdt.ir_hi = cpu_infop->aci_local_id; + + apic_vt_ops->apic_intrmap_alloc_entry( + &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED, + 1, ioapicindex); + apic_vt_ops->apic_intrmap_map_entry( + irq_ptr->airq_intrmap_private, + (void *)&irdt, DDI_INTR_TYPE_FIXED, 1); + apic_vt_ops->apic_intrmap_record_rdt( + irq_ptr->airq_intrmap_private, &irdt); + + /* Write the RDT entry -- bind to a specific CPU: */ + WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, + irdt.ir_hi); + + /* Write the vector, trigger, and polarity portion of the RDT */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, + irdt.ir_lo); + + } else { + int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? + DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; + if (type == DDI_INTR_TYPE_MSI) { + if (irq_ptr->airq_ioapicindex == + irq_ptr->airq_origirq) { + /* first one */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " + "apic_pci_msi_enable_vector\n")); + apic_pci_msi_enable_vector(irq_ptr, + type, which_irq, irq_ptr->airq_vector, + irq_ptr->airq_intin_no, + cpu_infop->aci_local_id); + } + if ((irq_ptr->airq_ioapicindex + + irq_ptr->airq_intin_no - 1) == + irq_ptr->airq_origirq) { /* last one */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " + "apic_pci_msi_enable_mode\n")); + apic_pci_msi_enable_mode(irq_ptr->airq_dip, + type, which_irq); + } + } else { /* MSI-X */ + apic_pci_msi_enable_vector(irq_ptr, type, + irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, + cpu_infop->aci_local_id); + apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, + irq_ptr->airq_origirq); + } + } + irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; + apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); + return (0); +} + +static void +apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) +{ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) + & AV_REMOTE_IRR) != 0) { + /* + * Trying to clear the bit through normal + * channels has failed. So as a last-ditch + * effort, try to set the trigger mode to + * edge, then to level. This has been + * observed to work on many systems. + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & ~AV_LEVEL); + + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) | AV_LEVEL); + + /* + * If the bit's STILL set, this interrupt may + * be hosed. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_REMOTE_IRR) != 0) { + + prom_printf("%s: Remote IRR still " + "not clear for IOAPIC %d intin %d.\n" + "\tInterrupts to this pin may cease " + "functioning.\n", psm_name, ioapic_ix, + intin_no); +#ifdef DEBUG + apic_last_ditch_reprogram_failures++; +#endif + } + } +} + +/* + * This function is protected by apic_ioapic_lock coupled with the + * fact that interrupts are disabled. + */ +static void +delete_defer_repro_ent(int which_irq) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + ASSERT(LOCK_HELD(&apic_ioapic_lock)); + + if (apic_reprogram_info[which_irq].done) + return; + + apic_reprogram_info[which_irq].done = B_TRUE; + +#ifdef DEBUG + apic_defer_repro_total_retries += + apic_reprogram_info[which_irq].tries; + + apic_defer_repro_successes++; +#endif + + if (--apic_reprogram_outstanding == 0) { + + setlvlx = psm_intr_exit_fn(); + } +} + + +/* + * Interrupts must be disabled during this function to prevent + * self-deadlock. Interrupts are disabled because this function + * is called from apic_check_stuck_interrupt(), which is called + * from apic_rebind(), which requires its caller to disable interrupts. + */ +static void +add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + ASSERT(!interrupts_enabled()); + + /* + * On the off-chance that there's already a deferred + * reprogramming on this irq, check, and if so, just update the + * CPU and irq pointer to which the interrupt is targeted, then return. + */ + if (!apic_reprogram_info[which_irq].done) { + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].irqp = irq_ptr; + return; + } + + apic_reprogram_info[which_irq].irqp = irq_ptr; + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].tries = 0; + /* + * This must be the last thing set, since we're not + * grabbing any locks, apic_try_deferred_reprogram() will + * make its decision about using this entry iff done + * is false. + */ + apic_reprogram_info[which_irq].done = B_FALSE; + + /* + * If there were previously no deferred reprogrammings, change + * setlvlx to call apic_try_deferred_reprogram() + */ + if (++apic_reprogram_outstanding == 1) { + + setlvlx = apic_try_deferred_reprogram; + } +} + +static void +apic_try_deferred_reprogram(int prev_ipl, int irq) +{ + int reproirq; + ulong_t iflag; + struct ioapic_reprogram_data *drep; + + (*psm_intr_exit_fn())(prev_ipl, irq); + + if (!lock_try(&apic_defer_reprogram_lock)) { + return; + } + + /* + * Acquire the apic_ioapic_lock so that any other operations that + * may affect the apic_reprogram_info state are serialized. + * It's still possible for the last deferred reprogramming to clear + * between the time we entered this function and the time we get to + * the for loop below. In that case, *setlvlx will have been set + * back to *_intr_exit and drep will be NULL. (There's no way to + * stop that from happening -- we would need to grab a lock before + * calling *setlvlx, which is neither realistic nor prudent). + */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* + * For each deferred RDT entry, try to reprogram it now. Note that + * there is no lock acquisition to read apic_reprogram_info because + * '.done' is set only after the other fields in the structure are set. + */ + + drep = NULL; + for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { + if (apic_reprogram_info[reproirq].done == B_FALSE) { + drep = &apic_reprogram_info[reproirq]; + break; + } + } + + /* + * Either we found a deferred action to perform, or + * we entered this function spuriously, after *setlvlx + * was restored to point to *_intr_exit. Any other + * permutation is invalid. + */ + ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); + + /* + * Though we can't really do anything about errors + * at this point, keep track of them for reporting. + * Note that it is very possible for apic_setup_io_intr + * to re-register this very timeout if the Remote IRR bit + * has not yet cleared. + */ + +#ifdef DEBUG + if (drep != NULL) { + if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { + apic_deferred_setup_failures++; + } + } else { + apic_deferred_spurious_enters++; + } +#else + if (drep != NULL) + (void) apic_setup_io_intr(drep, reproirq, B_TRUE); +#endif + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + lock_clear(&apic_defer_reprogram_lock); +} + +static void +apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) +{ + int waited; + + /* + * Wait for the delivery pending bit to clear. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & + (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { + + /* + * If we're still waiting on the delivery of this interrupt, + * continue to wait here until it is delivered (this should be + * a very small amount of time, but include a timeout just in + * case). + */ + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_PENDING) == 0) { + break; + } + } + } +} + + +/* + * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR + * bit set. Calls functions that modify the function that setlvlx points to, + * so that the reprogramming can be retried very shortly. + * + * This function will mask the RDT entry if the interrupt is level-triggered. + * (The caller is responsible for unmasking the RDT entry.) + * + * Returns non-zero if the caller should defer IOAPIC reprogramming. + */ +static int +apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, + int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, + struct ioapic_reprogram_data *drep) +{ + int32_t rdt_entry; + int waited; + int reps = 0; + + /* + * Wait for the delivery pending bit to clear. + */ + do { + ++reps; + + apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); + + /* + * Mask the RDT entry, but only if it's a level-triggered + * interrupt + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { + + /* Mask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, + AV_MASK | rdt_entry); + } + + if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { + /* + * If there was a race and an interrupt was injected + * just before we masked, check for that case here. + * Then, unmask the RDT entry and try again. If we're + * on our last try, don't unmask (because we want the + * RDT entry to remain masked for the rest of the + * function). + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)) { + /* Unmask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, rdt_entry & ~AV_MASK); + } + } + + } while ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)); + +#ifdef DEBUG + if (rdt_entry & AV_PENDING) + apic_intr_deliver_timeouts++; +#endif + + /* + * If the remote IRR bit is set, then the interrupt has been sent + * to a CPU for processing. We have no choice but to wait for + * that CPU to process the interrupt, at which point the remote IRR + * bit will be cleared. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & + (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { + + /* + * If the CPU that this RDT is bound to is NOT the current + * CPU, wait until that CPU handles the interrupt and ACKs + * it. If this interrupt is not bound to any CPU (that is, + * if it's bound to the logical destination of "anyone"), it + * may have been delivered to the current CPU so handle that + * case by deferring the reprogramming (below). + */ + if ((old_bind_cpu != IRQ_UNBOUND) && + (old_bind_cpu != IRQ_UNINIT) && + (old_bind_cpu != psm_get_cpu_id())) { + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_REMOTE_IRR) == 0) { + + delete_defer_repro_ent(which_irq); + + /* Remote IRR has cleared! */ + return (0); + } + } + } + + /* + * If we waited and the Remote IRR bit is still not cleared, + * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS + * times for this interrupt, try the last-ditch workaround: + */ + if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { + + apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); + + /* Mark this one as reprogrammed: */ + delete_defer_repro_ent(which_irq); + + return (0); + } else { +#ifdef DEBUG + apic_intr_deferrals++; +#endif + + /* + * If waiting for the Remote IRR bit (above) didn't + * allow it to clear, defer the reprogramming. + * Add a new deferred-programming entry if the + * caller passed a NULL one (and update the existing one + * in case anything changed). + */ + add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); + if (drep) + drep->tries++; + + /* Inform caller to defer IOAPIC programming: */ + return (1); + } + + } + + /* Remote IRR is clear */ + delete_defer_repro_ent(which_irq); + + return (0); +} + +/* + * Called to migrate all interrupts at an irq to another cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held + */ +int +apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) +{ + apic_irq_t *irqptr = irq_ptr; + int retval = 0; + + while (irqptr) { + if (irqptr->airq_temp_cpu != IRQ_UNINIT) + retval |= apic_rebind(irqptr, bind_cpu, NULL); + irqptr = irqptr->airq_next; + } + + return (retval); +} + +/* + * apic_intr_redistribute does all the messy computations for identifying + * which interrupt to move to which CPU. Currently we do just one interrupt + * at a time. This reduces the time we spent doing all this within clock + * interrupt. When it is done in idle, we could do more than 1. + * First we find the most busy and the most free CPU (time in ISR only) + * skipping those CPUs that has been identified as being ineligible (cpu_skip) + * Then we look for IRQs which are closest to the difference between the + * most busy CPU and the average ISR load. We try to find one whose load + * is less than difference.If none exists, then we chose one larger than the + * difference, provided it does not make the most idle CPU worse than the + * most busy one. In the end, we clear all the busy fields for CPUs. For + * IRQs, they are cleared as they are scanned. + */ +void +apic_intr_redistribute(void) +{ + int busiest_cpu, most_free_cpu; + int cpu_free, cpu_busy, max_busy, min_busy; + int min_free, diff; + int average_busy, cpus_online; + int i, busy; + ulong_t iflag; + apic_cpus_info_t *cpu_infop; + apic_irq_t *min_busy_irq = NULL; + apic_irq_t *max_busy_irq = NULL; + + busiest_cpu = most_free_cpu = -1; + cpu_free = cpu_busy = max_busy = average_busy = 0; + min_free = apic_sample_factor_redistribution; + cpus_online = 0; + /* + * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu + * without ioapic_lock. That is OK as we are just doing statistical + * sampling anyway and any inaccuracy now will get corrected next time + * The call to rebind which actually changes things will make sure + * we are consistent. + */ + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i) && + !(apic_redist_cpu_skip & (1 << i)) && + (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { + + cpu_infop = &apic_cpus[i]; + /* + * If no unbound interrupts or only 1 total on this + * CPU, skip + */ + if (!cpu_infop->aci_temp_bound || + (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) + == 1) { + apic_redist_cpu_skip |= 1 << i; + continue; + } + + busy = cpu_infop->aci_busy; + average_busy += busy; + cpus_online++; + if (max_busy < busy) { + max_busy = busy; + busiest_cpu = i; + } + if (min_free > busy) { + min_free = busy; + most_free_cpu = i; + } + if (busy > apic_int_busy_mark) { + cpu_busy |= 1 << i; + } else { + if (busy < apic_int_free_mark) + cpu_free |= 1 << i; + } + } + } + if ((cpu_busy && cpu_free) || + (max_busy >= (min_free + apic_diff_for_redistribution))) { + + apic_num_imbalance++; +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf( + "redistribute busy=%x free=%x max=%x min=%x", + cpu_busy, cpu_free, max_busy, min_free); + } +#endif /* DEBUG */ + + + average_busy /= cpus_online; + + diff = max_busy - average_busy; + min_busy = max_busy; /* start with the max possible value */ + max_busy = 0; + min_busy_irq = max_busy_irq = NULL; + i = apic_min_device_irq; + for (; i <= apic_max_device_irq; i++) { + apic_irq_t *irq_ptr; + /* Change to linked list per CPU ? */ + if ((irq_ptr = apic_irq_table[i]) == NULL) + continue; + /* Check for irq_busy & decide which one to move */ + /* Also zero them for next round */ + if ((irq_ptr->airq_temp_cpu == busiest_cpu) && + irq_ptr->airq_busy) { + if (irq_ptr->airq_busy < diff) { + /* + * Check for least busy CPU, + * best fit or what ? + */ + if (max_busy < irq_ptr->airq_busy) { + /* + * Most busy within the + * required differential + */ + max_busy = irq_ptr->airq_busy; + max_busy_irq = irq_ptr; + } + } else { + if (min_busy > irq_ptr->airq_busy) { + /* + * least busy, but more than + * the reqd diff + */ + if (min_busy < + (diff + average_busy - + min_free)) { + /* + * Making sure new cpu + * will not end up + * worse + */ + min_busy = + irq_ptr->airq_busy; + + min_busy_irq = irq_ptr; + } + } + } + } + irq_ptr->airq_busy = 0; + } + + if (max_busy_irq != NULL) { +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf("rebinding %x to %x", + max_busy_irq->airq_vector, most_free_cpu); + } +#endif /* DEBUG */ + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(max_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + max_busy_irq->airq_cpu = + (uint32_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + + } else if (min_busy_irq != NULL) { +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf("rebinding %x to %x", + min_busy_irq->airq_vector, most_free_cpu); + } +#endif /* DEBUG */ + + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(min_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + min_busy_irq->airq_cpu = + (uint32_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + + } else { + if (cpu_busy != (1 << busiest_cpu)) { + apic_redist_cpu_skip |= 1 << busiest_cpu; + /* + * We leave cpu_skip set so that next time we + * can choose another cpu + */ + } + } + apic_num_rebind++; + } else { + /* + * found nothing. Could be that we skipped over valid CPUs + * or we have balanced everything. If we had a variable + * ticks_for_redistribution, it could be increased here. + * apic_int_busy, int_free etc would also need to be + * changed. + */ + if (apic_redist_cpu_skip) + apic_redist_cpu_skip = 0; + } + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } + } +} + +void +apic_cleanup_busy(void) +{ + int i; + apic_irq_t *irq_ptr; + + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } + } + + for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { + if ((irq_ptr = apic_irq_table[i]) != NULL) + irq_ptr->airq_busy = 0; + } +} diff --git a/usr/src/uts/i86pc/io/pci/pci_common.c b/usr/src/uts/i86pc/io/pci/pci_common.c index ab74bd7bed..ad689868bc 100644 --- a/usr/src/uts/i86pc/io/pci/pci_common.c +++ b/usr/src/uts/i86pc/io/pci/pci_common.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -59,10 +58,15 @@ static int pci_enable_intr(dev_info_t *, dev_info_t *, ddi_intr_handle_impl_t *, uint32_t); static void pci_disable_intr(dev_info_t *, dev_info_t *, ddi_intr_handle_impl_t *, uint32_t); +static int pci_alloc_intr_fixed(dev_info_t *, dev_info_t *, + ddi_intr_handle_impl_t *, void *); +static int pci_free_intr_fixed(dev_info_t *, dev_info_t *, + ddi_intr_handle_impl_t *); -/* Extern decalration for pcplusmp module */ +/* Extern declarations for PSM module */ extern int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t, int *); +extern ddi_irm_pool_t *apix_irm_pool_p; /* * pci_name_child: @@ -205,6 +209,7 @@ pci_common_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, ihdl_plat_t *ihdl_plat_datap; ddi_intr_handle_t *h_array; ddi_acc_handle_t handle; + apic_get_intr_t intrinfo; DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: pdip 0x%p, rdip 0x%p, op %x handle 0x%p\n", @@ -264,7 +269,7 @@ pci_common_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: " "rdip: 0x%p supported types: 0x%x\n", (void *)rdip, - *(int *)result)); + types)); /* * Export any MSI/MSI-X cap locations via properties @@ -302,9 +307,14 @@ SUPPORTED_TYPES_OUT: } break; case DDI_INTROP_ALLOC: + + /* + * FIXED type + */ + if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) + return (pci_alloc_intr_fixed(pdip, rdip, hdlp, result)); /* * MSI or MSIX (figure out number of vectors available) - * FIXED interrupts: just return available interrupts */ if (DDI_INTR_IS_MSI_OR_MSIX(hdlp->ih_type) && (psm_intr_ops != NULL) && @@ -411,12 +421,6 @@ SUPPORTED_TYPES_OUT: ++pcieb_intr_pri_counter; } - } else if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) { - /* Figure out if this device supports MASKING */ - pci_rval = pci_intx_get_cap(rdip, &pci_status); - if (pci_rval == DDI_SUCCESS && pci_status) - hdlp->ih_cap |= pci_status; - *(int *)result = 1; /* DDI_INTR_TYPE_FIXED */ } else return (DDI_FAILURE); break; @@ -446,7 +450,10 @@ SUPPORTED_TYPES_OUT: i_ddi_set_msix(hdlp->ih_dip, NULL); } } - } + } else if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) { + return (pci_free_intr_fixed(pdip, rdip, hdlp)); + } else + return (DDI_FAILURE); break; case DDI_INTROP_GETPRI: /* Get the priority */ @@ -532,7 +539,7 @@ SUPPORTED_TYPES_OUT: else if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) pci_rval = pci_intx_get_cap(rdip, &pci_status); - /* next check with pcplusmp */ + /* next check with PSM module */ if (psm_intr_ops != NULL) psm_rval = (*psm_intr_ops)(rdip, hdlp, PSM_INTR_OP_GET_CAP, &psm_status); @@ -660,7 +667,7 @@ SUPPORTED_TYPES_OUT: pci_status = pci_intx_clr_mask(rdip); } - /* For MSI/X; no need to check with pcplusmp */ + /* For MSI/X; no need to check with PSM module */ if (hdlp->ih_type != DDI_INTR_TYPE_FIXED) return (pci_status); @@ -669,7 +676,7 @@ SUPPORTED_TYPES_OUT: pci_status == DDI_SUCCESS) break; - /* For fixed interrupts only: confer with pcplusmp next */ + /* For fixed interrupts only: confer with PSM module next */ if (psm_intr_ops != NULL) { /* If interrupt is shared; do nothing */ psm_rval = (*psm_intr_ops)(rdip, hdlp, @@ -678,7 +685,7 @@ SUPPORTED_TYPES_OUT: if (psm_rval == PSM_FAILURE || psm_status == 1) return (pci_status); - /* Now, pcplusmp should try to set/clear the mask */ + /* Now, PSM module should try to set/clear the mask */ if (intr_op == DDI_INTROP_SETMASK) psm_rval = (*psm_intr_ops)(rdip, hdlp, PSM_INTR_OP_SET_MASK, NULL); @@ -698,7 +705,7 @@ SUPPORTED_TYPES_OUT: else if (hdlp->ih_type == DDI_INTR_TYPE_FIXED) pci_rval = pci_intx_get_pending(rdip, &pci_status); - /* On failure; next try with pcplusmp */ + /* On failure; next try with PSM module */ if (pci_rval != DDI_SUCCESS && psm_intr_ops != NULL) psm_rval = (*psm_intr_ops)(rdip, hdlp, PSM_INTR_OP_GET_PENDING, &psm_status); @@ -722,27 +729,44 @@ SUPPORTED_TYPES_OUT: case DDI_INTROP_GETTARGET: DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: GETTARGET\n")); - /* Note hdlp->ih_vector is actually an irq */ - if ((rv = pci_get_cpu_from_vecirq(hdlp->ih_vector, IS_IRQ)) == - -1) + bcopy(hdlp, &tmp_hdl, sizeof (ddi_intr_handle_impl_t)); + tmp_hdl.ih_private = (void *)&intrinfo; + intrinfo.avgi_req_flags = PSMGI_INTRBY_DEFAULT; + intrinfo.avgi_req_flags |= PSMGI_REQ_CPUID; + + if ((*psm_intr_ops)(rdip, &tmp_hdl, PSM_INTR_OP_GET_INTR, + NULL) == PSM_FAILURE) return (DDI_FAILURE); - *(int *)result = rv; + + *(int *)result = intrinfo.avgi_cpu_id; DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: GETTARGET " - "vector = 0x%x, cpu = 0x%x\n", hdlp->ih_vector, rv)); + "vector = 0x%x, cpu = 0x%x\n", hdlp->ih_vector, + *(int *)result)); break; case DDI_INTROP_SETTARGET: DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: SETTARGET\n")); - /* hdlp->ih_vector is actually an irq */ - tmp_hdl.ih_vector = hdlp->ih_vector; - tmp_hdl.ih_flags = PSMGI_INTRBY_IRQ; + bcopy(hdlp, &tmp_hdl, sizeof (ddi_intr_handle_impl_t)); tmp_hdl.ih_private = (void *)(uintptr_t)*(int *)result; - psm_rval = (*psm_intr_ops)(rdip, &tmp_hdl, PSM_INTR_OP_SET_CPU, - &psm_status); + tmp_hdl.ih_flags = PSMGI_INTRBY_DEFAULT; - if (psm_rval != PSM_SUCCESS) + if ((*psm_intr_ops)(rdip, &tmp_hdl, PSM_INTR_OP_SET_CPU, + &psm_status) == PSM_FAILURE) return (DDI_FAILURE); + + hdlp->ih_vector = tmp_hdl.ih_vector; + DDI_INTR_NEXDBG((CE_CONT, "pci_common_intr_ops: SETTARGET " + "vector = 0x%x\n", hdlp->ih_vector)); break; + case DDI_INTROP_GETPOOL: + /* + * For MSI/X interrupts use global IRM pool if available. + */ + if (apix_irm_pool_p && DDI_INTR_IS_MSI_OR_MSIX(hdlp->ih_type)) { + *(ddi_irm_pool_t **)result = apix_irm_pool_p; + return (DDI_SUCCESS); + } + return (DDI_ENOTSUP); default: return (i_ddi_intr_ops(pdip, rdip, intr_op, hdlp, result)); } @@ -750,6 +774,107 @@ SUPPORTED_TYPES_OUT: return (DDI_SUCCESS); } +/* + * Allocate a vector for FIXED type interrupt. + */ +int +pci_alloc_intr_fixed(dev_info_t *pdip, dev_info_t *rdip, + ddi_intr_handle_impl_t *hdlp, void *result) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + int free_phdl = 0; + int pci_rval; + int pci_status = 0; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + /* Figure out if this device supports MASKING */ + pci_rval = pci_intx_get_cap(rdip, &pci_status); + if (pci_rval == DDI_SUCCESS && pci_status) + hdlp->ih_cap |= pci_status; + + /* + * If the PSM module is "APIX" then pass the request for + * allocating the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + ispec = (struct intrspec *)pci_intx_get_ispec(pdip, rdip, + (int)hdlp->ih_inum); + if (ispec == NULL) + return (DDI_FAILURE); + if (hdlp->ih_private == NULL) { /* allocate phdl structure */ + free_phdl = 1; + i_ddi_alloc_intr_phdl(hdlp); + } + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_ALLOC_VECTORS, result); + if (free_phdl) { /* free up the phdl structure */ + free_phdl = 0; + i_ddi_free_intr_phdl(hdlp); + hdlp->ih_private = NULL; + } + } else { + /* + * No APIX module; fall back to the old scheme where the + * interrupt vector is allocated during ddi_enable_intr() call. + */ + *(int *)result = 1; + ret = DDI_SUCCESS; + } + + return (ret); +} + +/* + * Free up the vector for FIXED (legacy) type interrupt. + */ +static int +pci_free_intr_fixed(dev_info_t *pdip, dev_info_t *rdip, + ddi_intr_handle_impl_t *hdlp) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request to it + * to free up the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + ispec = (struct intrspec *)pci_intx_get_ispec(pdip, rdip, + (int)hdlp->ih_inum); + if (ispec == NULL) + return (DDI_FAILURE); + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_FREE_VECTORS, NULL); + } else { + /* + * No APIX module; fall back to the old scheme where + * the interrupt vector was already freed during + * ddi_disable_intr() call. + */ + ret = DDI_SUCCESS; + } + + return (ret); +} + int pci_get_intr_from_vecirq(apic_get_intr_t *intrinfo_p, int vecirq, boolean_t is_irq) @@ -765,7 +890,7 @@ pci_get_intr_from_vecirq(apic_get_intr_t *intrinfo_p, * global interrupt handling. */ get_info_ii_hdl.ih_private = intrinfo_p; - get_info_ii_hdl.ih_vector = (ushort_t)vecirq; + get_info_ii_hdl.ih_vector = vecirq; if ((*psm_intr_ops)(NULL, &get_info_ii_hdl, PSM_INTR_OP_GET_INTR, NULL) == PSM_FAILURE) @@ -779,8 +904,8 @@ int pci_get_cpu_from_vecirq(int vecirq, boolean_t is_irq) { int rval; - apic_get_intr_t intrinfo; + intrinfo.avgi_req_flags = PSMGI_REQ_CPUID; rval = pci_get_intr_from_vecirq(&intrinfo, vecirq, is_irq); @@ -825,8 +950,7 @@ pci_enable_intr(dev_info_t *pdip, dev_info_t *rdip, hdlp->ih_cb_arg2, &ihdl_plat_datap->ip_ticks, rdip)) return (DDI_FAILURE); - /* Note this really is an irq. */ - hdlp->ih_vector = (ushort_t)irq; + hdlp->ih_vector = irq; return (DDI_SUCCESS); } diff --git a/usr/src/uts/i86pc/io/pci/pci_kstats.c b/usr/src/uts/i86pc/io/pci/pci_kstats.c index d31c6f29d9..ea7fcc9dc1 100644 --- a/usr/src/uts/i86pc/io/pci/pci_kstats.c +++ b/usr/src/uts/i86pc/io/pci/pci_kstats.c @@ -19,12 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Kstat support for X86 PCI driver */ @@ -68,6 +64,9 @@ static char ih_buspath[MAXPATHLEN]; static uint32_t pci_ks_inst; static kmutex_t pci_ks_template_lock; +extern int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, + psm_intr_op_t, int *); + /*ARGSUSED*/ static int pci_ih_ks_update(kstat_t *ksp, int rw) @@ -75,7 +74,7 @@ pci_ih_ks_update(kstat_t *ksp, int rw) pci_kstat_private_t *private_data = (pci_kstat_private_t *)ksp->ks_private; dev_info_t *rootnex_dip = private_data->rootnex_dip; - ddi_intr_handle_impl_t *ih_p = private_data->hdlp; + ddi_intr_handle_impl_t tmp_hdl, *ih_p = private_data->hdlp; dev_info_t *dip = ih_p->ih_dip; int maxlen = sizeof (pci_ks_template.ihks_name.value.c); apic_get_intr_t intrinfo; @@ -88,11 +87,6 @@ pci_ih_ks_update(kstat_t *ksp, int rw) kstat_named_setstr(&pci_ks_template.ihks_buspath, ih_buspath); /* - * ih_p->ih_vector really has an IRQ. Ask pci_get_intr_from_vecirq to - * return a vector since that's what PCItool will require intrd to use. - * - * PCItool will change the CPU routing of the IRQ that vector maps to. - * * Note that although possibly multiple vectors can map to an IRQ, the * vector returned below will always be the same for a given IRQ * specified, and so all kstats for a given IRQ will report the same @@ -107,10 +101,14 @@ pci_ih_ks_update(kstat_t *ksp, int rw) * It is also possible that the vector is for a dummy interrupt. * pci_get_intr_from_vecirq will return failure in this case. */ - intrinfo.avgi_cpu_id = 0; /* In case pci_get_intr_from_vecirq fails */ + bcopy(ih_p, &tmp_hdl, sizeof (ddi_intr_handle_impl_t)); + tmp_hdl.ih_private = (void *)&intrinfo; + intrinfo.avgi_cpu_id = 0; /* In case psm_intr_ops fails */ intrinfo.avgi_req_flags = PSMGI_REQ_CPUID | PSMGI_REQ_VECTOR; + intrinfo.avgi_req_flags |= PSMGI_INTRBY_DEFAULT; + if ((ih_p->ih_state != DDI_IHDL_STATE_ENABLE) || - (pci_get_intr_from_vecirq(&intrinfo, ih_p->ih_vector, IS_IRQ) != + ((*psm_intr_ops)(NULL, &tmp_hdl, PSM_INTR_OP_GET_INTR, NULL) != DDI_SUCCESS) || (intrinfo.avgi_cpu_id & PSMGI_CPU_FLAGS)) { diff --git a/usr/src/uts/i86pc/io/pci/pci_tools.c b/usr/src/uts/i86pc/io/pci/pci_tools.c index b0e89f8b0f..4cdfbf7591 100644 --- a/usr/src/uts/i86pc/io/pci/pci_tools.c +++ b/usr/src/uts/i86pc/io/pci/pci_tools.c @@ -36,6 +36,7 @@ #include <sys/pci_tools.h> #include <io/pci/pci_tools_ext.h> #include <sys/apic.h> +#include <sys/apix.h> #include <io/pci/pci_var.h> #include <sys/pci_impl.h> #include <sys/promif.h> @@ -131,6 +132,7 @@ pcitool_set_intr(dev_info_t *dip, void *arg, int mode) int ret, result; size_t copyinout_size; int rval = SUCCESS; + apic_get_type_t type_info; /* Version 1 of pcitool_intr_set_t doesn't have flags. */ copyinout_size = (size_t)&iset.flags - (size_t)&iset; @@ -160,20 +162,38 @@ pcitool_set_intr(dev_info_t *dip, void *arg, int mode) goto done_set_intr; } - if (iset.ino > APIC_MAX_VECTOR) { - rval = EINVAL; - iset.status = PCITOOL_INVALID_INO; + info_hdl.ih_private = &type_info; + + if ((*psm_intr_ops)(NULL, &info_hdl, + PSM_INTR_OP_APIC_TYPE, NULL) != PSM_SUCCESS) { + rval = ENOTSUP; + iset.status = PCITOOL_IO_ERROR; goto done_set_intr; } - iset.status = PCITOOL_SUCCESS; + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if (iset.old_cpu > type_info.avgi_num_cpu) { + rval = EINVAL; + iset.status = PCITOOL_INVALID_CPUID; + goto done_set_intr; + } + old_cpu = iset.old_cpu; + } else { + if ((old_cpu = + pci_get_cpu_from_vecirq(iset.ino, IS_VEC)) == -1) { + iset.status = PCITOOL_IO_ERROR; + rval = EINVAL; + goto done_set_intr; + } + } - if ((old_cpu = pci_get_cpu_from_vecirq(iset.ino, IS_VEC)) == -1) { - iset.status = PCITOOL_IO_ERROR; + if (iset.ino > type_info.avgi_num_intr) { rval = EINVAL; + iset.status = PCITOOL_INVALID_INO; goto done_set_intr; } + iset.status = PCITOOL_SUCCESS; old_cpu &= ~PSMGI_CPU_USER_BOUND; @@ -181,7 +201,11 @@ pcitool_set_intr(dev_info_t *dip, void *arg, int mode) * For this locally-declared and used handle, ih_private will contain a * CPU value, not an ihdl_plat_t as used for global interrupt handling. */ - info_hdl.ih_vector = iset.ino; + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + info_hdl.ih_vector = APIX_VIRTVECTOR(old_cpu, iset.ino); + } else { + info_hdl.ih_vector = iset.ino; + } info_hdl.ih_private = (void *)(uintptr_t)iset.cpu_id; info_hdl.ih_flags = PSMGI_INTRBY_VEC; if (pcitool_debug) @@ -222,6 +246,11 @@ pcitool_set_intr(dev_info_t *dip, void *arg, int mode) /* Return original CPU. */ iset.cpu_id = old_cpu; + /* Return new vector */ + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + iset.ino = APIX_VIRTVEC_VECTOR(info_hdl.ih_vector); + } + done_set_intr: iset.drvr_version = PCITOOL_VERSION; if (ddi_copyout(&iset, arg, copyinout_size, mode) != DDI_SUCCESS) @@ -256,6 +285,7 @@ pcitool_get_intr(dev_info_t *dip, void *arg, int mode) ddi_intr_handle_impl_t info_hdl; apic_get_intr_t intr_info; + apic_get_type_t type_info; /* Read in just the header part, no array section. */ if (ddi_copyin(arg, &partial_iget, PCITOOL_IGET_SIZE(0), mode) != @@ -269,8 +299,28 @@ pcitool_get_intr(dev_info_t *dip, void *arg, int mode) goto done_get_intr; } + info_hdl.ih_private = &type_info; + + if ((*psm_intr_ops)(NULL, &info_hdl, + PSM_INTR_OP_APIC_TYPE, NULL) != PSM_SUCCESS) { + iget->status = PCITOOL_IO_ERROR; + iget->num_devs_ret = 0; + rval = EINVAL; + goto done_get_intr; + } + + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if (partial_iget.cpu_id > type_info.avgi_num_cpu) { + partial_iget.status = PCITOOL_INVALID_CPUID; + partial_iget.num_devs_ret = 0; + rval = EINVAL; + goto done_get_intr; + } + } + /* Validate argument. */ - if (partial_iget.ino > APIC_MAX_VECTOR) { + if ((partial_iget.ino & APIX_VIRTVEC_VECMASK) > + type_info.avgi_num_intr) { partial_iget.status = PCITOOL_INVALID_INO; partial_iget.num_devs_ret = 0; rval = EINVAL; @@ -287,7 +337,13 @@ pcitool_get_intr(dev_info_t *dip, void *arg, int mode) * global interrupt handling. */ info_hdl.ih_private = &intr_info; - info_hdl.ih_vector = partial_iget.ino; + + if (strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + info_hdl.ih_vector = + APIX_VIRTVECTOR(partial_iget.cpu_id, partial_iget.ino); + } else { + info_hdl.ih_vector = partial_iget.ino; + } /* Caller wants device information returned. */ if (num_devs_ret > 0) { @@ -393,6 +449,7 @@ pcitool_intr_info(dev_info_t *dip, void *arg, int mode) pcitool_intr_info_t intr_info; ddi_intr_handle_impl_t info_hdl; int rval = SUCCESS; + apic_get_type_t type_info; /* If we need user_version, and to ret same user version as passed in */ if (ddi_copyin(arg, &intr_info, sizeof (pcitool_intr_info_t), mode) != @@ -405,22 +462,31 @@ pcitool_intr_info(dev_info_t *dip, void *arg, int mode) if (intr_info.flags & PCITOOL_INTR_FLAG_GET_MSI) return (ENOTSUP); + info_hdl.ih_private = &type_info; + /* For UPPC systems, psm_intr_ops has no entry for APIC_TYPE. */ if ((rval = (*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL)) != PSM_SUCCESS) { intr_info.ctlr_type = PCITOOL_CTLR_TYPE_UPPC; intr_info.ctlr_version = 0; - + intr_info.num_intr = APIC_MAX_VECTOR; } else { intr_info.ctlr_version = (uint32_t)info_hdl.ih_ver; - if (strcmp((char *)info_hdl.ih_private, - APIC_PCPLUSMP_NAME) == 0) + intr_info.num_cpu = type_info.avgi_num_cpu; + if (strcmp(type_info.avgi_type, + APIC_PCPLUSMP_NAME) == 0) { intr_info.ctlr_type = PCITOOL_CTLR_TYPE_PCPLUSMP; - else + intr_info.num_intr = type_info.avgi_num_intr; + } else if (strcmp(type_info.avgi_type, + APIC_APIX_NAME) == 0) { + intr_info.ctlr_type = PCITOOL_CTLR_TYPE_APIX; + intr_info.num_intr = type_info.avgi_num_intr; + } else { intr_info.ctlr_type = PCITOOL_CTLR_TYPE_UNKNOWN; + intr_info.num_intr = APIC_MAX_VECTOR; + } } - intr_info.num_intr = APIC_MAX_VECTOR; intr_info.drvr_version = PCITOOL_VERSION; if (ddi_copyout(&intr_info, arg, sizeof (pcitool_intr_info_t), mode) != DDI_SUCCESS) { diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic.c b/usr/src/uts/i86pc/io/pcplusmp/apic.c index f5a0d5bbdf..22553d39d3 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c @@ -27,6 +27,7 @@ * All rights reserved. */ + /* * PSMI 1.1 extensions are supported only in 2.6 and later versions. * PSMI 1.2 extensions are supported only in 2.7 and later versions. @@ -73,94 +74,32 @@ #include <sys/x_call.h> #include <sys/reboot.h> #include <sys/hpet.h> +#include <sys/apic_common.h> /* * Local Function Prototypes */ -static void apic_init_intr(); -static void apic_nmi_intr(caddr_t arg, struct regs *rp); -static processorid_t apic_find_cpu(int flag); +static void apic_init_intr(void); /* * standard MP entries */ -static int apic_probe(); -static int apic_clkinit(); +static int apic_probe(void); static int apic_getclkirq(int ipl); static uint_t apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj); -static hrtime_t apic_gettime(); -static hrtime_t apic_gethrtime(); -static void apic_init(); +static void apic_init(void); static void apic_picinit(void); -static int apic_cpu_start(processorid_t cpuid, caddr_t ctx); -static int apic_cpu_stop(processorid_t cpuid, caddr_t ctx); -static int apic_cpu_add(psm_cpu_request_t *reqp); -static int apic_cpu_remove(psm_cpu_request_t *reqp); -static int apic_cpu_ops(psm_cpu_request_t *reqp); static int apic_post_cpu_start(void); -static void apic_send_ipi(int cpun, int ipl); -static void apic_set_idlecpu(processorid_t cpun); -static void apic_unset_idlecpu(processorid_t cpun); static int apic_intr_enter(int ipl, int *vect); static void apic_setspl(int ipl); static void x2apic_setspl(int ipl); -static void apic_switch_ipi_callback(boolean_t enter); static int apic_addspl(int ipl, int vector, int min_ipl, int max_ipl); static int apic_delspl(int ipl, int vector, int min_ipl, int max_ipl); -static void apic_shutdown(int cmd, int fcn); -static void apic_preshutdown(int cmd, int fcn); static int apic_disable_intr(processorid_t cpun); static void apic_enable_intr(processorid_t cpun); -static processorid_t apic_get_next_processorid(processorid_t cpun); static int apic_get_ipivect(int ipl, int type); -static void apic_timer_reprogram(hrtime_t time); -static void apic_timer_enable(void); -static void apic_timer_disable(void); static void apic_post_cyclic_setup(void *arg); -static void apic_intrmap_init(int apic_mode); -static void apic_record_ioapic_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt); -static void apic_record_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs); - -static int apic_oneshot = 0; -int apic_oneshot_enable = 1; /* to allow disabling one-shot capability */ - -/* Now the ones for Dynamic Interrupt distribution */ -int apic_enable_dynamic_migration = 0; - -extern int apic_have_32bit_cr8; - -/* - * These variables are frequently accessed in apic_intr_enter(), - * apic_intr_exit and apic_setspl, so group them together - */ -volatile uint32_t *apicadr = NULL; /* virtual addr of local APIC */ -int apic_setspl_delay = 1; /* apic_setspl - delay enable */ -int apic_clkvect; - -/* vector at which error interrupts come in */ -int apic_errvect; -int apic_enable_error_intr = 1; -int apic_error_display_delay = 100; - -/* vector at which performance counter overflow interrupts come in */ -int apic_cpcovf_vect; -int apic_enable_cpcovf_intr = 1; - -/* maximum loop count when sending Start IPIs. */ -int apic_sipi_max_loop_count = 0x1000; - -/* vector at which CMCI interrupts come in */ -int apic_cmci_vect; -extern int cmi_enable_cmci; -extern void cmi_cmci_trap(void); - -static kmutex_t cmci_cpu_setup_lock; /* protects cmci_cpu_setup_registered */ -static int cmci_cpu_setup_registered; - -/* number of CPUs in power-on transition state */ -static int apic_poweron_cnt = 0; -static lock_t apic_mode_switch_lock; /* * The following vector assignments influence the value of ipltopri and @@ -212,20 +151,8 @@ uchar_t apic_ipls[APIC_AVAIL_VECTOR]; /* * Patchable global variables. */ -int apic_forceload = 0; - -int apic_coarse_hrtime = 1; /* 0 - use accurate slow gethrtime() */ - /* 1 - use gettime() for performance */ -int apic_flat_model = 0; /* 0 - clustered. 1 - flat */ int apic_enable_hwsoftint = 0; /* 0 - disable, 1 - enable */ int apic_enable_bind_log = 1; /* 1 - display interrupt binding log */ -int apic_panic_on_nmi = 0; -int apic_panic_on_apic_error = 0; - -int apic_verbose = 0; - -/* minimum number of timer ticks to program to */ -int apic_min_timer_ticks = 1; /* * Local static data @@ -273,6 +200,7 @@ static struct psm_ops apic_ops = { apic_cpu_ops, /* CPU control interface. */ }; +struct psm_ops *psmops = &apic_ops; static struct psm_info apic_psm_info = { PSM_INFO_VER01_7, /* version */ @@ -284,25 +212,6 @@ static struct psm_info apic_psm_info = { static void *apic_hdlp; -#ifdef DEBUG -int apic_debug = 0; -int apic_restrict_vector = 0; - -int apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE]; -int apic_debug_msgbufindex = 0; - -#endif /* DEBUG */ - -apic_cpus_info_t *apic_cpus; - -cpuset_t apic_cpumask; -uint_t apic_picinit_called; - -/* Flag to indicate that we need to shut down all processors */ -static uint_t apic_shutdown_processors; - -uint_t apic_nsec_per_intr = 0; - /* * apic_let_idle_redistribute can have the following values: * 0 - If clock decremented it from 1 to 0, clock has to call redistribute. @@ -310,93 +219,10 @@ uint_t apic_nsec_per_intr = 0; */ int apic_num_idle_redistributions = 0; static int apic_let_idle_redistribute = 0; -static uint_t apic_nticks = 0; -static uint_t apic_skipped_redistribute = 0; /* to gather intr data and redistribute */ static void apic_redistribute_compute(void); -static uint_t last_count_read = 0; -static lock_t apic_gethrtime_lock; -volatile int apic_hrtime_stamp = 0; -volatile hrtime_t apic_nsec_since_boot = 0; -static uint_t apic_hertz_count; - -uint64_t apic_ticks_per_SFnsecs; /* # of ticks in SF nsecs */ - -static hrtime_t apic_nsec_max; - -static hrtime_t apic_last_hrtime = 0; -int apic_hrtime_error = 0; -int apic_remote_hrterr = 0; -int apic_num_nmis = 0; -int apic_apic_error = 0; -int apic_num_apic_errors = 0; -int apic_num_cksum_errors = 0; - -int apic_error = 0; -static int apic_cmos_ssb_set = 0; - -/* use to make sure only one cpu handles the nmi */ -static lock_t apic_nmi_lock; -/* use to make sure only one cpu handles the error interrupt */ -static lock_t apic_error_lock; - -static struct { - uchar_t cntl; - uchar_t data; -} aspen_bmc[] = { - { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ - { CC_SMS_WR_NEXT, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ - { CC_SMS_WR_NEXT, 0x84 }, /* DataByte 1: SMS/OS no log */ - { CC_SMS_WR_NEXT, 0x2 }, /* DataByte 2: Power Down */ - { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 3: no pre-timeout */ - { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 4: timer expir. */ - { CC_SMS_WR_NEXT, 0xa }, /* DataByte 5: init countdown */ - { CC_SMS_WR_END, 0x0 }, /* DataByte 6: init countdown */ - - { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ - { CC_SMS_WR_END, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ -}; - -static struct { - int port; - uchar_t data; -} sitka_bmc[] = { - { SMS_COMMAND_REGISTER, SMS_WRITE_START }, - { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ - { SMS_DATA_REGISTER, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ - { SMS_DATA_REGISTER, 0x84 }, /* DataByte 1: SMS/OS no log */ - { SMS_DATA_REGISTER, 0x2 }, /* DataByte 2: Power Down */ - { SMS_DATA_REGISTER, 0x0 }, /* DataByte 3: no pre-timeout */ - { SMS_DATA_REGISTER, 0x0 }, /* DataByte 4: timer expir. */ - { SMS_DATA_REGISTER, 0xa }, /* DataByte 5: init countdown */ - { SMS_COMMAND_REGISTER, SMS_WRITE_END }, - { SMS_DATA_REGISTER, 0x0 }, /* DataByte 6: init countdown */ - - { SMS_COMMAND_REGISTER, SMS_WRITE_START }, - { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ - { SMS_COMMAND_REGISTER, SMS_WRITE_END }, - { SMS_DATA_REGISTER, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ -}; - -/* Patchable global variables. */ -int apic_kmdb_on_nmi = 0; /* 0 - no, 1 - yes enter kmdb */ -uint32_t apic_divide_reg_init = 0; /* 0 - divide by 2 */ - -/* default apic ops without interrupt remapping */ -static apic_intrmap_ops_t apic_nointrmap_ops = { - (int (*)(int))return_instr, - (void (*)(int))return_instr, - (void (*)(apic_irq_t *))return_instr, - (void (*)(apic_irq_t *, void *))return_instr, - (void (*)(apic_irq_t *))return_instr, - apic_record_ioapic_rdt, - apic_record_msi, -}; - -apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops; - /* * This is the loadable module wrapper */ @@ -421,19 +247,37 @@ _info(struct modinfo *modinfop) return (psm_mod_info(&apic_hdlp, &apic_psm_info, modinfop)); } - static int -apic_probe() +apic_probe(void) { + /* check if apix is initialized */ + if (apix_enable && apix_loaded()) + return (PSM_FAILURE); + else + apix_enable = 0; /* continue using pcplusmp PSM */ + return (apic_probe_common(apic_psm_info.p_mach_idstring)); } +static uchar_t +apic_xlate_vector_by_irq(uchar_t irq) +{ + if (apic_irq_table[irq] == NULL) + return (0); + + return (apic_irq_table[irq]->airq_vector); +} + void -apic_init() +apic_init(void) { int i; int j = 1; + psm_get_ioapicid = apic_get_ioapicid; + psm_get_localapicid = apic_get_localapicid; + psm_xlate_vector_by_irq = apic_xlate_vector_by_irq; + apic_ipltopri[0] = APIC_VECTOR_PER_IPL; /* leave 0 for idle */ for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { if ((i < ((APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL) - 1)) && @@ -462,141 +306,8 @@ apic_init() #endif /* __amd64 */ } -/* - * handler for APIC Error interrupt. Just print a warning and continue - */ -static int -apic_error_intr() -{ - uint_t error0, error1, error; - uint_t i; - - /* - * We need to write before read as per 7.4.17 of system prog manual. - * We do both and or the results to be safe - */ - error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); - apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); - error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); - error = error0 | error1; - - /* - * Clear the APIC error status (do this on all cpus that enter here) - * (two writes are required due to the semantics of accessing the - * error status register.) - */ - apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); - apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); - - /* - * Prevent more than 1 CPU from handling error interrupt causing - * double printing (interleave of characters from multiple - * CPU's when using prom_printf) - */ - if (lock_try(&apic_error_lock) == 0) - return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); - if (error) { -#if DEBUG - if (apic_debug) - debug_enter("pcplusmp: APIC Error interrupt received"); -#endif /* DEBUG */ - if (apic_panic_on_apic_error) - cmn_err(CE_PANIC, - "APIC Error interrupt on CPU %d. Status = %x\n", - psm_get_cpu_id(), error); - else { - if ((error & ~APIC_CS_ERRORS) == 0) { - /* cksum error only */ - apic_error |= APIC_ERR_APIC_ERROR; - apic_apic_error |= error; - apic_num_apic_errors++; - apic_num_cksum_errors++; - } else { - /* - * prom_printf is the best shot we have of - * something which is problem free from - * high level/NMI type of interrupts - */ - prom_printf("APIC Error interrupt on CPU %d. " - "Status 0 = %x, Status 1 = %x\n", - psm_get_cpu_id(), error0, error1); - apic_error |= APIC_ERR_APIC_ERROR; - apic_apic_error |= error; - apic_num_apic_errors++; - for (i = 0; i < apic_error_display_delay; i++) { - tenmicrosec(); - } - /* - * provide more delay next time limited to - * roughly 1 clock tick time - */ - if (apic_error_display_delay < 500) - apic_error_display_delay *= 2; - } - } - lock_clear(&apic_error_lock); - return (DDI_INTR_CLAIMED); - } else { - lock_clear(&apic_error_lock); - return (DDI_INTR_UNCLAIMED); - } - /* NOTREACHED */ -} - -/* - * Turn off the mask bit in the performance counter Local Vector Table entry. - */ static void -apic_cpcovf_mask_clear(void) -{ - apic_reg_ops->apic_write(APIC_PCINT_VECT, - (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK)); -} - -/*ARGSUSED*/ -static int -apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) -{ - apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect); - return (0); -} - -/*ARGSUSED*/ -static int -apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) -{ - apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK); - return (0); -} - -/*ARGSUSED*/ -static int -cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg) -{ - cpuset_t cpu_set; - - CPUSET_ONLY(cpu_set, cpuid); - - switch (what) { - case CPU_ON: - xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set), - (xc_func_t)apic_cmci_enable); - break; - - case CPU_OFF: - xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set), - (xc_func_t)apic_cmci_disable); - break; - - default: - break; - } - - return (0); -} - -static void -apic_init_intr() +apic_init_intr(void) { processorid_t cpun = psm_get_cpu_id(); uint_t nlvt; @@ -748,27 +459,6 @@ apic_init_intr() } static void -apic_disable_local_apic() -{ - apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL); - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK); - - /* local intr reg 0 */ - apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK); - - /* disable NMI */ - apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK); - - /* and error interrupt */ - apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK); - - /* and perf counter intr */ - apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK); - - apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR); -} - -static void apic_picinit(void) { int i, j; @@ -834,184 +524,9 @@ apic_picinit(void) ioapic_init_intr(IOAPIC_MASK); } -static void -apic_cpu_send_SIPI(processorid_t cpun, boolean_t start) -{ - int loop_count; - uint32_t vector; - uint_t apicid; - ulong_t iflag; - - apicid = apic_cpus[cpun].aci_local_id; - - /* - * Interrupts on current CPU will be disabled during the - * steps in order to avoid unwanted side effects from - * executing interrupt handlers on a problematic BIOS. - */ - iflag = intr_clear(); - - if (start) { - outb(CMOS_ADDR, SSB); - outb(CMOS_DATA, BIOS_SHUTDOWN); - } - - /* - * According to X2APIC specification in section '2.3.5.1' of - * Interrupt Command Register Semantics, the semantics of - * programming the Interrupt Command Register to dispatch an interrupt - * is simplified. A single MSR write to the 64-bit ICR is required - * for dispatching an interrupt. Specifically, with the 64-bit MSR - * interface to ICR, system software is not required to check the - * status of the delivery status bit prior to writing to the ICR - * to send an IPI. With the removal of the Delivery Status bit, - * system software no longer has a reason to read the ICR. It remains - * readable only to aid in debugging. - */ #ifdef DEBUG - APIC_AV_PENDING_SET(); -#else - if (apic_mode == LOCAL_APIC) { - APIC_AV_PENDING_SET(); - } -#endif /* DEBUG */ - - /* for integrated - make sure there is one INIT IPI in buffer */ - /* for external - it will wake up the cpu */ - apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET); - - /* If only 1 CPU is installed, PENDING bit will not go low */ - for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) { - if (apic_mode == LOCAL_APIC && - apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING) - apic_ret(); - else - break; - } - - apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET); - drv_usecwait(20000); /* 20 milli sec */ - - if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) { - /* integrated apic */ - - vector = (rm_platter_pa >> MMU_PAGESHIFT) & - (APIC_VECTOR_MASK | APIC_IPL_MASK); - - /* to offset the INIT IPI queue up in the buffer */ - apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); - drv_usecwait(200); /* 20 micro sec */ - - /* - * send the second SIPI (Startup IPI) as recommended by Intel - * software development manual. - */ - apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); - drv_usecwait(200); /* 20 micro sec */ - } - - intr_restore(iflag); -} - -/*ARGSUSED1*/ -static int -apic_cpu_start(processorid_t cpun, caddr_t arg) -{ - ASSERT(MUTEX_HELD(&cpu_lock)); - - if (!apic_cpu_in_range(cpun)) { - return (EINVAL); - } - - /* - * Switch to apic_common_send_ipi for safety during starting other CPUs. - */ - if (apic_mode == LOCAL_X2APIC) { - apic_switch_ipi_callback(B_TRUE); - } - - apic_cmos_ssb_set = 1; - apic_cpu_send_SIPI(cpun, B_TRUE); - - return (0); -} - -/* - * Put CPU into halted state with interrupts disabled. - */ -/*ARGSUSED1*/ -static int -apic_cpu_stop(processorid_t cpun, caddr_t arg) -{ - int rc; - cpu_t *cp; - extern cpuset_t cpu_ready_set; - extern void cpu_idle_intercept_cpu(cpu_t *cp); - - ASSERT(MUTEX_HELD(&cpu_lock)); - - if (!apic_cpu_in_range(cpun)) { - return (EINVAL); - } - if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) { - return (ENOTSUP); - } - - cp = cpu_get(cpun); - ASSERT(cp != NULL); - ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); - ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); - ASSERT((cp->cpu_flags & CPU_ENABLE) == 0); - - /* Clear CPU_READY flag to disable cross calls. */ - cp->cpu_flags &= ~CPU_READY; - CPUSET_ATOMIC_DEL(cpu_ready_set, cpun); - rc = xc_flush_cpu(cp); - if (rc != 0) { - CPUSET_ATOMIC_ADD(cpu_ready_set, cpun); - cp->cpu_flags |= CPU_READY; - return (rc); - } - - /* Intercept target CPU at a safe point before powering it off. */ - cpu_idle_intercept_cpu(cp); - - apic_cpu_send_SIPI(cpun, B_FALSE); - cp->cpu_flags &= ~CPU_RUNNING; - - return (0); -} - -static int -apic_cpu_ops(psm_cpu_request_t *reqp) -{ - if (reqp == NULL) { - return (EINVAL); - } - - switch (reqp->pcr_cmd) { - case PSM_CPU_ADD: - return (apic_cpu_add(reqp)); - - case PSM_CPU_REMOVE: - return (apic_cpu_remove(reqp)); - - case PSM_CPU_STOP: - return (apic_cpu_stop(reqp->req.cpu_stop.cpuid, - reqp->req.cpu_stop.ctx)); - - default: - return (ENOTSUP); - } -} - -#ifdef DEBUG -int apic_break_on_cpu = 9; -int apic_stretch_interrupts = 0; -int apic_stretch_ISR = 1 << 3; /* IPL of 3 matches nothing now */ - void -apic_break() +apic_break(void) { } #endif /* DEBUG */ @@ -1250,225 +765,6 @@ x2apic_setspl(int ipl) apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1; } -/* - * generates an interprocessor interrupt to another CPU. Any changes made to - * this routine must be accompanied by similar changes to - * apic_common_send_ipi(). - */ -static void -apic_send_ipi(int cpun, int ipl) -{ - int vector; - ulong_t flag; - - vector = apic_resv_vector[ipl]; - - ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR)); - - flag = intr_clear(); - - APIC_AV_PENDING_SET(); - - apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id, - vector); - - intr_restore(flag); -} - - -/*ARGSUSED*/ -static void -apic_set_idlecpu(processorid_t cpun) -{ -} - -/*ARGSUSED*/ -static void -apic_unset_idlecpu(processorid_t cpun) -{ -} - - -void -apic_ret() -{ -} - -/* - * If apic_coarse_time == 1, then apic_gettime() is used instead of - * apic_gethrtime(). This is used for performance instead of accuracy. - */ - -static hrtime_t -apic_gettime() -{ - int old_hrtime_stamp; - hrtime_t temp; - - /* - * In one-shot mode, we do not keep time, so if anyone - * calls psm_gettime() directly, we vector over to - * gethrtime(). - * one-shot mode MUST NOT be enabled if this psm is the source of - * hrtime. - */ - - if (apic_oneshot) - return (gethrtime()); - - -gettime_again: - while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) - apic_ret(); - - temp = apic_nsec_since_boot; - - if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ - goto gettime_again; - } - return (temp); -} - -/* - * Here we return the number of nanoseconds since booting. Note every - * clock interrupt increments apic_nsec_since_boot by the appropriate - * amount. - */ -static hrtime_t -apic_gethrtime() -{ - int curr_timeval, countval, elapsed_ticks; - int old_hrtime_stamp, status; - hrtime_t temp; - uint32_t cpun; - ulong_t oflags; - - /* - * In one-shot mode, we do not keep time, so if anyone - * calls psm_gethrtime() directly, we vector over to - * gethrtime(). - * one-shot mode MUST NOT be enabled if this psm is the source of - * hrtime. - */ - - if (apic_oneshot) - return (gethrtime()); - - oflags = intr_clear(); /* prevent migration */ - - cpun = apic_reg_ops->apic_read(APIC_LID_REG); - if (apic_mode == LOCAL_APIC) - cpun >>= APIC_ID_BIT_OFFSET; - - lock_set(&apic_gethrtime_lock); - -gethrtime_again: - while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) - apic_ret(); - - /* - * Check to see which CPU we are on. Note the time is kept on - * the local APIC of CPU 0. If on CPU 0, simply read the current - * counter. If on another CPU, issue a remote read command to CPU 0. - */ - if (cpun == apic_cpus[0].aci_local_id) { - countval = apic_reg_ops->apic_read(APIC_CURR_COUNT); - } else { -#ifdef DEBUG - APIC_AV_PENDING_SET(); -#else - if (apic_mode == LOCAL_APIC) - APIC_AV_PENDING_SET(); -#endif /* DEBUG */ - - apic_reg_ops->apic_write_int_cmd( - apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE); - - while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1)) - & AV_READ_PENDING) { - apic_ret(); - } - - if (status & AV_REMOTE_STATUS) /* 1 = valid */ - countval = apic_reg_ops->apic_read(APIC_REMOTE_READ); - else { /* 0 = invalid */ - apic_remote_hrterr++; - /* - * return last hrtime right now, will need more - * testing if change to retry - */ - temp = apic_last_hrtime; - - lock_clear(&apic_gethrtime_lock); - - intr_restore(oflags); - - return (temp); - } - } - if (countval > last_count_read) - countval = 0; - else - last_count_read = countval; - - elapsed_ticks = apic_hertz_count - countval; - - curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks); - temp = apic_nsec_since_boot + curr_timeval; - - if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ - /* we might have clobbered last_count_read. Restore it */ - last_count_read = apic_hertz_count; - goto gethrtime_again; - } - - if (temp < apic_last_hrtime) { - /* return last hrtime if error occurs */ - apic_hrtime_error++; - temp = apic_last_hrtime; - } - else - apic_last_hrtime = temp; - - lock_clear(&apic_gethrtime_lock); - intr_restore(oflags); - - return (temp); -} - -/* apic NMI handler */ -/*ARGSUSED*/ -static void -apic_nmi_intr(caddr_t arg, struct regs *rp) -{ - if (apic_shutdown_processors) { - apic_disable_local_apic(); - return; - } - - apic_error |= APIC_ERR_NMI; - - if (!lock_try(&apic_nmi_lock)) - return; - apic_num_nmis++; - - if (apic_kmdb_on_nmi && psm_debugger()) { - debug_enter("NMI received: entering kmdb\n"); - } else if (apic_panic_on_nmi) { - /* Keep panic from entering kmdb. */ - nopanicdebug = 1; - panic("NMI received\n"); - } else { - /* - * prom_printf is the best shot we have of something which is - * problem free from high level/NMI type of interrupts - */ - prom_printf("NMI received\n"); - } - - lock_clear(&apic_nmi_lock); -} - /*ARGSUSED*/ static int apic_addspl(int irqno, int ipl, int min_ipl, int max_ipl) @@ -1483,7 +779,7 @@ apic_delspl(int irqno, int ipl, int min_ipl, int max_ipl) } static int -apic_post_cpu_start() +apic_post_cpu_start(void) { int cpun; static int cpus_started = 1; @@ -1539,241 +835,6 @@ apic_post_cpu_start() return (PSM_SUCCESS); } -processorid_t -apic_get_next_processorid(processorid_t cpu_id) -{ - - int i; - - if (cpu_id == -1) - return ((processorid_t)0); - - for (i = cpu_id + 1; i < NCPU; i++) { - if (apic_cpu_in_range(i)) - return (i); - } - - return ((processorid_t)-1); -} - -static int -apic_cpu_add(psm_cpu_request_t *reqp) -{ - int i, rv = 0; - ulong_t iflag; - boolean_t first = B_TRUE; - uchar_t localver; - uint32_t localid, procid; - processorid_t cpuid = (processorid_t)-1; - mach_cpu_add_arg_t *ap; - - ASSERT(reqp != NULL); - reqp->req.cpu_add.cpuid = (processorid_t)-1; - - /* Check whether CPU hotplug is supported. */ - if (!plat_dr_support_cpu() || apic_max_nproc == -1) { - return (ENOTSUP); - } - - ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp; - switch (ap->type) { - case MACH_CPU_ARG_LOCAL_APIC: - localid = ap->arg.apic.apic_id; - procid = ap->arg.apic.proc_id; - if (localid >= 255 || procid > 255) { - cmn_err(CE_WARN, - "!apic: apicid(%u) or procid(%u) is invalid.", - localid, procid); - return (EINVAL); - } - break; - - case MACH_CPU_ARG_LOCAL_X2APIC: - localid = ap->arg.apic.apic_id; - procid = ap->arg.apic.proc_id; - if (localid >= UINT32_MAX) { - cmn_err(CE_WARN, - "!apic: x2apicid(%u) is invalid.", localid); - return (EINVAL); - } else if (localid >= 255 && apic_mode == LOCAL_APIC) { - cmn_err(CE_WARN, "!apic: system is in APIC mode, " - "can't support x2APIC processor."); - return (ENOTSUP); - } - break; - - default: - cmn_err(CE_WARN, - "!apic: unknown argument type %d to apic_cpu_add().", - ap->type); - return (EINVAL); - } - - /* Use apic_ioapic_lock to sync with apic_find_next_cpu_intr. */ - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - /* Check whether local APIC id already exists. */ - for (i = 0; i < apic_nproc; i++) { - if (!CPU_IN_SET(apic_cpumask, i)) - continue; - if (apic_cpus[i].aci_local_id == localid) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - cmn_err(CE_WARN, - "!apic: local apic id %u already exists.", - localid); - return (EEXIST); - } else if (apic_cpus[i].aci_processor_id == procid) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - cmn_err(CE_WARN, - "!apic: processor id %u already exists.", - (int)procid); - return (EEXIST); - } - - /* - * There's no local APIC version number available in MADT table, - * so assume that all CPUs are homogeneous and use local APIC - * version number of the first existing CPU. - */ - if (first) { - first = B_FALSE; - localver = apic_cpus[i].aci_local_ver; - } - } - ASSERT(first == B_FALSE); - - /* - * Try to assign the same cpuid if APIC id exists in the dirty cache. - */ - for (i = 0; i < apic_max_nproc; i++) { - if (CPU_IN_SET(apic_cpumask, i)) { - ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0); - continue; - } - ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE); - if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) && - apic_cpus[i].aci_local_id == localid && - apic_cpus[i].aci_processor_id == procid) { - cpuid = i; - break; - } - } - - /* Avoid the dirty cache and allocate fresh slot if possible. */ - if (cpuid == (processorid_t)-1) { - for (i = 0; i < apic_max_nproc; i++) { - if ((apic_cpus[i].aci_status & APIC_CPU_FREE) && - (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) { - cpuid = i; - break; - } - } - } - - /* Try to find any free slot as last resort. */ - if (cpuid == (processorid_t)-1) { - for (i = 0; i < apic_max_nproc; i++) { - if (apic_cpus[i].aci_status & APIC_CPU_FREE) { - cpuid = i; - break; - } - } - } - - if (cpuid == (processorid_t)-1) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - cmn_err(CE_NOTE, - "!apic: failed to allocate cpu id for processor %u.", - procid); - rv = EAGAIN; - } else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - cmn_err(CE_NOTE, - "!apic: failed to build mapping for processor %u.", - procid); - rv = EBUSY; - } else { - ASSERT(cpuid >= 0 && cpuid < NCPU); - ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus); - bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0])); - apic_cpus[cpuid].aci_processor_id = procid; - apic_cpus[cpuid].aci_local_id = localid; - apic_cpus[cpuid].aci_local_ver = localver; - CPUSET_ATOMIC_ADD(apic_cpumask, cpuid); - if (cpuid >= apic_nproc) { - apic_nproc = cpuid + 1; - } - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - reqp->req.cpu_add.cpuid = cpuid; - } - - return (rv); -} - -static int -apic_cpu_remove(psm_cpu_request_t *reqp) -{ - int i; - ulong_t iflag; - processorid_t cpuid; - - /* Check whether CPU hotplug is supported. */ - if (!plat_dr_support_cpu() || apic_max_nproc == -1) { - return (ENOTSUP); - } - - cpuid = reqp->req.cpu_remove.cpuid; - - /* Use apic_ioapic_lock to sync with apic_find_next_cpu_intr. */ - iflag = intr_clear(); - lock_set(&apic_ioapic_lock); - - if (!apic_cpu_in_range(cpuid)) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - cmn_err(CE_WARN, - "!apic: cpuid %d doesn't exist in apic_cpus array.", - cpuid); - return (ENODEV); - } - ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0); - - if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) { - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - return (ENOENT); - } - - if (cpuid == apic_nproc - 1) { - /* - * We are removing the highest numbered cpuid so we need to - * find the next highest cpuid as the new value for apic_nproc. - */ - for (i = apic_nproc; i > 0; i--) { - if (CPU_IN_SET(apic_cpumask, i - 1)) { - apic_nproc = i; - break; - } - } - /* at least one CPU left */ - ASSERT(i > 0); - } - CPUSET_ATOMIC_DEL(apic_cpumask, cpuid); - /* mark slot as free and keep it in the dirty cache */ - apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY; - - lock_clear(&apic_ioapic_lock); - intr_restore(iflag); - - return (0); -} - /* * type == -1 indicates it is an internal request. Do not change * resv_vector for these requests @@ -1815,359 +876,6 @@ apic_getclkirq(int ipl) return (irq); } - -/* - * Return the number of APIC clock ticks elapsed for 8245 to decrement - * (APIC_TIME_COUNT + pit_ticks_adj) ticks. - */ -static uint_t -apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) -{ - uint8_t pit_tick_lo; - uint16_t pit_tick, target_pit_tick; - uint32_t start_apic_tick, end_apic_tick; - ulong_t iflag; - uint32_t reg; - - reg = addr + APIC_CURR_COUNT - apicadr; - - iflag = intr_clear(); - - do { - pit_tick_lo = inb(PITCTR0_PORT); - pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; - } while (pit_tick < APIC_TIME_MIN || - pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX); - - /* - * Wait for the 8254 to decrement by 5 ticks to ensure - * we didn't start in the middle of a tick. - * Compare with 0x10 for the wrap around case. - */ - target_pit_tick = pit_tick - 5; - do { - pit_tick_lo = inb(PITCTR0_PORT); - pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; - } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - - start_apic_tick = apic_reg_ops->apic_read(reg); - - /* - * Wait for the 8254 to decrement by - * (APIC_TIME_COUNT + pit_ticks_adj) ticks - */ - target_pit_tick = pit_tick - APIC_TIME_COUNT; - do { - pit_tick_lo = inb(PITCTR0_PORT); - pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; - } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - - end_apic_tick = apic_reg_ops->apic_read(reg); - - *pit_ticks_adj = target_pit_tick - pit_tick; - - intr_restore(iflag); - - return (start_apic_tick - end_apic_tick); -} - -/* - * Initialise the APIC timer on the local APIC of CPU 0 to the desired - * frequency. Note at this stage in the boot sequence, the boot processor - * is the only active processor. - * hertz value of 0 indicates a one-shot mode request. In this case - * the function returns the resolution (in nanoseconds) for the hardware - * timer interrupt. If one-shot mode capability is not available, - * the return value will be 0. apic_enable_oneshot is a global switch - * for disabling the functionality. - * A non-zero positive value for hertz indicates a periodic mode request. - * In this case the hardware will be programmed to generate clock interrupts - * at hertz frequency and returns the resolution of interrupts in - * nanosecond. - */ - -static int -apic_clkinit(int hertz) -{ - uint_t apic_ticks = 0; - uint_t pit_ticks; - int ret; - uint16_t pit_ticks_adj; - static int firsttime = 1; - - if (firsttime) { - /* first time calibrate on CPU0 only */ - - apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); - - /* total number of PIT ticks corresponding to apic_ticks */ - pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; - - /* - * Determine the number of nanoseconds per APIC clock tick - * and then determine how many APIC ticks to interrupt at the - * desired frequency - * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s - * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s - * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) - * pic_ticks_per_SFns = - * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) - */ - apic_ticks_per_SFnsecs = - ((SF * apic_ticks * PIT_HZ) / - ((uint64_t)pit_ticks * NANOSEC)); - - /* the interval timer initial count is 32 bit max */ - apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); - firsttime = 0; - } - - if (hertz != 0) { - /* periodic */ - apic_nsec_per_intr = NANOSEC / hertz; - apic_hertz_count = APIC_NSECS_TO_TICKS(apic_nsec_per_intr); - } - - apic_int_busy_mark = (apic_int_busy_mark * - apic_sample_factor_redistribution) / 100; - apic_int_free_mark = (apic_int_free_mark * - apic_sample_factor_redistribution) / 100; - apic_diff_for_redistribution = (apic_diff_for_redistribution * - apic_sample_factor_redistribution) / 100; - - if (hertz == 0) { - /* requested one_shot */ - if (!tsc_gethrtime_enable || !apic_oneshot_enable) - return (0); - apic_oneshot = 1; - ret = (int)APIC_TICKS_TO_NSECS(1); - } else { - /* program the local APIC to interrupt at the given frequency */ - apic_reg_ops->apic_write(APIC_INIT_COUNT, apic_hertz_count); - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_TIME); - apic_oneshot = 0; - ret = NANOSEC / hertz; - } - - return (ret); - -} - -/* - * apic_preshutdown: - * Called early in shutdown whilst we can still access filesystems to do - * things like loading modules which will be required to complete shutdown - * after filesystems are all unmounted. - */ -static void -apic_preshutdown(int cmd, int fcn) -{ - APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n", - cmd, fcn, apic_poweroff_method, apic_enable_acpi)); - - if ((cmd != A_SHUTDOWN) || (fcn != AD_POWEROFF)) { - return; - } -} - -static void -apic_shutdown(int cmd, int fcn) -{ - int restarts, attempts; - int i; - uchar_t byte; - ulong_t iflag; - - hpet_acpi_fini(); - - /* Send NMI to all CPUs except self to do per processor shutdown */ - iflag = intr_clear(); -#ifdef DEBUG - APIC_AV_PENDING_SET(); -#else - if (apic_mode == LOCAL_APIC) - APIC_AV_PENDING_SET(); -#endif /* DEBUG */ - apic_shutdown_processors = 1; - apic_reg_ops->apic_write(APIC_INT_CMD1, - AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF); - - /* restore cmos shutdown byte before reboot */ - if (apic_cmos_ssb_set) { - outb(CMOS_ADDR, SSB); - outb(CMOS_DATA, 0); - } - - ioapic_disable_redirection(); - - /* disable apic mode if imcr present */ - if (apic_imcrp) { - outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT); - outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC); - } - - apic_disable_local_apic(); - - intr_restore(iflag); - - /* remainder of function is for shutdown cases only */ - if (cmd != A_SHUTDOWN) - return; - - /* - * Switch system back into Legacy-Mode if using ACPI and - * not powering-off. Some BIOSes need to remain in ACPI-mode - * for power-off to succeed (Dell Dimension 4600) - * Do not disable ACPI while doing fastreboot - */ - if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT) - (void) AcpiDisable(); - - if (fcn == AD_FASTREBOOT) { - apic_reg_ops->apic_write(APIC_INT_CMD1, - AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF); - } - - /* remainder of function is for shutdown+poweroff case only */ - if (fcn != AD_POWEROFF) - return; - - switch (apic_poweroff_method) { - case APIC_POWEROFF_VIA_RTC: - - /* select the extended NVRAM bank in the RTC */ - outb(CMOS_ADDR, RTC_REGA); - byte = inb(CMOS_DATA); - outb(CMOS_DATA, (byte | EXT_BANK)); - - outb(CMOS_ADDR, PFR_REG); - - /* for Predator must toggle the PAB bit */ - byte = inb(CMOS_DATA); - - /* - * clear power active bar, wakeup alarm and - * kickstart - */ - byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG); - outb(CMOS_DATA, byte); - - /* delay before next write */ - drv_usecwait(1000); - - /* for S40 the following would suffice */ - byte = inb(CMOS_DATA); - - /* power active bar control bit */ - byte |= PAB_CBIT; - outb(CMOS_DATA, byte); - - break; - - case APIC_POWEROFF_VIA_ASPEN_BMC: - restarts = 0; -restart_aspen_bmc: - if (++restarts == 3) - break; - attempts = 0; - do { - byte = inb(MISMIC_FLAG_REGISTER); - byte &= MISMIC_BUSY_MASK; - if (byte != 0) { - drv_usecwait(1000); - if (attempts >= 3) - goto restart_aspen_bmc; - ++attempts; - } - } while (byte != 0); - outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS); - byte = inb(MISMIC_FLAG_REGISTER); - byte |= 0x1; - outb(MISMIC_FLAG_REGISTER, byte); - i = 0; - for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0])); - i++) { - attempts = 0; - do { - byte = inb(MISMIC_FLAG_REGISTER); - byte &= MISMIC_BUSY_MASK; - if (byte != 0) { - drv_usecwait(1000); - if (attempts >= 3) - goto restart_aspen_bmc; - ++attempts; - } - } while (byte != 0); - outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl); - outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data); - byte = inb(MISMIC_FLAG_REGISTER); - byte |= 0x1; - outb(MISMIC_FLAG_REGISTER, byte); - } - break; - - case APIC_POWEROFF_VIA_SITKA_BMC: - restarts = 0; -restart_sitka_bmc: - if (++restarts == 3) - break; - attempts = 0; - do { - byte = inb(SMS_STATUS_REGISTER); - byte &= SMS_STATE_MASK; - if ((byte == SMS_READ_STATE) || - (byte == SMS_WRITE_STATE)) { - drv_usecwait(1000); - if (attempts >= 3) - goto restart_sitka_bmc; - ++attempts; - } - } while ((byte == SMS_READ_STATE) || - (byte == SMS_WRITE_STATE)); - outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS); - i = 0; - for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0])); - i++) { - attempts = 0; - do { - byte = inb(SMS_STATUS_REGISTER); - byte &= SMS_IBF_MASK; - if (byte != 0) { - drv_usecwait(1000); - if (attempts >= 3) - goto restart_sitka_bmc; - ++attempts; - } - } while (byte != 0); - outb(sitka_bmc[i].port, sitka_bmc[i].data); - } - break; - - case APIC_POWEROFF_NONE: - - /* If no APIC direct method, we will try using ACPI */ - if (apic_enable_acpi) { - if (acpi_poweroff() == 1) - return; - } else - return; - - break; - } - /* - * Wait a limited time here for power to go off. - * If the power does not go off, then there was a - * problem and we should continue to the halt which - * prints a message for the user to press a key to - * reboot. - */ - drv_usecwait(7000000); /* wait seven seconds */ - -} - /* * Try and disable all interrupts. We just assign interrupts to other * processors based on policy. If any were bound by user request, we @@ -2267,144 +975,13 @@ apic_enable_intr(processorid_t cpun) } } + if (apic_cpus[cpun].aci_status & APIC_CPU_SUSPEND) + apic_cpus[cpun].aci_status &= ~APIC_CPU_SUSPEND; + lock_clear(&apic_ioapic_lock); intr_restore(iflag); } - -/* - * This function will reprogram the timer. - * - * When in oneshot mode the argument is the absolute time in future to - * generate the interrupt at. - * - * When in periodic mode, the argument is the interval at which the - * interrupts should be generated. There is no need to support the periodic - * mode timer change at this time. - */ -static void -apic_timer_reprogram(hrtime_t time) -{ - hrtime_t now; - uint_t ticks; - int64_t delta; - - /* - * We should be called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - - if (!apic_oneshot) { - /* time is the interval for periodic mode */ - ticks = APIC_NSECS_TO_TICKS(time); - } else { - /* one shot mode */ - - now = gethrtime(); - delta = time - now; - - if (delta <= 0) { - /* - * requested to generate an interrupt in the past - * generate an interrupt as soon as possible - */ - ticks = apic_min_timer_ticks; - } else if (delta > apic_nsec_max) { - /* - * requested to generate an interrupt at a time - * further than what we are capable of. Set to max - * the hardware can handle - */ - - ticks = APIC_MAXVAL; -#ifdef DEBUG - cmn_err(CE_CONT, "apic_timer_reprogram, request at" - " %lld too far in future, current time" - " %lld \n", time, now); -#endif - } else - ticks = APIC_NSECS_TO_TICKS(delta); - } - - if (ticks < apic_min_timer_ticks) - ticks = apic_min_timer_ticks; - - apic_reg_ops->apic_write(APIC_INIT_COUNT, ticks); -} - -/* - * This function will enable timer interrupts. - */ -static void -apic_timer_enable(void) -{ - /* - * We should be Called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - - if (!apic_oneshot) { - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_TIME); - } else { - /* one shot */ - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT)); - } -} - -/* - * This function will disable timer interrupts. - */ -static void -apic_timer_disable(void) -{ - /* - * We should be Called from high PIL context (CBE_HIGH_PIL), - * so kpreempt is disabled. - */ - apic_reg_ops->apic_write(APIC_LOCAL_TIMER, - (apic_clkvect + APIC_BASE_VECT) | AV_MASK); -} - -/* - * Set timer far into the future and return timer - * current Count in nanoseconds. - */ -hrtime_t -apic_timer_stop_count(void) -{ - hrtime_t ns_val; - int enable_val, count_val; - - /* - * Should be called with interrupts disabled. - */ - ASSERT(!interrupts_enabled()); - - enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER); - if ((enable_val & AV_MASK) == AV_MASK) - return ((hrtime_t)-1); /* timer is disabled */ - - count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT); - ns_val = APIC_TICKS_TO_NSECS(count_val); - - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - - return (ns_val); -} - -/* - * Reprogram timer after Deep C-State. - */ -void -apic_timer_restart(hrtime_t time) -{ - apic_timer_reprogram(time); -} - -ddi_periodic_t apic_periodic_id; - /* * If this module needs a periodic handler for the interrupt distribution, it * can be added here. The argument to the periodic handler is not currently @@ -2500,37 +1077,6 @@ apic_redistribute_compute(void) */ /* - * map an apic for memory-mapped access - */ -uint32_t * -mapin_apic(uint32_t addr, size_t len, int flags) -{ - /*LINTED: pointer cast may result in improper alignment */ - return ((uint32_t *)psm_map_phys(addr, len, flags)); -} - -uint32_t * -mapin_ioapic(uint32_t addr, size_t len, int flags) -{ - return (mapin_apic(addr, len, flags)); -} - -/* - * unmap an apic - */ -void -mapout_apic(caddr_t addr, size_t len) -{ - psm_unmap_phys(addr, len); -} - -void -mapout_ioapic(caddr_t addr, size_t len) -{ - mapout_apic(addr, len); -} - -/* * Check to make sure there are enough irq slots */ int @@ -2763,71 +1309,6 @@ apic_free_vector(uchar_t vector) apic_vector_to_irq[vector] = APIC_RESV_IRQ; } -uint32_t -ioapic_read(int ioapic_ix, uint32_t reg) -{ - volatile uint32_t *ioapic; - - ioapic = apicioadr[ioapic_ix]; - ioapic[APIC_IO_REG] = reg; - return (ioapic[APIC_IO_DATA]); -} - -void -ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value) -{ - volatile uint32_t *ioapic; - - ioapic = apicioadr[ioapic_ix]; - ioapic[APIC_IO_REG] = reg; - ioapic[APIC_IO_DATA] = value; -} - -void -ioapic_write_eoi(int ioapic_ix, uint32_t value) -{ - volatile uint32_t *ioapic; - - ioapic = apicioadr[ioapic_ix]; - ioapic[APIC_IO_EOI] = value; -} - -/* - * Round-robin algorithm to find the next CPU with interrupts enabled. - * It can't share the same static variable apic_next_bind_cpu with - * apic_get_next_bind_cpu(), since that will cause all interrupts to be - * bound to CPU1 at boot time. During boot, only CPU0 is online with - * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu() - * are called. However, the pcplusmp driver assumes that there will be - * boot_ncpus CPUs configured eventually so it tries to distribute all - * interrupts among CPU0 - CPU[boot_ncpus - 1]. Thus to prevent all - * interrupts being targetted at CPU1, we need to use a dedicated static - * variable for find_next_cpu() instead of sharing apic_next_bind_cpu. - */ - -static processorid_t -apic_find_cpu(int flag) -{ - int i; - static processorid_t acid = 0; - - ASSERT(LOCK_HELD(&apic_ioapic_lock)); - - /* Find the first CPU with the passed-in flag set */ - for (i = 0; i < apic_nproc; i++) { - if (++acid >= apic_nproc) { - acid = 0; - } - if (apic_cpu_in_range(acid) && - (apic_cpus[acid].aci_status & flag)) { - break; - } - } - - ASSERT((apic_cpus[acid].aci_status & flag) != 0); - return (acid); -} - /* * Call rebind to do the actual programming. * Must be called with interrupts disabled and apic_ioapic_lock held @@ -2837,7 +1318,7 @@ apic_find_cpu(int flag) * p is of the type 'apic_irq_t *'. * * apic_ioapic_lock must be held across this call, as it protects apic_rebind - * and it protects apic_find_next_cpu_intr() from a race in which a CPU can be + * and it protects apic_get_next_bind_cpu() from a race in which a CPU can be * taken offline after a cpu is selected, but before apic_rebind is called to * bind interrupts to it. */ @@ -2879,52 +1360,13 @@ apic_modify_vector(uchar_t vector, int irq) } char * -apic_get_apic_type() +apic_get_apic_type(void) { return (apic_psm_info.p_mach_idstring); } -/* - * Switch between safe and x2APIC IPI sending method. - * CPU may power on in xapic mode or x2apic mode. If CPU needs to send IPI to - * other CPUs before entering x2APIC mode, it still needs to xAPIC method. - * Before sending StartIPI to target CPU, psm_send_ipi will be changed to - * apic_common_send_ipi, which detects current local APIC mode and use right - * method to send IPI. If some CPUs fail to start up, apic_poweron_cnt - * won't return to zero, so apic_common_send_ipi will always be used. - * psm_send_ipi can't be simply changed back to x2apic_send_ipi if some CPUs - * failed to start up because those failed CPUs may recover itself later at - * unpredictable time. - */ -static void -apic_switch_ipi_callback(boolean_t enter) -{ - ulong_t iflag; - struct psm_ops *pops = &apic_ops; - - iflag = intr_clear(); - lock_set(&apic_mode_switch_lock); - if (enter) { - ASSERT(apic_poweron_cnt >= 0); - if (apic_poweron_cnt == 0) { - pops->psm_send_ipi = apic_common_send_ipi; - send_dirintf = pops->psm_send_ipi; - } - apic_poweron_cnt++; - } else { - ASSERT(apic_poweron_cnt > 0); - apic_poweron_cnt--; - if (apic_poweron_cnt == 0) { - pops->psm_send_ipi = x2apic_send_ipi; - send_dirintf = pops->psm_send_ipi; - } - } - lock_clear(&apic_mode_switch_lock); - intr_restore(iflag); -} - void -x2apic_update_psm() +x2apic_update_psm(void) { struct psm_ops *pops = &apic_ops; @@ -2932,67 +1374,10 @@ x2apic_update_psm() pops->psm_intr_exit = x2apic_intr_exit; pops->psm_setspl = x2apic_setspl; + pops->psm_send_ipi = x2apic_send_ipi; send_dirintf = pops->psm_send_ipi; apic_mode = LOCAL_X2APIC; apic_change_ops(); } - -static void -apic_intrmap_init(int apic_mode) -{ - int suppress_brdcst_eoi = 0; - - if (psm_vt_ops != NULL) { - /* - * Since X2APIC requires the use of interrupt remapping - * (though this is not documented explicitly in the Intel - * documentation (yet)), initialize interrupt remapping - * support before initializing the X2APIC unit. - */ - if (((apic_intrmap_ops_t *)psm_vt_ops)-> - apic_intrmap_init(apic_mode) == DDI_SUCCESS) { - - apic_vt_ops = psm_vt_ops; - - /* - * We leverage the interrupt remapping engine to - * suppress broadcast EOI; thus we must send the - * directed EOI with the directed-EOI handler. - */ - if (apic_directed_EOI_supported() == 0) { - suppress_brdcst_eoi = 1; - } - - apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi); - - if (apic_detect_x2apic()) { - apic_enable_x2apic(); - } - - if (apic_directed_EOI_supported() == 0) { - apic_set_directed_EOI_handler(); - } - } - } -} - -/*ARGSUSED*/ -static void -apic_record_ioapic_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt) -{ - irdt->ir_hi <<= APIC_ID_BIT_OFFSET; -} - -/*ARGSUSED*/ -static void -apic_record_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs) -{ - mregs->mr_addr = MSI_ADDR_HDR | - (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | - (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) | - (mregs->mr_addr << MSI_ADDR_DEST_SHIFT); - mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | - mregs->mr_data; -} diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c new file mode 100644 index 0000000000..79d24ed110 --- /dev/null +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c @@ -0,0 +1,1924 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * PSMI 1.1 extensions are supported only in 2.6 and later versions. + * PSMI 1.2 extensions are supported only in 2.7 and later versions. + * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. + * PSMI 1.5 extensions are supported in Solaris Nevada. + * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. + */ +#define PSMI_1_7 + +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/cram.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/psm_common.h> +#include <sys/apic.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/archsystm.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/sysmacros.h> +#include <sys/cpuvar.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/spl.h> +#include <sys/clock.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/x_call.h> +#include <sys/reboot.h> +#include <sys/hpet.h> +#include <sys/apic_common.h> + +static void apic_record_ioapic_rdt(void *intrmap_private, + ioapic_rdt_t *irdt); +static void apic_record_msi(void *intrmap_private, msi_regs_t *mregs); + +/* + * Common routines between pcplusmp & apix (taken from apic.c). + */ + +int apic_clkinit(int); +hrtime_t apic_gethrtime(void); +void apic_send_ipi(int, int); +void apic_set_idlecpu(processorid_t); +void apic_unset_idlecpu(processorid_t); +void apic_shutdown(int, int); +void apic_preshutdown(int, int); +processorid_t apic_get_next_processorid(processorid_t); +void apic_timer_reprogram(hrtime_t); +void apic_timer_enable(void); +void apic_timer_disable(void); + +hrtime_t apic_gettime(); + +enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP; + +int apic_oneshot = 0; +int apic_oneshot_enable = 1; /* to allow disabling one-shot capability */ + +/* Now the ones for Dynamic Interrupt distribution */ +int apic_enable_dynamic_migration = 0; + +/* maximum loop count when sending Start IPIs. */ +int apic_sipi_max_loop_count = 0x1000; + +/* + * These variables are frequently accessed in apic_intr_enter(), + * apic_intr_exit and apic_setspl, so group them together + */ +volatile uint32_t *apicadr = NULL; /* virtual addr of local APIC */ +int apic_setspl_delay = 1; /* apic_setspl - delay enable */ +int apic_clkvect; + +/* vector at which error interrupts come in */ +int apic_errvect; +int apic_enable_error_intr = 1; +int apic_error_display_delay = 100; + +/* vector at which performance counter overflow interrupts come in */ +int apic_cpcovf_vect; +int apic_enable_cpcovf_intr = 1; + +/* vector at which CMCI interrupts come in */ +int apic_cmci_vect; +extern int cmi_enable_cmci; +extern void cmi_cmci_trap(void); + +kmutex_t cmci_cpu_setup_lock; /* protects cmci_cpu_setup_registered */ +int cmci_cpu_setup_registered; + +/* number of CPUs in power-on transition state */ +static int apic_poweron_cnt = 0; +lock_t apic_mode_switch_lock; + +/* + * Patchable global variables. + */ +int apic_forceload = 0; + +int apic_coarse_hrtime = 1; /* 0 - use accurate slow gethrtime() */ + +int apic_flat_model = 0; /* 0 - clustered. 1 - flat */ +int apic_panic_on_nmi = 0; +int apic_panic_on_apic_error = 0; + +int apic_verbose = 0; /* 0x1ff */ + +/* minimum number of timer ticks to program to */ +int apic_min_timer_ticks = 1; + +#ifdef DEBUG +int apic_debug = 0; +int apic_restrict_vector = 0; + +int apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE]; +int apic_debug_msgbufindex = 0; + +#endif /* DEBUG */ + +uint_t apic_nsec_per_intr = 0; + +uint_t apic_nticks = 0; +uint_t apic_skipped_redistribute = 0; + +uint_t last_count_read = 0; +lock_t apic_gethrtime_lock; +volatile int apic_hrtime_stamp = 0; +volatile hrtime_t apic_nsec_since_boot = 0; +uint_t apic_hertz_count; + +uint64_t apic_ticks_per_SFnsecs; /* # of ticks in SF nsecs */ + +static hrtime_t apic_nsec_max; + +static hrtime_t apic_last_hrtime = 0; +int apic_hrtime_error = 0; +int apic_remote_hrterr = 0; +int apic_num_nmis = 0; +int apic_apic_error = 0; +int apic_num_apic_errors = 0; +int apic_num_cksum_errors = 0; + +int apic_error = 0; + +static int apic_cmos_ssb_set = 0; + +/* use to make sure only one cpu handles the nmi */ +lock_t apic_nmi_lock; +/* use to make sure only one cpu handles the error interrupt */ +lock_t apic_error_lock; + +static struct { + uchar_t cntl; + uchar_t data; +} aspen_bmc[] = { + { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ + { CC_SMS_WR_NEXT, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ + { CC_SMS_WR_NEXT, 0x84 }, /* DataByte 1: SMS/OS no log */ + { CC_SMS_WR_NEXT, 0x2 }, /* DataByte 2: Power Down */ + { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 3: no pre-timeout */ + { CC_SMS_WR_NEXT, 0x0 }, /* DataByte 4: timer expir. */ + { CC_SMS_WR_NEXT, 0xa }, /* DataByte 5: init countdown */ + { CC_SMS_WR_END, 0x0 }, /* DataByte 6: init countdown */ + + { CC_SMS_WR_START, 0x18 }, /* NetFn/LUN */ + { CC_SMS_WR_END, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ +}; + +static struct { + int port; + uchar_t data; +} sitka_bmc[] = { + { SMS_COMMAND_REGISTER, SMS_WRITE_START }, + { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ + { SMS_DATA_REGISTER, 0x24 }, /* Cmd SET_WATCHDOG_TIMER */ + { SMS_DATA_REGISTER, 0x84 }, /* DataByte 1: SMS/OS no log */ + { SMS_DATA_REGISTER, 0x2 }, /* DataByte 2: Power Down */ + { SMS_DATA_REGISTER, 0x0 }, /* DataByte 3: no pre-timeout */ + { SMS_DATA_REGISTER, 0x0 }, /* DataByte 4: timer expir. */ + { SMS_DATA_REGISTER, 0xa }, /* DataByte 5: init countdown */ + { SMS_COMMAND_REGISTER, SMS_WRITE_END }, + { SMS_DATA_REGISTER, 0x0 }, /* DataByte 6: init countdown */ + + { SMS_COMMAND_REGISTER, SMS_WRITE_START }, + { SMS_DATA_REGISTER, 0x18 }, /* NetFn/LUN */ + { SMS_COMMAND_REGISTER, SMS_WRITE_END }, + { SMS_DATA_REGISTER, 0x22 } /* Cmd RESET_WATCHDOG_TIMER */ +}; + +/* Patchable global variables. */ +int apic_kmdb_on_nmi = 0; /* 0 - no, 1 - yes enter kmdb */ +uint32_t apic_divide_reg_init = 0; /* 0 - divide by 2 */ + +/* default apic ops without interrupt remapping */ +static apic_intrmap_ops_t apic_nointrmap_ops = { + (int (*)(int))return_instr, + (void (*)(int))return_instr, + (void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr, + (void (*)(void *, void *, uint16_t, int))return_instr, + (void (*)(void **))return_instr, + apic_record_ioapic_rdt, + apic_record_msi, +}; + +apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops; +apic_cpus_info_t *apic_cpus = NULL; +cpuset_t apic_cpumask; +uint_t apic_picinit_called; + +/* Flag to indicate that we need to shut down all processors */ +static uint_t apic_shutdown_processors; + +/* + * Probe the ioapic method for apix module. Called in apic_probe_common() + */ +int +apic_ioapic_method_probe() +{ + if (apix_enable == 0) + return (PSM_SUCCESS); + + /* + * Set IOAPIC EOI handling method. The priority from low to high is: + * 1. IOxAPIC: with EOI register + * 2. IOMMU interrupt mapping + * 3. Mask-Before-EOI method for systems without boot + * interrupt routing, such as systems with only one IOAPIC; + * NVIDIA CK8-04/MCP55 systems; systems with bridge solution + * which disables the boot interrupt routing already. + * 4. Directed EOI + */ + if (apic_io_ver[0] >= 0x20) + apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC; + if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max)) + apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK; + if (apic_directed_EOI_supported()) + apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI; + + /* fall back to pcplusmp */ + if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) { + /* make sure apix is after pcplusmp in /etc/mach */ + apix_enable = 0; /* go ahead with pcplusmp install next */ + return (PSM_FAILURE); + } + + return (PSM_SUCCESS); +} + +/* + * handler for APIC Error interrupt. Just print a warning and continue + */ +int +apic_error_intr() +{ + uint_t error0, error1, error; + uint_t i; + + /* + * We need to write before read as per 7.4.17 of system prog manual. + * We do both and or the results to be safe + */ + error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); + apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); + error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS); + error = error0 | error1; + + /* + * Clear the APIC error status (do this on all cpus that enter here) + * (two writes are required due to the semantics of accessing the + * error status register.) + */ + apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); + apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0); + + /* + * Prevent more than 1 CPU from handling error interrupt causing + * double printing (interleave of characters from multiple + * CPU's when using prom_printf) + */ + if (lock_try(&apic_error_lock) == 0) + return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); + if (error) { +#if DEBUG + if (apic_debug) + debug_enter("pcplusmp: APIC Error interrupt received"); +#endif /* DEBUG */ + if (apic_panic_on_apic_error) + cmn_err(CE_PANIC, + "APIC Error interrupt on CPU %d. Status = %x", + psm_get_cpu_id(), error); + else { + if ((error & ~APIC_CS_ERRORS) == 0) { + /* cksum error only */ + apic_error |= APIC_ERR_APIC_ERROR; + apic_apic_error |= error; + apic_num_apic_errors++; + apic_num_cksum_errors++; + } else { + /* + * prom_printf is the best shot we have of + * something which is problem free from + * high level/NMI type of interrupts + */ + prom_printf("APIC Error interrupt on CPU %d. " + "Status 0 = %x, Status 1 = %x\n", + psm_get_cpu_id(), error0, error1); + apic_error |= APIC_ERR_APIC_ERROR; + apic_apic_error |= error; + apic_num_apic_errors++; + for (i = 0; i < apic_error_display_delay; i++) { + tenmicrosec(); + } + /* + * provide more delay next time limited to + * roughly 1 clock tick time + */ + if (apic_error_display_delay < 500) + apic_error_display_delay *= 2; + } + } + lock_clear(&apic_error_lock); + return (DDI_INTR_CLAIMED); + } else { + lock_clear(&apic_error_lock); + return (DDI_INTR_UNCLAIMED); + } +} + +/* + * Turn off the mask bit in the performance counter Local Vector Table entry. + */ +void +apic_cpcovf_mask_clear(void) +{ + apic_reg_ops->apic_write(APIC_PCINT_VECT, + (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK)); +} + +/*ARGSUSED*/ +static int +apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) +{ + apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect); + return (0); +} + +/*ARGSUSED*/ +static int +apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) +{ + apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK); + return (0); +} + +/*ARGSUSED*/ +int +cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg) +{ + cpuset_t cpu_set; + + CPUSET_ONLY(cpu_set, cpuid); + + switch (what) { + case CPU_ON: + xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set), + (xc_func_t)apic_cmci_enable); + break; + + case CPU_OFF: + xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set), + (xc_func_t)apic_cmci_disable); + break; + + default: + break; + } + + return (0); +} + +static void +apic_disable_local_apic(void) +{ + apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL); + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK); + + /* local intr reg 0 */ + apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK); + + /* disable NMI */ + apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK); + + /* and error interrupt */ + apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK); + + /* and perf counter intr */ + apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK); + + apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR); +} + +static void +apic_cpu_send_SIPI(processorid_t cpun, boolean_t start) +{ + int loop_count; + uint32_t vector; + uint_t apicid; + ulong_t iflag; + + apicid = apic_cpus[cpun].aci_local_id; + + /* + * Interrupts on current CPU will be disabled during the + * steps in order to avoid unwanted side effects from + * executing interrupt handlers on a problematic BIOS. + */ + iflag = intr_clear(); + + if (start) { + outb(CMOS_ADDR, SSB); + outb(CMOS_DATA, BIOS_SHUTDOWN); + } + + /* + * According to X2APIC specification in section '2.3.5.1' of + * Interrupt Command Register Semantics, the semantics of + * programming the Interrupt Command Register to dispatch an interrupt + * is simplified. A single MSR write to the 64-bit ICR is required + * for dispatching an interrupt. Specifically, with the 64-bit MSR + * interface to ICR, system software is not required to check the + * status of the delivery status bit prior to writing to the ICR + * to send an IPI. With the removal of the Delivery Status bit, + * system software no longer has a reason to read the ICR. It remains + * readable only to aid in debugging. + */ +#ifdef DEBUG + APIC_AV_PENDING_SET(); +#else + if (apic_mode == LOCAL_APIC) { + APIC_AV_PENDING_SET(); + } +#endif /* DEBUG */ + + /* for integrated - make sure there is one INIT IPI in buffer */ + /* for external - it will wake up the cpu */ + apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET); + + /* If only 1 CPU is installed, PENDING bit will not go low */ + for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) { + if (apic_mode == LOCAL_APIC && + apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING) + apic_ret(); + else + break; + } + + apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET); + drv_usecwait(20000); /* 20 milli sec */ + + if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) { + /* integrated apic */ + + vector = (rm_platter_pa >> MMU_PAGESHIFT) & + (APIC_VECTOR_MASK | APIC_IPL_MASK); + + /* to offset the INIT IPI queue up in the buffer */ + apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); + drv_usecwait(200); /* 20 micro sec */ + + /* + * send the second SIPI (Startup IPI) as recommended by Intel + * software development manual. + */ + apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); + drv_usecwait(200); /* 20 micro sec */ + } + + intr_restore(iflag); +} + +/*ARGSUSED1*/ +int +apic_cpu_start(processorid_t cpun, caddr_t arg) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!apic_cpu_in_range(cpun)) { + return (EINVAL); + } + + /* + * Switch to apic_common_send_ipi for safety during starting other CPUs. + */ + if (apic_mode == LOCAL_X2APIC) { + apic_switch_ipi_callback(B_TRUE); + } + + apic_cmos_ssb_set = 1; + apic_cpu_send_SIPI(cpun, B_TRUE); + + return (0); +} + +/* + * Put CPU into halted state with interrupts disabled. + */ +/*ARGSUSED1*/ +int +apic_cpu_stop(processorid_t cpun, caddr_t arg) +{ + int rc; + cpu_t *cp; + extern cpuset_t cpu_ready_set; + extern void cpu_idle_intercept_cpu(cpu_t *cp); + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!apic_cpu_in_range(cpun)) { + return (EINVAL); + } + if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) { + return (ENOTSUP); + } + + cp = cpu_get(cpun); + ASSERT(cp != NULL); + ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); + ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); + ASSERT((cp->cpu_flags & CPU_ENABLE) == 0); + + /* Clear CPU_READY flag to disable cross calls. */ + cp->cpu_flags &= ~CPU_READY; + CPUSET_ATOMIC_DEL(cpu_ready_set, cpun); + rc = xc_flush_cpu(cp); + if (rc != 0) { + CPUSET_ATOMIC_ADD(cpu_ready_set, cpun); + cp->cpu_flags |= CPU_READY; + return (rc); + } + + /* Intercept target CPU at a safe point before powering it off. */ + cpu_idle_intercept_cpu(cp); + + apic_cpu_send_SIPI(cpun, B_FALSE); + cp->cpu_flags &= ~CPU_RUNNING; + + return (0); +} + +int +apic_cpu_ops(psm_cpu_request_t *reqp) +{ + if (reqp == NULL) { + return (EINVAL); + } + + switch (reqp->pcr_cmd) { + case PSM_CPU_ADD: + return (apic_cpu_add(reqp)); + + case PSM_CPU_REMOVE: + return (apic_cpu_remove(reqp)); + + case PSM_CPU_STOP: + return (apic_cpu_stop(reqp->req.cpu_stop.cpuid, + reqp->req.cpu_stop.ctx)); + + default: + return (ENOTSUP); + } +} + +#ifdef DEBUG +int apic_break_on_cpu = 9; +int apic_stretch_interrupts = 0; +int apic_stretch_ISR = 1 << 3; /* IPL of 3 matches nothing now */ +#endif /* DEBUG */ + +/* + * generates an interprocessor interrupt to another CPU. Any changes made to + * this routine must be accompanied by similar changes to + * apic_common_send_ipi(). + */ +void +apic_send_ipi(int cpun, int ipl) +{ + int vector; + ulong_t flag; + + vector = apic_resv_vector[ipl]; + + ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR)); + + flag = intr_clear(); + + APIC_AV_PENDING_SET(); + + apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id, + vector); + + intr_restore(flag); +} + + +/*ARGSUSED*/ +void +apic_set_idlecpu(processorid_t cpun) +{ +} + +/*ARGSUSED*/ +void +apic_unset_idlecpu(processorid_t cpun) +{ +} + + +void +apic_ret() +{ +} + +/* + * If apic_coarse_time == 1, then apic_gettime() is used instead of + * apic_gethrtime(). This is used for performance instead of accuracy. + */ + +hrtime_t +apic_gettime() +{ + int old_hrtime_stamp; + hrtime_t temp; + + /* + * In one-shot mode, we do not keep time, so if anyone + * calls psm_gettime() directly, we vector over to + * gethrtime(). + * one-shot mode MUST NOT be enabled if this psm is the source of + * hrtime. + */ + + if (apic_oneshot) + return (gethrtime()); + + +gettime_again: + while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) + apic_ret(); + + temp = apic_nsec_since_boot; + + if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ + goto gettime_again; + } + return (temp); +} + +/* + * Here we return the number of nanoseconds since booting. Note every + * clock interrupt increments apic_nsec_since_boot by the appropriate + * amount. + */ +hrtime_t +apic_gethrtime(void) +{ + int curr_timeval, countval, elapsed_ticks; + int old_hrtime_stamp, status; + hrtime_t temp; + uint32_t cpun; + ulong_t oflags; + + /* + * In one-shot mode, we do not keep time, so if anyone + * calls psm_gethrtime() directly, we vector over to + * gethrtime(). + * one-shot mode MUST NOT be enabled if this psm is the source of + * hrtime. + */ + + if (apic_oneshot) + return (gethrtime()); + + oflags = intr_clear(); /* prevent migration */ + + cpun = apic_reg_ops->apic_read(APIC_LID_REG); + if (apic_mode == LOCAL_APIC) + cpun >>= APIC_ID_BIT_OFFSET; + + lock_set(&apic_gethrtime_lock); + +gethrtime_again: + while ((old_hrtime_stamp = apic_hrtime_stamp) & 1) + apic_ret(); + + /* + * Check to see which CPU we are on. Note the time is kept on + * the local APIC of CPU 0. If on CPU 0, simply read the current + * counter. If on another CPU, issue a remote read command to CPU 0. + */ + if (cpun == apic_cpus[0].aci_local_id) { + countval = apic_reg_ops->apic_read(APIC_CURR_COUNT); + } else { +#ifdef DEBUG + APIC_AV_PENDING_SET(); +#else + if (apic_mode == LOCAL_APIC) + APIC_AV_PENDING_SET(); +#endif /* DEBUG */ + + apic_reg_ops->apic_write_int_cmd( + apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE); + + while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1)) + & AV_READ_PENDING) { + apic_ret(); + } + + if (status & AV_REMOTE_STATUS) /* 1 = valid */ + countval = apic_reg_ops->apic_read(APIC_REMOTE_READ); + else { /* 0 = invalid */ + apic_remote_hrterr++; + /* + * return last hrtime right now, will need more + * testing if change to retry + */ + temp = apic_last_hrtime; + + lock_clear(&apic_gethrtime_lock); + + intr_restore(oflags); + + return (temp); + } + } + if (countval > last_count_read) + countval = 0; + else + last_count_read = countval; + + elapsed_ticks = apic_hertz_count - countval; + + curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks); + temp = apic_nsec_since_boot + curr_timeval; + + if (apic_hrtime_stamp != old_hrtime_stamp) { /* got an interrupt */ + /* we might have clobbered last_count_read. Restore it */ + last_count_read = apic_hertz_count; + goto gethrtime_again; + } + + if (temp < apic_last_hrtime) { + /* return last hrtime if error occurs */ + apic_hrtime_error++; + temp = apic_last_hrtime; + } + else + apic_last_hrtime = temp; + + lock_clear(&apic_gethrtime_lock); + intr_restore(oflags); + + return (temp); +} + +/* apic NMI handler */ +/*ARGSUSED*/ +void +apic_nmi_intr(caddr_t arg, struct regs *rp) +{ + if (apic_shutdown_processors) { + apic_disable_local_apic(); + return; + } + + apic_error |= APIC_ERR_NMI; + + if (!lock_try(&apic_nmi_lock)) + return; + apic_num_nmis++; + + if (apic_kmdb_on_nmi && psm_debugger()) { + debug_enter("NMI received: entering kmdb\n"); + } else if (apic_panic_on_nmi) { + /* Keep panic from entering kmdb. */ + nopanicdebug = 1; + panic("NMI received\n"); + } else { + /* + * prom_printf is the best shot we have of something which is + * problem free from high level/NMI type of interrupts + */ + prom_printf("NMI received\n"); + } + + lock_clear(&apic_nmi_lock); +} + +processorid_t +apic_get_next_processorid(processorid_t cpu_id) +{ + + int i; + + if (cpu_id == -1) + return ((processorid_t)0); + + for (i = cpu_id + 1; i < NCPU; i++) { + if (apic_cpu_in_range(i)) + return (i); + } + + return ((processorid_t)-1); +} + +int +apic_cpu_add(psm_cpu_request_t *reqp) +{ + int i, rv = 0; + ulong_t iflag; + boolean_t first = B_TRUE; + uchar_t localver; + uint32_t localid, procid; + processorid_t cpuid = (processorid_t)-1; + mach_cpu_add_arg_t *ap; + + ASSERT(reqp != NULL); + reqp->req.cpu_add.cpuid = (processorid_t)-1; + + /* Check whether CPU hotplug is supported. */ + if (!plat_dr_support_cpu() || apic_max_nproc == -1) { + return (ENOTSUP); + } + + ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp; + switch (ap->type) { + case MACH_CPU_ARG_LOCAL_APIC: + localid = ap->arg.apic.apic_id; + procid = ap->arg.apic.proc_id; + if (localid >= 255 || procid > 255) { + cmn_err(CE_WARN, + "!apic: apicid(%u) or procid(%u) is invalid.", + localid, procid); + return (EINVAL); + } + break; + + case MACH_CPU_ARG_LOCAL_X2APIC: + localid = ap->arg.apic.apic_id; + procid = ap->arg.apic.proc_id; + if (localid >= UINT32_MAX) { + cmn_err(CE_WARN, + "!apic: x2apicid(%u) is invalid.", localid); + return (EINVAL); + } else if (localid >= 255 && apic_mode == LOCAL_APIC) { + cmn_err(CE_WARN, "!apic: system is in APIC mode, " + "can't support x2APIC processor."); + return (ENOTSUP); + } + break; + + default: + cmn_err(CE_WARN, + "!apic: unknown argument type %d to apic_cpu_add().", + ap->type); + return (EINVAL); + } + + /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Check whether local APIC id already exists. */ + for (i = 0; i < apic_nproc; i++) { + if (!CPU_IN_SET(apic_cpumask, i)) + continue; + if (apic_cpus[i].aci_local_id == localid) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: local apic id %u already exists.", + localid); + return (EEXIST); + } else if (apic_cpus[i].aci_processor_id == procid) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: processor id %u already exists.", + (int)procid); + return (EEXIST); + } + + /* + * There's no local APIC version number available in MADT table, + * so assume that all CPUs are homogeneous and use local APIC + * version number of the first existing CPU. + */ + if (first) { + first = B_FALSE; + localver = apic_cpus[i].aci_local_ver; + } + } + ASSERT(first == B_FALSE); + + /* + * Try to assign the same cpuid if APIC id exists in the dirty cache. + */ + for (i = 0; i < apic_max_nproc; i++) { + if (CPU_IN_SET(apic_cpumask, i)) { + ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0); + continue; + } + ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE); + if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) && + apic_cpus[i].aci_local_id == localid && + apic_cpus[i].aci_processor_id == procid) { + cpuid = i; + break; + } + } + + /* Avoid the dirty cache and allocate fresh slot if possible. */ + if (cpuid == (processorid_t)-1) { + for (i = 0; i < apic_max_nproc; i++) { + if ((apic_cpus[i].aci_status & APIC_CPU_FREE) && + (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) { + cpuid = i; + break; + } + } + } + + /* Try to find any free slot as last resort. */ + if (cpuid == (processorid_t)-1) { + for (i = 0; i < apic_max_nproc; i++) { + if (apic_cpus[i].aci_status & APIC_CPU_FREE) { + cpuid = i; + break; + } + } + } + + if (cpuid == (processorid_t)-1) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_NOTE, + "!apic: failed to allocate cpu id for processor %u.", + procid); + rv = EAGAIN; + } else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_NOTE, + "!apic: failed to build mapping for processor %u.", + procid); + rv = EBUSY; + } else { + ASSERT(cpuid >= 0 && cpuid < NCPU); + ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus); + bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0])); + apic_cpus[cpuid].aci_processor_id = procid; + apic_cpus[cpuid].aci_local_id = localid; + apic_cpus[cpuid].aci_local_ver = localver; + CPUSET_ATOMIC_ADD(apic_cpumask, cpuid); + if (cpuid >= apic_nproc) { + apic_nproc = cpuid + 1; + } + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + reqp->req.cpu_add.cpuid = cpuid; + } + + return (rv); +} + +int +apic_cpu_remove(psm_cpu_request_t *reqp) +{ + int i; + ulong_t iflag; + processorid_t cpuid; + + /* Check whether CPU hotplug is supported. */ + if (!plat_dr_support_cpu() || apic_max_nproc == -1) { + return (ENOTSUP); + } + + cpuid = reqp->req.cpu_remove.cpuid; + + /* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + if (!apic_cpu_in_range(cpuid)) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: cpuid %d doesn't exist in apic_cpus array.", + cpuid); + return (ENODEV); + } + ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0); + + if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + return (ENOENT); + } + + if (cpuid == apic_nproc - 1) { + /* + * We are removing the highest numbered cpuid so we need to + * find the next highest cpuid as the new value for apic_nproc. + */ + for (i = apic_nproc; i > 0; i--) { + if (CPU_IN_SET(apic_cpumask, i - 1)) { + apic_nproc = i; + break; + } + } + /* at least one CPU left */ + ASSERT(i > 0); + } + CPUSET_ATOMIC_DEL(apic_cpumask, cpuid); + /* mark slot as free and keep it in the dirty cache */ + apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY; + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (0); +} + +/* + * Return the number of APIC clock ticks elapsed for 8245 to decrement + * (APIC_TIME_COUNT + pit_ticks_adj) ticks. + */ +static uint_t +apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) +{ + uint8_t pit_tick_lo; + uint16_t pit_tick, target_pit_tick; + uint32_t start_apic_tick, end_apic_tick; + ulong_t iflag; + uint32_t reg; + + reg = addr + APIC_CURR_COUNT - apicadr; + + iflag = intr_clear(); + + do { + pit_tick_lo = inb(PITCTR0_PORT); + pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; + } while (pit_tick < APIC_TIME_MIN || + pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX); + + /* + * Wait for the 8254 to decrement by 5 ticks to ensure + * we didn't start in the middle of a tick. + * Compare with 0x10 for the wrap around case. + */ + target_pit_tick = pit_tick - 5; + do { + pit_tick_lo = inb(PITCTR0_PORT); + pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; + } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); + + start_apic_tick = apic_reg_ops->apic_read(reg); + + /* + * Wait for the 8254 to decrement by + * (APIC_TIME_COUNT + pit_ticks_adj) ticks + */ + target_pit_tick = pit_tick - APIC_TIME_COUNT; + do { + pit_tick_lo = inb(PITCTR0_PORT); + pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; + } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); + + end_apic_tick = apic_reg_ops->apic_read(reg); + + *pit_ticks_adj = target_pit_tick - pit_tick; + + intr_restore(iflag); + + return (start_apic_tick - end_apic_tick); +} + +/* + * Initialise the APIC timer on the local APIC of CPU 0 to the desired + * frequency. Note at this stage in the boot sequence, the boot processor + * is the only active processor. + * hertz value of 0 indicates a one-shot mode request. In this case + * the function returns the resolution (in nanoseconds) for the hardware + * timer interrupt. If one-shot mode capability is not available, + * the return value will be 0. apic_enable_oneshot is a global switch + * for disabling the functionality. + * A non-zero positive value for hertz indicates a periodic mode request. + * In this case the hardware will be programmed to generate clock interrupts + * at hertz frequency and returns the resolution of interrupts in + * nanosecond. + */ + +int +apic_clkinit(int hertz) +{ + uint_t apic_ticks = 0; + uint_t pit_ticks; + int ret; + uint16_t pit_ticks_adj; + static int firsttime = 1; + + if (firsttime) { + /* first time calibrate on CPU0 only */ + + apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); + apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); + + /* total number of PIT ticks corresponding to apic_ticks */ + pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; + + /* + * Determine the number of nanoseconds per APIC clock tick + * and then determine how many APIC ticks to interrupt at the + * desired frequency + * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s + * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s + * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) + * pic_ticks_per_SFns = + * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) + */ + apic_ticks_per_SFnsecs = + ((SF * apic_ticks * PIT_HZ) / + ((uint64_t)pit_ticks * NANOSEC)); + + /* the interval timer initial count is 32 bit max */ + apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); + firsttime = 0; + } + + if (hertz != 0) { + /* periodic */ + apic_nsec_per_intr = NANOSEC / hertz; + apic_hertz_count = APIC_NSECS_TO_TICKS(apic_nsec_per_intr); + } + + apic_int_busy_mark = (apic_int_busy_mark * + apic_sample_factor_redistribution) / 100; + apic_int_free_mark = (apic_int_free_mark * + apic_sample_factor_redistribution) / 100; + apic_diff_for_redistribution = (apic_diff_for_redistribution * + apic_sample_factor_redistribution) / 100; + + if (hertz == 0) { + /* requested one_shot */ + if (!tsc_gethrtime_enable || !apic_oneshot_enable) + return (0); + apic_oneshot = 1; + ret = (int)APIC_TICKS_TO_NSECS(1); + } else { + /* program the local APIC to interrupt at the given frequency */ + apic_reg_ops->apic_write(APIC_INIT_COUNT, apic_hertz_count); + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_TIME); + apic_oneshot = 0; + ret = NANOSEC / hertz; + } + + return (ret); + +} + +/* + * apic_preshutdown: + * Called early in shutdown whilst we can still access filesystems to do + * things like loading modules which will be required to complete shutdown + * after filesystems are all unmounted. + */ +void +apic_preshutdown(int cmd, int fcn) +{ + APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n", + cmd, fcn, apic_poweroff_method, apic_enable_acpi)); +} + +void +apic_shutdown(int cmd, int fcn) +{ + int restarts, attempts; + int i; + uchar_t byte; + ulong_t iflag; + + hpet_acpi_fini(); + + /* Send NMI to all CPUs except self to do per processor shutdown */ + iflag = intr_clear(); +#ifdef DEBUG + APIC_AV_PENDING_SET(); +#else + if (apic_mode == LOCAL_APIC) + APIC_AV_PENDING_SET(); +#endif /* DEBUG */ + apic_shutdown_processors = 1; + apic_reg_ops->apic_write(APIC_INT_CMD1, + AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF); + + /* restore cmos shutdown byte before reboot */ + if (apic_cmos_ssb_set) { + outb(CMOS_ADDR, SSB); + outb(CMOS_DATA, 0); + } + + ioapic_disable_redirection(); + + /* disable apic mode if imcr present */ + if (apic_imcrp) { + outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT); + outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC); + } + + apic_disable_local_apic(); + + intr_restore(iflag); + + /* remainder of function is for shutdown cases only */ + if (cmd != A_SHUTDOWN) + return; + + /* + * Switch system back into Legacy-Mode if using ACPI and + * not powering-off. Some BIOSes need to remain in ACPI-mode + * for power-off to succeed (Dell Dimension 4600) + * Do not disable ACPI while doing fastreboot + */ + if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT) + (void) AcpiDisable(); + + if (fcn == AD_FASTREBOOT) { + apic_reg_ops->apic_write(APIC_INT_CMD1, + AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF); + } + + /* remainder of function is for shutdown+poweroff case only */ + if (fcn != AD_POWEROFF) + return; + + switch (apic_poweroff_method) { + case APIC_POWEROFF_VIA_RTC: + + /* select the extended NVRAM bank in the RTC */ + outb(CMOS_ADDR, RTC_REGA); + byte = inb(CMOS_DATA); + outb(CMOS_DATA, (byte | EXT_BANK)); + + outb(CMOS_ADDR, PFR_REG); + + /* for Predator must toggle the PAB bit */ + byte = inb(CMOS_DATA); + + /* + * clear power active bar, wakeup alarm and + * kickstart + */ + byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG); + outb(CMOS_DATA, byte); + + /* delay before next write */ + drv_usecwait(1000); + + /* for S40 the following would suffice */ + byte = inb(CMOS_DATA); + + /* power active bar control bit */ + byte |= PAB_CBIT; + outb(CMOS_DATA, byte); + + break; + + case APIC_POWEROFF_VIA_ASPEN_BMC: + restarts = 0; +restart_aspen_bmc: + if (++restarts == 3) + break; + attempts = 0; + do { + byte = inb(MISMIC_FLAG_REGISTER); + byte &= MISMIC_BUSY_MASK; + if (byte != 0) { + drv_usecwait(1000); + if (attempts >= 3) + goto restart_aspen_bmc; + ++attempts; + } + } while (byte != 0); + outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS); + byte = inb(MISMIC_FLAG_REGISTER); + byte |= 0x1; + outb(MISMIC_FLAG_REGISTER, byte); + i = 0; + for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0])); + i++) { + attempts = 0; + do { + byte = inb(MISMIC_FLAG_REGISTER); + byte &= MISMIC_BUSY_MASK; + if (byte != 0) { + drv_usecwait(1000); + if (attempts >= 3) + goto restart_aspen_bmc; + ++attempts; + } + } while (byte != 0); + outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl); + outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data); + byte = inb(MISMIC_FLAG_REGISTER); + byte |= 0x1; + outb(MISMIC_FLAG_REGISTER, byte); + } + break; + + case APIC_POWEROFF_VIA_SITKA_BMC: + restarts = 0; +restart_sitka_bmc: + if (++restarts == 3) + break; + attempts = 0; + do { + byte = inb(SMS_STATUS_REGISTER); + byte &= SMS_STATE_MASK; + if ((byte == SMS_READ_STATE) || + (byte == SMS_WRITE_STATE)) { + drv_usecwait(1000); + if (attempts >= 3) + goto restart_sitka_bmc; + ++attempts; + } + } while ((byte == SMS_READ_STATE) || + (byte == SMS_WRITE_STATE)); + outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS); + i = 0; + for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0])); + i++) { + attempts = 0; + do { + byte = inb(SMS_STATUS_REGISTER); + byte &= SMS_IBF_MASK; + if (byte != 0) { + drv_usecwait(1000); + if (attempts >= 3) + goto restart_sitka_bmc; + ++attempts; + } + } while (byte != 0); + outb(sitka_bmc[i].port, sitka_bmc[i].data); + } + break; + + case APIC_POWEROFF_NONE: + + /* If no APIC direct method, we will try using ACPI */ + if (apic_enable_acpi) { + if (acpi_poweroff() == 1) + return; + } else + return; + + break; + } + /* + * Wait a limited time here for power to go off. + * If the power does not go off, then there was a + * problem and we should continue to the halt which + * prints a message for the user to press a key to + * reboot. + */ + drv_usecwait(7000000); /* wait seven seconds */ + +} + +/* + * This function will reprogram the timer. + * + * When in oneshot mode the argument is the absolute time in future to + * generate the interrupt at. + * + * When in periodic mode, the argument is the interval at which the + * interrupts should be generated. There is no need to support the periodic + * mode timer change at this time. + */ +void +apic_timer_reprogram(hrtime_t time) +{ + hrtime_t now; + uint_t ticks; + int64_t delta; + + /* + * We should be called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + + if (!apic_oneshot) { + /* time is the interval for periodic mode */ + ticks = APIC_NSECS_TO_TICKS(time); + } else { + /* one shot mode */ + + now = gethrtime(); + delta = time - now; + + if (delta <= 0) { + /* + * requested to generate an interrupt in the past + * generate an interrupt as soon as possible + */ + ticks = apic_min_timer_ticks; + } else if (delta > apic_nsec_max) { + /* + * requested to generate an interrupt at a time + * further than what we are capable of. Set to max + * the hardware can handle + */ + + ticks = APIC_MAXVAL; +#ifdef DEBUG + cmn_err(CE_CONT, "apic_timer_reprogram, request at" + " %lld too far in future, current time" + " %lld \n", time, now); +#endif + } else + ticks = APIC_NSECS_TO_TICKS(delta); + } + + if (ticks < apic_min_timer_ticks) + ticks = apic_min_timer_ticks; + + apic_reg_ops->apic_write(APIC_INIT_COUNT, ticks); +} + +/* + * This function will enable timer interrupts. + */ +void +apic_timer_enable(void) +{ + /* + * We should be Called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + + if (!apic_oneshot) { + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_TIME); + } else { + /* one shot */ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT)); + } +} + +/* + * This function will disable timer interrupts. + */ +void +apic_timer_disable(void) +{ + /* + * We should be Called from high PIL context (CBE_HIGH_PIL), + * so kpreempt is disabled. + */ + apic_reg_ops->apic_write(APIC_LOCAL_TIMER, + (apic_clkvect + APIC_BASE_VECT) | AV_MASK); +} + +/* + * Set timer far into the future and return timer + * current Count in nanoseconds. + */ +hrtime_t +apic_timer_stop_count(void) +{ + hrtime_t ns_val; + int enable_val, count_val; + + /* + * Should be called with interrupts disabled. + */ + ASSERT(!interrupts_enabled()); + + enable_val = apic_reg_ops->apic_read(APIC_LOCAL_TIMER); + if ((enable_val & AV_MASK) == AV_MASK) + return ((hrtime_t)-1); /* timer is disabled */ + + count_val = apic_reg_ops->apic_read(APIC_CURR_COUNT); + ns_val = APIC_TICKS_TO_NSECS(count_val); + + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); + + return (ns_val); +} + +/* + * Reprogram timer after Deep C-State. + */ +void +apic_timer_restart(hrtime_t time) +{ + apic_timer_reprogram(time); +} + +ddi_periodic_t apic_periodic_id; + +/* + * The following functions are in the platform specific file so that they + * can be different functions depending on whether we are running on + * bare metal or a hypervisor. + */ + +/* + * map an apic for memory-mapped access + */ +uint32_t * +mapin_apic(uint32_t addr, size_t len, int flags) +{ + return ((void *)psm_map_phys(addr, len, flags)); +} + +uint32_t * +mapin_ioapic(uint32_t addr, size_t len, int flags) +{ + return (mapin_apic(addr, len, flags)); +} + +/* + * unmap an apic + */ +void +mapout_apic(caddr_t addr, size_t len) +{ + psm_unmap_phys(addr, len); +} + +void +mapout_ioapic(caddr_t addr, size_t len) +{ + mapout_apic(addr, len); +} + +uint32_t +ioapic_read(int ioapic_ix, uint32_t reg) +{ + volatile uint32_t *ioapic; + + ioapic = apicioadr[ioapic_ix]; + ioapic[APIC_IO_REG] = reg; + return (ioapic[APIC_IO_DATA]); +} + +void +ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value) +{ + volatile uint32_t *ioapic; + + ioapic = apicioadr[ioapic_ix]; + ioapic[APIC_IO_REG] = reg; + ioapic[APIC_IO_DATA] = value; +} + +void +ioapic_write_eoi(int ioapic_ix, uint32_t value) +{ + volatile uint32_t *ioapic; + + ioapic = apicioadr[ioapic_ix]; + ioapic[APIC_IO_EOI] = value; +} + +/* + * Round-robin algorithm to find the next CPU with interrupts enabled. + * It can't share the same static variable apic_next_bind_cpu with + * apic_get_next_bind_cpu(), since that will cause all interrupts to be + * bound to CPU1 at boot time. During boot, only CPU0 is online with + * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu() + * are called. However, the pcplusmp driver assumes that there will be + * boot_ncpus CPUs configured eventually so it tries to distribute all + * interrupts among CPU0 - CPU[boot_ncpus - 1]. Thus to prevent all + * interrupts being targetted at CPU1, we need to use a dedicated static + * variable for find_next_cpu() instead of sharing apic_next_bind_cpu. + */ + +processorid_t +apic_find_cpu(int flag) +{ + int i; + static processorid_t acid = 0; + + /* Find the first CPU with the passed-in flag set */ + for (i = 0; i < apic_nproc; i++) { + if (++acid >= apic_nproc) { + acid = 0; + } + if (apic_cpu_in_range(acid) && + (apic_cpus[acid].aci_status & flag)) { + break; + } + } + + ASSERT((apic_cpus[acid].aci_status & flag) != 0); + return (acid); +} + +/* + * Switch between safe and x2APIC IPI sending method. + * CPU may power on in xapic mode or x2apic mode. If CPU needs to send IPI to + * other CPUs before entering x2APIC mode, it still needs to xAPIC method. + * Before sending StartIPI to target CPU, psm_send_ipi will be changed to + * apic_common_send_ipi, which detects current local APIC mode and use right + * method to send IPI. If some CPUs fail to start up, apic_poweron_cnt + * won't return to zero, so apic_common_send_ipi will always be used. + * psm_send_ipi can't be simply changed back to x2apic_send_ipi if some CPUs + * failed to start up because those failed CPUs may recover itself later at + * unpredictable time. + */ +void +apic_switch_ipi_callback(boolean_t enter) +{ + ulong_t iflag; + struct psm_ops *pops = psmops; + + iflag = intr_clear(); + lock_set(&apic_mode_switch_lock); + if (enter) { + ASSERT(apic_poweron_cnt >= 0); + if (apic_poweron_cnt == 0) { + pops->psm_send_ipi = apic_common_send_ipi; + send_dirintf = pops->psm_send_ipi; + } + apic_poweron_cnt++; + } else { + ASSERT(apic_poweron_cnt > 0); + apic_poweron_cnt--; + if (apic_poweron_cnt == 0) { + pops->psm_send_ipi = x2apic_send_ipi; + send_dirintf = pops->psm_send_ipi; + } + } + lock_clear(&apic_mode_switch_lock); + intr_restore(iflag); +} + +void +apic_intrmap_init(int apic_mode) +{ + int suppress_brdcst_eoi = 0; + + if (psm_vt_ops != NULL) { + /* + * Since X2APIC requires the use of interrupt remapping + * (though this is not documented explicitly in the Intel + * documentation (yet)), initialize interrupt remapping + * support before initializing the X2APIC unit. + */ + if (((apic_intrmap_ops_t *)psm_vt_ops)-> + apic_intrmap_init(apic_mode) == DDI_SUCCESS) { + + apic_vt_ops = psm_vt_ops; + + /* + * We leverage the interrupt remapping engine to + * suppress broadcast EOI; thus we must send the + * directed EOI with the directed-EOI handler. + */ + if (apic_directed_EOI_supported() == 0) { + suppress_brdcst_eoi = 1; + } + + apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi); + + if (apic_detect_x2apic()) { + apic_enable_x2apic(); + } + + if (apic_directed_EOI_supported() == 0) { + apic_set_directed_EOI_handler(); + } + } + } +} + +/*ARGSUSED*/ +static void +apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt) +{ + irdt->ir_hi <<= APIC_ID_BIT_OFFSET; +} + +/*ARGSUSED*/ +static void +apic_record_msi(void *intrmap_private, msi_regs_t *mregs) +{ + mregs->mr_addr = MSI_ADDR_HDR | + (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | + (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) | + (mregs->mr_addr << MSI_ADDR_DEST_SHIFT); + mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | + mregs->mr_data; +} + +/* + * Functions from apic_introp.c + * + * Those functions are used by apic_intr_ops(). + */ + +/* + * MSI support flag: + * reflects whether MSI is supported at APIC level + * it can also be patched through /etc/system + * + * 0 = default value - don't know and need to call apic_check_msi_support() + * to find out then set it accordingly + * 1 = supported + * -1 = not supported + */ +int apic_support_msi = 0; + +/* Multiple vector support for MSI-X */ +int apic_msix_enable = 1; + +/* Multiple vector support for MSI */ +int apic_multi_msi_enable = 1; + +/* + * check whether the system supports MSI + * + * If PCI-E capability is found, then this must be a PCI-E system. + * Since MSI is required for PCI-E system, it returns PSM_SUCCESS + * to indicate this system supports MSI. + */ +int +apic_check_msi_support() +{ + dev_info_t *cdip; + char dev_type[16]; + int dev_len; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n")); + + /* + * check whether the first level children of root_node have + * PCI-E capability + */ + for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL; + cdip = ddi_get_next_sibling(cdip)) { + + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p," + " driver: %s, binding: %s, nodename: %s\n", (void *)cdip, + ddi_driver_name(cdip), ddi_binding_name(cdip), + ddi_node_name(cdip))); + dev_len = sizeof (dev_type); + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS, + "device_type", (caddr_t)dev_type, &dev_len) + != DDI_PROP_SUCCESS) + continue; + if (strcmp(dev_type, "pciex") == 0) + return (PSM_SUCCESS); + } + + /* MSI is not supported on this system */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' " + "device_type found\n")); + return (PSM_FAILURE); +} + +/* + * apic_pci_msi_unconfigure: + * + * This and next two interfaces are copied from pci_intr_lib.c + * Do ensure that these two files stay in sync. + * These needed to be copied over here to avoid a deadlock situation on + * certain mp systems that use MSI interrupts. + * + * IMPORTANT regards next three interfaces: + * i) are called only for MSI/X interrupts. + * ii) called with interrupts disabled, and must not block + */ +void +apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum) +{ + ushort_t msi_ctrl; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + if (type == DDI_INTR_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + msi_ctrl &= (~PCI_MSI_MME_MASK); + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0); + + if (msi_ctrl & PCI_MSI_64BIT_MASK) { + pci_config_put16(handle, + cap_ptr + PCI_MSI_64BIT_DATA, 0); + pci_config_put32(handle, + cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0); + } else { + pci_config_put16(handle, + cap_ptr + PCI_MSI_32BIT_DATA, 0); + } + + } else if (type == DDI_INTR_TYPE_MSIX) { + uintptr_t off; + uint32_t mask; + ddi_intr_msix_t *msix_p = i_ddi_get_msix(rdip); + + ASSERT(msix_p != NULL); + + /* Offset into "inum"th entry in the MSI-X table & mask it */ + off = (uintptr_t)msix_p->msix_tbl_addr + (inum * + PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; + + mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); + + ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1)); + + /* Offset into the "inum"th entry in the MSI-X table */ + off = (uintptr_t)msix_p->msix_tbl_addr + + (inum * PCI_MSIX_VECTOR_SIZE); + + /* Reset the "data" and "addr" bits */ + ddi_put32(msix_p->msix_tbl_hdl, + (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0); + ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0); + } +} + +/* + * apic_pci_msi_disable_mode: + */ +void +apic_pci_msi_disable_mode(dev_info_t *rdip, int type) +{ + ushort_t msi_ctrl; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + if (type == DDI_INTR_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + if (!(msi_ctrl & PCI_MSI_ENABLE_BIT)) + return; + + msi_ctrl &= ~PCI_MSI_ENABLE_BIT; /* MSI disable */ + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + + } else if (type == DDI_INTR_TYPE_MSIX) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); + if (msi_ctrl & PCI_MSIX_ENABLE_BIT) { + msi_ctrl &= ~PCI_MSIX_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, + msi_ctrl); + } + } +} + +uint32_t +apic_get_localapicid(uint32_t cpuid) +{ + ASSERT(cpuid < apic_nproc && apic_cpus != NULL); + + return (apic_cpus[cpuid].aci_local_id); +} + +uchar_t +apic_get_ioapicid(uchar_t ioapicindex) +{ + ASSERT(ioapicindex < MAX_IO_APIC); + + return (apic_io_id[ioapicindex]); +} diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c index 7663263abb..46f1257ecd 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c @@ -38,6 +38,7 @@ #include <sys/trap.h> #include <sys/pci.h> #include <sys/pci_intr_lib.h> +#include <sys/apic_common.h> extern struct av_head autovect[]; @@ -47,24 +48,6 @@ extern struct av_head autovect[]; apic_irq_t *apic_find_irq(dev_info_t *, struct intrspec *, int); /* - * MSI support flag: - * reflects whether MSI is supported at APIC level - * it can also be patched through /etc/system - * - * 0 = default value - don't know and need to call apic_check_msi_support() - * to find out then set it accordingly - * 1 = supported - * -1 = not supported - */ -int apic_support_msi = 0; - -/* Multiple vector support for MSI */ -int apic_multi_msi_enable = 1; - -/* Multiple vector support for MSI-X */ -int apic_msix_enable = 1; - -/* * apic_pci_msi_enable_vector: * Set the address/data fields in the MSI/X capability structure * XXX: MSI-X support @@ -79,9 +62,9 @@ apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, dev_info_t *dip = irq_ptr->airq_dip; int cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip); -#if !defined(__xpv) msi_regs_t msi_regs; -#endif /* ! __xpv */ + int irqno, i; + void *intrmap_tbl[PCI_MSI_MAX_INTRS]; DDI_INTR_IMPLDBG((CE_CONT, "apic_pci_msi_enable_vector: dip=0x%p\n" "\tdriver = %s, inum=0x%x vector=0x%x apicid=0x%x\n", (void *)dip, @@ -89,29 +72,28 @@ apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, ASSERT((handle != NULL) && (cap_ptr != 0)); -#if !defined(__xpv) msi_regs.mr_data = vector; msi_regs.mr_addr = target_apic_id; - apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr); - apic_vt_ops->apic_intrmap_map_entry(irq_ptr, (void *)&msi_regs); - apic_vt_ops->apic_intrmap_record_msi(irq_ptr, &msi_regs); + intrmap_tbl[0] = irq_ptr->airq_intrmap_private; + apic_vt_ops->apic_intrmap_alloc_entry(intrmap_tbl, dip, type, + count, 0xff); + for (i = 0; i < count; i++) { + irqno = apic_vector_to_irq[vector + i]; + apic_irq_table[irqno]->airq_intrmap_private = + intrmap_tbl[i]; + } + + apic_vt_ops->apic_intrmap_map_entry(irq_ptr->airq_intrmap_private, + (void *)&msi_regs, type, count); + apic_vt_ops->apic_intrmap_record_msi(irq_ptr->airq_intrmap_private, + &msi_regs); /* MSI Address */ msi_addr = msi_regs.mr_addr; /* MSI Data: MSI is edge triggered according to spec */ msi_data = msi_regs.mr_data; -#else - /* MSI Address */ - msi_addr = (MSI_ADDR_HDR | - (target_apic_id << MSI_ADDR_DEST_SHIFT)); - msi_addr |= ((MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | - (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT)); - - /* MSI Data: MSI is edge triggered according to spec */ - msi_data = ((MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | vector); -#endif /* ! __xpv */ DDI_INTR_IMPLDBG((CE_CONT, "apic_pci_msi_enable_vector: addr=0x%lx " "data=0x%lx\n", (long)msi_addr, (long)msi_data)); @@ -123,7 +105,6 @@ apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, msi_ctrl |= ((highbit(count) -1) << PCI_MSI_MME_SHIFT); pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); -#if !defined(__xpv) /* * Only set vector if not on hypervisor */ @@ -154,13 +135,9 @@ apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), msi_data); ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)(off + PCI_MSIX_LOWER_ADDR_OFFSET), msi_addr); -#endif /* ! __xpv */ } } - -#if !defined(__xpv) - /* * This function returns the no. of vectors available for the pri. * dip is not used at this moment. If we really don't need that, @@ -198,8 +175,6 @@ apic_navail_vector(dev_info_t *dip, int pri) return (navail); } -#endif /* ! __xpv */ - /* * Finds "count" contiguous MSI vectors starting at the proper alignment * at "pri". @@ -288,9 +263,6 @@ apic_find_irq(dev_info_t *dip, struct intrspec *ispec, int type) return (NULL); } - -#if !defined(__xpv) - /* * This function will return the pending bit of the irqp. * It either comes from the IRR register of the APIC or the RDT @@ -424,117 +396,6 @@ apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type) } } -#endif /* ! __xpv */ - -/* - * check whether the system supports MSI - * - * If PCI-E capability is found, then this must be a PCI-E system. - * Since MSI is required for PCI-E system, it returns PSM_SUCCESS - * to indicate this system supports MSI. - */ -int -apic_check_msi_support() -{ - dev_info_t *cdip; - char dev_type[16]; - int dev_len; - - DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n")); - - /* - * check whether the first level children of root_node have - * PCI-E capability - */ - for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL; - cdip = ddi_get_next_sibling(cdip)) { - - DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p," - " driver: %s, binding: %s, nodename: %s\n", (void *)cdip, - ddi_driver_name(cdip), ddi_binding_name(cdip), - ddi_node_name(cdip))); - dev_len = sizeof (dev_type); - if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS, - "device_type", (caddr_t)dev_type, &dev_len) - != DDI_PROP_SUCCESS) - continue; - if (strcmp(dev_type, "pciex") == 0) - return (PSM_SUCCESS); - } - - /* MSI is not supported on this system */ - DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' " - "device_type found\n")); - return (PSM_FAILURE); -} - -#if !defined(__xpv) - -/* - * apic_pci_msi_unconfigure: - * - * This and next two interfaces are copied from pci_intr_lib.c - * Do ensure that these two files stay in sync. - * These needed to be copied over here to avoid a deadlock situation on - * certain mp systems that use MSI interrupts. - * - * IMPORTANT regards next three interfaces: - * i) are called only for MSI/X interrupts. - * ii) called with interrupts disabled, and must not block - */ -void -apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum) -{ - ushort_t msi_ctrl; - int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); - ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); - - ASSERT((handle != NULL) && (cap_ptr != 0)); - - if (type == DDI_INTR_TYPE_MSI) { - msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); - msi_ctrl &= (~PCI_MSI_MME_MASK); - pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); - pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0); - - if (msi_ctrl & PCI_MSI_64BIT_MASK) { - pci_config_put16(handle, - cap_ptr + PCI_MSI_64BIT_DATA, 0); - pci_config_put32(handle, - cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0); - } else { - pci_config_put16(handle, - cap_ptr + PCI_MSI_32BIT_DATA, 0); - } - - } else if (type == DDI_INTR_TYPE_MSIX) { - uintptr_t off; - uint32_t mask; - ddi_intr_msix_t *msix_p = i_ddi_get_msix(rdip); - - ASSERT(msix_p != NULL); - - /* Offset into "inum"th entry in the MSI-X table & mask it */ - off = (uintptr_t)msix_p->msix_tbl_addr + (inum * - PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; - - mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); - - ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1)); - - /* Offset into the "inum"th entry in the MSI-X table */ - off = (uintptr_t)msix_p->msix_tbl_addr + - (inum * PCI_MSIX_VECTOR_SIZE); - - /* Reset the "data" and "addr" bits */ - ddi_put32(msix_p->msix_tbl_hdl, - (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0); - ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0); - } -} - -#endif /* __xpv */ - /* * apic_pci_msi_enable_mode: */ @@ -582,38 +443,6 @@ apic_pci_msi_enable_mode(dev_info_t *rdip, int type, int inum) } } -/* - * apic_pci_msi_disable_mode: - */ -void -apic_pci_msi_disable_mode(dev_info_t *rdip, int type) -{ - ushort_t msi_ctrl; - int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); - ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); - - ASSERT((handle != NULL) && (cap_ptr != 0)); - - if (type == DDI_INTR_TYPE_MSI) { - msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); - if (!(msi_ctrl & PCI_MSI_ENABLE_BIT)) - return; - - msi_ctrl &= ~PCI_MSI_ENABLE_BIT; /* MSI disable */ - pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); - - } else if (type == DDI_INTR_TYPE_MSIX) { - msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); - if (msi_ctrl & PCI_MSIX_ENABLE_BIT) { - msi_ctrl &= ~PCI_MSIX_ENABLE_BIT; - pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, - msi_ctrl); - } - } -} - -#if !defined(__xpv) - static int apic_set_cpu(int irqno, int cpu, int *result) { @@ -800,21 +629,6 @@ set_grp_intr_done: return (PSM_SUCCESS); } -#else /* __xpv */ - -/* - * We let the hypervisor deal with msi configutation - * so just stub this out. - */ - -/* ARGSUSED */ -void -apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum) -{ -} - -#endif /* __xpv */ - int apic_get_vector_intr_info(int vecirq, apic_get_intr_t *intr_params_p) { @@ -915,9 +729,6 @@ apic_get_vector_intr_info(int vecirq, apic_get_intr_t *intr_params_p) return (PSM_SUCCESS); } - -#if !defined(__xpv) - /* * This function provides external interface to the nexus for all * functionalities related to the new DDI interrupt framework. @@ -1093,7 +904,7 @@ apic_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, *result = EINVAL; return (PSM_FAILURE); } - if (!(hdlp->ih_flags & PSMGI_INTRBY_IRQ)) + if ((hdlp->ih_flags & PSMGI_INTRBY_FLAGS) == PSMGI_INTRBY_VEC) hdlp->ih_vector = apic_vector_to_irq[hdlp->ih_vector]; if (intr_op == PSM_INTR_OP_SET_CPU) { if (apic_set_cpu(hdlp->ih_vector, new_cpu, result) != @@ -1116,7 +927,12 @@ apic_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, return (PSM_FAILURE); break; case PSM_INTR_OP_APIC_TYPE: - hdlp->ih_private = apic_get_apic_type(); + ((apic_get_type_t *)(hdlp->ih_private))->avgi_type = + apic_get_apic_type(); + ((apic_get_type_t *)(hdlp->ih_private))->avgi_num_intr = + APIC_MAX_VECTOR; + ((apic_get_type_t *)(hdlp->ih_private))->avgi_num_cpu = + boot_ncpus; hdlp->ih_ver = apic_get_apic_version(); break; case PSM_INTR_OP_SET_CAP: @@ -1125,4 +941,3 @@ apic_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, } return (PSM_SUCCESS); } -#endif /* !__xpv */ diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c index bd0de9346b..7a3dd8a733 100644 --- a/usr/src/uts/i86pc/io/psm/psm_common.c +++ b/usr/src/uts/i86pc/io/psm/psm_common.c @@ -102,7 +102,7 @@ extern int goany(void); #define NEXT_PRT_ITEM(p) \ - (ACPI_PCI_ROUTING_TABLE *)(((char *)(p)) + (p)->Length) + (void *)(((char *)(p)) + (p)->Length) static int acpi_get_gsiv(dev_info_t *dip, ACPI_HANDLE pciobj, int devno, int ipin, @@ -384,6 +384,7 @@ psm_node_has_prt(ACPI_HANDLE *ah) return (AcpiGetHandle(ah, "_PRT", &rh) == AE_OK); } + /* * Look first for an ACPI PCI bus node matching busid, then for a _PRT on the * parent node; then drop into the bridge-chasing code (which will also @@ -512,7 +513,7 @@ acpi_set_irq_resource(acpi_psm_lnk_t *acpipsmlnkp, int irq) switch (srsp->Type) { case ACPI_RESOURCE_TYPE_IRQ: srsp->Data.Irq.InterruptCount = 1; - srsp->Data.Irq.Interrupts[0] = irq; + srsp->Data.Irq.Interrupts[0] = (uint8_t)irq; break; case ACPI_RESOURCE_TYPE_EXTENDED_IRQ: srsp->Data.ExtendedIrq.InterruptCount = 1; @@ -855,7 +856,7 @@ acpi_new_irq_cache_ent(int bus, int dev, int ipin, int pci_irq, ep->dev = (uchar_t)dev; ep->ipin = (uchar_t)ipin; ep->flags = *intr_flagp; - ep->irq = pci_irq; + ep->irq = (uchar_t)pci_irq; ASSERT(acpipsmlnkp != NULL); ep->lnkobj = acpipsmlnkp->lnkobj; mutex_exit(&acpi_irq_cache_mutex); diff --git a/usr/src/uts/i86pc/io/rootnex.c b/usr/src/uts/i86pc/io/rootnex.c index 68c6122d07..8416281fee 100644 --- a/usr/src/uts/i86pc/io/rootnex.c +++ b/usr/src/uts/i86pc/io/rootnex.c @@ -60,6 +60,7 @@ #include <vm/hat_i86.h> #include <sys/ddifm.h> #include <sys/ddi_isa.h> +#include <sys/apic.h> #ifdef __xpv #include <sys/bootinfo.h> @@ -157,7 +158,7 @@ typedef paddr_t rootnex_addr_t; #endif #if !defined(__xpv) -char _depends_on[] = "mach/pcplusmp misc/iommulib misc/acpica"; +char _depends_on[] = "misc/iommulib misc/acpica"; #endif static struct cb_ops rootnex_cb_ops = { @@ -212,6 +213,9 @@ static int rootnex_fm_init(dev_info_t *dip, dev_info_t *tdip, int tcap, ddi_iblock_cookie_t *ibc); static int rootnex_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, ddi_intr_handle_impl_t *hdlp, void *result); +static int rootnex_alloc_intr_fixed(dev_info_t *, ddi_intr_handle_impl_t *, + void *); +static int rootnex_free_intr_fixed(dev_info_t *, ddi_intr_handle_impl_t *); static int rootnex_coredma_allochdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr, int (*waitfp)(caddr_t), caddr_t arg, @@ -1361,7 +1365,6 @@ rootnex_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, ddi_intr_handle_impl_t *hdlp, void *result) { struct intrspec *ispec; - struct ddi_parent_private_data *pdp; DDI_INTR_NEXDBG((CE_CONT, "rootnex_intr_ops: pdip = %p, rdip = %p, intr_op = %x, hdlp = %p\n", @@ -1387,30 +1390,11 @@ rootnex_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, return (DDI_FAILURE); break; case DDI_INTROP_ALLOC: - if ((ispec = rootnex_get_ispec(rdip, hdlp->ih_inum)) == NULL) - return (DDI_FAILURE); - hdlp->ih_pri = ispec->intrspec_pri; - *(int *)result = hdlp->ih_scratch1; - break; + ASSERT(hdlp->ih_type == DDI_INTR_TYPE_FIXED); + return (rootnex_alloc_intr_fixed(rdip, hdlp, result)); case DDI_INTROP_FREE: - pdp = ddi_get_parent_data(rdip); - /* - * Special case for 'pcic' driver' only. - * If an intrspec was created for it, clean it up here - * See detailed comments on this in the function - * rootnex_get_ispec(). - */ - if (pdp->par_intr && strcmp(ddi_get_name(rdip), "pcic") == 0) { - kmem_free(pdp->par_intr, sizeof (struct intrspec) * - pdp->par_nintr); - /* - * Set it to zero; so that - * DDI framework doesn't free it again - */ - pdp->par_intr = NULL; - pdp->par_nintr = 0; - } - break; + ASSERT(hdlp->ih_type == DDI_INTR_TYPE_FIXED); + return (rootnex_free_intr_fixed(rdip, hdlp)); case DDI_INTROP_GETPRI: if ((ispec = rootnex_get_ispec(rdip, hdlp->ih_inum)) == NULL) return (DDI_FAILURE); @@ -1590,6 +1574,117 @@ rootnex_get_ispec(dev_info_t *rdip, int inum) return ((struct intrspec *)&pdp->par_intr[inum]); } +/* + * Allocate interrupt vector for FIXED (legacy) type. + */ +static int +rootnex_alloc_intr_fixed(dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp, + void *result) +{ + struct intrspec *ispec; + ddi_intr_handle_impl_t info_hdl; + int ret; + int free_phdl = 0; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + if ((ispec = rootnex_get_ispec(rdip, hdlp->ih_inum)) == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to allocate the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if (hdlp->ih_private == NULL) { /* allocate phdl structure */ + free_phdl = 1; + i_ddi_alloc_intr_phdl(hdlp); + } + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_ALLOC_VECTORS, result); + if (free_phdl) { /* free up the phdl structure */ + free_phdl = 0; + i_ddi_free_intr_phdl(hdlp); + hdlp->ih_private = NULL; + } + } else { + /* + * No APIX module; fall back to the old scheme where the + * interrupt vector is allocated during ddi_enable_intr() call. + */ + hdlp->ih_pri = ispec->intrspec_pri; + *(int *)result = hdlp->ih_scratch1; + ret = DDI_SUCCESS; + } + + return (ret); +} + +/* + * Free up interrupt vector for FIXED (legacy) type. + */ +static int +rootnex_free_intr_fixed(dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp) +{ + struct intrspec *ispec; + struct ddi_parent_private_data *pdp; + ddi_intr_handle_impl_t info_hdl; + int ret; + apic_get_type_t type_info; + + if (psm_intr_ops == NULL) + return (DDI_FAILURE); + + /* + * If the PSM module is "APIX" then pass the request for it + * to free up the vector now. + */ + bzero(&info_hdl, sizeof (ddi_intr_handle_impl_t)); + info_hdl.ih_private = &type_info; + if ((*psm_intr_ops)(NULL, &info_hdl, PSM_INTR_OP_APIC_TYPE, NULL) == + PSM_SUCCESS && strcmp(type_info.avgi_type, APIC_APIX_NAME) == 0) { + if ((ispec = rootnex_get_ispec(rdip, hdlp->ih_inum)) == NULL) + return (DDI_FAILURE); + ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp = ispec; + ret = (*psm_intr_ops)(rdip, hdlp, + PSM_INTR_OP_FREE_VECTORS, NULL); + } else { + /* + * No APIX module; fall back to the old scheme where + * the interrupt vector was already freed during + * ddi_disable_intr() call. + */ + ret = DDI_SUCCESS; + } + + pdp = ddi_get_parent_data(rdip); + + /* + * Special case for 'pcic' driver' only. + * If an intrspec was created for it, clean it up here + * See detailed comments on this in the function + * rootnex_get_ispec(). + */ + if (pdp->par_intr && strcmp(ddi_get_name(rdip), "pcic") == 0) { + kmem_free(pdp->par_intr, sizeof (struct intrspec) * + pdp->par_nintr); + /* + * Set it to zero; so that + * DDI framework doesn't free it again + */ + pdp->par_intr = NULL; + pdp->par_nintr = 0; + } + + return (ret); +} + /* * ****************** diff --git a/usr/src/uts/i86pc/ml/interrupt.s b/usr/src/uts/i86pc/ml/interrupt.s index 0a95adf429..97f5acba2d 100644 --- a/usr/src/uts/i86pc/ml/interrupt.s +++ b/usr/src/uts/i86pc/ml/interrupt.s @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -30,8 +29,6 @@ /* Copyright (c) 1987, 1988 Microsoft Corporation */ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/asm_linkage.h> #include <sys/asm_misc.h> #include <sys/regset.h> @@ -102,7 +99,7 @@ _interrupt(void) #endif movq %rsp, %rdi /* pass struct regs pointer */ - call do_interrupt + call *do_interrupt_common jmp _sys_rtt_ints_disabled /*NOTREACHED*/ @@ -131,7 +128,7 @@ _interrupt(void) pushl %esi /* pass traptrace record pointer */ pushl %ebp /* pass struct regs pointer */ - call do_interrupt /* interrupt service routine */ + call *do_interrupt_common /* interrupt service routine */ addl $8, %esp /* pop args off of stack */ jmp _sys_rtt_ints_disabled diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index fc0ef9e260..faaecd20b6 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include <sys/cpuvar.h> @@ -83,6 +82,11 @@ ulong_t lastcli[NCPU]; #endif +void do_interrupt(struct regs *rp, trap_trace_rec_t *ttp); + +void (*do_interrupt_common)(struct regs *, trap_trace_rec_t *) = do_interrupt; +uintptr_t (*get_intr_handler)(int, short) = NULL; + /* * Set cpu's base SPL level to the highest active interrupt level */ @@ -1103,6 +1107,11 @@ send_dirint(int cpuid, int int_level) (*send_dirintf)(cpuid, int_level); } +#define IS_FAKE_SOFTINT(flag, newpri) \ + (((flag) & PS_IE) && \ + (((*get_pending_spl)() > (newpri)) || \ + bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > (newpri))) + /* * do_splx routine, takes new ipl to set * returns the old ipl. @@ -1130,8 +1139,7 @@ do_splx(int newpri) * If we are going to reenable interrupts see if new priority level * allows pending softint delivery. */ - if ((flag & PS_IE) && - bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri) + if (IS_FAKE_SOFTINT(flag, newpri)) fakesoftint(); ASSERT(!interrupts_enabled()); intr_restore(flag); @@ -1164,8 +1172,7 @@ splr(int newpri) /* * See if new priority level allows pending softint delivery */ - if ((flag & PS_IE) && - bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri) + if (IS_FAKE_SOFTINT(flag, newpri)) fakesoftint(); } intr_restore(flag); diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index e9d08a9fa7..38c9f7159f 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -141,6 +140,13 @@ extern void audit_enterprom(int); extern void audit_exitprom(int); /* + * Tunable to enable apix PSM; if set to 0, pcplusmp PSM will be used. + */ +int apix_enable = 1; + +int apic_nvidia_io_max = 0; /* no. of NVIDIA i/o apics */ + +/* * Occassionally the kernel knows better whether to power-off or reboot. */ int force_shutdown_method = AD_UNKNOWN; diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index cf43c302b0..9f9c3aae4a 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2009-2010, Intel Corporation. @@ -123,6 +123,10 @@ void (*send_dirintf)() = return_instr; void (*setspl)(int) = (void (*)(int))return_instr; int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr; int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr; +int (*get_pending_spl)(void) = (int (*)(void))return_instr; +int (*addintr)(void *, int, avfunc, char *, int, caddr_t, caddr_t, + uint64_t *, dev_info_t *) = NULL; +void (*remintr)(void *, int, avfunc, int) = NULL; void (*kdisetsoftint)(int, struct av_softinfo *)= (void (*)(int, struct av_softinfo *))return_instr; void (*setsoftint)(int, struct av_softinfo *)= @@ -141,6 +145,9 @@ void (*gethrestimef)(timestruc_t *) = pc_gethrestime; void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL; int (*psm_get_clockirq)(int) = NULL; int (*psm_get_ipivect)(int, int) = NULL; +uchar_t (*psm_get_ioapicid)(uchar_t) = NULL; +uint32_t (*psm_get_localapicid)(uint32_t) = NULL; +uchar_t (*psm_xlate_vector_by_irq)(uchar_t) = NULL; int (*psm_clkinit)(int) = NULL; void (*psm_timer_reprogram)(hrtime_t) = NULL; @@ -158,6 +165,9 @@ void (*hrtime_tick)(void) = return_instr; int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **) = mach_cpu_create_devinfo; int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **) = NULL; +/* global IRM pool for APIX (PSM) module */ +ddi_irm_pool_t *apix_irm_pool_p = NULL; + /* * True if the generic TSC code is our source of hrtime, rather than whatever * the PSM can provide. diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index c679eb2626..a004b73055 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -2132,7 +2131,14 @@ dump_ttrace(void) case TT_INTERRUPT: printf(fmt2, "intr", rec->ttr_vector); - vec = (&autovect[rec->ttr_vector])->avh_link; + if (get_intr_handler != NULL) + vec = (struct autovec *) + (*get_intr_handler) + (rec->ttr_cpuid, rec->ttr_vector); + else + vec = + autovect[rec->ttr_vector].avh_link; + if (vec != NULL) { sym = kobj_getsymname( (uintptr_t)vec->av_vector, &off); diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index ef57e65249..1f177556ea 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -36,6 +36,7 @@ extern "C" { #include <sys/psm_common.h> #define APIC_PCPLUSMP_NAME "pcplusmp" +#define APIC_APIX_NAME "apix" #define APIC_IO_ADDR 0xfec00000 #define APIC_LOCAL_ADDR 0xfee00000 @@ -525,10 +526,11 @@ typedef struct apic_cpus_info { } apic_cpus_info_t; #pragma pack() -#define APIC_CPU_ONLINE 1 -#define APIC_CPU_INTR_ENABLE 2 -#define APIC_CPU_FREE 4 /* APIC CPU slot is free */ -#define APIC_CPU_DIRTY 8 /* Slot was once used */ +#define APIC_CPU_ONLINE 0x1 +#define APIC_CPU_INTR_ENABLE 0x2 +#define APIC_CPU_FREE 0x4 /* APIC CPU slot is free */ +#define APIC_CPU_DIRTY 0x8 /* Slot was once used */ +#define APIC_CPU_SUSPEND 0x10 /* * APIC ops to support various flavors of APIC like APIC and x2APIC. @@ -561,11 +563,12 @@ typedef struct msi_regs { typedef struct apic_intrmap_ops { int (*apic_intrmap_init)(int); void (*apic_intrmap_enable)(int); - void (*apic_intrmap_alloc_entry)(apic_irq_t *); - void (*apic_intrmap_map_entry)(apic_irq_t *, void *); - void (*apic_intrmap_free_entry)(apic_irq_t *); - void (*apic_intrmap_record_rdt)(apic_irq_t *, ioapic_rdt_t *); - void (*apic_intrmap_record_msi)(apic_irq_t *, msi_regs_t *); + void (*apic_intrmap_alloc_entry)(void **, dev_info_t *, uint16_t, + int, uchar_t); + void (*apic_intrmap_map_entry)(void *, void *, uint16_t, int); + void (*apic_intrmap_free_entry)(void **); + void (*apic_intrmap_record_rdt)(void *, ioapic_rdt_t *); + void (*apic_intrmap_record_msi)(void *, msi_regs_t *); } apic_intrmap_ops_t; /* @@ -642,6 +645,13 @@ typedef struct { /* Contains num_devs elements. */ } apic_get_intr_t; +/* Used by PSM_INTR_OP_GET_TYPE to return platform information. */ +typedef struct { + char *avgi_type; /* platform type - from kernel */ + uint32_t avgi_num_intr; /* max intr number - from kernel */ + uint32_t avgi_num_cpu; /* max cpu number - from kernel */ +} apic_get_type_t; + /* Masks for avgi_req_flags. */ #define PSMGI_REQ_CPUID 0x1 /* Request CPU ID */ #define PSMGI_REQ_NUM_DEVS 0x2 /* Request num of devices on vector */ @@ -652,7 +662,8 @@ typedef struct { /* Other flags */ #define PSMGI_INTRBY_VEC 0 /* Vec passed. xlate to IRQ needed */ #define PSMGI_INTRBY_IRQ 0x8000 /* IRQ passed. no xlate needed */ -#define PSMGI_INTRBY_FLAGS 0x8000 /* Mask for this flag */ +#define PSMGI_INTRBY_DEFAULT 0x4000 /* PSM specific default value */ +#define PSMGI_INTRBY_FLAGS 0xc000 /* Mask for this flag */ /* * Use scaled-fixed-point arithmetic to calculate apic ticks. @@ -674,25 +685,19 @@ extern int apic_verbose; #define APIC_VERBOSE_IRQ_FLAG 0x00000002 #define APIC_VERBOSE_POWEROFF_FLAG 0x00000004 #define APIC_VERBOSE_POWEROFF_PAUSE_FLAG 0x00000008 +#define APIC_VERBOSE_INIT 0x00000010 +#define APIC_VERBOSE_REBIND 0x00000020 +#define APIC_VERBOSE_ALLOC 0x00000040 +#define APIC_VERBOSE_IPI 0x00000080 +#define APIC_VERBOSE_INTR 0x00000100 - -#define APIC_VERBOSE_IOAPIC(fmt) \ - if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) \ - cmn_err fmt; - +/* required test to wait until APIC command is sent on the bus */ #define APIC_AV_PENDING_SET() \ while (apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING) \ apic_ret(); -#define APIC_VERBOSE_IRQ(fmt) \ - if (apic_verbose & APIC_VERBOSE_IRQ_FLAG) \ - cmn_err fmt; +#ifdef DEBUG -#define APIC_VERBOSE_POWEROFF(fmt) \ - if (apic_verbose & APIC_VERBOSE_POWEROFF_FLAG) \ - prom_printf fmt; - -#ifdef DEBUG #define DENT 0x0001 extern int apic_debug; /* @@ -714,7 +719,23 @@ extern int apic_debug_msgbufindex; if (apic_debug_msgbufindex >= (APIC_DEBUG_MSGBUFSIZE - NCPU)) \ apic_debug_msgbufindex = 0; -#endif /* DEBUG */ +#define APIC_VERBOSE(flag, fmt) \ + if (apic_verbose & APIC_VERBOSE_##flag) \ + cmn_err fmt; + +#define APIC_VERBOSE_POWEROFF(fmt) \ + if (apic_verbose & APIC_VERBOSE_POWEROFF_FLAG) \ + prom_printf fmt; + +#else /* DEBUG */ + +#define APIC_VERBOSE(flag, fmt) +#define APIC_VERBOSE_POWEROFF(fmt) + +#endif /* DEBUG */ + +#define APIC_VERBOSE_IOAPIC(fmt) APIC_VERBOSE(IOAPIC_FLAG, fmt) +#define APIC_VERBOSE_IRQ(fmt) APIC_VERBOSE(IRQ_FLAG, fmt) extern int apic_error; /* values which apic_error can take. Not catastrophic, but may help debug */ @@ -748,8 +769,6 @@ extern int apic_error; #define INTR_ROUND_ROBIN 1 #define INTR_LOWEST_PRIORITY 2 - - struct ioapic_reprogram_data { boolean_t done; apic_irq_t *irqp; @@ -856,6 +875,7 @@ extern int apic_next_bind_cpu; extern int apic_redistribute_sample_interval; extern int apic_multi_msi_enable; extern int apic_sci_vect; +extern int apic_hpet_vect; extern uchar_t apic_ipls[]; extern apic_reg_ops_t *apic_reg_ops; extern int apic_mode; diff --git a/usr/src/uts/i86pc/sys/apic_common.h b/usr/src/uts/i86pc/sys/apic_common.h new file mode 100644 index 0000000000..cd259d7f62 --- /dev/null +++ b/usr/src/uts/i86pc/sys/apic_common.h @@ -0,0 +1,205 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_APIC_COMMON_H +#define _SYS_APIC_COMMON_H + +#include <sys/psm_types.h> +#include <sys/avintr.h> +#include <sys/privregs.h> +#include <sys/pci.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Functions & Variables common to pcplusmp & apix + */ + +#include <sys/psm_common.h> + +/* Methods for multiple IOAPIC */ +enum apic_ioapic_method_type { + APIC_MUL_IOAPIC_NONE, /* use to disable pcplusmp fallback */ + APIC_MUL_IOAPIC_MASK, /* Set RT Entry Mask bit before EOI */ + APIC_MUL_IOAPIC_DEOI, /* Directed EOI */ + APIC_MUL_IOAPIC_IOXAPIC, /* IOxAPIC */ + APIC_MUL_IOAPIC_IIR, /* IOMMU interrup remapping */ + APIC_MUL_IOAPIC_PCPLUSMP /* Fall back to old pcplusmp */ +}; + +#define APIX_IS_DIRECTED_EOI(type) \ + ((type) == APIC_MUL_IOAPIC_DEOI || (type) == APIC_MUL_IOAPIC_IIR) +#define APIX_IS_MASK_RDT(type) \ + ((type) == APIC_MUL_IOAPIC_NONE || (type) == APIC_MUL_IOAPIC_MASK) + +extern int apix_enable; +extern int apix_loaded(void); +extern enum apic_ioapic_method_type apix_mul_ioapic_method; + +extern int apic_oneshot; +/* to allow disabling one-shot capability */ +extern int apic_oneshot_enable; + +/* Now the ones for Dynamic Interrupt distribution */ +extern int apic_enable_dynamic_migration; + +extern int apic_have_32bit_cr8; + +extern struct psm_ops *psmops; + +/* + * These variables are frequently accessed in apic_intr_enter(), + * apic_intr_exit and apic_setspl, so group them together + */ +extern volatile uint32_t *apicadr; /* virtual addr of local APIC */ +extern uchar_t apic_io_vectbase[MAX_IO_APIC]; +extern uchar_t apic_io_vectend[MAX_IO_APIC]; +extern uchar_t apic_io_ver[MAX_IO_APIC]; +extern int apic_io_max; +extern int apic_nvidia_io_max; +extern int apic_setspl_delay; /* apic_setspl - delay enable */ +extern int apic_clkvect; + +/* vector at which error interrupts come in */ +extern int apic_errvect; +extern int apic_enable_error_intr; +extern int apic_error_display_delay; + +/* vector at which performance counter overflow interrupts come in */ +extern int apic_cpcovf_vect; +extern int apic_enable_cpcovf_intr; + +/* vector at which CMCI interrupts come in */ +extern int apic_cmci_vect; +extern int cmi_enable_cmci; +extern void cmi_cmci_trap(void); + +extern kmutex_t cmci_cpu_setup_lock; /* protects cmci_cpu_setup_registered */ +extern int cmci_cpu_setup_registered; + +extern int apic_forceload; + +extern int apic_coarse_hrtime; /* 0 - use accurate slow gethrtime() */ + /* 1 - use gettime() for performance */ +extern int apic_flat_model; /* 0 - clustered. 1 - flat */ + +extern int apic_panic_on_nmi; +extern int apic_panic_on_apic_error; + +extern int apic_verbose; + +/* minimum number of timer ticks to program to */ +extern int apic_min_timer_ticks; + +#ifdef DEBUG +extern int apic_debug; +extern int apic_restrict_vector; + +extern int apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE]; +extern int apic_debug_msgbufindex; + +#endif /* DEBUG */ + +extern uint_t apic_nsec_per_intr; +extern uint_t apic_nticks; +extern uint_t apic_skipped_redistribute; + +extern uint_t last_count_read; +extern lock_t apic_mode_switch_lock; +extern lock_t apic_gethrtime_lock; +extern volatile int apic_hrtime_stamp; +extern volatile hrtime_t apic_nsec_since_boot; +extern uint_t apic_hertz_count; + +extern uint64_t apic_ticks_per_SFnsecs; /* # of ticks in SF nsecs */ + +extern int apic_hrtime_error; +extern int apic_remote_hrterr; +extern int apic_num_nmis; +extern int apic_apic_error; +extern int apic_num_apic_errors; +extern int apic_num_cksum_errors; + +extern int apic_error; + +/* use to make sure only one cpu handles the nmi */ +extern lock_t apic_nmi_lock; +/* use to make sure only one cpu handles the error interrupt */ +extern lock_t apic_error_lock; + +/* Patchable global variables. */ +extern int apic_kmdb_on_nmi; /* 0 - no, 1 - yes enter kmdb */ +extern uint32_t apic_divide_reg_init; /* 0 - divide by 2 */ + +extern apic_intrmap_ops_t *apic_vt_ops; + +#ifdef DEBUG +extern int apic_break_on_cpu; +extern int apic_stretch_interrupts; +extern int apic_stretch_ISR; /* IPL of 3 matches nothing now */ +#endif + +extern ddi_periodic_t apic_periodic_id; + +extern void apic_nmi_intr(caddr_t arg, struct regs *rp); +extern int apic_clkinit(); +extern hrtime_t apic_gettime(); +extern hrtime_t apic_gethrtime(); +extern int apic_cpu_start(processorid_t cpuid, caddr_t ctx); +extern int apic_cpu_stop(processorid_t cpuid, caddr_t ctx); +extern int apic_cpu_add(psm_cpu_request_t *reqp); +extern int apic_cpu_remove(psm_cpu_request_t *reqp); +extern int apic_cpu_ops(psm_cpu_request_t *reqp); +extern void apic_switch_ipi_callback(boolean_t enter); +extern void apic_send_ipi(int cpun, int ipl); +extern void apic_set_idlecpu(processorid_t cpun); +extern void apic_unset_idlecpu(processorid_t cpun); +extern void apic_shutdown(int cmd, int fcn); +extern void apic_preshutdown(int cmd, int fcn); +extern processorid_t apic_get_next_processorid(processorid_t cpun); +extern void apic_timer_reprogram(hrtime_t time); +extern void apic_timer_enable(void); +extern void apic_timer_disable(void); + +extern int apic_error_intr(); +extern void apic_cpcovf_mask_clear(void); +extern int cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg); +extern void apic_intrmap_init(int apic_mode); +extern processorid_t apic_find_cpu(int flag); +extern processorid_t apic_get_next_bind_cpu(void); + +extern int apic_support_msi; +extern int apic_multi_msi_enable; +extern int apic_msix_enable; + +extern uint32_t apic_get_localapicid(uint32_t cpuid); +extern uchar_t apic_get_ioapicid(uchar_t ioapicindex); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_APIC_COMMON_H */ diff --git a/usr/src/uts/i86pc/sys/apix.h b/usr/src/uts/i86pc/sys/apix.h new file mode 100644 index 0000000000..3db39b4021 --- /dev/null +++ b/usr/src/uts/i86pc/sys/apix.h @@ -0,0 +1,352 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef __SYS_APIX_APIX_H +#define __SYS_APIX_APIX_H + +#include <sys/note.h> +#include <sys/avintr.h> +#include <sys/traptrace.h> +#include <sys/apic.h> +#include <sys/apic_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef DEBUG +#ifndef TRAPTRACE +#define TRAPTRACE +#endif +#endif + +#define APIX_NAME "apix" + +#define APIX_NVECTOR 256 /* max number of per-cpu vectors */ +#define APIX_NIRQ 256 /* maximum number of IRQs */ +#define APIX_INVALID_VECT 0 /* invalid vector */ + +/* vector type */ +#define APIX_TYPE_FIXED DDI_INTR_TYPE_FIXED /* 1 */ +#define APIX_TYPE_MSI DDI_INTR_TYPE_MSI /* 2 */ +#define APIX_TYPE_MSIX DDI_INTR_TYPE_MSIX /* 4 */ +#define APIX_TYPE_IPI 8 + +/* vector states */ +enum { + APIX_STATE_FREED = 0, + APIX_STATE_OBSOLETED, /* 1 */ + APIX_STATE_ALLOCED, /* 2 */ + APIX_STATE_ENABLED, /* 3 */ + APIX_STATE_DISABLED /* 4 */ +}; +#define IS_VECT_FREE(p) \ + (((p) == NULL) || ((p)->v_state == APIX_STATE_FREED)) +#define IS_VECT_OBSOL(p) \ + (((p) != NULL) && ((p)->v_state == APIX_STATE_OBSOLETED)) +#define IS_VECT_ENABLED(p) \ + (((p) != NULL) && ((p)->v_state == APIX_STATE_ENABLED)) + +/* flags */ +#define APIX_VECT_USER_BOUND 0x1 +#define APIX_VECT_MASKABLE 0x2 + +/* + * Number of interrupt vectors reserved by software on each LOCAL APIC: + * 1. Dtrace + * 2. int80 + * 3. system-call + * 4. fast-trap + * 5. apix-reserved + */ +#define APIX_SW_RESERVED_VECTORS 5 + +/* + * Macros to help deal with shared interrupts and to differentiate + * between vector and irq number when passing arguments to interfaces + * xxx_avintr() + */ +#define APIX_VIRTVEC_VECMASK 0xff +#define APIX_VIRTVEC_FLAG 0x80000000 +#define APIX_VIRTVECTOR(cpuid, v) \ + (APIX_VIRTVEC_FLAG | ((cpuid) << 8) | (v)) +#define APIX_IS_VIRTVEC(vv) \ + ((vv) & APIX_VIRTVEC_FLAG) +#define APIX_VIRTVEC_VECTOR(vv) \ + (((uchar_t)(vv)) & APIX_VIRTVEC_VECMASK) +#define APIX_VIRTVEC_CPU(vv) \ + (((uint32_t)(vv) & ~APIX_VIRTVEC_FLAG) >> 8) + +struct apix_dev_vector; +typedef struct apix_vector { + ushort_t v_state; + ushort_t v_type; /* interrupt type */ + processorid_t v_cpuid; /* current target cpu */ + uchar_t v_vector; /* vector */ + uchar_t v_share; /* intrs at this vector */ + int v_inum; /* irq for fixed, inum for msi/x */ + uint_t v_flags; + processorid_t v_bound_cpuid; /* binding cpu */ + uint_t v_busy; /* How frequently did clock */ + /* find us in this */ + uint_t v_pri; /* maximum priority */ + struct autovec *v_autovect; /* ISR linked list */ + void *v_intrmap_private; /* intr remap data */ + struct apix_dev_vector *v_devp; /* pointer to device */ + struct apix_vector *v_next; /* next on per-cpu obosoletes chain */ +} apix_vector_t; + +typedef struct apix_impl { + processorid_t x_cpuid; /* cpu number */ + + uint16_t x_intr_pending; /* pending intr by IPL */ + /* pointer to head of interrupt pending list */ + struct autovec *x_intr_head[PIL_MAX + 1]; + /* pointer to tail of interrupt pending list */ + struct autovec *x_intr_tail[PIL_MAX + 1]; + + apix_vector_t *x_obsoletes; /* obosoleted vectors */ + apix_vector_t *x_vectbl[APIX_NVECTOR]; /* vector table */ + + lock_t x_lock; +} apix_impl_t; + +#define HILEVEL_PENDING(cpu) \ + (apixs[(cpu)->cpu_id]->x_intr_pending & CPU_INTR_ACTV_HIGH_LEVEL_MASK) +#define LOWLEVEL_PENDING(cpu) \ + (apixs[(cpu)->cpu_id]->x_intr_pending & ~CPU_INTR_ACTV_HIGH_LEVEL_MASK) +#define IS_HILEVEL_RUNNING(cpu) \ + (((ushort_t)((cpu)->intr_actv)) & CPU_INTR_ACTV_HIGH_LEVEL_MASK) +#define IS_LOWLEVEL_RUNNING(cpu) \ + (((ushort_t)((cpu)->intr_actv)) & ~CPU_INTR_ACTV_HIGH_LEVEL_MASK) + +#define INTR_PENDING(apixp, ipl) \ + ((ipl) <= LOCK_LEVEL ? \ + ((apixp)->x_intr_pending & (1 << (ipl))) : \ + ((apixp)->x_intr_pending >> (LOCK_LEVEL + 1))) + +/* + * We need a way to find allocated vector for a device. One option + * is to maintain a mapping table in pcplusmp. Another option would + * be to record vector or irq with interrupt handler hdlp->ih_vector or + * hdlp->ih_irq. + * Second option requires interface changes, such as, a new interface + * for noticing vector changes caused by interrupt re-targeting. + * Currently we choose the first option cause it doesn't require + * new interfaces. + */ +typedef struct apix_dev_vector { + dev_info_t *dv_dip; + int dv_inum; /* interrupt number */ + int dv_type; /* interrupt type */ + apix_vector_t *dv_vector; /* vector */ + struct apix_dev_vector *dv_next; /* per major chain */ +} apix_dev_vector_t; + +extern lock_t apix_lock; +extern apix_impl_t *apixs[]; +extern int apix_nipis; +extern int apix_cpu_nvectors; +extern apix_dev_vector_t **apix_dev_vector; +extern processorid_t *apix_major_to_cpu; +extern kmutex_t apix_mutex; + +#define xv_vector(cpu, v) apixs[(cpu)]->x_vectbl[(v)] +#define xv_intrmap_private(cpu, v) (xv_vector(cpu, v))->v_intrmap_private + +#define APIX_IPI_MAX APIC_MAX_VECTOR +#define APIX_IPI_MIN (APIX_NVECTOR - apix_nipis) +#define APIX_AVINTR_MIN 0x20 +#define APIX_NAVINTR \ + (apix_cpu_nvectors - apix_nipis - APIX_AVINTR_MIN) +#define APIX_AVINTR_MAX \ + ((APIX_NAVINTR <= 0) ? 0 : \ + (((APIX_AVINTR_MIN + APIX_NAVINTR) > APIX_IPI_MIN) ? \ + (APIX_IPI_MIN - 2) : \ + (APIX_AVINTR_MIN + APIX_NAVINTR - 2))) +#define APIX_RESV_VECTOR (APIX_AVINTR_MAX + 1) + +#define IS_VALID_AVINTR(v) \ + ((v) >= APIX_AVINTR_MIN && (v) <= APIX_AVINTR_MAX) + +#define APIX_ENTER_CPU_LOCK(cpuid) lock_set(&apixs[(cpuid)]->x_lock) +#define APIX_LEAVE_CPU_LOCK(cpuid) lock_clear(&apixs[(cpuid)]->x_lock) +#define APIX_CPU_LOCK_HELD(cpuid) LOCK_HELD(&apixs[(cpuid)]->x_lock) + +/* Get dip for msi/x */ +#define APIX_GET_DIP(v) \ + ((v)->v_devp->dv_dip) + +/* + * For irq + */ +extern apic_irq_t *apic_irq_table[APIC_MAX_VECTOR+1]; +#define IS_IRQ_FREE(p) \ + ((p) == NULL || ((p)->airq_mps_intr_index == FREE_INDEX)) + +#define UNREFERENCED_1PARAMETER(_p) _NOTE(ARGUNUSED(_p)) +#define UNREFERENCED_3PARAMETER(_p, _q, _r) _NOTE(ARGUNUSED(_p, _q, _r)) + +/* + * From mp_platform_common.c + */ +extern int apic_intr_policy; +extern iflag_t apic_sci_flags; +extern int apic_hpet_vect; +extern iflag_t apic_hpet_flags; +extern int apic_redist_cpu_skip; +extern int apic_num_imbalance; +extern int apic_num_rebind; +extern struct apic_io_intr *apic_io_intrp; +extern int apic_use_acpi_madt_only; +extern uint32_t eisa_level_intr_mask; +extern int apic_pci_bus_total; +extern uchar_t apic_single_pci_busid; + +extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; +extern int acpi_iso_cnt; + +extern int apic_defconf; +extern int apic_irq_translate; + +extern int apic_max_reps_clear_pending; + +extern int apic_probe_common(char *modname); +extern uchar_t acpi_find_ioapic(int irq); +extern int apic_find_bus_id(int bustype); +extern int apic_find_intin(uchar_t ioapic, uchar_t intin); +extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); +extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, + int ipin, int *pci_irqp, iflag_t *intr_flagp); +extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, + int child_ipin, struct apic_io_intr **intrp); +extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); + +/* + * From apic_regops.c + */ +extern int apic_have_32bit_cr8; + +/* + * apix_intr.c + */ +extern void apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp); + +/* + * apix_utils.c + */ + +typedef struct apix_rebind_info { + int i_go; /* if rebinding op is in progress */ + uint_t i_pri; + processorid_t i_old_cpuid; + struct autovec *i_old_av; + processorid_t i_new_cpuid; + struct autovec *i_new_av; +} apix_rebind_info_t; + +extern struct apix_rebind_info apix_rebindinfo; + +#define APIX_SET_REBIND_INFO(_ovp, _nvp)\ + if (((_ovp)->v_flags & APIX_VECT_MASKABLE) == 0) {\ + apix_rebindinfo.i_pri = (_ovp)->v_pri;\ + apix_rebindinfo.i_old_cpuid = (_ovp)->v_cpuid;\ + apix_rebindinfo.i_old_av = (_ovp)->v_autovect;\ + apix_rebindinfo.i_new_cpuid = (_nvp)->v_cpuid;\ + apix_rebindinfo.i_new_av = (_nvp)->v_autovect;\ + apix_rebindinfo.i_go = 1;\ + } + +#define APIX_CLR_REBIND_INFO() \ + apix_rebindinfo.i_go = 0 + +#define APIX_IS_FAKE_INTR(_vector)\ + (apix_rebindinfo.i_go && (_vector) == APIX_RESV_VECTOR) + +#define APIX_DO_FAKE_INTR(_cpu, _vector)\ + if (APIX_IS_FAKE_INTR(_vector)) {\ + struct autovec *tp;\ + if ((_cpu) == apix_rebindinfo.i_old_cpuid)\ + tp = apix_rebindinfo.i_old_av;\ + else if ((_cpu) == apix_rebindinfo.i_new_cpuid)\ + tp = apix_rebindinfo.i_new_av;\ + if (tp->av_vector != NULL &&\ + (tp->av_flags & AV_PENTRY_PEND) == 0) {\ + tp->av_flags |= AV_PENTRY_PEND;\ + apix_insert_pending_av(apixs[(_cpu)], tp,\ + tp->av_prilevel);\ + apixs[(_cpu)]->x_intr_pending |=\ + (1 << tp->av_prilevel);\ + }\ + } + +extern int apix_add_avintr(void *intr_id, int ipl, avfunc xxintr, char *name, + int vector, caddr_t arg1, caddr_t arg2, uint64_t *ticksp, dev_info_t *dip); +extern void apix_rem_avintr(void *intr_id, int ipl, avfunc xxintr, + int virt_vect); + +extern uint32_t apix_bind_cpu_locked(dev_info_t *dip); +extern apix_vector_t *apix_rebind(apix_vector_t *vecp, processorid_t tocpu, + int count); + +extern uchar_t apix_alloc_ipi(int ipl); +extern apix_vector_t *apix_alloc_intx(dev_info_t *dip, int inum, int irqno); +extern int apix_alloc_msi(dev_info_t *dip, int inum, int count, int behavior); +extern int apix_alloc_msix(dev_info_t *dip, int inum, int count, int behavior); +extern void apix_free_vectors(dev_info_t *dip, int inum, int count, int type); +extern void apix_enable_vector(apix_vector_t *vecp); +extern void apix_disable_vector(apix_vector_t *vecp); +extern int apix_obsolete_vector(apix_vector_t *vecp); +extern int apix_find_cont_vector_oncpu(uint32_t cpuid, int count); + +extern void apix_set_dev_map(apix_vector_t *vecp, dev_info_t *dip, int inum); +extern apix_vector_t *apix_get_dev_map(dev_info_t *dip, int inum, int type); +extern apix_vector_t *apix_setup_io_intr(apix_vector_t *vecp); +extern void ioapix_init_intr(int mask_apic); +extern int apix_get_min_dev_inum(dev_info_t *dip, int type); +extern int apix_get_max_dev_inum(dev_info_t *dip, int type); + +/* + * apix.c + */ +extern int apix_addspl(int virtvec, int ipl, int min_ipl, int max_ipl); +extern int apix_delspl(int virtvec, int ipl, int min_ipl, int max_ipl); +extern void apix_intx_set_vector(int irqno, uint32_t cpuid, uchar_t vector); +extern apix_vector_t *apix_intx_get_vector(int irqno); +extern void apix_intx_enable(int irqno); +extern void apix_intx_disable(int irqno); +extern void apix_intx_free(int irqno); +extern int apix_intx_rebind(int irqno, processorid_t cpuid, uchar_t vector); +extern apix_vector_t *apix_set_cpu(apix_vector_t *vecp, int new_cpu, + int *result); +extern apix_vector_t *apix_grp_set_cpu(apix_vector_t *vecp, int new_cpu, + int *result); +extern void apix_level_intr_pre_eoi(int irq); +extern void apix_level_intr_post_dispatch(int irq); + +#ifdef __cplusplus +} +#endif + +#endif /* __SYS_APIX_APIX_H */ diff --git a/usr/src/uts/i86pc/sys/apix_irm_impl.h b/usr/src/uts/i86pc/sys/apix_irm_impl.h new file mode 100644 index 0000000000..a7638fbd36 --- /dev/null +++ b/usr/src/uts/i86pc/sys/apix_irm_impl.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _APIX_IRM_IMPL_H +#define _APIX_IRM_IMPL_H + +#include <sys/types.h> +#include <sys/ddi_intr_impl.h> +#include <sys/psm_types.h> +#include <sys/apix.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define APIX_IRM_DEBUG(args) DDI_INTR_IRMDBG(args) + +typedef struct apix_irm_info { + int apix_ncpus; /* # of available CPUs (boot time) */ + int apix_per_cpu_vectors; /* # of available vectors per CPU */ + int apix_ioapic_max_vectors; /* max # of vectors used by IOAPICs */ + int apix_vectors_allocated; /* # of vectors (pre) allocated */ +} apix_irm_info_t; + +extern apix_irm_info_t apix_irminfo; +extern int apix_system_max_vectors; +extern int apix_irm_cpu_factor; +extern ddi_irm_pool_t *apix_irm_pool_p; + +#ifdef __cplusplus +} +#endif + +#endif /* _APIX_IRM_IMPL_H */ diff --git a/usr/src/uts/i86pc/sys/hpet_acpi.h b/usr/src/uts/i86pc/sys/hpet_acpi.h index c85707787e..078f4e73b3 100644 --- a/usr/src/uts/i86pc/sys/hpet_acpi.h +++ b/usr/src/uts/i86pc/sys/hpet_acpi.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _HPET_ACPI_H @@ -281,50 +280,6 @@ typedef hrtime_t hpet_proxy_t; extern ACPI_TABLE_HPET *hpet_table; extern hpet_info_t hpet_info; -static int hpet_init_proxy(int *hpet_vect, iflag_t *hpet_flags); -static boolean_t hpet_install_proxy(void); -static boolean_t hpet_callback(int code); -static boolean_t hpet_cpr(int code); -static boolean_t hpet_resume(void); -static void hpet_cst_callback(uint32_t code); -static boolean_t hpet_deep_idle_config(int code); -static int hpet_validate_table(ACPI_TABLE_HPET *hpet_table); -static boolean_t hpet_checksum_table(unsigned char *table, unsigned int len); -static void *hpet_memory_map(ACPI_TABLE_HPET *hpet_table); -static int hpet_start_main_counter(hpet_info_t *hip); -static int hpet_stop_main_counter(hpet_info_t *hip); -static uint64_t hpet_read_main_counter_value(hpet_info_t *hip); -static uint64_t hpet_set_leg_rt_cnf(hpet_info_t *hip, uint32_t new_value); -static uint64_t hpet_read_gen_cap(hpet_info_t *hip); -static uint64_t hpet_read_gen_config(hpet_info_t *hip); -static uint64_t hpet_read_gen_intrpt_stat(hpet_info_t *hip); -static uint64_t hpet_read_timer_N_config(hpet_info_t *hip, uint_t n); -static hpet_TN_conf_cap_t hpet_convert_timer_N_config(uint64_t conf); -static uint64_t hpet_read_timer_N_comp(hpet_info_t *hip, uint_t n); -static void hpet_write_gen_cap(hpet_info_t *hip, uint64_t l); -static void hpet_write_gen_config(hpet_info_t *hip, uint64_t l); -static void hpet_write_gen_intrpt_stat(hpet_info_t *hip, uint64_t l); -static void hpet_write_timer_N_config(hpet_info_t *hip, uint_t n, uint64_t l); -static void hpet_write_timer_N_comp(hpet_info_t *hip, uint_t n, uint64_t l); -static void hpet_disable_timer(hpet_info_t *hip, uint32_t timer_n); -static void hpet_enable_timer(hpet_info_t *hip, uint32_t timer_n); -static void hpet_write_main_counter_value(hpet_info_t *hip, uint64_t l); -static int hpet_get_FSB_intr_capable_timer(hpet_info_t *hip, uint32_t mask); -static int hpet_get_IOAPIC_intr_capable_timer(hpet_info_t *hip); -static int hpet_timer_available(uint32_t allocated_timers, uint32_t n); -static void hpet_timer_alloc(uint32_t *allocated_timers, uint32_t n); -static void hpet_timer_set_up(hpet_info_t *hip, uint32_t timer_n, - uint32_t interrupt); -static uint_t hpet_isr(char *arg); -static uint32_t hpet_install_interrupt_handler(uint_t (*func)(char *), - int vector); -static void hpet_uninstall_interrupt_handler(void); -static void hpet_expire_all(void); -static boolean_t hpet_guaranteed_schedule(hrtime_t required_wakeup_time); -static boolean_t hpet_use_hpet_timer(hrtime_t *expire); -static void hpet_use_lapic_timer(hrtime_t expire); -static void hpet_init_proxy_data(void); - #endif /* defined(_KERNEL) */ #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index 8c79243a29..a783e942f7 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -127,6 +126,12 @@ extern void do_interrupt(struct regs *, trap_trace_rec_t *); extern void memscrub_disable(void); /* + * Interrupt handling hooks + */ +extern void (*do_interrupt_common)(struct regs *, trap_trace_rec_t *); +extern uintptr_t (*get_intr_handler)(int, short); + +/* * Dispatcher hooks. */ void (*idle_cpu)(); diff --git a/usr/src/uts/i86pc/sys/smp_impldefs.h b/usr/src/uts/i86pc/sys/smp_impldefs.h index 800c8d4e93..6afce7fd6c 100644 --- a/usr/src/uts/i86pc/sys/smp_impldefs.h +++ b/usr/src/uts/i86pc/sys/smp_impldefs.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SMP_IMPLDEFS_H @@ -70,6 +69,9 @@ extern void (*psm_timer_enable)(void); /* timer enable */ extern void (*psm_timer_disable)(void); /* timer disable */ extern void (*psm_post_cyclic_setup)(void *arg); /* psm cyclic setup */ extern int (*psm_state)(psm_state_request_t *); /* psm state save/restore */ +extern uchar_t (*psm_get_ioapicid)(uchar_t); /* get io-apic id */ +extern uint32_t (*psm_get_localapicid)(uint32_t); /* get local-apic id */ +extern uchar_t (*psm_xlate_vector_by_irq)(uchar_t); /* get vector for an irq */ extern int (*slvltovect)(int); /* ipl interrupt priority level */ extern int (*setlvl)(int, int *); /* set intr pri represented by vect */ @@ -77,6 +79,10 @@ extern void (*setlvlx)(int, int); /* set intr pri to specified level */ extern void (*setspl)(int); /* mask intr below or equal given ipl */ extern int (*addspl)(int, int, int, int); /* add intr mask of vector */ extern int (*delspl)(int, int, int, int); /* delete intr mask of vector */ +extern int (*get_pending_spl)(void); /* get highest pending ipl */ +extern int (*addintr)(void *, int, avfunc, char *, int, caddr_t, caddr_t, + uint64_t *, dev_info_t *); /* replacement of add_avintr */ +extern void (*remintr)(void *, int, avfunc, int); /* replace of rem_avintr */ /* trigger a software intr */ extern void (*setsoftint)(int, struct av_softinfo *); diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index e9501d7684..3735a03f45 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -20,8 +20,7 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # # This Makefile defines file modules in the directory uts/i86xpv @@ -199,7 +198,8 @@ PRIVCMD_OBJS += seg_mf.o privcmd.o privcmd_hcall.o ROOTNEX_OBJS += rootnex.o XPVTOD_OBJS += xpvtod.o XPV_AUTOCONFIG_OBJS += xpv_autoconfig.o -XPV_PSM_OBJS += xpv_psm.o mp_platform_common.o apic_introp.o apic_regops.o psm_common.o +XPV_PSM_OBJS += xpv_psm.o mp_platform_common.o mp_platform_xpv.o \ + apic_regops.o psm_common.o xpv_intr.o XPV_UPPC_OBJS += xpv_uppc.o psm_common.o XENBUS_OBJS += xenbus_dev.o XENCONS_OBJS += xencons.o diff --git a/usr/src/uts/i86xpv/io/psm/mp_platform_xpv.c b/usr/src/uts/i86xpv/io/psm/mp_platform_xpv.c new file mode 100644 index 0000000000..5d30358825 --- /dev/null +++ b/usr/src/uts/i86xpv/io/psm/mp_platform_xpv.c @@ -0,0 +1,2152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * PSMI 1.1 extensions are supported only in 2.6 and later versions. + * PSMI 1.2 extensions are supported only in 2.7 and later versions. + * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. + * PSMI 1.5 extensions are supported in Solaris Nevada. + * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. + */ +#define PSMI_1_7 + +#include <sys/processor.h> +#include <sys/time.h> +#include <sys/psm.h> +#include <sys/smp_impldefs.h> +#include <sys/cram.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/psm_common.h> +#include <sys/apic.h> +#include <sys/apic_common.h> +#include <sys/pit.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/promif.h> +#include <sys/x86_archext.h> +#include <sys/cpc_impl.h> +#include <sys/uadmin.h> +#include <sys/panic.h> +#include <sys/debug.h> +#include <sys/archsystm.h> +#include <sys/trap.h> +#include <sys/machsystm.h> +#include <sys/cpuvar.h> +#include <sys/rm_platter.h> +#include <sys/privregs.h> +#include <sys/cyclic.h> +#include <sys/note.h> +#include <sys/pci_intr_lib.h> +#include <sys/sunndi.h> + + +/* + * Local Function Prototypes + */ +static void apic_mark_vector(uchar_t oldvector, uchar_t newvector); +static void apic_xlate_vector_free_timeout_handler(void *arg); +static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, + int new_bind_cpu, int apicindex, int intin_no, int which_irq, + struct ioapic_reprogram_data *drep); +static int apic_setup_irq_table(dev_info_t *dip, int irqno, + struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, + int type); +static void apic_try_deferred_reprogram(int ipl, int vect); +static void delete_defer_repro_ent(int which_irq); +static void apic_ioapic_wait_pending_clear(int ioapicindex, + int intin_no); + +extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid, + int ipin, int *pci_irqp, iflag_t *intr_flagp); +extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno, + int child_ipin, struct apic_io_intr **intrp); +extern uchar_t acpi_find_ioapic(int irq); +extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid); +extern int apic_find_bus_id(int bustype); +extern int apic_find_intin(uchar_t ioapic, uchar_t intin); +extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq); + +extern int apic_sci_vect; +extern iflag_t apic_sci_flags; +extern int apic_intr_policy; +extern char *psm_name; + +/* + * number of bits per byte, from <sys/param.h> + */ +#define UCHAR_MAX ((1 << NBBY) - 1) + +/* Max wait time (in repetitions) for flags to clear in an RDT entry. */ +extern int apic_max_reps_clear_pending; + +/* The irq # is implicit in the array index: */ +struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1]; +/* + * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info + * is indexed by IRQ number, NOT by vector number. + */ + +extern int apic_int_busy_mark; +extern int apic_int_free_mark; +extern int apic_diff_for_redistribution; +extern int apic_sample_factor_redistribution; +extern int apic_redist_cpu_skip; +extern int apic_num_imbalance; +extern int apic_num_rebind; + +/* timeout for xlate_vector, mark_vector */ +int apic_revector_timeout = 16 * 10000; /* 160 millisec */ + +extern int apic_defconf; +extern int apic_irq_translate; + +extern int apic_use_acpi_madt_only; /* 1=ONLY use MADT from ACPI */ + +extern uchar_t apic_io_vectbase[MAX_IO_APIC]; + +extern boolean_t ioapic_mask_workaround[MAX_IO_APIC]; + +/* + * First available slot to be used as IRQ index into the apic_irq_table + * for those interrupts (like MSI/X) that don't have a physical IRQ. + */ +extern int apic_first_avail_irq; + +/* + * apic_defer_reprogram_lock ensures that only one processor is handling + * deferred interrupt programming at *_intr_exit time. + */ +static lock_t apic_defer_reprogram_lock; + +/* + * The current number of deferred reprogrammings outstanding + */ +uint_t apic_reprogram_outstanding = 0; + +#ifdef DEBUG +/* + * Counters that keep track of deferred reprogramming stats + */ +uint_t apic_intr_deferrals = 0; +uint_t apic_intr_deliver_timeouts = 0; +uint_t apic_last_ditch_reprogram_failures = 0; +uint_t apic_deferred_setup_failures = 0; +uint_t apic_defer_repro_total_retries = 0; +uint_t apic_defer_repro_successes = 0; +uint_t apic_deferred_spurious_enters = 0; +#endif + +extern int apic_io_max; +extern struct apic_io_intr *apic_io_intrp; + +uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1]; + +extern uint32_t eisa_level_intr_mask; + /* At least MSB will be set if EISA bus */ + +extern int apic_pci_bus_total; +extern uchar_t apic_single_pci_busid; + +/* + * Following declarations are for revectoring; used when ISRs at different + * IPLs share an irq. + */ +static lock_t apic_revector_lock; +int apic_revector_pending = 0; +static uchar_t *apic_oldvec_to_newvec; +static uchar_t *apic_newvec_to_oldvec; + +/* ACPI Interrupt Source Override Structure ptr */ +ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop; +extern int acpi_iso_cnt; + +/* + * Auto-configuration routines + */ + +/* + * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable + * are also set to NULL. vector->irq is set to a value which cannot map + * to a real irq to show that it is free. + */ +void +apic_init_common(void) +{ + int i, j, indx; + int *iptr; + + /* + * Initialize apic_ipls from apic_vectortoipl. This array is + * used in apic_intr_enter to determine the IPL to use for the + * corresponding vector. On some systems, due to hardware errata + * and interrupt sharing, the IPL may not correspond to the IPL listed + * in apic_vectortoipl (see apic_addspl and apic_delspl). + */ + for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) { + indx = i * APIC_VECTOR_PER_IPL; + + for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++) + apic_ipls[indx] = apic_vectortoipl[i]; + } + + /* cpu 0 is always up (for now) */ + apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE; + + iptr = (int *)&apic_irq_table[0]; + for (i = 0; i <= APIC_MAX_VECTOR; i++) { + apic_level_intr[i] = 0; + *iptr++ = NULL; + apic_vector_to_irq[i] = APIC_RESV_IRQ; + + /* These *must* be initted to B_TRUE! */ + apic_reprogram_info[i].done = B_TRUE; + apic_reprogram_info[i].irqp = NULL; + apic_reprogram_info[i].tries = 0; + apic_reprogram_info[i].bindcpu = 0; + } + + /* + * Allocate a dummy irq table entry for the reserved entry. + * This takes care of the race between removing an irq and + * clock detecting a CPU in that irq during interrupt load + * sampling. + */ + apic_irq_table[APIC_RESV_IRQ] = + kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + + mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL); +} + +void +ioapic_init_intr(int mask_apic) +{ + int ioapic_ix; + struct intrspec ispec; + apic_irq_t *irqptr; + int i, j; + ulong_t iflag; + + LOCK_INIT_CLEAR(&apic_revector_lock); + LOCK_INIT_CLEAR(&apic_defer_reprogram_lock); + + /* mask interrupt vectors */ + for (j = 0; j < apic_io_max && mask_apic; j++) { + int intin_max; + + ioapic_ix = j; + /* Bits 23-16 define the maximum redirection entries */ + intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16) + & 0xff; + for (i = 0; i <= intin_max; i++) + ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK); + } + + /* + * Hack alert: deal with ACPI SCI interrupt chicken/egg here + */ + if (apic_sci_vect > 0) { + /* + * acpica has already done add_avintr(); we just + * to finish the job by mimicing translate_irq() + * + * Fake up an intrspec and setup the tables + */ + ispec.intrspec_vec = apic_sci_vect; + ispec.intrspec_pri = SCI_IPL; + + if (apic_setup_irq_table(NULL, apic_sci_vect, NULL, + &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) { + cmn_err(CE_WARN, "!apic: SCI setup failed"); + return; + } + irqptr = apic_irq_table[apic_sci_vect]; + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Program I/O APIC */ + (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + irqptr->airq_share++; + } +} + +/* + * Add mask bits to disable interrupt vector from happening + * at or above IPL. In addition, it should remove mask bits + * to enable interrupt vectors below the given IPL. + * + * Both add and delspl are complicated by the fact that different interrupts + * may share IRQs. This can happen in two ways. + * 1. The same H/W line is shared by more than 1 device + * 1a. with interrupts at different IPLs + * 1b. with interrupts at same IPL + * 2. We ran out of vectors at a given IPL and started sharing vectors. + * 1b and 2 should be handled gracefully, except for the fact some ISRs + * will get called often when no interrupt is pending for the device. + * For 1a, we handle it at the higher IPL. + */ +/*ARGSUSED*/ +int +apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl) +{ + uchar_t vector; + ulong_t iflag; + apic_irq_t *irqptr, *irqheadptr; + int irqindex; + + ASSERT(max_ipl <= UCHAR_MAX); + irqindex = IRQINDEX(irqno); + + if ((irqindex == -1) || (!apic_irq_table[irqindex])) + return (PSM_FAILURE); + + mutex_enter(&airq_mutex); + irqptr = irqheadptr = apic_irq_table[irqindex]; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x " + "vector=0x%x\n", (void *)irqptr->airq_dip, + irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); + + while (irqptr) { + if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) + break; + irqptr = irqptr->airq_next; + } + irqptr->airq_share++; + + mutex_exit(&airq_mutex); + + /* return if it is not hardware interrupt */ + if (irqptr->airq_mps_intr_index == RESERVE_INDEX) + return (PSM_SUCCESS); + + /* Or if there are more interupts at a higher IPL */ + if (ipl != max_ipl) + return (PSM_SUCCESS); + + /* + * if apic_picinit() has not been called yet, just return. + * At the end of apic_picinit(), we will call setup_io_intr(). + */ + + if (!apic_picinit_called) + return (PSM_SUCCESS); + + /* + * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate, + * return failure. + */ + if (irqptr->airq_ipl != max_ipl && + !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + + vector = apic_allocate_vector(max_ipl, irqindex, 1); + if (vector == 0) { + irqptr->airq_share--; + return (PSM_FAILURE); + } + irqptr = irqheadptr; + apic_mark_vector(irqptr->airq_vector, vector); + while (irqptr) { + irqptr->airq_vector = vector; + irqptr->airq_ipl = (uchar_t)max_ipl; + /* + * reprogram irq being added and every one else + * who is not in the UNINIT state + */ + if ((VIRTIRQ(irqindex, irqptr->airq_share_id) == + irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) { + apic_record_rdt_entry(irqptr, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, + B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + irqptr = irqptr->airq_next; + } + return (PSM_SUCCESS); + + } else if (irqptr->airq_ipl != max_ipl && + ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + /* + * We cannot upgrade the vector, but we can change + * the IPL that this vector induces. + * + * Note that we subtract APIC_BASE_VECT from the vector + * here because this array is used in apic_intr_enter + * (no need to add APIC_BASE_VECT in that hot code + * path since we can do it in the rarely-executed path + * here). + */ + apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] = + (uchar_t)max_ipl; + + irqptr = irqheadptr; + while (irqptr) { + irqptr->airq_ipl = (uchar_t)max_ipl; + irqptr = irqptr->airq_next; + } + + return (PSM_SUCCESS); + } + + ASSERT(irqptr); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (PSM_SUCCESS); +} + +/* + * Recompute mask bits for the given interrupt vector. + * If there is no interrupt servicing routine for this + * vector, this function should disable interrupt vector + * from happening at all IPLs. If there are still + * handlers using the given vector, this function should + * disable the given vector from happening below the lowest + * IPL of the remaining hadlers. + */ +/*ARGSUSED*/ +int +apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) +{ + uchar_t vector; + uint32_t bind_cpu; + int intin, irqindex; + int ioapic_ix; + apic_irq_t *irqptr, *preirqptr, *irqheadptr, *irqp; + ulong_t iflag; + + mutex_enter(&airq_mutex); + irqindex = IRQINDEX(irqno); + irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex]; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x " + "vector=0x%x\n", (void *)irqptr->airq_dip, + irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector)); + + while (irqptr) { + if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno) + break; + preirqptr = irqptr; + irqptr = irqptr->airq_next; + } + ASSERT(irqptr); + + irqptr->airq_share--; + + mutex_exit(&airq_mutex); + + /* + * If there are more interrupts at a higher IPL, we don't need + * to disable anything. + */ + if (ipl < max_ipl) + return (PSM_SUCCESS); + + /* return if it is not hardware interrupt */ + if (irqptr->airq_mps_intr_index == RESERVE_INDEX) + return (PSM_SUCCESS); + + if (!apic_picinit_called) { + /* + * Clear irq_struct. If two devices shared an intpt + * line & 1 unloaded before picinit, we are hosed. But, then + * we hope the machine survive. + */ + irqptr->airq_mps_intr_index = FREE_INDEX; + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_free_vector(irqptr->airq_vector); + return (PSM_SUCCESS); + } + /* + * Downgrade vector to new max_ipl if needed. If we cannot allocate, + * use old IPL. Not very elegant, but it should work. + */ + if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) && + !ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + apic_irq_t *irqp; + if (vector = apic_allocate_vector(max_ipl, irqno, 1)) { + apic_mark_vector(irqheadptr->airq_vector, vector); + irqp = irqheadptr; + while (irqp) { + irqp->airq_vector = vector; + irqp->airq_ipl = (uchar_t)max_ipl; + if (irqp->airq_temp_cpu != IRQ_UNINIT) { + apic_record_rdt_entry(irqp, irqindex); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + (void) apic_setup_io_intr(irqp, + irqindex, B_FALSE); + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + irqp = irqp->airq_next; + } + } + + } else if (irqptr->airq_ipl != max_ipl && + max_ipl != PSM_INVALID_IPL && + ioapic_mask_workaround[irqptr->airq_ioapicindex]) { + + /* + * We cannot downgrade the IPL of the vector below the vector's + * hardware priority. If we did, it would be possible for a + * higher-priority hardware vector to interrupt a CPU running at an IPL + * lower than the hardware priority of the interrupting vector (but + * higher than the soft IPL of this IRQ). When this happens, we would + * then try to drop the IPL BELOW what it was (effectively dropping + * below base_spl) which would be potentially catastrophic. + * + * (e.g. Suppose the hardware vector associated with this IRQ is 0x40 + * (hardware IPL of 4). Further assume that the old IPL of this IRQ + * was 4, but the new IPL is 1. If we forced vector 0x40 to result in + * an IPL of 1, it would be possible for the processor to be executing + * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting + * the currently-executing ISR. When apic_intr_enter consults + * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1 + * so even though the processor was running at IPL 4, an IPL 1 + * interrupt will have interrupted it, which must not happen)). + * + * Effectively, this means that the hardware priority corresponding to + * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's + * hardware priority. + * + * (In the above example, then, after removal of the IPL 4 device's + * interrupt handler, the new IPL will continue to be 4 because the + * hardware priority that IPL 1 implies is lower than the hardware + * priority of the vector used.) + */ + /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */ + const int apic_ipls_index = irqptr->airq_vector - + APIC_BASE_VECT; + const int vect_inherent_hwpri = irqptr->airq_vector >> + APIC_IPL_SHIFT; + + /* + * If there are still devices using this IRQ, determine the + * new ipl to use. + */ + if (irqptr->airq_share) { + int vect_desired_hwpri, hwpri; + + ASSERT(max_ipl < MAXIPL); + vect_desired_hwpri = apic_ipltopri[max_ipl] >> + APIC_IPL_SHIFT; + + /* + * If the desired IPL's hardware priority is lower + * than that of the vector, use the hardware priority + * of the vector to determine the new IPL. + */ + hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ? + vect_inherent_hwpri : vect_desired_hwpri; + + /* + * Now, to get the right index for apic_vectortoipl, + * we need to subtract APIC_BASE_VECT from the + * hardware-vector-equivalent (in hwpri). Since hwpri + * is already shifted, we shift APIC_BASE_VECT before + * doing the subtraction. + */ + hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT); + + ASSERT(hwpri >= 0); + ASSERT(hwpri < MAXIPL); + max_ipl = apic_vectortoipl[hwpri]; + apic_ipls[apic_ipls_index] = max_ipl; + + irqp = irqheadptr; + while (irqp) { + irqp->airq_ipl = (uchar_t)max_ipl; + irqp = irqp->airq_next; + } + } else { + /* + * No more devices on this IRQ, so reset this vector's + * element in apic_ipls to the original IPL for this + * vector + */ + apic_ipls[apic_ipls_index] = + apic_vectortoipl[vect_inherent_hwpri]; + } + } + + /* + * If there are still active interrupts, we are done. + */ + if (irqptr->airq_share) + return (PSM_SUCCESS); + + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + if (irqptr->airq_mps_intr_index == MSI_INDEX) { + /* + * Disable the MSI vector + * Make sure we only disable on the last + * of the multi-MSI support + */ + if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { + apic_pci_msi_disable_mode(irqptr->airq_dip, + DDI_INTR_TYPE_MSI); + } + } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) { + /* + * Disable the MSI-X vector + * needs to clear its mask and addr/data for each MSI-X + */ + apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX, + irqptr->airq_origirq); + /* + * Make sure we only disable on the last MSI-X + */ + if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) { + apic_pci_msi_disable_mode(irqptr->airq_dip, + DDI_INTR_TYPE_MSIX); + } + } else { + /* + * The assumption here is that this is safe, even for + * systems with IOAPICs that suffer from the hardware + * erratum because all devices have been quiesced before + * they unregister their interrupt handlers. If that + * assumption turns out to be false, this mask operation + * can induce the same erratum result we're trying to + * avoid. + */ + ioapic_ix = irqptr->airq_ioapicindex; + intin = irqptr->airq_intin_no; + ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK); + } + + /* + * This irq entry is the only one in the chain. + */ + if (irqheadptr->airq_next == NULL) { + ASSERT(irqheadptr == irqptr); + bind_cpu = irqptr->airq_temp_cpu; + if (((uint32_t)bind_cpu != IRQ_UNBOUND) && + ((uint32_t)bind_cpu != IRQ_UNINIT)) { + ASSERT(apic_cpu_in_range(bind_cpu)); + if (bind_cpu & IRQ_USER_BOUND) { + /* If hardbound, temp_cpu == cpu */ + bind_cpu &= ~IRQ_USER_BOUND; + apic_cpus[bind_cpu].aci_bound--; + } else + apic_cpus[bind_cpu].aci_temp_bound--; + } + irqptr->airq_temp_cpu = IRQ_UNINIT; + irqptr->airq_mps_intr_index = FREE_INDEX; + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + apic_free_vector(irqptr->airq_vector); + return (PSM_SUCCESS); + } + + /* + * If we get here, we are sharing the vector and there are more than + * one active irq entries in the chain. + */ + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + mutex_enter(&airq_mutex); + /* Remove the irq entry from the chain */ + if (irqptr == irqheadptr) { /* The irq entry is at the head */ + apic_irq_table[irqindex] = irqptr->airq_next; + } else { + preirqptr->airq_next = irqptr->airq_next; + } + /* Free the irq entry */ + kmem_free(irqptr, sizeof (apic_irq_t)); + mutex_exit(&airq_mutex); + + return (PSM_SUCCESS); +} + +/* + * apic_introp_xlate() replaces apic_translate_irq() and is + * called only from apic_intr_ops(). With the new ADII framework, + * the priority can no longer be retrieved through i_ddi_get_intrspec(). + * It has to be passed in from the caller. + * + * Return value: + * Success: irqno for the given device + * Failure: -1 + */ +int +apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type) +{ + char dev_type[16]; + int dev_len, pci_irq, newirq, bustype, devid, busid, i; + int irqno = ispec->intrspec_vec; + ddi_acc_handle_t cfg_handle; + uchar_t ipin; + struct apic_io_intr *intrp; + iflag_t intr_flag; + ACPI_SUBTABLE_HEADER *hp; + ACPI_MADT_INTERRUPT_OVERRIDE *isop; + apic_irq_t *airqp; + int parent_is_pci_or_pciex = 0; + int child_is_pciex = 0; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s " + "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type, + irqno)); + + dev_len = sizeof (dev_type); + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip), + DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if ((strcmp(dev_type, "pci") == 0) || + (strcmp(dev_type, "pciex") == 0)) + parent_is_pci_or_pciex = 1; + } + + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type, + &dev_len) == DDI_PROP_SUCCESS) { + if (strstr(dev_type, "pciex")) + child_is_pciex = 1; + } + + if (DDI_INTR_IS_MSI_OR_MSIX(type)) { + if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) { + airqp->airq_iflag.bustype = + child_is_pciex ? BUS_PCIE : BUS_PCI; + return (apic_vector_to_irq[airqp->airq_vector]); + } + return (apic_setup_irq_table(dip, irqno, NULL, ispec, + NULL, type)); + } + + bustype = 0; + + /* check if we have already translated this irq */ + mutex_enter(&airq_mutex); + newirq = apic_min_device_irq; + for (; newirq <= apic_max_device_irq; newirq++) { + airqp = apic_irq_table[newirq]; + while (airqp) { + if ((airqp->airq_dip == dip) && + (airqp->airq_origirq == irqno) && + (airqp->airq_mps_intr_index != FREE_INDEX)) { + + mutex_exit(&airq_mutex); + return (VIRTIRQ(newirq, airqp->airq_share_id)); + } + airqp = airqp->airq_next; + } + } + mutex_exit(&airq_mutex); + + if (apic_defconf) + goto defconf; + + if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi)) + goto nonpci; + + if (parent_is_pci_or_pciex) { + /* pci device */ + if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0) + goto nonpci; + if (busid == 0 && apic_pci_bus_total == 1) + busid = (int)apic_single_pci_busid; + + if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS) + return (-1); + ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA; + pci_config_teardown(&cfg_handle); + if (apic_enable_acpi && !apic_use_acpi_madt_only) { + if (apic_acpi_translate_pci_irq(dip, busid, devid, + ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS) + return (-1); + + intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI; + return (apic_setup_irq_table(dip, pci_irq, NULL, ispec, + &intr_flag, type)); + } else { + pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3); + if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid)) + == NULL) { + if ((pci_irq = apic_handle_pci_pci_bridge(dip, + devid, ipin, &intrp)) == -1) + return (-1); + } + return (apic_setup_irq_table(dip, pci_irq, intrp, ispec, + NULL, type)); + } + } else if (strcmp(dev_type, "isa") == 0) + bustype = BUS_ISA; + else if (strcmp(dev_type, "eisa") == 0) + bustype = BUS_EISA; + +nonpci: + if (apic_enable_acpi && !apic_use_acpi_madt_only) { + /* search iso entries first */ + if (acpi_iso_cnt != 0) { + hp = (ACPI_SUBTABLE_HEADER *)acpi_isop; + i = 0; + while (i < acpi_iso_cnt) { + if (hp->Type == + ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) { + isop = + (ACPI_MADT_INTERRUPT_OVERRIDE *) hp; + if (isop->Bus == 0 && + isop->SourceIrq == irqno) { + newirq = isop->GlobalIrq; + intr_flag.intr_po = + isop->IntiFlags & + ACPI_MADT_POLARITY_MASK; + intr_flag.intr_el = + (isop->IntiFlags & + ACPI_MADT_TRIGGER_MASK) + >> 2; + intr_flag.bustype = BUS_ISA; + + return (apic_setup_irq_table( + dip, newirq, NULL, ispec, + &intr_flag, type)); + + } + i++; + } + hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) + + hp->Length); + } + } + intr_flag.intr_po = INTR_PO_ACTIVE_HIGH; + intr_flag.intr_el = INTR_EL_EDGE; + intr_flag.bustype = BUS_ISA; + return (apic_setup_irq_table(dip, irqno, NULL, ispec, + &intr_flag, type)); + } else { + if (bustype == 0) /* not initialized */ + bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA; + for (i = 0; i < 2; i++) { + if (((busid = apic_find_bus_id(bustype)) != -1) && + ((intrp = apic_find_io_intr_w_busid(irqno, busid)) + != NULL)) { + if ((newirq = apic_setup_irq_table(dip, irqno, + intrp, ispec, NULL, type)) != -1) { + return (newirq); + } + goto defconf; + } + bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA; + } + } + +/* MPS default configuration */ +defconf: + newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type); + if (newirq == -1) + return (-1); + ASSERT(IRQINDEX(newirq) == irqno); + ASSERT(apic_irq_table[irqno]); + return (newirq); +} + +/* + * Attempt to share vector with someone else + */ +static int +apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl, + uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp) +{ +#ifdef DEBUG + apic_irq_t *tmpirqp = NULL; +#endif /* DEBUG */ + apic_irq_t *irqptr, dummyirq; + int newirq, chosen_irq = -1, share = 127; + int lowest, highest, i; + uchar_t share_id; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x " + "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl)); + + highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK; + lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL; + + if (highest < lowest) /* Both ipl and ipl-1 map to same pri */ + lowest -= APIC_VECTOR_PER_IPL; + dummyirq.airq_mps_intr_index = intr_index; + dummyirq.airq_ioapicindex = ioapicindex; + dummyirq.airq_intin_no = ipin; + if (intr_flagp) + dummyirq.airq_iflag = *intr_flagp; + apic_record_rdt_entry(&dummyirq, irqno); + for (i = lowest; i <= highest; i++) { + newirq = apic_vector_to_irq[i]; + if (newirq == APIC_RESV_IRQ) + continue; + irqptr = apic_irq_table[newirq]; + + if ((dummyirq.airq_rdt_entry & 0xFF00) != + (irqptr->airq_rdt_entry & 0xFF00)) + /* not compatible */ + continue; + + if (irqptr->airq_share < share) { + share = irqptr->airq_share; + chosen_irq = newirq; + } + } + if (chosen_irq != -1) { + /* + * Assign a share id which is free or which is larger + * than the largest one. + */ + share_id = 1; + mutex_enter(&airq_mutex); + irqptr = apic_irq_table[chosen_irq]; + while (irqptr) { + if (irqptr->airq_mps_intr_index == FREE_INDEX) { + share_id = irqptr->airq_share_id; + break; + } + if (share_id <= irqptr->airq_share_id) + share_id = irqptr->airq_share_id + 1; +#ifdef DEBUG + tmpirqp = irqptr; +#endif /* DEBUG */ + irqptr = irqptr->airq_next; + } + if (!irqptr) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + irqptr->airq_next = + apic_irq_table[chosen_irq]->airq_next; + apic_irq_table[chosen_irq]->airq_next = irqptr; +#ifdef DEBUG + tmpirqp = apic_irq_table[chosen_irq]; +#endif /* DEBUG */ + } + irqptr->airq_mps_intr_index = intr_index; + irqptr->airq_ioapicindex = ioapicindex; + irqptr->airq_intin_no = ipin; + if (intr_flagp) + irqptr->airq_iflag = *intr_flagp; + irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector; + irqptr->airq_share_id = share_id; + apic_record_rdt_entry(irqptr, irqno); + *irqptrp = irqptr; +#ifdef DEBUG + /* shuffle the pointers to test apic_delspl path */ + if (tmpirqp) { + tmpirqp->airq_next = irqptr->airq_next; + irqptr->airq_next = apic_irq_table[chosen_irq]; + apic_irq_table[chosen_irq] = irqptr; + } +#endif /* DEBUG */ + mutex_exit(&airq_mutex); + return (VIRTIRQ(chosen_irq, share_id)); + } + return (-1); +} + +/* + * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry + * is used already, we will try to allocate a new irqno. + * + * Return value: + * Success: irqno + * Failure: -1 + */ +static int +apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, + struct intrspec *ispec, iflag_t *intr_flagp, int type) +{ + int origirq = ispec->intrspec_vec; + uchar_t ipl = ispec->intrspec_pri; + int newirq, intr_index; + uchar_t ipin, ioapic, ioapicindex, vector; + apic_irq_t *irqptr; + major_t major; + dev_info_t *sdip; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d " + "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq)); + + ASSERT(ispec != NULL); + + major = (dip != NULL) ? ddi_driver_major(dip) : 0; + + if (DDI_INTR_IS_MSI_OR_MSIX(type)) { + /* MSI/X doesn't need to setup ioapic stuffs */ + ioapicindex = 0xff; + ioapic = 0xff; + ipin = (uchar_t)0xff; + intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX : + MSIX_INDEX; + mutex_enter(&airq_mutex); + if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) { + mutex_exit(&airq_mutex); + /* need an irq for MSI/X to index into autovect[] */ + cmn_err(CE_WARN, "No interrupt irq: %s instance %d", + ddi_get_name(dip), ddi_get_instance(dip)); + return (-1); + } + mutex_exit(&airq_mutex); + + } else if (intrp != NULL) { + intr_index = (int)(intrp - apic_io_intrp); + ioapic = intrp->intr_destid; + ipin = intrp->intr_destintin; + /* Find ioapicindex. If destid was ALL, we will exit with 0. */ + for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--) + if (apic_io_id[ioapicindex] == ioapic) + break; + ASSERT((ioapic == apic_io_id[ioapicindex]) || + (ioapic == INTR_ALL_APIC)); + + /* check whether this intin# has been used by another irqno */ + if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) { + return (newirq); + } + + } else if (intr_flagp != NULL) { + /* ACPI case */ + intr_index = ACPI_INDEX; + ioapicindex = acpi_find_ioapic(irqno); + ASSERT(ioapicindex != 0xFF); + ioapic = apic_io_id[ioapicindex]; + ipin = irqno - apic_io_vectbase[ioapicindex]; + if (apic_irq_table[irqno] && + apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) { + ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin && + apic_irq_table[irqno]->airq_ioapicindex == + ioapicindex); + return (irqno); + } + + } else { + /* default configuration */ + ioapicindex = 0; + ioapic = apic_io_id[ioapicindex]; + ipin = (uchar_t)irqno; + intr_index = DEFAULT_INDEX; + } + + if (ispec == NULL) { + APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n", + irqno)); + } else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) { + if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index, + ipl, ioapicindex, ipin, &irqptr)) != -1) { + irqptr->airq_ipl = ipl; + irqptr->airq_origirq = (uchar_t)origirq; + irqptr->airq_dip = dip; + irqptr->airq_major = major; + sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip; + /* This is OK to do really */ + if (sdip == NULL) { + cmn_err(CE_WARN, "Sharing vectors: %s" + " instance %d and SCI", + ddi_get_name(dip), ddi_get_instance(dip)); + } else { + cmn_err(CE_WARN, "Sharing vectors: %s" + " instance %d and %s instance %d", + ddi_get_name(sdip), ddi_get_instance(sdip), + ddi_get_name(dip), ddi_get_instance(dip)); + } + return (newirq); + } + /* try high priority allocation now that share has failed */ + if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) { + cmn_err(CE_WARN, "No interrupt vector: %s instance %d", + ddi_get_name(dip), ddi_get_instance(dip)); + return (-1); + } + } + + mutex_enter(&airq_mutex); + if (apic_irq_table[irqno] == NULL) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_irq_table[irqno] = irqptr; + } else { + irqptr = apic_irq_table[irqno]; + if (irqptr->airq_mps_intr_index != FREE_INDEX) { + /* + * The slot is used by another irqno, so allocate + * a free irqno for this interrupt + */ + newirq = apic_allocate_irq(apic_first_avail_irq); + if (newirq == -1) { + mutex_exit(&airq_mutex); + return (-1); + } + irqno = newirq; + irqptr = apic_irq_table[irqno]; + if (irqptr == NULL) { + irqptr = kmem_zalloc(sizeof (apic_irq_t), + KM_SLEEP); + irqptr->airq_temp_cpu = IRQ_UNINIT; + apic_irq_table[irqno] = irqptr; + } + vector = apic_modify_vector(vector, newirq); + } + } + apic_max_device_irq = max(irqno, apic_max_device_irq); + apic_min_device_irq = min(irqno, apic_min_device_irq); + mutex_exit(&airq_mutex); + irqptr->airq_ioapicindex = ioapicindex; + irqptr->airq_intin_no = ipin; + irqptr->airq_ipl = ipl; + irqptr->airq_vector = vector; + irqptr->airq_origirq = (uchar_t)origirq; + irqptr->airq_share_id = 0; + irqptr->airq_mps_intr_index = (short)intr_index; + irqptr->airq_dip = dip; + irqptr->airq_major = major; + irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin); + if (intr_flagp) + irqptr->airq_iflag = *intr_flagp; + + if (!DDI_INTR_IS_MSI_OR_MSIX(type)) { + /* setup I/O APIC entry for non-MSI/X interrupts */ + apic_record_rdt_entry(irqptr, irqno); + } + return (irqno); +} + +/* + * return the cpu to which this intr should be bound. + * Check properties or any other mechanism to see if user wants it + * bound to a specific CPU. If so, return the cpu id with high bit set. + * If not, use the policy to choose a cpu and return the id. + */ +uint32_t +apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) +{ + int instance, instno, prop_len, bind_cpu, count; + uint_t i, rc; + uint32_t cpu; + major_t major; + char *name, *drv_name, *prop_val, *cptr; + char prop_name[32]; + ulong_t iflag; + + + if (apic_intr_policy == INTR_LOWEST_PRIORITY) + return (IRQ_UNBOUND); + + if (apic_nproc == 1) + return (0); + + drv_name = NULL; + rc = DDI_PROP_NOT_FOUND; + major = (major_t)-1; + if (dip != NULL) { + name = ddi_get_name(dip); + major = ddi_name_to_major(name); + drv_name = ddi_major_to_name(major); + instance = ddi_get_instance(dip); + if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) { + i = apic_min_device_irq; + for (; i <= apic_max_device_irq; i++) { + + if ((i == irq) || (apic_irq_table[i] == NULL) || + (apic_irq_table[i]->airq_mps_intr_index + == FREE_INDEX)) + continue; + + if ((apic_irq_table[i]->airq_major == major) && + (!(apic_irq_table[i]->airq_cpu & + IRQ_USER_BOUND))) { + + cpu = apic_irq_table[i]->airq_cpu; + + cmn_err(CE_CONT, + "!%s: %s (%s) instance #%d " + "irq 0x%x vector 0x%x ioapic 0x%x " + "intin 0x%x is bound to cpu %d\n", + psm_name, + name, drv_name, instance, irq, + apic_irq_table[irq]->airq_vector, + ioapicid, intin, cpu); + return (cpu); + } + } + } + /* + * search for "drvname"_intpt_bind_cpus property first, the + * syntax of the property should be "a[,b,c,...]" where + * instance 0 binds to cpu a, instance 1 binds to cpu b, + * instance 3 binds to cpu c... + * ddi_getlongprop() will search /option first, then / + * if "drvname"_intpt_bind_cpus doesn't exist, then find + * intpt_bind_cpus property. The syntax is the same, and + * it applies to all the devices if its "drvname" specific + * property doesn't exist + */ + (void) strcpy(prop_name, drv_name); + (void) strcat(prop_name, "_intpt_bind_cpus"); + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name, + (caddr_t)&prop_val, &prop_len); + if (rc != DDI_PROP_SUCCESS) { + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, + "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len); + } + } + if (rc == DDI_PROP_SUCCESS) { + for (i = count = 0; i < (prop_len - 1); i++) + if (prop_val[i] == ',') + count++; + if (prop_val[i-1] != ',') + count++; + /* + * if somehow the binding instances defined in the + * property are not enough for this instno., then + * reuse the pattern for the next instance until + * it reaches the requested instno + */ + instno = instance % count; + i = 0; + cptr = prop_val; + while (i < instno) + if (*cptr++ == ',') + i++; + bind_cpu = stoi(&cptr); + kmem_free(prop_val, prop_len); + /* if specific CPU is bogus, then default to next cpu */ + if (!apic_cpu_in_range(bind_cpu)) { + cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", + psm_name, prop_name, prop_val, bind_cpu); + rc = DDI_PROP_NOT_FOUND; + } else { + /* indicate that we are bound at user request */ + bind_cpu |= IRQ_USER_BOUND; + } + /* + * no need to check apic_cpus[].aci_status, if specific CPU is + * not up, then post_cpu_start will handle it. + */ + } + if (rc != DDI_PROP_SUCCESS) { + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + bind_cpu = apic_get_next_bind_cpu(); + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + + if (drv_name != NULL) + cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " + "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", + psm_name, name, drv_name, instance, irq, + apic_irq_table[irq]->airq_vector, ioapicid, intin, + bind_cpu & ~IRQ_USER_BOUND); + else + cmn_err(CE_CONT, "!%s: irq 0x%x " + "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", + psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid, + intin, bind_cpu & ~IRQ_USER_BOUND); + + return ((uint32_t)bind_cpu); +} + +/* + * Mark vector as being in the process of being deleted. Interrupts + * may still come in on some CPU. The moment an interrupt comes with + * the new vector, we know we can free the old one. Called only from + * addspl and delspl with interrupts disabled. Because an interrupt + * can be shared, but no interrupt from either device may come in, + * we also use a timeout mechanism, which we arbitrarily set to + * apic_revector_timeout microseconds. + */ +static void +apic_mark_vector(uchar_t oldvector, uchar_t newvector) +{ + ulong_t iflag; + + iflag = intr_clear(); + lock_set(&apic_revector_lock); + if (!apic_oldvec_to_newvec) { + apic_oldvec_to_newvec = + kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2, + KM_NOSLEEP); + + if (!apic_oldvec_to_newvec) { + /* + * This failure is not catastrophic. + * But, the oldvec will never be freed. + */ + apic_error |= APIC_ERR_MARK_VECTOR_FAIL; + lock_clear(&apic_revector_lock); + intr_restore(iflag); + return; + } + apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR]; + } + + /* See if we already did this for drivers which do double addintrs */ + if (apic_oldvec_to_newvec[oldvector] != newvector) { + apic_oldvec_to_newvec[oldvector] = newvector; + apic_newvec_to_oldvec[newvector] = oldvector; + apic_revector_pending++; + } + lock_clear(&apic_revector_lock); + intr_restore(iflag); + (void) timeout(apic_xlate_vector_free_timeout_handler, + (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout)); +} + +/* + * xlate_vector is called from intr_enter if revector_pending is set. + * It will xlate it if needed and mark the old vector as free. + */ +uchar_t +apic_xlate_vector(uchar_t vector) +{ + uchar_t newvector, oldvector = 0; + + lock_set(&apic_revector_lock); + /* Do we really need to do this ? */ + if (!apic_revector_pending) { + lock_clear(&apic_revector_lock); + return (vector); + } + if ((newvector = apic_oldvec_to_newvec[vector]) != 0) + oldvector = vector; + else { + /* + * The incoming vector is new . See if a stale entry is + * remaining + */ + if ((oldvector = apic_newvec_to_oldvec[vector]) != 0) + newvector = vector; + } + + if (oldvector) { + apic_revector_pending--; + apic_oldvec_to_newvec[oldvector] = 0; + apic_newvec_to_oldvec[newvector] = 0; + apic_free_vector(oldvector); + lock_clear(&apic_revector_lock); + /* There could have been more than one reprogramming! */ + return (apic_xlate_vector(newvector)); + } + lock_clear(&apic_revector_lock); + return (vector); +} + +void +apic_xlate_vector_free_timeout_handler(void *arg) +{ + ulong_t iflag; + uchar_t oldvector, newvector; + + oldvector = (uchar_t)(uintptr_t)arg; + iflag = intr_clear(); + lock_set(&apic_revector_lock); + if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) { + apic_free_vector(oldvector); + apic_oldvec_to_newvec[oldvector] = 0; + apic_newvec_to_oldvec[newvector] = 0; + apic_revector_pending--; + } + + lock_clear(&apic_revector_lock); + intr_restore(iflag); +} + +/* + * Bind interrupt corresponding to irq_ptr to bind_cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held + */ +int +apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, + struct ioapic_reprogram_data *drep) +{ + int ioapicindex, intin_no; + uint32_t airq_temp_cpu; + apic_cpus_info_t *cpu_infop; + uint32_t rdt_entry; + int which_irq; + ioapic_rdt_t irdt; + + which_irq = apic_vector_to_irq[irq_ptr->airq_vector]; + + intin_no = irq_ptr->airq_intin_no; + ioapicindex = irq_ptr->airq_ioapicindex; + airq_temp_cpu = irq_ptr->airq_temp_cpu; + if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) { + if (airq_temp_cpu & IRQ_USER_BOUND) + /* Mask off high bit so it can be used as array index */ + airq_temp_cpu &= ~IRQ_USER_BOUND; + + ASSERT(apic_cpu_in_range(airq_temp_cpu)); + } + + /* + * Can't bind to a CPU that's not accepting interrupts: + */ + cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND]; + if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE)) + return (1); + + /* + * If we are about to change the interrupt vector for this interrupt, + * and this interrupt is level-triggered, attached to an IOAPIC, + * has been delivered to a CPU and that CPU has not handled it + * yet, we cannot reprogram the IOAPIC now. + */ + if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, + intin_no); + + if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) && + apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu, + bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) { + + return (0); + } + + /* + * NOTE: We do not unmask the RDT here, as an interrupt MAY + * still come in before we have a chance to reprogram it below. + * The reprogramming below will simultaneously change and + * unmask the RDT entry. + */ + + if ((uint32_t)bind_cpu == IRQ_UNBOUND) { + irdt.ir_lo = AV_LDEST | AV_LOPRI | + irq_ptr->airq_rdt_entry; + + WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, + AV_TOALL); + + if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != + IRQ_UNBOUND) + apic_cpus[airq_temp_cpu].aci_temp_bound--; + + /* + * Write the vector, trigger, and polarity portion of + * the RDT + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, + irdt.ir_lo); + + irq_ptr->airq_temp_cpu = IRQ_UNBOUND; + return (0); + } + } + + if (bind_cpu & IRQ_USER_BOUND) { + cpu_infop->aci_bound++; + } else { + cpu_infop->aci_temp_bound++; + } + ASSERT(apic_cpu_in_range(bind_cpu)); + + if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { + apic_cpus[airq_temp_cpu].aci_temp_bound--; + } + if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) { + + irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry; + irdt.ir_hi = cpu_infop->aci_local_id; + + /* Write the RDT entry -- bind to a specific CPU: */ + WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no, + irdt.ir_hi << APIC_ID_BIT_OFFSET); + + /* Write the vector, trigger, and polarity portion of the RDT */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no, + irdt.ir_lo); + + } else { + int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ? + DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX; + if (type == DDI_INTR_TYPE_MSI) { + if (irq_ptr->airq_ioapicindex == + irq_ptr->airq_origirq) { + /* first one */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " + "apic_pci_msi_enable_vector\n")); + apic_pci_msi_enable_vector(irq_ptr, + type, which_irq, irq_ptr->airq_vector, + irq_ptr->airq_intin_no, + cpu_infop->aci_local_id); + } + if ((irq_ptr->airq_ioapicindex + + irq_ptr->airq_intin_no - 1) == + irq_ptr->airq_origirq) { /* last one */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call " + "apic_pci_msi_enable_mode\n")); + apic_pci_msi_enable_mode(irq_ptr->airq_dip, + type, which_irq); + } + } else { /* MSI-X */ + apic_pci_msi_enable_vector(irq_ptr, type, + irq_ptr->airq_origirq, irq_ptr->airq_vector, 1, + cpu_infop->aci_local_id); + apic_pci_msi_enable_mode(irq_ptr->airq_dip, type, + irq_ptr->airq_origirq); + } + } + irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu; + apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND)); + return (0); +} + +static void +apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no) +{ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) + & AV_REMOTE_IRR) != 0) { + /* + * Trying to clear the bit through normal + * channels has failed. So as a last-ditch + * effort, try to set the trigger mode to + * edge, then to level. This has been + * observed to work on many systems. + */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & ~AV_LEVEL); + + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, + READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) | AV_LEVEL); + + /* + * If the bit's STILL set, this interrupt may + * be hosed. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_REMOTE_IRR) != 0) { + + prom_printf("%s: Remote IRR still " + "not clear for IOAPIC %d intin %d.\n" + "\tInterrupts to this pin may cease " + "functioning.\n", psm_name, ioapic_ix, + intin_no); +#ifdef DEBUG + apic_last_ditch_reprogram_failures++; +#endif + } + } +} + +/* + * This function is protected by apic_ioapic_lock coupled with the + * fact that interrupts are disabled. + */ +static void +delete_defer_repro_ent(int which_irq) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + ASSERT(LOCK_HELD(&apic_ioapic_lock)); + + if (apic_reprogram_info[which_irq].done) + return; + + apic_reprogram_info[which_irq].done = B_TRUE; + +#ifdef DEBUG + apic_defer_repro_total_retries += + apic_reprogram_info[which_irq].tries; + + apic_defer_repro_successes++; +#endif + + if (--apic_reprogram_outstanding == 0) { + + setlvlx = psm_intr_exit_fn(); + } +} + + +/* + * Interrupts must be disabled during this function to prevent + * self-deadlock. Interrupts are disabled because this function + * is called from apic_check_stuck_interrupt(), which is called + * from apic_rebind(), which requires its caller to disable interrupts. + */ +static void +add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu) +{ + ASSERT(which_irq >= 0); + ASSERT(which_irq <= 255); + ASSERT(!interrupts_enabled()); + + /* + * On the off-chance that there's already a deferred + * reprogramming on this irq, check, and if so, just update the + * CPU and irq pointer to which the interrupt is targeted, then return. + */ + if (!apic_reprogram_info[which_irq].done) { + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].irqp = irq_ptr; + return; + } + + apic_reprogram_info[which_irq].irqp = irq_ptr; + apic_reprogram_info[which_irq].bindcpu = new_bind_cpu; + apic_reprogram_info[which_irq].tries = 0; + /* + * This must be the last thing set, since we're not + * grabbing any locks, apic_try_deferred_reprogram() will + * make its decision about using this entry iff done + * is false. + */ + apic_reprogram_info[which_irq].done = B_FALSE; + + /* + * If there were previously no deferred reprogrammings, change + * setlvlx to call apic_try_deferred_reprogram() + */ + if (++apic_reprogram_outstanding == 1) { + + setlvlx = apic_try_deferred_reprogram; + } +} + +static void +apic_try_deferred_reprogram(int prev_ipl, int irq) +{ + int reproirq; + ulong_t iflag; + struct ioapic_reprogram_data *drep; + + (*psm_intr_exit_fn())(prev_ipl, irq); + + if (!lock_try(&apic_defer_reprogram_lock)) { + return; + } + + /* + * Acquire the apic_ioapic_lock so that any other operations that + * may affect the apic_reprogram_info state are serialized. + * It's still possible for the last deferred reprogramming to clear + * between the time we entered this function and the time we get to + * the for loop below. In that case, *setlvlx will have been set + * back to *_intr_exit and drep will be NULL. (There's no way to + * stop that from happening -- we would need to grab a lock before + * calling *setlvlx, which is neither realistic nor prudent). + */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* + * For each deferred RDT entry, try to reprogram it now. Note that + * there is no lock acquisition to read apic_reprogram_info because + * '.done' is set only after the other fields in the structure are set. + */ + + drep = NULL; + for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) { + if (apic_reprogram_info[reproirq].done == B_FALSE) { + drep = &apic_reprogram_info[reproirq]; + break; + } + } + + /* + * Either we found a deferred action to perform, or + * we entered this function spuriously, after *setlvlx + * was restored to point to *_intr_exit. Any other + * permutation is invalid. + */ + ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn()); + + /* + * Though we can't really do anything about errors + * at this point, keep track of them for reporting. + * Note that it is very possible for apic_setup_io_intr + * to re-register this very timeout if the Remote IRR bit + * has not yet cleared. + */ + +#ifdef DEBUG + if (drep != NULL) { + if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) { + apic_deferred_setup_failures++; + } + } else { + apic_deferred_spurious_enters++; + } +#else + if (drep != NULL) + (void) apic_setup_io_intr(drep, reproirq, B_TRUE); +#endif + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + lock_clear(&apic_defer_reprogram_lock); +} + +static void +apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no) +{ + int waited; + + /* + * Wait for the delivery pending bit to clear. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & + (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) { + + /* + * If we're still waiting on the delivery of this interrupt, + * continue to wait here until it is delivered (this should be + * a very small amount of time, but include a timeout just in + * case). + */ + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_PENDING) == 0) { + break; + } + } + } +} + + +/* + * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR + * bit set. Calls functions that modify the function that setlvlx points to, + * so that the reprogramming can be retried very shortly. + * + * This function will mask the RDT entry if the interrupt is level-triggered. + * (The caller is responsible for unmasking the RDT entry.) + * + * Returns non-zero if the caller should defer IOAPIC reprogramming. + */ +static int +apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu, + int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq, + struct ioapic_reprogram_data *drep) +{ + int32_t rdt_entry; + int waited; + int reps = 0; + + /* + * Wait for the delivery pending bit to clear. + */ + do { + ++reps; + + apic_ioapic_wait_pending_clear(ioapic_ix, intin_no); + + /* + * Mask the RDT entry, but only if it's a level-triggered + * interrupt + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) { + + /* Mask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no, + AV_MASK | rdt_entry); + } + + if ((rdt_entry & AV_LEVEL) == AV_LEVEL) { + /* + * If there was a race and an interrupt was injected + * just before we masked, check for that case here. + * Then, unmask the RDT entry and try again. If we're + * on our last try, don't unmask (because we want the + * RDT entry to remain masked for the rest of the + * function). + */ + rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no); + if ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)) { + /* Unmask it */ + WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no, rdt_entry & ~AV_MASK); + } + } + + } while ((rdt_entry & AV_PENDING) && + (reps < apic_max_reps_clear_pending)); + +#ifdef DEBUG + if (rdt_entry & AV_PENDING) + apic_intr_deliver_timeouts++; +#endif + + /* + * If the remote IRR bit is set, then the interrupt has been sent + * to a CPU for processing. We have no choice but to wait for + * that CPU to process the interrupt, at which point the remote IRR + * bit will be cleared. + */ + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) & + (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) { + + /* + * If the CPU that this RDT is bound to is NOT the current + * CPU, wait until that CPU handles the interrupt and ACKs + * it. If this interrupt is not bound to any CPU (that is, + * if it's bound to the logical destination of "anyone"), it + * may have been delivered to the current CPU so handle that + * case by deferring the reprogramming (below). + */ + if ((old_bind_cpu != IRQ_UNBOUND) && + (old_bind_cpu != IRQ_UNINIT) && + (old_bind_cpu != psm_get_cpu_id())) { + for (waited = 0; waited < apic_max_reps_clear_pending; + waited++) { + if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, + intin_no) & AV_REMOTE_IRR) == 0) { + + delete_defer_repro_ent(which_irq); + + /* Remote IRR has cleared! */ + return (0); + } + } + } + + /* + * If we waited and the Remote IRR bit is still not cleared, + * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS + * times for this interrupt, try the last-ditch workaround: + */ + if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) { + + apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no); + + /* Mark this one as reprogrammed: */ + delete_defer_repro_ent(which_irq); + + return (0); + } else { +#ifdef DEBUG + apic_intr_deferrals++; +#endif + + /* + * If waiting for the Remote IRR bit (above) didn't + * allow it to clear, defer the reprogramming. + * Add a new deferred-programming entry if the + * caller passed a NULL one (and update the existing one + * in case anything changed). + */ + add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu); + if (drep) + drep->tries++; + + /* Inform caller to defer IOAPIC programming: */ + return (1); + } + + } + + /* Remote IRR is clear */ + delete_defer_repro_ent(which_irq); + + return (0); +} + +/* + * Called to migrate all interrupts at an irq to another cpu. + * Must be called with interrupts disabled and apic_ioapic_lock held + */ +int +apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu) +{ + apic_irq_t *irqptr = irq_ptr; + int retval = 0; + + while (irqptr) { + if (irqptr->airq_temp_cpu != IRQ_UNINIT) + retval |= apic_rebind(irqptr, bind_cpu, NULL); + irqptr = irqptr->airq_next; + } + + return (retval); +} + +/* + * apic_intr_redistribute does all the messy computations for identifying + * which interrupt to move to which CPU. Currently we do just one interrupt + * at a time. This reduces the time we spent doing all this within clock + * interrupt. When it is done in idle, we could do more than 1. + * First we find the most busy and the most free CPU (time in ISR only) + * skipping those CPUs that has been identified as being ineligible (cpu_skip) + * Then we look for IRQs which are closest to the difference between the + * most busy CPU and the average ISR load. We try to find one whose load + * is less than difference.If none exists, then we chose one larger than the + * difference, provided it does not make the most idle CPU worse than the + * most busy one. In the end, we clear all the busy fields for CPUs. For + * IRQs, they are cleared as they are scanned. + */ +void +apic_intr_redistribute(void) +{ + int busiest_cpu, most_free_cpu; + int cpu_free, cpu_busy, max_busy, min_busy; + int min_free, diff; + int average_busy, cpus_online; + int i, busy; + ulong_t iflag; + apic_cpus_info_t *cpu_infop; + apic_irq_t *min_busy_irq = NULL; + apic_irq_t *max_busy_irq = NULL; + + busiest_cpu = most_free_cpu = -1; + cpu_free = cpu_busy = max_busy = average_busy = 0; + min_free = apic_sample_factor_redistribution; + cpus_online = 0; + /* + * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu + * without ioapic_lock. That is OK as we are just doing statistical + * sampling anyway and any inaccuracy now will get corrected next time + * The call to rebind which actually changes things will make sure + * we are consistent. + */ + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i) && + !(apic_redist_cpu_skip & (1 << i)) && + (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { + + cpu_infop = &apic_cpus[i]; + /* + * If no unbound interrupts or only 1 total on this + * CPU, skip + */ + if (!cpu_infop->aci_temp_bound || + (cpu_infop->aci_bound + cpu_infop->aci_temp_bound) + == 1) { + apic_redist_cpu_skip |= 1 << i; + continue; + } + + busy = cpu_infop->aci_busy; + average_busy += busy; + cpus_online++; + if (max_busy < busy) { + max_busy = busy; + busiest_cpu = i; + } + if (min_free > busy) { + min_free = busy; + most_free_cpu = i; + } + if (busy > apic_int_busy_mark) { + cpu_busy |= 1 << i; + } else { + if (busy < apic_int_free_mark) + cpu_free |= 1 << i; + } + } + } + if ((cpu_busy && cpu_free) || + (max_busy >= (min_free + apic_diff_for_redistribution))) { + + apic_num_imbalance++; +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf( + "redistribute busy=%x free=%x max=%x min=%x", + cpu_busy, cpu_free, max_busy, min_free); + } +#endif /* DEBUG */ + + + average_busy /= cpus_online; + + diff = max_busy - average_busy; + min_busy = max_busy; /* start with the max possible value */ + max_busy = 0; + min_busy_irq = max_busy_irq = NULL; + i = apic_min_device_irq; + for (; i <= apic_max_device_irq; i++) { + apic_irq_t *irq_ptr; + /* Change to linked list per CPU ? */ + if ((irq_ptr = apic_irq_table[i]) == NULL) + continue; + /* Check for irq_busy & decide which one to move */ + /* Also zero them for next round */ + if ((irq_ptr->airq_temp_cpu == busiest_cpu) && + irq_ptr->airq_busy) { + if (irq_ptr->airq_busy < diff) { + /* + * Check for least busy CPU, + * best fit or what ? + */ + if (max_busy < irq_ptr->airq_busy) { + /* + * Most busy within the + * required differential + */ + max_busy = irq_ptr->airq_busy; + max_busy_irq = irq_ptr; + } + } else { + if (min_busy > irq_ptr->airq_busy) { + /* + * least busy, but more than + * the reqd diff + */ + if (min_busy < + (diff + average_busy - + min_free)) { + /* + * Making sure new cpu + * will not end up + * worse + */ + min_busy = + irq_ptr->airq_busy; + + min_busy_irq = irq_ptr; + } + } + } + } + irq_ptr->airq_busy = 0; + } + + if (max_busy_irq != NULL) { +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf("rebinding %x to %x", + max_busy_irq->airq_vector, most_free_cpu); + } +#endif /* DEBUG */ + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(max_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + max_busy_irq->airq_cpu = + (uint32_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + + } else if (min_busy_irq != NULL) { +#ifdef DEBUG + if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) { + prom_printf("rebinding %x to %x", + min_busy_irq->airq_vector, most_free_cpu); + } +#endif /* DEBUG */ + + iflag = intr_clear(); + if (lock_try(&apic_ioapic_lock)) { + if (apic_rebind_all(min_busy_irq, + most_free_cpu) == 0) { + /* Make change permenant */ + min_busy_irq->airq_cpu = + (uint32_t)most_free_cpu; + } + lock_clear(&apic_ioapic_lock); + } + intr_restore(iflag); + + } else { + if (cpu_busy != (1 << busiest_cpu)) { + apic_redist_cpu_skip |= 1 << busiest_cpu; + /* + * We leave cpu_skip set so that next time we + * can choose another cpu + */ + } + } + apic_num_rebind++; + } else { + /* + * found nothing. Could be that we skipped over valid CPUs + * or we have balanced everything. If we had a variable + * ticks_for_redistribution, it could be increased here. + * apic_int_busy, int_free etc would also need to be + * changed. + */ + if (apic_redist_cpu_skip) + apic_redist_cpu_skip = 0; + } + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } + } +} + +void +apic_cleanup_busy(void) +{ + int i; + apic_irq_t *irq_ptr; + + for (i = 0; i < apic_nproc; i++) { + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } + } + + for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { + if ((irq_ptr = apic_irq_table[i]) != NULL) + irq_ptr->airq_busy = 0; + } +} + +int +apic_ioapic_method_probe() +{ + return (PSM_SUCCESS); +} diff --git a/usr/src/uts/i86xpv/io/psm/xpv_intr.c b/usr/src/uts/i86xpv/io/psm/xpv_intr.c new file mode 100644 index 0000000000..99fc66866c --- /dev/null +++ b/usr/src/uts/i86xpv/io/psm/xpv_intr.c @@ -0,0 +1,363 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include <sys/mutex.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/clock.h> +#include <sys/machlock.h> +#include <sys/smp_impldefs.h> +#include <sys/uadmin.h> +#include <sys/promif.h> +#include <sys/psm.h> +#include <sys/psm_common.h> +#include <sys/atomic.h> +#include <sys/apic.h> +#include <sys/archsystm.h> +#include <sys/mach_intr.h> +#include <sys/modctl.h> +#include <sys/sysmacros.h> +#include <sys/pci_intr_lib.h> + + +/* Multiple vector support for MSI */ +int apic_multi_msi_enable = 1; + +/* Multiple vector support for MSI-X */ +int apic_msix_enable = 1; + +/* + * check whether the system supports MSI + * + * If PCI-E capability is found, then this must be a PCI-E system. + * Since MSI is required for PCI-E system, it returns PSM_SUCCESS + * to indicate this system supports MSI. + */ +int +apic_check_msi_support() +{ + dev_info_t *cdip; + char dev_type[16]; + int dev_len; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n")); + + /* + * check whether the first level children of root_node have + * PCI-E capability + */ + for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL; + cdip = ddi_get_next_sibling(cdip)) { + + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p," + " driver: %s, binding: %s, nodename: %s\n", (void *)cdip, + ddi_driver_name(cdip), ddi_binding_name(cdip), + ddi_node_name(cdip))); + dev_len = sizeof (dev_type); + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS, + "device_type", (caddr_t)dev_type, &dev_len) + != DDI_PROP_SUCCESS) + continue; + if (strcmp(dev_type, "pciex") == 0) + return (PSM_SUCCESS); + } + + /* MSI is not supported on this system */ + DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' " + "device_type found\n")); + return (PSM_FAILURE); +} + + +/* + * It finds the apic_irq_t associates with the dip, ispec and type. + */ +apic_irq_t * +apic_find_irq(dev_info_t *dip, struct intrspec *ispec, int type) +{ + apic_irq_t *irqp; + int i; + + DDI_INTR_IMPLDBG((CE_CONT, "apic_find_irq: dip=0x%p vec=0x%x " + "ipl=0x%x type=0x%x\n", (void *)dip, ispec->intrspec_vec, + ispec->intrspec_pri, type)); + + for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { + for (irqp = apic_irq_table[i]; irqp; irqp = irqp->airq_next) { + if ((irqp->airq_dip == dip) && + (irqp->airq_origirq == ispec->intrspec_vec) && + (irqp->airq_ipl == ispec->intrspec_pri)) { + if (type == DDI_INTR_TYPE_MSI) { + if (irqp->airq_mps_intr_index == + MSI_INDEX) + return (irqp); + } else if (type == DDI_INTR_TYPE_MSIX) { + if (irqp->airq_mps_intr_index == + MSIX_INDEX) + return (irqp); + } else + return (irqp); + } + } + } + DDI_INTR_IMPLDBG((CE_CONT, "apic_find_irq: return NULL\n")); + return (NULL); +} + + +int +apic_get_vector_intr_info(int vecirq, apic_get_intr_t *intr_params_p) +{ + struct autovec *av_dev; + uchar_t irqno; + int i; + apic_irq_t *irq_p; + + /* Sanity check the vector/irq argument. */ + ASSERT((vecirq >= 0) || (vecirq <= APIC_MAX_VECTOR)); + + mutex_enter(&airq_mutex); + + /* + * Convert the vecirq arg to an irq using vector_to_irq table + * if the arg is a vector. Pass thru if already an irq. + */ + if ((intr_params_p->avgi_req_flags & PSMGI_INTRBY_FLAGS) == + PSMGI_INTRBY_VEC) + irqno = apic_vector_to_irq[vecirq]; + else + irqno = vecirq; + + irq_p = apic_irq_table[irqno]; + + if ((irq_p == NULL) || + ((irq_p->airq_mps_intr_index != RESERVE_INDEX) && + ((irq_p->airq_temp_cpu == IRQ_UNBOUND) || + (irq_p->airq_temp_cpu == IRQ_UNINIT)))) { + mutex_exit(&airq_mutex); + return (PSM_FAILURE); + } + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_CPUID) { + + /* Get the (temp) cpu from apic_irq table, indexed by irq. */ + intr_params_p->avgi_cpu_id = irq_p->airq_temp_cpu; + + /* Return user bound info for intrd. */ + if (intr_params_p->avgi_cpu_id & IRQ_USER_BOUND) { + intr_params_p->avgi_cpu_id &= ~IRQ_USER_BOUND; + intr_params_p->avgi_cpu_id |= PSMGI_CPU_USER_BOUND; + } + } + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_VECTOR) + intr_params_p->avgi_vector = irq_p->airq_vector; + + if (intr_params_p->avgi_req_flags & + (PSMGI_REQ_NUM_DEVS | PSMGI_REQ_GET_DEVS)) + /* Get number of devices from apic_irq table shared field. */ + intr_params_p->avgi_num_devs = irq_p->airq_share; + + if (intr_params_p->avgi_req_flags & PSMGI_REQ_GET_DEVS) { + + intr_params_p->avgi_req_flags |= PSMGI_REQ_NUM_DEVS; + + /* Some devices have NULL dip. Don't count these. */ + if (intr_params_p->avgi_num_devs > 0) { + for (i = 0, av_dev = autovect[irqno].avh_link; + av_dev; av_dev = av_dev->av_link) + if (av_dev->av_vector && av_dev->av_dip) + i++; + intr_params_p->avgi_num_devs = + MIN(intr_params_p->avgi_num_devs, i); + } + + /* There are no viable dips to return. */ + if (intr_params_p->avgi_num_devs == 0) + intr_params_p->avgi_dip_list = NULL; + + else { /* Return list of dips */ + + /* Allocate space in array for that number of devs. */ + intr_params_p->avgi_dip_list = kmem_zalloc( + intr_params_p->avgi_num_devs * + sizeof (dev_info_t *), + KM_SLEEP); + + /* + * Loop through the device list of the autovec table + * filling in the dip array. + * + * Note that the autovect table may have some special + * entries which contain NULL dips. These will be + * ignored. + */ + for (i = 0, av_dev = autovect[irqno].avh_link; + av_dev; av_dev = av_dev->av_link) + if (av_dev->av_vector && av_dev->av_dip) + intr_params_p->avgi_dip_list[i++] = + av_dev->av_dip; + } + } + + mutex_exit(&airq_mutex); + + return (PSM_SUCCESS); +} + + +/* + * apic_pci_msi_enable_vector: + * Set the address/data fields in the MSI/X capability structure + * XXX: MSI-X support + */ +/* ARGSUSED */ +void +apic_pci_msi_enable_vector(apic_irq_t *irq_ptr, int type, int inum, int vector, + int count, int target_apic_id) +{ + uint64_t msi_addr, msi_data; + ushort_t msi_ctrl; + dev_info_t *dip = irq_ptr->airq_dip; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip); + + DDI_INTR_IMPLDBG((CE_CONT, "apic_pci_msi_enable_vector: dip=0x%p\n" + "\tdriver = %s, inum=0x%x vector=0x%x apicid=0x%x\n", (void *)dip, + ddi_driver_name(dip), inum, vector, target_apic_id)); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + /* MSI Address */ + msi_addr = (MSI_ADDR_HDR | + (target_apic_id << MSI_ADDR_DEST_SHIFT)); + msi_addr |= ((MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | + (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT)); + + /* MSI Data: MSI is edge triggered according to spec */ + msi_data = ((MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | vector); + + DDI_INTR_IMPLDBG((CE_CONT, "apic_pci_msi_enable_vector: addr=0x%lx " + "data=0x%lx\n", (long)msi_addr, (long)msi_data)); + + if (type == DDI_INTR_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + + /* Set the bits to inform how many MSIs are enabled */ + msi_ctrl |= ((highbit(count) -1) << PCI_MSI_MME_SHIFT); + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + } +} + + +/* + * apic_pci_msi_disable_mode: + */ +void +apic_pci_msi_disable_mode(dev_info_t *rdip, int type) +{ + ushort_t msi_ctrl; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + if (type == DDI_INTR_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + if (!(msi_ctrl & PCI_MSI_ENABLE_BIT)) + return; + + msi_ctrl &= ~PCI_MSI_ENABLE_BIT; /* MSI disable */ + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + + } else if (type == DDI_INTR_TYPE_MSIX) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); + if (msi_ctrl & PCI_MSIX_ENABLE_BIT) { + msi_ctrl &= ~PCI_MSIX_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, + msi_ctrl); + } + } +} + + +/* + * apic_pci_msi_enable_mode: + */ +void +apic_pci_msi_enable_mode(dev_info_t *rdip, int type, int inum) +{ + ushort_t msi_ctrl; + int cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip); + ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(rdip); + + ASSERT((handle != NULL) && (cap_ptr != 0)); + + if (type == DDI_INTR_TYPE_MSI) { + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL); + if ((msi_ctrl & PCI_MSI_ENABLE_BIT)) + return; + + msi_ctrl |= PCI_MSI_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl); + + } else if (type == DDI_INTR_TYPE_MSIX) { + uintptr_t off; + uint32_t mask; + ddi_intr_msix_t *msix_p; + + msix_p = i_ddi_get_msix(rdip); + + ASSERT(msix_p != NULL); + + /* Offset into "inum"th entry in the MSI-X table & clear mask */ + off = (uintptr_t)msix_p->msix_tbl_addr + (inum * + PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET; + + mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off); + + ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask & ~1)); + + msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL); + + if (!(msi_ctrl & PCI_MSIX_ENABLE_BIT)) { + msi_ctrl |= PCI_MSIX_ENABLE_BIT; + pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL, + msi_ctrl); + } + } +} + + +/* + * We let the hypervisor deal with msi configutation + * so just stub this out. + */ + +/* ARGSUSED */ +void +apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum) +{ +} diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index 7c0adae494..6953d15bb5 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -1411,6 +1411,15 @@ fcnname/**/_info: \ END_MODULE(elfexec); #endif +/* + * Stub(s) for APIX module. + */ +#ifndef APIX_MODULE + MODULE(apix,mach); + WSTUB(apix, apix_loaded, nomod_zero); + END_MODULE(apix); +#endif + / this is just a marker for the area of text that contains stubs ENTRY_NP(stubs_end) diff --git a/usr/src/uts/intel/ia32/sys/traptrace.h b/usr/src/uts/intel/ia32/sys/traptrace.h index dc83c20187..3183f2f580 100644 --- a/usr/src/uts/intel/ia32/sys/traptrace.h +++ b/usr/src/uts/intel/ia32/sys/traptrace.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _IA32_SYS_TRAPTRACE_H @@ -64,6 +63,7 @@ typedef struct { greg_t ttr_cr2; union _ttr_info { struct _idt_entry { + int cpuid; short vector; uchar_t ipl; uchar_t spl; @@ -81,6 +81,7 @@ typedef struct { pc_t ttr_stack[TTR_STACK_DEPTH]; } trap_trace_rec_t; +#define ttr_cpuid ttr_info.idt_entry.cpuid #define ttr_vector ttr_info.idt_entry.vector #define ttr_ipl ttr_info.idt_entry.ipl #define ttr_spl ttr_info.idt_entry.spl diff --git a/usr/src/uts/intel/io/pci/pci_boot.c b/usr/src/uts/intel/io/pci/pci_boot.c index 850a5124d6..a0dcdaa7ef 100644 --- a/usr/src/uts/intel/io/pci/pci_boot.c +++ b/usr/src/uts/intel/io/pci/pci_boot.c @@ -99,6 +99,7 @@ struct pci_devfunc { boolean_t reprogram; /* this device needs to be reprogrammed */ }; +extern int apic_nvidia_io_max; extern int pseudo_isa; extern int pci_bios_maxbus; static uchar_t max_dev_pci = 32; /* PCI standard */ @@ -2058,15 +2059,17 @@ process_devfunc(uchar_t bus, uchar_t dev, uchar_t func, uchar_t header, pci_bus_res[bus].privdata = entry; } - if (config_op == CONFIG_INFO && - IS_CLASS_IOAPIC(basecl, subcl, progcl)) { + if (IS_CLASS_IOAPIC(basecl, subcl, progcl)) { create_ioapic_node(bus, dev, func, vendorid, deviceid); } - /* check for ck8-04 based PCI ISA bridge only */ + /* check for NVIDIA CK8-04/MCP55 based LPC bridge */ if (NVIDIA_IS_LPC_BRIDGE(vendorid, deviceid) && (dev == 1) && - (func == 0)) + (func == 0)) { add_nvidia_isa_bridge_props(dip, bus, dev, func); + /* each LPC bridge has an integrated IOAPIC */ + apic_nvidia_io_max++; + } if (pciex && is_pci_bridge) (void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, "model", diff --git a/usr/src/uts/intel/os/mach b/usr/src/uts/intel/os/mach index 4080a6bd2f..ccec5ddf21 100644 --- a/usr/src/uts/intel/os/mach +++ b/usr/src/uts/intel/os/mach @@ -1,7 +1,3 @@ -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# # CDDL HEADER START # # The contents of this file are subject to the terms of the @@ -21,8 +17,18 @@ # # CDDL HEADER END # -#pragma ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# CAUTION! The order of modules specified here is very important. If the +# order is not correct it can result in unexpected system behavior. The +# loading of modules is in the reverse order specified here (i.e. the last +# entry is loaded first and the first entry loaded last). +# pcplusmp +apix xpv_psm |
