diff options
131 files changed, 22855 insertions, 2347 deletions
diff --git a/usr/src/Makefile.lint b/usr/src/Makefile.lint index 90b76afe47..2182a61ff9 100644 --- a/usr/src/Makefile.lint +++ b/usr/src/Makefile.lint @@ -477,11 +477,13 @@ $(CLOSED_BUILD)COMMON_SUBDIRS += \ $(CLOSED)/lib/libc_i18n i386_SUBDIRS= \ + cmd/acpihpd \ cmd/biosdev \ cmd/rtc \ cmd/ucodeadm \ lib/brand/lx \ lib/cfgadm_plugins/sata \ + lib/cfgadm_plugins/sbd \ lib/libfdisk sparc_SUBDIRS= \ diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 2d0ce5e6d5..b493fa6ef1 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -476,6 +476,7 @@ ODE $(CLOSED)/cmd/tail \ $(CLOSED)/cmd/tr_xpg4 i386_SUBDIRS= \ + acpihpd \ addbadsec \ biosdev \ diskscan \ diff --git a/usr/src/cmd/acpihpd/Makefile b/usr/src/cmd/acpihpd/Makefile new file mode 100644 index 0000000000..33a26fb2be --- /dev/null +++ b/usr/src/cmd/acpihpd/Makefile @@ -0,0 +1,62 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# cmd/acpihpd/Makefile +# + +PLATFORM = i86pc +MANIFEST = acpihpd.xml +SVCMETHOD = svc-acpihpd + +include ../Makefile.cmd + +$(64ONLY)SUBDIRS= $(MACH) + +ROOTMANIFESTDIR = $(ROOTSVCPLATFORM)/${PLATFORM} + +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint + +.KEEP_STATE: + +all: $(SUBDIRS) + +install: $(SUBDIRS) $(ROOTMANIFEST) $(ROOTSVCMETHOD) + +clean clobber lint: $(SUBDIRS) + +check: $(CHKMANIFEST) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include ../Makefile.targ diff --git a/usr/src/cmd/acpihpd/Makefile.com b/usr/src/cmd/acpihpd/Makefile.com new file mode 100644 index 0000000000..287a48d896 --- /dev/null +++ b/usr/src/cmd/acpihpd/Makefile.com @@ -0,0 +1,46 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# cmd/acpihpd/Makefile.com +# + +LIBS = acpihpd +PLATFORM = i86pc + +include ../../Makefile.cmd +include ../../../Makefile.psm + +OBJS = acpihpd.o notify.o +SRCS = $(OBJS:%.o=../%.c) +LINTFILES = $(SRCS:%.c=%.ln) +CLOBBERFILES = $(LIBS) + +CFLAGS += $(CCVERBOSE) +CPPFLAGS += -I$(USR_PSM_INCL_DIR) +LDLIBS += -lsysevent -lnvpair -lcfgadm + +FILEMODE = 0755 diff --git a/usr/src/cmd/acpihpd/Makefile.targ b/usr/src/cmd/acpihpd/Makefile.targ new file mode 100644 index 0000000000..89da583847 --- /dev/null +++ b/usr/src/cmd/acpihpd/Makefile.targ @@ -0,0 +1,62 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# cmd/acpihpd/Makefile.targ +# + +.KEEP_STATE: + +all: $(LIBS) + +clean: + $(RM) $(OBJS) + +lint: + $(LINT.c) $(SRCS) $(LDLIBS) + +$(LIBS): $(OBJS) + $(LINK.c) $(OBJS) -o $@ $(LDLIBS) + $(POST_PROCESS) + +%.o: ../%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +$(USR_PLAT_DIR): + -$(INS.dir) + +$(USR_PSM_DIR): $(USR_PLAT_DIR) + -$(INS.dir) + +$(USR_PSM_LIB_DIR): $(USR_PSM_DIR) + -$(INS.dir) + +$(USR_PSM_LIB_DIR)/%: % $(USR_PSM_LIB_DIR) + $(INS.file) + +include ../../Makefile.targ + diff --git a/usr/src/cmd/acpihpd/acpihpd.c b/usr/src/cmd/acpihpd/acpihpd.c new file mode 100644 index 0000000000..6beee85f54 --- /dev/null +++ b/usr/src/cmd/acpihpd/acpihpd.c @@ -0,0 +1,325 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dr.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <syslog.h> +#include <string.h> +#include <strings.h> +#include <fcntl.h> +#include <errno.h> +#include <time.h> +#include <config_admin.h> +#include <libscf.h> +#include <libsysevent.h> +#include <stdarg.h> + +/* Signal handler type */ +typedef void (sig_handler_t)(int); + +#define ACPIHPD_PID_FILE "/var/run/acpihpd.pid" /* lock file path */ + +/* Program Name */ +char *g_prog_name; +int g_debuglevel = 0; + +static int s_pid_fd; +static sysevent_handle_t *s_acpihpd_hdl; + +static int daemon_init(void); +static void daemon_quit(int); +static int set_sig_handler(int, sig_handler_t *); +static int acpihpd_init(void); +static void acpihpd_fini(void); +static void acpihpd_event(sysevent_t *); +extern void notify_hotplug(sysevent_t *ev); +void debug_print(int, const char *, ...); + +int +main(int argc, char *argv[]) +{ + int c; + + /* Get Program Name */ + if ((g_prog_name = strrchr(argv[0], '/')) == NULL) { + g_prog_name = argv[0]; + } else { + g_prog_name++; + } + + while ((c = getopt(argc, argv, ":d:")) != -1) { + switch (c) { + case 'd': + g_debuglevel = atoi(optarg); + if ((g_debuglevel < 0) || (g_debuglevel > 2)) { + g_debuglevel = 0; + } + break; + + case ':': + syslog(LOG_ERR, + "missed argument for option %c.", optopt); + break; + + case '?': + syslog(LOG_ERR, "unrecognized option %c.", optopt); + break; + } + } + + s_acpihpd_hdl = NULL; + + /* Check the daemon running lock and initialize the signal */ + if (daemon_init() != 0) { + debug_print(0, "%s could not startup!", g_prog_name); + exit(SMF_EXIT_ERR_FATAL); + } + + /* Subscribe to the hotplug event */ + if (acpihpd_init() != 0) { + debug_print(0, "%s could not startup!", g_prog_name); + daemon_quit(SMF_EXIT_ERR_FATAL); + } + + debug_print(2, "daemon is running."); + /*CONSTCOND*/ + while (1) { + (void) pause(); + } + + return (SMF_EXIT_OK); +} + +static int +daemon_init(void) +{ + int i, ret; + pid_t pid; + char pid_str[32]; + + if (geteuid() != 0) { + debug_print(0, "must be root to execute %s", g_prog_name); + return (1); + } + + if ((pid = fork()) < 0) { + return (1); + } + + if (pid > 0) { + /* Parent to exit. */ + exit(SMF_EXIT_OK); + } + + (void) setsid(); + (void) chdir("/"); + (void) umask(0); + (void) closefrom(0); + (void) open("/dev/null", O_RDONLY); + (void) open("/dev/null", O_WRONLY); + (void) dup(1); + (void) openlog(g_prog_name, LOG_PID, LOG_DAEMON); + + /* + * Create the lock file for singleton + */ + if ((s_pid_fd = open(ACPIHPD_PID_FILE, O_RDWR | O_CREAT, 0644)) < 0) { + debug_print(0, "could not create pid file: %s", + strerror(errno)); + return (1); + } + + if (lockf(s_pid_fd, F_TLOCK, 0L) < 0) { + if (errno == EACCES || errno == EAGAIN) { + debug_print(0, "another acpihpd is already running"); + } else { + debug_print(0, "could not lock pid file"); + } + + return (1); + } + + (void) ftruncate(s_pid_fd, 0); + i = sprintf(pid_str, "%ld", (long)getpid()); + while ((ret = write(s_pid_fd, pid_str, i)) != i) { + if (errno == EINTR) { + continue; + } + if (ret < 0) { + debug_print(0, "pid file write failed: %s", + strerror(errno)); + return (1); + } + } + + if (set_sig_handler(SIGTERM, (sig_handler_t *)daemon_quit) != 0) { + debug_print(2, "could not set signal handler(SIGTERM)"); + return (1); + } + + if (set_sig_handler(SIGQUIT, (sig_handler_t *)daemon_quit) != 0) { + debug_print(2, "could not set signal handler(SIGQUIT)"); + return (1); + } + + if (set_sig_handler(SIGINT, (sig_handler_t *)daemon_quit) != 0) { + debug_print(2, "could not set signal handler(SIGINT)"); + return (1); + } + + if (set_sig_handler(SIGCHLD, SIG_IGN) != 0) { + debug_print(2, "could not set signal handler(SIGCHLD)"); + return (1); + } + + return (0); +} + +static void +daemon_quit(int signo) +{ + int status = 0; + id_t pgid; + + debug_print(1, "daemon quit [signal#:%d].", signo); + + acpihpd_fini(); + (void) set_sig_handler(SIGTERM, SIG_IGN); + pgid = getpgrp(); + (void) kill(-pgid, SIGTERM); + (void) close(s_pid_fd); + (void) unlink(ACPIHPD_PID_FILE); + + if (signo < 0) { + status = signo; + } + _exit(status); +} + +static int +set_sig_handler(int sig, sig_handler_t *handler) +{ + struct sigaction act; + + act.sa_handler = handler; + act.sa_flags = 0; + if (sig == SIGCHLD && handler == SIG_IGN) { + act.sa_flags |= SA_NOCLDWAIT; + } + + (void) sigemptyset(&act.sa_mask); + if (sigaction(sig, &act, NULL) < 0) { + return (1); + } + + return (0); +} + +static int +acpihpd_init(void) +{ + const char *subclass = ESC_DR_REQ; + + debug_print(2, "acpihpd_init"); + + if ((s_acpihpd_hdl = sysevent_bind_handle(acpihpd_event)) == NULL) { + debug_print(2, "could not bind to sysevent."); + return (-1); + } + + if (sysevent_subscribe_event(s_acpihpd_hdl, EC_DR, &subclass, 1) != 0) { + debug_print(2, "could not subscribe an event."); + sysevent_unbind_handle(s_acpihpd_hdl); + s_acpihpd_hdl = NULL; + return (-1); + } + + return (0); +} + +static void +acpihpd_fini(void) +{ + debug_print(2, "acpihpd_fini"); + + if (s_acpihpd_hdl != NULL) { + sysevent_unsubscribe_event(s_acpihpd_hdl, EC_DR); + sysevent_unbind_handle(s_acpihpd_hdl); + } +} + +static void +acpihpd_event(sysevent_t *ev) +{ + debug_print(2, "*** got an event ***"); + + /* Inform cfgadm of the hot-plug event. */ + notify_hotplug(ev); +} + +void +debug_print(int level, const char *fmt, ...) +{ + va_list ap; + int pri, pr_out = 0; + + if (level <= g_debuglevel) { + switch (level) { + case 0: + pri = LOG_ERR; + pr_out = 1; + break; + + case 1: + pri = LOG_NOTICE; + pr_out = 1; + break; + + case 2: + pri = LOG_DEBUG; + pr_out = 1; + break; + } + + if (pr_out) { + va_start(ap, fmt); + vsyslog(pri, fmt, ap); + va_end(ap); + } + } +} diff --git a/usr/src/cmd/acpihpd/acpihpd.xml b/usr/src/cmd/acpihpd/acpihpd.xml new file mode 100644 index 0000000000..e8e799de08 --- /dev/null +++ b/usr/src/cmd/acpihpd/acpihpd.xml @@ -0,0 +1,93 @@ +<?xml version="1.0"?> +<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1"> +<!-- + + Copyright (c) 2010, Intel Corporation. + All rights reserved. + + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + + Note: + NOTE: This service manifest is not editable; its contents will + be overwritten by package or patch operations, including + operating system upgrade. Make customizations in a different + file. +--> + +<service_bundle type='manifest' name='SUNWacpihpd:acpihpd'> + +<service + name='platform/i86pc/acpihpd' + type='service' + version='1'> + + <create_default_instance enabled='true' /> + + <single_instance/> + + <!-- + acpihpd should be started after syseventd is running. + --> + <dependency + name='startup_req' + grouping='require_all' + restart_on='none' + type='service'> + <service_fmri value='svc:/system/sysevent' /> + </dependency> + + <exec_method + type='method' + name='start' + exec='/lib/svc/method/svc-acpihpd' + timeout_seconds='60' > + <method_context> + <method_credential user='root' group='root' + privileges='basic,!proc_info,sys_config,sys_mount,sys_devices' /> + </method_context> + </exec_method> + + <exec_method + type='method' + name='stop' + exec=':kill' + timeout_seconds='60' > + <method_context> + <method_credential user='root' group='root' /> + </method_context> + </exec_method> + + <stability value='Unstable' /> + + <template> + <common_name> + <loctext xml:lang='C'> + Intel ACPI hot-plug daemon + </loctext> + </common_name> + <documentation> + <manpage title='acpihpd' section='1M' + manpath='/usr/share/man' /> + </documentation> + </template> +</service> + +</service_bundle> + diff --git a/usr/src/cmd/acpihpd/i386/Makefile b/usr/src/cmd/acpihpd/i386/Makefile new file mode 100644 index 0000000000..4356b38b82 --- /dev/null +++ b/usr/src/cmd/acpihpd/i386/Makefile @@ -0,0 +1,37 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# cmd/acpihpd/i386/Makefile +# + +include ../Makefile.com + +.KEEP_STATE: + +install: all $(USR_PSM_LIBS) + +include ../Makefile.targ diff --git a/usr/src/cmd/acpihpd/notify.c b/usr/src/cmd/acpihpd/notify.c new file mode 100644 index 0000000000..138b1a233d --- /dev/null +++ b/usr/src/cmd/acpihpd/notify.c @@ -0,0 +1,353 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <config_admin.h> +#include <strings.h> +#include <syslog.h> +#include <libsysevent.h> +#include <libdevinfo.h> +#include <libnvpair.h> +#include <assert.h> +#include <errno.h> +#include <unistd.h> +#include <stropts.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dr.h> +#include <sys/sbd_ioctl.h> +#include <sys/acpidev.h> + +#define PMCONFIG_PATH "/usr/sbin/pmconfig" + +#define CFGADM_CMD_ASSIGN "assign" +#define CFGADM_CMD_POWERON "poweron" +#define CFGADM_CMD_PASSTHRU "passthru" + +#define STATUS_INPROGRESS 0 +#define STATUS_SUCCESS 1 +#define STATUS_FAILURE 2 +#define STATUS_NOOP 3 + +static char *s_status_array[] = { + ACPIDEV_CMD_OST_INPROGRESS, + ACPIDEV_CMD_OST_SUCCESS, + ACPIDEV_CMD_OST_FAILURE, + ACPIDEV_CMD_OST_NOOP +}; + +extern void debug_print(int, const char *, ...); + +/*ARGSUSED*/ +static int +confirm_no(void *appdata_ptr, const char *message) +{ + return (0); +} + +/*ARGSUSED*/ +static int +message_output(void *appdata_ptr, const char *message) +{ + debug_print(2, "cfgadm message: %s", message); + return (CFGA_OK); +} + +static char * +plat_opt_str_alloc(int cmd, char *acpi_event_type, int status) +{ + char *opt; + size_t len; + + if (cmd == SBD_CMD_PASSTHRU) { + len = strlen(s_status_array[status]) + + strlen(ACPIDEV_EVENT_TYPE_ATTR_NAME) + + strlen(acpi_event_type) + 10; + if ((opt = malloc(len)) != NULL) { + (void) snprintf(opt, len, "%s %s=%s", + s_status_array[status], + ACPIDEV_EVENT_TYPE_ATTR_NAME, + acpi_event_type); + debug_print(2, "plat_opt_str_alloc = '%s'", opt); + } + } else { + len = strlen("platform=''") + + strlen(s_status_array[status]) + + strlen(ACPIDEV_EVENT_TYPE_ATTR_NAME) + + strlen(acpi_event_type) + 10; + if ((opt = malloc(len)) != NULL) { + (void) snprintf(opt, len, "platform='%s %s=%s'", + s_status_array[status], + ACPIDEV_EVENT_TYPE_ATTR_NAME, + acpi_event_type); + debug_print(2, "plat_opt_str_alloc = '%s'", opt); + } + } + + return (opt); +} + +static int +cfgadm_cmd_wrapper(int cmd, int apid_num, char **apids, + char *acpi_event_type, int status, + struct cfga_confirm *confirm, struct cfga_msg *message) +{ + cfga_err_t ret; + char *plat_opts; + char *estrp = NULL; + + assert(apid_num == 1); + assert(apids != NULL); + + plat_opts = plat_opt_str_alloc(cmd, acpi_event_type, status); + if (plat_opts == NULL) { + debug_print(0, + "failed to generate platform option string for cfgadm"); + return (-1); + } + + switch (cmd) { + case SBD_CMD_CONNECT: + ret = config_change_state(CFGA_CMD_CONNECT, apid_num, apids, + plat_opts, confirm, message, &estrp, 0); + if (ret != CFGA_OK) { + debug_print(0, "cfgadm('connect', '%s') failed, " + "ret = %d, errstr = '%s'", apids[0], ret, estrp); + } + break; + + case SBD_CMD_CONFIGURE: + ret = config_change_state(CFGA_CMD_CONFIGURE, apid_num, apids, + plat_opts, confirm, message, &estrp, 0); + if (ret != CFGA_OK) { + debug_print(0, "cfgadm('configure', '%s') failed, " + "ret = %d, errstr = '%s'", apids[0], ret, estrp); + } + break; + + case SBD_CMD_ASSIGN: + ret = config_private_func(CFGADM_CMD_ASSIGN, apid_num, apids, + plat_opts, confirm, message, &estrp, 0); + if (ret != CFGA_OK) { + debug_print(0, "cfgadm('assign', '%s') failed, " + "ret = %d, errstr = '%s'", apids[0], ret, estrp); + } + break; + + case SBD_CMD_POWERON: + ret = config_private_func(CFGADM_CMD_POWERON, apid_num, apids, + plat_opts, confirm, message, &estrp, 0); + if (ret != CFGA_OK) { + debug_print(0, "cfgadm('poweron', '%s') failed, " + "ret = %d, errstr = '%s'", apids[0], ret, estrp); + } + break; + + case SBD_CMD_PASSTHRU: + ret = config_private_func(CFGADM_CMD_PASSTHRU, apid_num, apids, + plat_opts, confirm, message, &estrp, 0); + if (ret != CFGA_OK) { + debug_print(0, "cfgadm('passthru', '%s') failed, " + "ret = %d, errstr = '%s'", apids[0], ret, estrp); + } + break; + + default: + debug_print(2, "unknown command (%d) to cfgadm_cmd_wrapper()"); + ret = CFGA_ERROR; + break; + } + + if (plat_opts != NULL) + free(plat_opts); + + return (ret == CFGA_OK ? 0 : -1); +} + +static int +event_process(char *ap_id, char *req, char *acpi_event_type) +{ + char *apids[1]; + struct cfga_msg message; + struct cfga_confirm confirm; + + if (strcmp(req, DR_REQ_INCOMING_RES) != 0) { + debug_print(2, + "Event is not supported (ap_id = %s, req = %s)", + ap_id, req); + return (-1); + } + + apids[0] = ap_id; + (void) memset(&confirm, 0, sizeof (confirm)); + confirm.confirm = confirm_no; + (void) memset(&message, 0, sizeof (message)); + message.message_routine = message_output; + + if (cfgadm_cmd_wrapper(SBD_CMD_ASSIGN, 1, apids, + acpi_event_type, STATUS_NOOP, &confirm, &message) != 0) { + goto L_ERR; + } + syslog(LOG_NOTICE, + "board '%s' has been assigned successfully", ap_id); + (void) cfgadm_cmd_wrapper(SBD_CMD_PASSTHRU, 1, apids, + acpi_event_type, STATUS_INPROGRESS, &confirm, &message); + + if (cfgadm_cmd_wrapper(SBD_CMD_POWERON, 1, apids, + acpi_event_type, STATUS_NOOP, &confirm, &message) != 0) { + goto L_ERR_SIG; + } + syslog(LOG_NOTICE, + "board '%s' has been powered on successfully", ap_id); + (void) cfgadm_cmd_wrapper(SBD_CMD_PASSTHRU, 1, apids, + acpi_event_type, STATUS_INPROGRESS, &confirm, &message); + + if (cfgadm_cmd_wrapper(SBD_CMD_CONNECT, 1, apids, + acpi_event_type, STATUS_INPROGRESS, &confirm, &message) != 0) { + goto L_ERR_SIG; + } + syslog(LOG_NOTICE, + "board '%s' has been connected successfully", ap_id); + (void) cfgadm_cmd_wrapper(SBD_CMD_PASSTHRU, 1, apids, + acpi_event_type, STATUS_INPROGRESS, &confirm, &message); + + if (cfgadm_cmd_wrapper(SBD_CMD_CONFIGURE, 1, apids, + acpi_event_type, STATUS_INPROGRESS, &confirm, &message) != 0) { + goto L_ERR_SIG; + } + syslog(LOG_NOTICE, + "board '%s' has been configured successfully", ap_id); + (void) cfgadm_cmd_wrapper(SBD_CMD_PASSTHRU, 1, apids, + acpi_event_type, STATUS_SUCCESS, &confirm, &message); + + (void) system(PMCONFIG_PATH); + syslog(LOG_NOTICE, + "board '%s' has been added into system successfully", ap_id); + + return (0); + +L_ERR_SIG: + (void) cfgadm_cmd_wrapper(SBD_CMD_PASSTHRU, 1, apids, + acpi_event_type, STATUS_FAILURE, &confirm, &message); +L_ERR: + syslog(LOG_ERR, "failed to add board '%s' into system", ap_id); + + return (-1); +} + +void +notify_hotplug(sysevent_t *ev) +{ + char *vendor = NULL; + nvlist_t *attr_list = NULL; + char *class, *subclass; + char *ap_id, *req, *acpi_event_type; + + vendor = sysevent_get_vendor_name(ev); + debug_print(2, "message_vendor = '%s'", vendor ? vendor : "unknown"); + if (vendor == NULL || strcmp(vendor, SUNW_VENDOR) != 0) { + debug_print(2, + "vendor id of message is not '%s'", SUNW_VENDOR); + goto L_EXIT; + } + + class = sysevent_get_class_name(ev); + debug_print(2, "message_class = '%s'", class ? class : "unknown"); + if (class == NULL || strcmp(class, EC_DR) != 0) { + debug_print(2, "class of message is not '%s'", EC_DR); + goto L_EXIT; + } + + subclass = sysevent_get_subclass_name(ev); + debug_print(2, + "message_subclass = '%s'", subclass ? subclass : "unknown"); + if (subclass == NULL || strcmp(subclass, ESC_DR_REQ) != 0) { + debug_print(2, + "subclass of message is not '%s'", ESC_DR_REQ); + goto L_EXIT; + } + + if (sysevent_get_attr_list(ev, &attr_list) != 0) { + debug_print(2, + "can't retrieve attribute list from DR message"); + goto L_EXIT; + } + + if (nvlist_lookup_string(attr_list, DR_AP_ID, &ap_id) != 0) { + debug_print(2, + "can't retrieve '%s' property from attribute list", + DR_AP_ID); + goto L_EXIT; + } + debug_print(2, "%s = '%s'", DR_AP_ID, ap_id ? ap_id : "<null>"); + if ((ap_id == NULL) || (strlen(ap_id) == 0)) { + debug_print(2, "'%s' property in message is NULL", DR_AP_ID); + goto L_EXIT; + } + + if (nvlist_lookup_string(attr_list, DR_REQ_TYPE, &req) != 0) { + debug_print(2, + "can't retrieve '%s' property from attribute list", + DR_REQ_TYPE); + goto L_EXIT; + } + debug_print(2, "%s = '%s'", DR_REQ_TYPE, req ? req : "<null>"); + if ((req == NULL) || (strlen(req) == 0)) { + debug_print(2, "'%s' property in message is NULL", DR_REQ_TYPE); + goto L_EXIT; + } + + if (nvlist_lookup_string(attr_list, ACPIDEV_EVENT_TYPE_ATTR_NAME, + &acpi_event_type) != 0) { + debug_print(2, + "can't retrieve '%s' property from attribute list", + ACPIDEV_EVENT_TYPE_ATTR_NAME); + goto L_EXIT; + } + debug_print(2, "%s = '%s'", ACPIDEV_EVENT_TYPE_ATTR_NAME, + acpi_event_type ? acpi_event_type : "<null>"); + if ((acpi_event_type == NULL) || (strlen(acpi_event_type) == 0)) { + debug_print(2, "'%s' property in message is NULL", + ACPIDEV_EVENT_TYPE_ATTR_NAME); + goto L_EXIT; + } + + (void) event_process(ap_id, req, acpi_event_type); + +L_EXIT: + if (vendor != NULL) { + free(vendor); + } + + if (attr_list != NULL) { + nvlist_free(attr_list); + } + + /* No need to free class & subclass. */ +} diff --git a/usr/src/cmd/acpihpd/svc-acpihpd b/usr/src/cmd/acpihpd/svc-acpihpd new file mode 100644 index 0000000000..f45f05ed06 --- /dev/null +++ b/usr/src/cmd/acpihpd/svc-acpihpd @@ -0,0 +1,39 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# + +. /lib/svc/share/smf_include.sh + +PLATFORM="i86pc" +ACPIHPD=/usr/platform/${PLATFORM}/lib/acpihpd + +/usr/sbin/prtconf -D /devices/fw/acpidr@0 > /dev/null 2>&1 +if [ $? -ne 0 ]; then + svcadm disable -t $SMF_FMRI + sleep 5& + exit $SMF_EXIT_OK +fi + +$ACPIHPD && exit $SMF_EXIT_OK || exit $SMF_EXIT_ERR_FATAL diff --git a/usr/src/lib/cfgadm_plugins/Makefile b/usr/src/lib/cfgadm_plugins/Makefile index bb37297b11..9fc029e2ec 100644 --- a/usr/src/lib/cfgadm_plugins/Makefile +++ b/usr/src/lib/cfgadm_plugins/Makefile @@ -27,8 +27,8 @@ include $(SRC)/Makefile.master -COMMON_SUBDIRS= scsi sdcard pci usb ib fp shp -sparc_SUBDIRS= sbd ac sysctrl +COMMON_SUBDIRS= scsi sdcard pci usb ib fp shp sbd +sparc_SUBDIRS= ac sysctrl i386_SUBDIRS= sata diff --git a/usr/src/lib/cfgadm_plugins/sbd/Makefile.com b/usr/src/lib/cfgadm_plugins/sbd/Makefile.com index fb3046ba42..e28db75c53 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/Makefile.com +++ b/usr/src/lib/cfgadm_plugins/sbd/Makefile.com @@ -23,7 +23,6 @@ # Use is subject to license terms. # -PLATFORM= sun4u LIBRARY= sbd.a VERS= .1 @@ -33,7 +32,7 @@ OBJECTS= ap.o ap_msg.o ap_rcm.o ap_sbd.o ap_seq.o cfga.o ap_err.o include ../../../Makefile.lib USR_PLAT_DIR = $(ROOT)/usr/platform -USR_PSM_DIR = $(USR_PLAT_DIR)/sun4u +USR_PSM_DIR = $(USR_PLAT_DIR)/$(PLATFORM) USR_PSM_LIB_DIR = $(USR_PSM_DIR)/lib USR_PSM_LIB_CFG_DIR = $(USR_PSM_LIB_DIR)/cfgadm USR_PSM_LIB_CFG_DIR_64 = $(USR_PSM_LIB_CFG_DIR)/$(MACH64) @@ -54,7 +53,7 @@ CPPFLAGS += -I$(ROOT)/usr/platform/$(PLATFORM)/include -DSBD_DEBUG # GENERRDIR= $(SRC)/lib/cfgadm_plugins/sbd GENERR= $(GENERRDIR)/sbdgenerr -ERRSRC= $(ROOT)/usr/platform/sun4u/include/sys/sbd_ioctl.h +ERRSRC= $(ROOT)/usr/platform/$(PLATFORM)/include/sys/sbd_ioctl.h .KEEP_STATE: diff --git a/usr/src/lib/cfgadm_plugins/sbd/amd64/Makefile b/usr/src/lib/cfgadm_plugins/sbd/amd64/Makefile new file mode 100644 index 0000000000..9fd69cde68 --- /dev/null +++ b/usr/src/lib/cfgadm_plugins/sbd/amd64/Makefile @@ -0,0 +1,40 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# + +PLATFORM = i86pc + +include ../Makefile.com +include ../../../Makefile.lib.64 + +CFLAGS += -D_POSIX_PTHREAD_SEMANTICS +LINTFLAGS += -D_POSIX_PTHREAD_SEMANTICS +LDLIBS += -lscf + +.KEEP_STATE: + +install: all $(ROOTLIBS64) $(ROOTLINKS64) diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/ap.c b/usr/src/lib/cfgadm_plugins/sbd/common/ap.c index c7c3e96f69..502d192014 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/ap.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/ap.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <assert.h> #include <ctype.h> #include <stdio.h> @@ -40,6 +37,7 @@ #include <config_admin.h> #include "ap.h" +/*ARGSUSED0*/ int ap_symid(apd_t *a, char *apid, char *symid, size_t bufsize) { @@ -78,13 +76,13 @@ ap_symid(apd_t *a, char *apid, char *symid, size_t bufsize) cp--; /* Get the '/' */ if (strcmp(cp, apid) == 0) { - snprintf(symid, bufsize, "%s", dp->d_name); + (void) snprintf(symid, bufsize, "%s", dp->d_name); rc = 0; break; } } - closedir(dirp); + (void) closedir(dirp); return (rc); } @@ -293,7 +291,7 @@ done: DBG("path=<%s> ", a->path ? a->path : ""); DBG("drv=<%s> inst=%d minor=<%s> ", - a->drv ? a->drv : "", a->inst, a->minor ? a->minor : ""); + a->drv ? a->drv : "", a->inst, a->minor ? a->minor : ""); DBG("target=<%s>\n", a->target ? a->target : ""); DBG("cid=<%s> ", a->cid ? a->cid : ""); DBG("cname=<%s> ", a->cname ? a->cname : ""); @@ -612,7 +610,7 @@ ap_opt_parse(apd_t *a, ap_cmd_t *acp, const char *options) * during sequencing. */ for (p = strtok(value, ":"); p != NULL; - p = strtok(NULL, ":")) { + p = strtok(NULL, ":")) { if ((i = ap_cmd(p)) == CMD_NONE) { ap_err(a, ERR_CMD_INVAL, p); return (-1); @@ -711,8 +709,7 @@ ap_cmd_parse(apd_t *a, const char *f, const char *options, int *cmd) target = mask(tgt); DBG("cmd=%s(%d) tmask=0x%x cmask=0x%x omask=0x%x\n", - ap_cmd_name(c), c, target, - acp->cmask, acp->omask[tgt]); + ap_cmd_name(c), c, target, acp->cmask, acp->omask[tgt]); if ((acp->cmask & target) == 0) ap_err(a, ERR_CMD_NOTSUPP, c); diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/ap_msg.c b/usr/src/lib/cfgadm_plugins/sbd/common/ap_msg.c index d2049bdc1d..1005dab325 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/ap_msg.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/ap_msg.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ctype.h> #include <stdio.h> #include <stdlib.h> @@ -63,8 +61,7 @@ debugging(void) if ((debug_fp = fopen(ep, "a")) == NULL) return (0); } - (void) fprintf(debug_fp, - "\nDebug started, pid=%d\n", (int)getpid()); + (void) fprintf(debug_fp, "\nDebug started, pid=%d\n", (int)getpid()); return (1); } @@ -110,7 +107,7 @@ ap_err_fmts[] = { NULL }; -#define ap_err_fmt(i) ap_err_fmts[min((i), ERR_NONE)] +#define ap_err_fmt(i) ap_err_fmts[min((uint_t)(i), ERR_NONE)] static char * ap_msg_fmts[] = { @@ -125,7 +122,7 @@ ap_msg_fmts[] = { NULL }; -#define ap_msg_fmt(i) ap_msg_fmts[min((i), MSG_NONE)] +#define ap_msg_fmt(i) ap_msg_fmts[min((uint_t)(i), MSG_NONE)] #define STR_BD "board" #define STR_SEP ": " @@ -366,7 +363,7 @@ ap_err(apd_t *a, ...) switch (err) { case ERR_CMD_FAIL: (void) snprintf(p, len, fmt, cmd, target, - syserr, rsep, sysrsrc); + syserr, rsep, sysrsrc); break; case ERR_CMD_ABORT: case ERR_CMD_NACK: diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/ap_rcm.c b/usr/src/lib/cfgadm_plugins/sbd/common/ap_rcm.c index f917bb5ae4..770784f16f 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/ap_rcm.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/ap_rcm.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ctype.h> #include <stdio.h> #include <stdlib.h> @@ -145,6 +143,8 @@ ap_rcm_ops[] = { #ifdef __sparcv9 #define RCMLIB "/lib/sparcv9/librcm.so"; +#elif defined(__amd64) +#define RCMLIB "/lib/amd64/librcm.so"; #else #define RCMLIB "/lib/librcm.so"; #endif @@ -323,22 +323,21 @@ ap_rcm_init(apd_t *a) switch (i) { case ALLOC_HANDLE: rcm->alloc_handle = (int(*) - (char *, uint_t, void *, rcm_handle_t **))sym; + (char *, uint_t, void *, rcm_handle_t **))sym; break; case FREE_HANDLE: rcm->free_handle = (void (*)(rcm_handle_t *))sym; break; case GET_INFO: rcm->get_info = (int (*) - (rcm_handle_t *, char *, uint_t, - rcm_info_t **))sym; + (rcm_handle_t *, char *, uint_t, rcm_info_t **))sym; break; case FREE_INFO: rcm->free_info = (void (*)(rcm_info_t *))sym; break; case INFO_TUPLE_NEXT: rcm->info_next = (rcm_info_tuple_t *(*) - (rcm_info_t *, rcm_info_tuple_t *))sym; + (rcm_info_t *, rcm_info_tuple_t *))sym; break; case INFO_TUPLE_STATE: rcm->info_state = (int (*)(rcm_info_tuple_t *))sym; @@ -348,50 +347,50 @@ ap_rcm_init(apd_t *a) break; case INFO_TUPLE_ERROR: rcm->info_error = (const char *(*) - (rcm_info_tuple_t *))sym; + (rcm_info_tuple_t *))sym; break; case INFO_TUPLE_INFO: rcm->info_info = (const char *(*) - (rcm_info_tuple_t *))sym; + (rcm_info_tuple_t *))sym; break; case INFO_TUPLE_RSRC: rcm->info_rsrc = (const char *(*) - (rcm_info_tuple_t *))sym; + (rcm_info_tuple_t *))sym; break; case REQUEST_OFFLINE: rcm->request_offline_list = (int (*) - (rcm_handle_t *, char **, uint_t, - rcm_info_t **))sym; + (rcm_handle_t *, char **, uint_t, + rcm_info_t **))sym; break; case NOTIFY_ONLINE: rcm->notify_online_list = (int (*) - (rcm_handle_t *, char **, uint_t, - rcm_info_t **))sym; + (rcm_handle_t *, char **, uint_t, + rcm_info_t **))sym; break; case REQUEST_SUSPEND: rcm->request_suspend = (int (*) - (rcm_handle_t *, char *, uint_t, - timespec_t *, rcm_info_t **))sym; + (rcm_handle_t *, char *, uint_t, + timespec_t *, rcm_info_t **))sym; break; case NOTIFY_RESUME: rcm->notify_resume = (int (*) - (rcm_handle_t *, char *, uint_t, - rcm_info_t **))sym; + (rcm_handle_t *, char *, uint_t, + rcm_info_t **))sym; break; case NOTIFY_REMOVE: rcm->notify_remove_list = (int (*) - (rcm_handle_t *, char **, uint_t, - rcm_info_t **))sym; + (rcm_handle_t *, char **, uint_t, + rcm_info_t **))sym; break; case REQUEST_CAP_CHANGE: rcm->request_capacity_change = (int (*) - (rcm_handle_t *, char *, uint_t, - nvlist_t *, rcm_info_t **))sym; + (rcm_handle_t *, char *, uint_t, + nvlist_t *, rcm_info_t **))sym; break; case NOTIFY_CAP_CHANGE: rcm->notify_capacity_change = (int (*) - (rcm_handle_t *, char *, uint_t, - nvlist_t *, rcm_info_t **))sym; + (rcm_handle_t *, char *, uint_t, + nvlist_t *, rcm_info_t **))sym; break; default: break; @@ -400,7 +399,7 @@ ap_rcm_init(apd_t *a) if (rcm->alloc_handle == NULL || (*rcm->alloc_handle)(NULL, RCM_NOPID, NULL, &rcm->hd) - != RCM_SUCCESS) { + != RCM_SUCCESS) { ap_err(a, ERR_RCM_HANDLE); return (CFGA_LIB_ERROR); } @@ -441,7 +440,7 @@ ap_rcm_init(apd_t *a) rcm->ncpus = ncpu; if ((rcm->cpuids = (cpuid_t *)calloc(ncpu, sizeof (cpuid_t))) - == NULL) { + == NULL) { ap_err(a, ERR_NOMEM); return (CFGA_LIB_ERROR); } @@ -781,20 +780,21 @@ ap_rcm_cap_cpu(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, uint_t flags, nvlist_add_int32(nvl, "old_total", oldncpuids) != 0 || nvlist_add_int32(nvl, "new_total", newncpuids) != 0 || nvlist_add_int32_array(nvl, "old_cpu_list", oldcpuids, - oldncpuids) != 0 || + oldncpuids) != 0 || nvlist_add_int32_array(nvl, "new_cpu_list", newcpuids, - newncpuids) != 0) + newncpuids) != 0) goto done; - snprintf(buf, sizeof (buf), fmt, change); + (void) snprintf(buf, sizeof (buf), fmt, change); ap_msg(a, MSG_ISSUE, cmd, buf); - if (cmd == CMD_RCM_CAP_DEL) + if (cmd == CMD_RCM_CAP_DEL) { rv = (*rcm->request_capacity_change)(hd, "SUNW_cpu", - flags, nvl, rinfo); - else + flags, nvl, rinfo); + } else { rv = (*rcm->notify_capacity_change)(hd, "SUNW_cpu", - flags & ~RCM_FORCE, nvl, rinfo); + flags & ~RCM_FORCE, nvl, rinfo); + } done: if (nvl) @@ -865,12 +865,13 @@ ap_rcm_cap_mem(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, uint_t flags, (void) snprintf(buf, sizeof (buf), "(%ld pages)", change); ap_msg(a, MSG_ISSUE, cmd, buf); - if (cmd == CMD_RCM_CAP_DEL) + if (cmd == CMD_RCM_CAP_DEL) { rv = (*rcm->request_capacity_change)(hd, "SUNW_memory", - flags, nvl, rinfo); - else + flags, nvl, rinfo); + } else { rv = (*rcm->notify_capacity_change)(hd, "SUNW_memory", - flags & ~RCM_FORCE, nvl, rinfo); + flags & ~RCM_FORCE, nvl, rinfo); + } nvlist_free(nvl); @@ -923,11 +924,13 @@ ap_rcm_request_cap(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, } if (ncpus && ((*rv = ap_rcm_cap_cpu(a, rcm, hd, flags, rinfo, - CMD_RCM_CAP_DEL, ncpus)) != RCM_SUCCESS)) + CMD_RCM_CAP_DEL, ncpus)) != RCM_SUCCESS)) { return (CFGA_LIB_ERROR); + } if (npages && ((*rv = ap_rcm_cap_mem(a, rcm, hd, flags, rinfo, - CMD_RCM_CAP_DEL, npages)) != RCM_SUCCESS)) + CMD_RCM_CAP_DEL, npages)) != RCM_SUCCESS)) { return (CFGA_LIB_ERROR); + } return (CFGA_OK); } @@ -945,8 +948,8 @@ ap_rcm_add_cap(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, DBG("ap_rcm_add_cap(%p)\n", (void *)a); /* Get the new capacity info to figure out what has changed */ - if ((rc = ap_capinfo(a, rcm->firstcm, rcm->lastcm, &capinfo)) - != CFGA_OK) + if ((rc = ap_capinfo(a, rcm->firstcm, rcm->lastcm, &capinfo)) != + CFGA_OK) return (rc); if (capinfo == NULL) { @@ -981,7 +984,7 @@ ap_rcm_add_cap(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, type = ap_cm_type(a, cm); DBG("cm=%d valid=%d type=%d, prevos=%d os=%d\n", - cm, prevvalidity, type, prevos, os); + cm, prevvalidity, type, prevos, os); /* * We are interested only in those components @@ -1008,11 +1011,13 @@ ap_rcm_add_cap(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, free(capinfo); if (ncpus && ((*rv = ap_rcm_cap_cpu(a, rcm, hd, flags, rinfo, - CMD_RCM_CAP_ADD, ncpus)) != RCM_SUCCESS)) + CMD_RCM_CAP_ADD, ncpus)) != RCM_SUCCESS)) { return (CFGA_LIB_ERROR); + } if (npages && ((*rv = ap_rcm_cap_mem(a, rcm, hd, flags, rinfo, - CMD_RCM_CAP_ADD, npages)) != RCM_SUCCESS)) + CMD_RCM_CAP_ADD, npages)) != RCM_SUCCESS)) { return (CFGA_LIB_ERROR); + } return (CFGA_OK); } @@ -1045,8 +1050,8 @@ ap_rcm_notify_cap(apd_t *a, rcmd_t *rcm, rcm_handle_t *hd, DBG("ap_rcm_notify_cap(%p)\n", (void *)a); /* Get the new capacity info to figure out what has changed */ - if ((rc = ap_capinfo(a, rcm->firstcm, rcm->lastcm, &capinfo)) - != CFGA_OK) + if ((rc = ap_capinfo(a, rcm->firstcm, rcm->lastcm, &capinfo)) != + CFGA_OK) return (rc); if (capinfo == NULL) { @@ -1183,8 +1188,7 @@ ap_rcm_ctl(apd_t *a, int cmd) if (rcm->capinfo == NULL) noop++; else - rc = ap_rcm_request_cap(a, rcm, hd, &rv, - flags, &rinfo); + rc = ap_rcm_request_cap(a, rcm, hd, &rv, flags, &rinfo); break; case CMD_RCM_CAP_ADD: rc = ap_rcm_add_cap(a, rcm, hd, &rv, flags, &rinfo); @@ -1266,13 +1270,13 @@ ap_rcm_ctl(apd_t *a, int cmd) ap_msg(a, MSG_ISSUE, cmd, rlist[nrsrc]); if (cmd == CMD_RCM_OFFLINE) rv = (*rcm->request_offline_list)(hd, rlist, flags, - &rinfo); + &rinfo); else if (cmd == CMD_RCM_ONLINE) rv = (*rcm->notify_online_list)(hd, rlist, - flags & ~RCM_FORCE, &rinfo); + flags & ~RCM_FORCE, &rinfo); else rv = (*rcm->notify_remove_list)(hd, rlist, - flags & ~RCM_FORCE, &rinfo); + flags & ~RCM_FORCE, &rinfo); break; } case CMD_RCM_SUSPEND: { @@ -1440,7 +1444,7 @@ ap_rcm_info(apd_t *a, char **msg) if ((infostr = (*rcm->info_info)(tuple)) != NULL) { (void) strcat(*msg, "\n"); (void) sprintf(&((*msg)[strlen(*msg)]), format, - (*rcm->info_rsrc)(tuple), infostr); + (*rcm->info_rsrc)(tuple), infostr); } } diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/ap_sbd.c b/usr/src/lib/cfgadm_plugins/sbd/common/ap_sbd.c index 70c3fe7dee..7c3b34b1f0 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/ap_sbd.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/ap_sbd.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ctype.h> #include <stdio.h> #include <stdlib.h> @@ -175,11 +172,11 @@ ap_stat(apd_t *a, int all) */ ctl->ic_type = SBD_COMP_UNKNOWN; ctl->ic_unit = a->cnum; - strcpy(ctl->ic_name, a->cname); + (void) strcpy(ctl->ic_name, a->cname); } DBG("ioctl(%d SBD_CMD_STATUS, sc=0x%p sz=%d flags=%d", - fd, (void *)sc->s_statp, sc->s_nbytes, ctl->i_flags); + fd, (void *)sc->s_statp, sc->s_nbytes, ctl->i_flags); if (select) DBG(" cname=<%s> cnum=%d", a->cname, a->cnum); DBG(")\n"); @@ -282,7 +279,7 @@ apd_init(apd_t *a, int all) for (dst = st->s_stat, i = 0; i < st->s_nstat; i++, dst++) { DBG("ds_name,ds_unit,ds_type=<%s,%d,%d> ", - dst->ds_name, dst->ds_unit, dst->ds_type); + dst->ds_name, dst->ds_unit, dst->ds_type); if (dst->ds_unit != a->cnum) continue; @@ -331,7 +328,7 @@ apd_free(apd_t *a) ap_rcm_fini(a); if (a->fd != -1) - close(a->fd); + (void) close(a->fd); s_free(a->options); s_free(a->path); @@ -405,7 +402,7 @@ ap_init(apd_t *a, cfga_list_data_t *ap) st = (sbd_stat_t *)a->stat; DBG("ap_init bd=%d rs=%d os=%d type=<%s>\n", - a->bnum, st->s_rstate, st->s_ostate, st->s_type); + a->bnum, st->s_rstate, st->s_ostate, st->s_type); parsable_strncpy(ap->ap_type, st->s_type, sizeof (ap->ap_type)); ap->ap_r_state = (cfga_stat_t)st->s_rstate; @@ -555,7 +552,7 @@ ap_ioctl(apd_t *a, int cmd) else { ctl->ic_type = SBD_COMP_UNKNOWN; ctl->ic_unit = a->cnum; - strcpy(ctl->ic_name, a->cname); + (void) strcpy(ctl->ic_name, a->cname); } if (!(ioc = ap_ioc(cmd))) { @@ -638,7 +635,7 @@ mod_estr(int code) if (i == sbd_etab_len) { char buf[32]; - snprintf(buf, sizeof (buf), "error %d", code); + (void) snprintf(buf, sizeof (buf), "error %d", code); s = strdup(buf); } @@ -760,7 +757,7 @@ ap_help(struct cfga_msg *msgp, const char *options, cfga_flags_t flags) continue; if ((q = (char *)calloc(len + 1, 1)) == NULL) continue; - strcpy(q, *p); + (void) strcpy(q, *p); (*msgp->message_routine)(msgp->appdata_ptr, q); free(q); } @@ -823,7 +820,7 @@ ap_cm_devpath(apd_t *a, int seq) * Assume an io component so that we can get * the path if the component is indeed of type io. */ - if (seq == -1) + if (seq == CM_DFLT) dst = (sbd_io_stat_t *)a->cmstat; else { sbd_stat_t *st; @@ -1002,7 +999,7 @@ ap_cm_init(apd_t *a, cfga_list_data_t *ap, int seq) a->cmstat = (void *)dst; DBG("ap_cm_init bd=%d rs=%d os=%d type=<%s> seq=%d\n", - a->bnum, st->s_rstate, dst->ds_ostate, type, seq); + a->bnum, st->s_rstate, dst->ds_ostate, type, seq); (void) strncpy(ap->ap_type, type, sizeof (ap->ap_type)); ap->ap_r_state = (cfga_stat_t)st->s_rstate; diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/ap_seq.c b/usr/src/lib/cfgadm_plugins/sbd/common/ap_seq.c index 929dc0251c..0536b838f4 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/ap_seq.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/ap_seq.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <assert.h> #include <ctype.h> #include <stdio.h> @@ -40,6 +37,12 @@ #include <config_admin.h> #include "ap.h" +#ifdef __x86 +#include <libscf_priv.h> + +static int fastreboot_disabled; +#endif /* __x86 */ + static cfga_err_t ap_suspend_check(apd_t *a, int cmd, int first, int last, int *suspend) { @@ -202,7 +205,7 @@ ap_seq_get(apd_t *a, int cmd, int *first, int *last) *last = l; DBG("ap_seq(%d, %d, %d, %p, %p) = (%d, %d)\n", - rs, os, cmd, (void *)first, (void *)last, f, l); + rs, os, cmd, (void *)first, (void *)last, f, l); return (AP_SEQ_OK); } @@ -268,7 +271,7 @@ ap_seq_exec(apd_t *a, int cmd, int first, int last) * interpose on the suspend operation. */ rc = ap_suspend_check(a, cmd, - first + 1, last, &suspend); + first + 1, last, &suspend); break; case CMD_RCM_SUSPEND: if (suspend && ((rc = ap_rcm_ctl(a, c)) == CFGA_OK)) { @@ -297,6 +300,41 @@ ap_seq_exec(apd_t *a, int cmd, int first, int last) case CMD_RCM_CAP_NOTIFY: (void) ap_rcm_ctl(a, c); break; + +#ifdef __x86 + /* + * Disable fast reboot if a CPU/MEM/IOH hotplug event happens. + * Note: this is a temporary solution and will be revised when + * fast reboot can support CPU/MEM/IOH DR operations in the + * future. + * + * ACPI BIOS generates some static ACPI tables, such as MADT, + * SRAT and SLIT, to describe the system hardware configuration + * on power-on. When a CPU/MEM/IOH hotplug event happens, those + * static tables won't be updated and will become stale. + * + * If we reset the system by fast reboot, BIOS will have no + * chance to regenerate those staled static tables. Fast reboot + * can't tolerate such inconsistency between staled ACPI tables + * and real hardware configuration yet. + * + * A temporary solution is introduced to disable fast reboot if + * CPU/MEM/IOH hotplug event happens. This solution should be + * revised when fast reboot is enhanced to support CPU/MEM/IOH + * DR operations. + */ + case CMD_ASSIGN: + case CMD_POWERON: + case CMD_POWEROFF: + case CMD_UNASSIGN: + if (!fastreboot_disabled && + scf_fastreboot_default_set_transient(B_FALSE) == + SCF_SUCCESS) { + fastreboot_disabled = 1; + } + /* FALLTHROUGH */ +#endif /* __x86 */ + default: rc = ap_ioctl(a, c); break; diff --git a/usr/src/lib/cfgadm_plugins/sbd/common/cfga.c b/usr/src/lib/cfgadm_plugins/sbd/common/cfga.c index a891746634..c03350e19b 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/common/cfga.c +++ b/usr/src/lib/cfgadm_plugins/sbd/common/cfga.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <ctype.h> #include <stdio.h> #include <stdlib.h> @@ -243,14 +240,14 @@ cfga_list_ext( ap_cm_id(a, i, dyn, sizeof (dyn)); (void) snprintf(ap->ap_log_id, szl, "%s::%s", - a->target, dyn); + a->target, dyn); (void) snprintf(ap->ap_phys_id, szp, "%s::%s", - a->path, dyn); + a->path, dyn); ap_cm_init(a, ap, i); DBG("ap_phys_id=%s ap_log_id=%s\n", - ap->ap_phys_id, ap->ap_log_id); + ap->ap_phys_id, ap->ap_log_id); } } else diff --git a/usr/src/lib/cfgadm_plugins/sbd/i386/Makefile b/usr/src/lib/cfgadm_plugins/sbd/i386/Makefile new file mode 100644 index 0000000000..7a382d291f --- /dev/null +++ b/usr/src/lib/cfgadm_plugins/sbd/i386/Makefile @@ -0,0 +1,39 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# + +PLATFORM = i86pc + +include ../Makefile.com + +CFLAGS += -D_POSIX_PTHREAD_SEMANTICS +LINTFLAGS += -D_POSIX_PTHREAD_SEMANTICS +LDLIBS += -lscf + +.KEEP_STATE: + +install: all $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/lib/cfgadm_plugins/sbd/sparc/Makefile b/usr/src/lib/cfgadm_plugins/sbd/sparc/Makefile index dcaf7b8438..39bd47a725 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/sparc/Makefile +++ b/usr/src/lib/cfgadm_plugins/sbd/sparc/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -23,10 +22,10 @@ # Copyright (c) 2000 by Sun Microsystems, Inc. # All rights reserved. # -#ident "%Z%%M% %I% %E% SMI" -# # lib/sbd/sparc/Makefile +PLATFORM = sun4u + include ../Makefile.com .KEEP_STATE: diff --git a/usr/src/lib/cfgadm_plugins/sbd/sparcv9/Makefile b/usr/src/lib/cfgadm_plugins/sbd/sparcv9/Makefile index 4ac3f6310c..1d9c67c4e1 100644 --- a/usr/src/lib/cfgadm_plugins/sbd/sparcv9/Makefile +++ b/usr/src/lib/cfgadm_plugins/sbd/sparcv9/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -23,10 +22,10 @@ # Copyright (c) 2000 by Sun Microsystems, Inc. # All rights reserved. # -#ident "%Z%%M% %I% %E% SMI" -# # lib/sbd/sparcv9/Makefile +PLATFORM = sun4u + include ../Makefile.com include ../../../Makefile.lib.64 diff --git a/usr/src/pkg/license_files/cr_Intel.hotplug b/usr/src/pkg/license_files/cr_Intel.hotplug new file mode 100644 index 0000000000..bf0648aabc --- /dev/null +++ b/usr/src/pkg/license_files/cr_Intel.hotplug @@ -0,0 +1 @@ + * Copyright(c) 2009 - 2010 Intel Corporation. All rights reserved. diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf index 113817866b..b95dd46026 100644 --- a/usr/src/pkg/manifests/system-header.mf +++ b/usr/src/pkg/manifests/system-header.mf @@ -1660,6 +1660,7 @@ $(i386_ONLY)file path=usr/platform/i86pc/include/sys/psm_defs.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/psm_modctl.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/psm_types.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/rm_platter.h +$(i386_ONLY)file path=usr/platform/i86pc/include/sys/sbd_ioctl.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/smp_impldefs.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/vm_machparam.h $(i386_ONLY)file path=usr/platform/i86pc/include/sys/x_call.h diff --git a/usr/src/pkg/manifests/system-kernel-dynamic-reconfiguration-i86pc.mf b/usr/src/pkg/manifests/system-kernel-dynamic-reconfiguration-i86pc.mf new file mode 100644 index 0000000000..28ddc833bb --- /dev/null +++ b/usr/src/pkg/manifests/system-kernel-dynamic-reconfiguration-i86pc.mf @@ -0,0 +1,83 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2010 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# This package will install successfully into any zone, global or +# non-global. The files, directories, links, and hardlinks, however, +# will only be installed into the global zone. +# +<include hollow_zone_pkg> +set name=pkg.fmri \ + value=pkg:/system/kernel/dynamic-reconfiguration/i86pc@$(PKGVERS) +set name=pkg.description \ + value="Dynamic Reconfiguration Modules for i86pc required to support \ + hotplug on i86pc based systems. Includes the ACPI Hotplug Daemon, SMF \ + manifests/methods and drivers for hotplug operations." +set name=pkg.summary value="Dynamic Reconfiguration Modules for i86pc" +set name=info.classification \ + value="org.opensolaris.category.2008:System/Administration and Configuration" \ + value="org.opensolaris.category.2008:System/Enterprise Management" +set name=variant.arch value=i386 +set name=variant.opensolaris.zone value=global value=nonglobal +dir path=platform group=sys +dir path=platform/i86pc group=sys +dir path=platform/i86pc/kernel group=sys +dir path=platform/i86pc/kernel/drv group=sys +dir path=platform/i86pc/kernel/drv/$(ARCH64) group=sys +dir path=platform/i86pc/kernel/misc group=sys +dir path=platform/i86pc/kernel/misc/$(ARCH64) group=sys +file path=platform/i86pc/kernel/misc/$(ARCH64)/drmach_acpi group=sys \ + mode=0755 +dir path=usr group=sys +dir path=usr/platform group=sys +dir path=usr/platform/i86pc group=sys +dir path=usr/platform/i86pc/lib group=sys +dir path=var group=sys +dir path=var/svc group=sys +dir path=var/svc/manifest group=sys +dir path=var/svc/manifest/platform group=sys +dir path=var/svc/manifest/platform/i86pc group=sys +file path=platform/i86pc/kernel/drv/$(ARCH64)/dr group=sys +driver name=dr alias=acpidr_sbd +file path=platform/i86pc/kernel/drv/dr.conf group=sys mode=0644 +file path=usr/platform/i86pc/lib/acpihpd group=sys mode=0755 +file path=var/svc/manifest/platform/i86pc/acpihpd.xml group=sys mode=0444 +dir path=lib group=sys +dir path=lib/svc group=sys +dir path=lib/svc/method group=sys +file path=lib/svc/method/svc-acpihpd group=sys mode=0555 +dir path=usr/platform/i86pc/lib/cfgadm +dir path=usr/platform/i86pc/lib/cfgadm/$(ARCH64) +file path=usr/platform/i86pc/lib/cfgadm/sbd.so.1 +link path=usr/platform/i86pc/lib/cfgadm/sbd.so \ + target=./sbd.so.1 +file path=usr/platform/i86pc/lib/cfgadm/$(ARCH64)/sbd.so.1 +link path=usr/platform/i86pc/lib/cfgadm/$(ARCH64)/sbd.so \ + target=./sbd.so.1 +license cr_Intel.hotplug license=cr_Intel +license cr_Sun license=cr_Sun +license lic_CDDL license=lic_CDDL +depend fmri=pkg:/system/kernel/power type=require diff --git a/usr/src/uts/common/os/cpu_event.c b/usr/src/uts/common/os/cpu_event.c index 11162ccfc9..b3ed323f92 100644 --- a/usr/src/uts/common/os/cpu_event.c +++ b/usr/src/uts/common/os/cpu_event.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -152,8 +152,13 @@ typedef struct cpu_idle_cb_item { /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */ typedef union cpu_idle_cb_state { struct { + /* Index of already invoked callbacks. */ int index; + /* Invoke registered callbacks if true. */ + boolean_t enabled; + /* Property values are valid if true. */ boolean_t ready; + /* Pointers to per-CPU properties. */ cpu_idle_prop_value_t *idle_state; cpu_idle_prop_value_t *enter_ts; cpu_idle_prop_value_t *exit_ts; @@ -182,6 +187,17 @@ static int cpu_idle_cb_max = 0; static cpu_idle_cb_state_t *cpu_idle_cb_state; +#ifdef __x86 +/* + * cpuset used to intercept CPUs before powering them off. + * The control CPU sets the bit corresponding to the target CPU and waits + * until the bit is cleared. + * The target CPU disables interrupts before clearing corresponding bit and + * then loops for ever. + */ +static cpuset_t cpu_idle_intercept_set; +#endif + static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum, cpu_idle_prop_value_t *valp); @@ -326,17 +342,26 @@ cpu_event_init(void) #endif } +/* + * This function is called to initialize per CPU state when starting CPUs. + */ void cpu_event_init_cpu(cpu_t *cp) { ASSERT(cp->cpu_seqid < max_ncpus); + cpu_idle_cb_state[cp->cpu_seqid].v.index = 0; cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE; + cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_TRUE; } +/* + * This function is called to clean up per CPU state when stopping CPUs. + */ void cpu_event_fini_cpu(cpu_t *cp) { ASSERT(cp->cpu_seqid < max_ncpus); + cpu_idle_cb_state[cp->cpu_seqid].v.enabled = B_FALSE; cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE; } @@ -618,6 +643,21 @@ cpu_idle_enter(int state, int flag, ASSERT(CPU->cpu_seqid < max_ncpus); sp = &cpu_idle_cb_state[CPU->cpu_seqid]; ASSERT(sp->v.index == 0); + if (sp->v.enabled == B_FALSE) { +#if defined(__x86) + /* Intercept CPU at a safe point before powering off it. */ + if (CPU_IN_SET(cpu_idle_intercept_set, CPU->cpu_id)) { + iflags = intr_clear(); + CPUSET_ATOMIC_DEL(cpu_idle_intercept_set, CPU->cpu_id); + /*CONSTCOND*/ + while (1) { + SMT_PAUSE(); + } + } +#endif + + return (0); + } /* * On x86, cpu_idle_enter can be called from idle thread with either @@ -1091,3 +1131,26 @@ cpu_idle_get_cpu_state(cpu_t *cp) cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, CPU_IDLE_GET_CTX(cp))); } + +#if defined(__x86) +/* + * Intercept CPU at a safe point in idle() before powering it off. + */ +void +cpu_idle_intercept_cpu(cpu_t *cp) +{ + ASSERT(cp->cpu_seqid < max_ncpus); + ASSERT(cpu_idle_cb_state[cp->cpu_seqid].v.enabled == B_FALSE); + + /* Set flag to intercept CPU. */ + CPUSET_ATOMIC_ADD(cpu_idle_intercept_set, cp->cpu_id); + /* Wake up CPU from possible sleep state. */ + poke_cpu(cp->cpu_id); + while (CPU_IN_SET(cpu_idle_intercept_set, cp->cpu_id)) { + DELAY(1); + } + /* + * Now target CPU is spinning in a pause loop with interrupts disabled. + */ +} +#endif diff --git a/usr/src/uts/common/os/cpu_pm.c b/usr/src/uts/common/os/cpu_pm.c index 324e4168d7..b885e421b8 100644 --- a/usr/src/uts/common/os/cpu_pm.c +++ b/usr/src/uts/common/os/cpu_pm.c @@ -677,9 +677,13 @@ cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level) cpupm_state_t *new_state = NULL; did = cpupm_domain_id(cp, type); - mutex_enter(&cpu_lock); - dom = cpupm_domain_find(did, type); - mutex_exit(&cpu_lock); + if (MUTEX_HELD(&cpu_lock)) { + dom = cpupm_domain_find(did, type); + } else { + mutex_enter(&cpu_lock); + dom = cpupm_domain_find(did, type); + mutex_exit(&cpu_lock); + } /* * Can use a lock to avoid changing the power state of the cpu when diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index be67866592..20a89b0611 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -59,9 +59,6 @@ extern struct memlist *phys_avail; -extern void mem_node_add(pfn_t, pfn_t); -extern void mem_node_del(pfn_t, pfn_t); - extern uint_t page_ctrs_adjust(int); void page_ctrs_cleanup(void); static void kphysm_setup_post_add(pgcnt_t); @@ -107,6 +104,13 @@ static void memseg_cpu_vm_flush(void); int meta_alloc_enable; +#ifdef DEBUG +static int memseg_debug; +#define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) +#else +#define MEMSEG_DEBUG(...) +#endif + /* * Add a chunk of memory to the system. * base: starting PAGESIZE page of new memory. @@ -117,13 +121,6 @@ int meta_alloc_enable; * dynamically most likely means more hash misses, since the tables will * be smaller than they otherwise would be. */ -#ifdef DEBUG -static int memseg_debug; -#define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) -#else -#define MEMSEG_DEBUG(...) -#endif - int kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) { @@ -341,6 +338,9 @@ mapalloc: * this may change with COD or in larger SSM systems with * nested latency groups, so we must not assume that the * node does not yet exist. + * + * Note that there may be multiple memory nodes associated with + * a single lgrp node on x86 systems. */ pnum = pt_base + tpgs - 1; mem_node_add_range(pt_base, pnum); @@ -569,8 +569,8 @@ mapalloc: (uint64_t)(tpgs) << PAGESHIFT); delspan_unreserve(pt_base, tpgs); - return (KPHYSM_OK); /* Successfully added system memory */ + return (KPHYSM_OK); /* Successfully added system memory */ } /* @@ -2566,7 +2566,7 @@ remap_to_dummy(caddr_t va, pgcnt_t metapgs) { int phase; - ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE)); + ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE)); /* * We may start remapping at a non-zero page offset @@ -2574,7 +2574,8 @@ remap_to_dummy(caddr_t va, pgcnt_t metapgs) * of the outgoing pp's could be shared by other * memsegs (see memseg_remap_meta). */ - phase = btop((uint64_t)va) % pp_dummy_npages; + phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages; + /*CONSTCOND*/ ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); while (metapgs != 0) { diff --git a/usr/src/uts/common/os/mem_config_stubs.c b/usr/src/uts/common/os/mem_config_stubs.c index d3226ff633..1a265173f4 100644 --- a/usr/src/uts/common/os/mem_config_stubs.c +++ b/usr/src/uts/common/os/mem_config_stubs.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/cmn_err.h> #include <sys/errno.h> @@ -44,6 +42,7 @@ pgcnt_t kcage_desfree; pgcnt_t kcage_needfree; pgcnt_t kcage_lotsfree = 1; +#if !defined(__x86) || defined(__xpv) /*ARGSUSED*/ int kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) @@ -56,6 +55,7 @@ void kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) { } +#endif /*ARGSUSED*/ int diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index 91e2b72dc3..33cc970aea 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -85,6 +85,10 @@ #include <sys/zone.h> #include <sys/clock_impl.h> #include <sys/ddi.h> +#include <sys/modhash.h> +#include <sys/sunldi_impl.h> +#include <sys/fs/dv_node.h> +#include <sys/fs/snode.h> extern pri_t minclsyspri; @@ -9369,3 +9373,938 @@ ddi_cb_unregister(ddi_cb_handle_t hdl) return (DDI_SUCCESS); } + +/* + * Platform independent DR routines + */ + +static int +ndi2errno(int n) +{ + int err = 0; + + switch (n) { + case NDI_NOMEM: + err = ENOMEM; + break; + case NDI_BUSY: + err = EBUSY; + break; + case NDI_FAULT: + err = EFAULT; + break; + case NDI_FAILURE: + err = EIO; + break; + case NDI_SUCCESS: + break; + case NDI_BADHANDLE: + default: + err = EINVAL; + break; + } + return (err); +} + +/* + * Prom tree node list + */ +struct ptnode { + pnode_t nodeid; + struct ptnode *next; +}; + +/* + * Prom tree walk arg + */ +struct pta { + dev_info_t *pdip; + devi_branch_t *bp; + uint_t flags; + dev_info_t *fdip; + struct ptnode *head; +}; + +static void +visit_node(pnode_t nodeid, struct pta *ap) +{ + struct ptnode **nextp; + int (*select)(pnode_t, void *, uint_t); + + ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE); + + select = ap->bp->create.prom_branch_select; + + ASSERT(select); + + if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) { + + for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next) + ; + + *nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP); + + (*nextp)->nodeid = nodeid; + } + + if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD) + return; + + nodeid = prom_childnode(nodeid); + while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { + visit_node(nodeid, ap); + nodeid = prom_nextnode(nodeid); + } +} + +/* + * NOTE: The caller of this function must check for device contracts + * or LDI callbacks against this dip before setting the dip offline. + */ +static int +set_infant_dip_offline(dev_info_t *dip, void *arg) +{ + char *path = (char *)arg; + + ASSERT(dip); + ASSERT(arg); + + if (i_ddi_node_state(dip) >= DS_ATTACHED) { + (void) ddi_pathname(dip, path); + cmn_err(CE_WARN, "Attempt to set offline flag on attached " + "node: %s", path); + return (DDI_FAILURE); + } + + mutex_enter(&(DEVI(dip)->devi_lock)); + if (!DEVI_IS_DEVICE_OFFLINE(dip)) + DEVI_SET_DEVICE_OFFLINE(dip); + mutex_exit(&(DEVI(dip)->devi_lock)); + + return (DDI_SUCCESS); +} + +typedef struct result { + char *path; + int result; +} result_t; + +static int +dip_set_offline(dev_info_t *dip, void *arg) +{ + int end; + result_t *resp = (result_t *)arg; + + ASSERT(dip); + ASSERT(resp); + + /* + * We stop the walk if e_ddi_offline_notify() returns + * failure, because this implies that one or more consumers + * (either LDI or contract based) has blocked the offline. + * So there is no point in conitnuing the walk + */ + if (e_ddi_offline_notify(dip) == DDI_FAILURE) { + resp->result = DDI_FAILURE; + return (DDI_WALK_TERMINATE); + } + + /* + * If set_infant_dip_offline() returns failure, it implies + * that we failed to set a particular dip offline. This + * does not imply that the offline as a whole should fail. + * We want to do the best we can, so we continue the walk. + */ + if (set_infant_dip_offline(dip, resp->path) == DDI_SUCCESS) + end = DDI_SUCCESS; + else + end = DDI_FAILURE; + + e_ddi_offline_finalize(dip, end); + + return (DDI_WALK_CONTINUE); +} + +/* + * The call to e_ddi_offline_notify() exists for the + * unlikely error case that a branch we are trying to + * create already exists and has device contracts or LDI + * event callbacks against it. + * + * We allow create to succeed for such branches only if + * no constraints block the offline. + */ +static int +branch_set_offline(dev_info_t *dip, char *path) +{ + int circ; + int end; + result_t res; + + + if (e_ddi_offline_notify(dip) == DDI_FAILURE) { + return (DDI_FAILURE); + } + + if (set_infant_dip_offline(dip, path) == DDI_SUCCESS) + end = DDI_SUCCESS; + else + end = DDI_FAILURE; + + e_ddi_offline_finalize(dip, end); + + if (end == DDI_FAILURE) + return (DDI_FAILURE); + + res.result = DDI_SUCCESS; + res.path = path; + + ndi_devi_enter(dip, &circ); + ddi_walk_devs(ddi_get_child(dip), dip_set_offline, &res); + ndi_devi_exit(dip, circ); + + return (res.result); +} + +/*ARGSUSED*/ +static int +create_prom_branch(void *arg, int has_changed) +{ + int circ; + int exists, rv; + pnode_t nodeid; + struct ptnode *tnp; + dev_info_t *dip; + struct pta *ap = arg; + devi_branch_t *bp; + char *path; + + ASSERT(ap); + ASSERT(ap->fdip == NULL); + ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip)); + + bp = ap->bp; + + nodeid = ddi_get_nodeid(ap->pdip); + if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) { + cmn_err(CE_WARN, "create_prom_branch: invalid " + "nodeid: 0x%x", nodeid); + return (EINVAL); + } + + ap->head = NULL; + + nodeid = prom_childnode(nodeid); + while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { + visit_node(nodeid, ap); + nodeid = prom_nextnode(nodeid); + } + + if (ap->head == NULL) + return (ENODEV); + + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + rv = 0; + while ((tnp = ap->head) != NULL) { + ap->head = tnp->next; + + ndi_devi_enter(ap->pdip, &circ); + + /* + * Check if the branch already exists. + */ + exists = 0; + dip = e_ddi_nodeid_to_dip(tnp->nodeid); + if (dip != NULL) { + exists = 1; + + /* Parent is held busy, so release hold */ + ndi_rele_devi(dip); +#ifdef DEBUG + cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists" + " for nodeid 0x%x", (void *)dip, tnp->nodeid); +#endif + } else { + dip = i_ddi_create_branch(ap->pdip, tnp->nodeid); + } + + kmem_free(tnp, sizeof (struct ptnode)); + + /* + * Hold the branch if it is not already held + */ + if (dip && !exists) { + e_ddi_branch_hold(dip); + } + + ASSERT(dip == NULL || e_ddi_branch_held(dip)); + + /* + * Set all dips in the newly created branch offline so that + * only a "configure" operation can attach + * the branch + */ + if (dip == NULL || branch_set_offline(dip, path) + == DDI_FAILURE) { + ndi_devi_exit(ap->pdip, circ); + rv = EIO; + continue; + } + + ASSERT(ddi_get_parent(dip) == ap->pdip); + + ndi_devi_exit(ap->pdip, circ); + + if (ap->flags & DEVI_BRANCH_CONFIGURE) { + int error = e_ddi_branch_configure(dip, &ap->fdip, 0); + if (error && rv == 0) + rv = error; + } + + /* + * Invoke devi_branch_callback() (if it exists) only for + * newly created branches + */ + if (bp->devi_branch_callback && !exists) + bp->devi_branch_callback(dip, bp->arg, 0); + } + + kmem_free(path, MAXPATHLEN); + + return (rv); +} + +static int +sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp) +{ + int rv, circ, len; + int i, flags, ret; + dev_info_t *dip; + char *nbuf; + char *path; + static const char *noname = "<none>"; + + ASSERT(pdip); + ASSERT(DEVI_BUSY_OWNED(pdip)); + + flags = 0; + + /* + * Creating the root of a branch ? + */ + if (rdipp) { + *rdipp = NULL; + flags = DEVI_BRANCH_ROOT; + } + + ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip); + rv = bp->create.sid_branch_create(dip, bp->arg, flags); + + nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP); + + if (rv == DDI_WALK_ERROR) { + cmn_err(CE_WARN, "e_ddi_branch_create: Error setting" + " properties on devinfo node %p", (void *)dip); + goto fail; + } + + len = OBP_MAXDRVNAME; + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len) + != DDI_PROP_SUCCESS) { + cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has" + "no name property", (void *)dip); + goto fail; + } + + ASSERT(i_ddi_node_state(dip) == DS_PROTO); + if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) { + cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)" + " for devinfo node %p", nbuf, (void *)dip); + goto fail; + } + + kmem_free(nbuf, OBP_MAXDRVNAME); + + /* + * Ignore bind failures just like boot does + */ + (void) ndi_devi_bind_driver(dip, 0); + + switch (rv) { + case DDI_WALK_CONTINUE: + case DDI_WALK_PRUNESIB: + ndi_devi_enter(dip, &circ); + + i = DDI_WALK_CONTINUE; + for (; i == DDI_WALK_CONTINUE; ) { + i = sid_node_create(dip, bp, NULL); + } + + ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB); + if (i == DDI_WALK_ERROR) + rv = i; + /* + * If PRUNESIB stop creating siblings + * of dip's child. Subsequent walk behavior + * is determined by rv returned by dip. + */ + + ndi_devi_exit(dip, circ); + break; + case DDI_WALK_TERMINATE: + /* + * Don't create children and ask our parent + * to not create siblings either. + */ + rv = DDI_WALK_PRUNESIB; + break; + case DDI_WALK_PRUNECHILD: + /* + * Don't create children, but ask parent to continue + * with siblings. + */ + rv = DDI_WALK_CONTINUE; + break; + default: + ASSERT(0); + break; + } + + if (rdipp) + *rdipp = dip; + + /* + * Set device offline - only the "configure" op should cause an attach. + * Note that it is safe to set the dip offline without checking + * for either device contract or layered driver (LDI) based constraints + * since there cannot be any contracts or LDI opens of this device. + * This is because this node is a newly created dip with the parent busy + * held, so no other thread can come in and attach this dip. A dip that + * has never been attached cannot have contracts since by definition + * a device contract (an agreement between a process and a device minor + * node) can only be created against a device that has minor nodes + * i.e is attached. Similarly an LDI open will only succeed if the + * dip is attached. We assert below that the dip is not attached. + */ + ASSERT(i_ddi_node_state(dip) < DS_ATTACHED); + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + ret = set_infant_dip_offline(dip, path); + ASSERT(ret == DDI_SUCCESS); + kmem_free(path, MAXPATHLEN); + + return (rv); +fail: + (void) ndi_devi_free(dip); + kmem_free(nbuf, OBP_MAXDRVNAME); + return (DDI_WALK_ERROR); +} + +static int +create_sid_branch( + dev_info_t *pdip, + devi_branch_t *bp, + dev_info_t **dipp, + uint_t flags) +{ + int rv = 0, state = DDI_WALK_CONTINUE; + dev_info_t *rdip; + + while (state == DDI_WALK_CONTINUE) { + int circ; + + ndi_devi_enter(pdip, &circ); + + state = sid_node_create(pdip, bp, &rdip); + if (rdip == NULL) { + ndi_devi_exit(pdip, circ); + ASSERT(state == DDI_WALK_ERROR); + break; + } + + e_ddi_branch_hold(rdip); + + ndi_devi_exit(pdip, circ); + + if (flags & DEVI_BRANCH_CONFIGURE) { + int error = e_ddi_branch_configure(rdip, dipp, 0); + if (error && rv == 0) + rv = error; + } + + /* + * devi_branch_callback() is optional + */ + if (bp->devi_branch_callback) + bp->devi_branch_callback(rdip, bp->arg, 0); + } + + ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB); + + return (state == DDI_WALK_ERROR ? EIO : rv); +} + +int +e_ddi_branch_create( + dev_info_t *pdip, + devi_branch_t *bp, + dev_info_t **dipp, + uint_t flags) +{ + int prom_devi, sid_devi, error; + + if (pdip == NULL || bp == NULL || bp->type == 0) + return (EINVAL); + + prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0; + sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0; + + if (prom_devi && bp->create.prom_branch_select == NULL) + return (EINVAL); + else if (sid_devi && bp->create.sid_branch_create == NULL) + return (EINVAL); + else if (!prom_devi && !sid_devi) + return (EINVAL); + + if (flags & DEVI_BRANCH_EVENT) + return (EINVAL); + + if (prom_devi) { + struct pta pta = {0}; + + pta.pdip = pdip; + pta.bp = bp; + pta.flags = flags; + + error = prom_tree_access(create_prom_branch, &pta, NULL); + + if (dipp) + *dipp = pta.fdip; + else if (pta.fdip) + ndi_rele_devi(pta.fdip); + } else { + error = create_sid_branch(pdip, bp, dipp, flags); + } + + return (error); +} + +int +e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags) +{ + int rv; + char *devnm; + dev_info_t *pdip; + + if (dipp) + *dipp = NULL; + + if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT)) + return (EINVAL); + + pdip = ddi_get_parent(rdip); + + ndi_hold_devi(pdip); + + if (!e_ddi_branch_held(rdip)) { + ndi_rele_devi(pdip); + cmn_err(CE_WARN, "e_ddi_branch_configure: " + "dip(%p) not held", (void *)rdip); + return (EINVAL); + } + + if (i_ddi_node_state(rdip) < DS_INITIALIZED) { + /* + * First attempt to bind a driver. If we fail, return + * success (On some platforms, dips for some device + * types (CPUs) may not have a driver) + */ + if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) { + ndi_rele_devi(pdip); + return (0); + } + + if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) { + rv = NDI_FAILURE; + goto out; + } + } + + ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED); + + devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); + + (void) ddi_deviname(rdip, devnm); + + if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip, + NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) { + /* release hold from ndi_devi_config_one() */ + ndi_rele_devi(rdip); + } + + kmem_free(devnm, MAXNAMELEN + 1); +out: + if (rv != NDI_SUCCESS && dipp && rdip) { + ndi_hold_devi(rdip); + *dipp = rdip; + } + ndi_rele_devi(pdip); + return (ndi2errno(rv)); +} + +void +e_ddi_branch_hold(dev_info_t *rdip) +{ + if (e_ddi_branch_held(rdip)) { + cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held"); + return; + } + + mutex_enter(&DEVI(rdip)->devi_lock); + if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) { + DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD; + DEVI(rdip)->devi_ref++; + } + ASSERT(DEVI(rdip)->devi_ref > 0); + mutex_exit(&DEVI(rdip)->devi_lock); +} + +int +e_ddi_branch_held(dev_info_t *rdip) +{ + int rv = 0; + + mutex_enter(&DEVI(rdip)->devi_lock); + if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) && + DEVI(rdip)->devi_ref > 0) { + rv = 1; + } + mutex_exit(&DEVI(rdip)->devi_lock); + + return (rv); +} + +void +e_ddi_branch_rele(dev_info_t *rdip) +{ + mutex_enter(&DEVI(rdip)->devi_lock); + DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD; + DEVI(rdip)->devi_ref--; + mutex_exit(&DEVI(rdip)->devi_lock); +} + +int +e_ddi_branch_unconfigure( + dev_info_t *rdip, + dev_info_t **dipp, + uint_t flags) +{ + int circ, rv; + int destroy; + char *devnm; + uint_t nflags; + dev_info_t *pdip; + + if (dipp) + *dipp = NULL; + + if (rdip == NULL) + return (EINVAL); + + pdip = ddi_get_parent(rdip); + + ASSERT(pdip); + + /* + * Check if caller holds pdip busy - can cause deadlocks during + * devfs_clean() + */ + if (DEVI_BUSY_OWNED(pdip)) { + cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent" + " devinfo node(%p) is busy held", (void *)pdip); + return (EINVAL); + } + + destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0; + + devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); + + ndi_devi_enter(pdip, &circ); + (void) ddi_deviname(rdip, devnm); + ndi_devi_exit(pdip, circ); + + /* + * ddi_deviname() returns a component name with / prepended. + */ + (void) devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE); + + ndi_devi_enter(pdip, &circ); + + /* + * Recreate device name as it may have changed state (init/uninit) + * when parent busy lock was dropped for devfs_clean() + */ + (void) ddi_deviname(rdip, devnm); + + if (!e_ddi_branch_held(rdip)) { + kmem_free(devnm, MAXNAMELEN + 1); + ndi_devi_exit(pdip, circ); + cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held", + destroy ? "destroy" : "unconfigure", (void *)rdip); + return (EINVAL); + } + + /* + * Release hold on the branch. This is ok since we are holding the + * parent busy. If rdip is not removed, we must do a hold on the + * branch before returning. + */ + e_ddi_branch_rele(rdip); + + nflags = NDI_DEVI_OFFLINE; + if (destroy || (flags & DEVI_BRANCH_DESTROY)) { + nflags |= NDI_DEVI_REMOVE; + destroy = 1; + } else { + nflags |= NDI_UNCONFIG; /* uninit but don't remove */ + } + + if (flags & DEVI_BRANCH_EVENT) + nflags |= NDI_POST_EVENT; + + if (i_ddi_devi_attached(pdip) && + (i_ddi_node_state(rdip) >= DS_INITIALIZED)) { + rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags); + } else { + rv = e_ddi_devi_unconfig(rdip, dipp, nflags); + if (rv == NDI_SUCCESS) { + ASSERT(!destroy || ddi_get_child(rdip) == NULL); + rv = ndi_devi_offline(rdip, nflags); + } + } + + if (!destroy || rv != NDI_SUCCESS) { + /* The dip still exists, so do a hold */ + e_ddi_branch_hold(rdip); + } +out: + kmem_free(devnm, MAXNAMELEN + 1); + ndi_devi_exit(pdip, circ); + return (ndi2errno(rv)); +} + +int +e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag) +{ + return (e_ddi_branch_unconfigure(rdip, dipp, + flag|DEVI_BRANCH_DESTROY)); +} + +/* + * Number of chains for hash table + */ +#define NUMCHAINS 17 + +/* + * Devinfo busy arg + */ +struct devi_busy { + int dv_total; + int s_total; + mod_hash_t *dv_hash; + mod_hash_t *s_hash; + int (*callback)(dev_info_t *, void *, uint_t); + void *arg; +}; + +static int +visit_dip(dev_info_t *dip, void *arg) +{ + uintptr_t sbusy, dvbusy, ref; + struct devi_busy *bsp = arg; + + ASSERT(bsp->callback); + + /* + * A dip cannot be busy if its reference count is 0 + */ + if ((ref = e_ddi_devi_holdcnt(dip)) == 0) { + return (bsp->callback(dip, bsp->arg, 0)); + } + + if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy)) + dvbusy = 0; + + /* + * To catch device opens currently maintained on specfs common snodes. + */ + if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) + sbusy = 0; + +#ifdef DEBUG + if (ref < sbusy || ref < dvbusy) { + cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu " + "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref); + } +#endif + + dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy; + + return (bsp->callback(dip, bsp->arg, dvbusy)); +} + +static int +visit_snode(struct snode *sp, void *arg) +{ + uintptr_t sbusy; + dev_info_t *dip; + int count; + struct devi_busy *bsp = arg; + + ASSERT(sp); + + /* + * The stable lock is held. This prevents + * the snode and its associated dip from + * going away. + */ + dip = NULL; + count = spec_devi_open_count(sp, &dip); + + if (count <= 0) + return (DDI_WALK_CONTINUE); + + ASSERT(dip); + + if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) + sbusy = count; + else + sbusy += count; + + if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) { + cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, " + "sbusy = %lu", "e_ddi_branch_referenced", + (void *)dip, sbusy); + } + + bsp->s_total += count; + + return (DDI_WALK_CONTINUE); +} + +static void +visit_dvnode(struct dv_node *dv, void *arg) +{ + uintptr_t dvbusy; + uint_t count; + struct vnode *vp; + struct devi_busy *bsp = arg; + + ASSERT(dv && dv->dv_devi); + + vp = DVTOV(dv); + + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + + if (!count) + return; + + if (mod_hash_remove(bsp->dv_hash, dv->dv_devi, + (mod_hash_val_t *)&dvbusy)) + dvbusy = count; + else + dvbusy += count; + + if (mod_hash_insert(bsp->dv_hash, dv->dv_devi, + (mod_hash_val_t)dvbusy)) { + cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, " + "dvbusy=%lu", "e_ddi_branch_referenced", + (void *)dv->dv_devi, dvbusy); + } + + bsp->dv_total += count; +} + +/* + * Returns reference count on success or -1 on failure. + */ +int +e_ddi_branch_referenced( + dev_info_t *rdip, + int (*callback)(dev_info_t *dip, void *arg, uint_t ref), + void *arg) +{ + int circ; + char *path; + dev_info_t *pdip; + struct devi_busy bsa = {0}; + + ASSERT(rdip); + + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + ndi_hold_devi(rdip); + + pdip = ddi_get_parent(rdip); + + ASSERT(pdip); + + /* + * Check if caller holds pdip busy - can cause deadlocks during + * devfs_walk() + */ + if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) { + cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: " + "devinfo branch(%p) not held or parent busy held", + (void *)rdip); + ndi_rele_devi(rdip); + kmem_free(path, MAXPATHLEN); + return (-1); + } + + ndi_devi_enter(pdip, &circ); + (void) ddi_pathname(rdip, path); + ndi_devi_exit(pdip, circ); + + bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS, + mod_hash_null_valdtor, sizeof (struct dev_info)); + + bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS, + mod_hash_null_valdtor, sizeof (struct snode)); + + if (devfs_walk(path, visit_dvnode, &bsa)) { + cmn_err(CE_WARN, "e_ddi_branch_referenced: " + "devfs walk failed for: %s", path); + kmem_free(path, MAXPATHLEN); + bsa.s_total = bsa.dv_total = -1; + goto out; + } + + kmem_free(path, MAXPATHLEN); + + /* + * Walk the snode table to detect device opens, which are currently + * maintained on specfs common snodes. + */ + spec_snode_walk(visit_snode, &bsa); + + if (callback == NULL) + goto out; + + bsa.callback = callback; + bsa.arg = arg; + + if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) { + ndi_devi_enter(rdip, &circ); + ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa); + ndi_devi_exit(rdip, circ); + } + +out: + ndi_rele_devi(rdip); + mod_hash_destroy_ptrhash(bsa.s_hash); + mod_hash_destroy_ptrhash(bsa.dv_hash); + return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total); +} diff --git a/usr/src/uts/common/sys/esunddi.h b/usr/src/uts/common/sys/esunddi.h index 3cde0b2b14..e2f860d520 100644 --- a/usr/src/uts/common/sys/esunddi.h +++ b/usr/src/uts/common/sys/esunddi.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -26,9 +25,6 @@ #ifndef _SYS_ESUNDDI_H #define _SYS_ESUNDDI_H - -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 */ - #include <sys/sunddi.h> #include <sys/proc.h> #include <sys/autoconf.h> @@ -214,6 +210,41 @@ dev_info_t * e_ddi_nodeid_to_dip(pnode_t nodeid); /* + * Defines for DR interfaces + */ +#define DEVI_BRANCH_CHILD 0x01 /* Walk immediate children of root */ +#define DEVI_BRANCH_CONFIGURE 0x02 /* Configure branch after create */ +#define DEVI_BRANCH_DESTROY 0x04 /* Destroy branch after unconfigure */ +#define DEVI_BRANCH_EVENT 0x08 /* Post NDI event */ +#define DEVI_BRANCH_PROM 0x10 /* Branches derived from PROM nodes */ +#define DEVI_BRANCH_SID 0x20 /* SID node branches */ +#define DEVI_BRANCH_ROOT 0x40 /* Node is the root of a branch */ + +typedef struct devi_branch { + void *arg; + void (*devi_branch_callback)(dev_info_t *, void *, uint_t); + int type; + union { + int (*prom_branch_select)(pnode_t, void *, uint_t); + int (*sid_branch_create)(dev_info_t *, void *, uint_t); + } create; +} devi_branch_t; + +extern int e_ddi_branch_create(dev_info_t *pdip, devi_branch_t *bp, + dev_info_t **dipp, uint_t flags); +extern int e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, + uint_t flags); +extern int e_ddi_branch_unconfigure(dev_info_t *rdip, dev_info_t **dipp, + uint_t flags); +extern int e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, + uint_t flags); +extern void e_ddi_branch_hold(dev_info_t *rdip); +extern void e_ddi_branch_rele(dev_info_t *rdip); +extern int e_ddi_branch_held(dev_info_t *rdip); +extern int e_ddi_branch_referenced(dev_info_t *rdip, + int (*cb)(dev_info_t *dip, void *, uint_t), void *arg); + +/* * Obsolete interfaces, no longer used, to be removed. * Retained only for driver compatibility. */ diff --git a/usr/src/uts/common/sys/pci_impl.h b/usr/src/uts/common/sys/pci_impl.h index d485800b96..429d6860bd 100644 --- a/usr/src/uts/common/sys/pci_impl.h +++ b/usr/src/uts/common/sys/pci_impl.h @@ -116,6 +116,9 @@ extern struct pci_bus_resource *pci_bus_res; /* * For now, x86-only to avoid conflicts with <sys/memlist_impl.h> */ +#define memlist_find memlist_find_pci +#define memlist_insert memlist_insert_pci + extern struct memlist *memlist_alloc(void); extern void memlist_free(struct memlist *); extern void memlist_free_all(struct memlist **); diff --git a/usr/src/uts/common/sys/processor.h b/usr/src/uts/common/sys/processor.h index 3a76c8c9b4..c0fe6e21b8 100644 --- a/usr/src/uts/common/sys/processor.h +++ b/usr/src/uts/common/sys/processor.h @@ -32,8 +32,6 @@ #ifndef _SYS_PROCESSOR_H #define _SYS_PROCESSOR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/procset.h> @@ -140,6 +138,7 @@ extern lgrpid_t gethomelgroup(); * Internal interface prototypes */ extern int p_online_internal(processorid_t, int, int *); +extern int p_online_internal_locked(processorid_t, int, int *); #endif /* !_KERNEL */ diff --git a/usr/src/uts/common/syscall/p_online.c b/usr/src/uts/common/syscall/p_online.c index 004627569c..88a0340bf7 100644 --- a/usr/src/uts/common/syscall/p_online.c +++ b/usr/src/uts/common/syscall/p_online.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/var.h> @@ -49,7 +46,7 @@ * P_FAULTED */ int -p_online_internal(processorid_t cpun, int new_status, int *old_status) +p_online_internal_locked(processorid_t cpun, int new_status, int *old_status) { cpu_t *cp; int status; @@ -59,7 +56,7 @@ p_online_internal(processorid_t cpun, int new_status, int *old_status) /* * Try to get a pointer to the requested CPU structure. */ - mutex_enter(&cpu_lock); /* protects CPU states */ + ASSERT(MUTEX_HELD(&cpu_lock)); if ((cp = cpu_get(cpun)) == NULL) { error = EINVAL; goto out; @@ -219,10 +216,21 @@ p_online_internal(processorid_t cpun, int new_status, int *old_status) break; } out: - mutex_exit(&cpu_lock); return (error); } +int +p_online_internal(processorid_t cpun, int new_status, int *old_status) +{ + int rc; + + mutex_enter(&cpu_lock); /* protects CPU states */ + rc = p_online_internal_locked(cpun, new_status, old_status); + mutex_exit(&cpu_lock); + + return (rc); +} + /* * p_online(2) - get/change processor operational status. * diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index 8739e8a366..298b8d4415 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -1085,8 +1085,8 @@ struct memseg { page_t *pages, *epages; /* [from, to] in page array */ pfn_t pages_base, pages_end; /* [from, to] in page numbers */ struct memseg *next; /* next segment in list */ -#if defined(__sparc) struct memseg *lnext; /* next segment in deleted list */ +#if defined(__sparc) uint64_t pagespa, epagespa; /* [from, to] page array physical */ uint64_t nextpa; /* physical next pointer */ pfn_t kpm_pbase; /* start of kpm range */ @@ -1096,8 +1096,8 @@ struct memseg { kpm_spage_t *kpm_spgs; /* ptr to kpm_spage array */ } mseg_un; uint64_t kpm_pagespa; /* physical ptr to kpm (s)pages array */ - uint_t msegflags; /* memseg flags */ #endif /* __sparc */ + uint_t msegflags; /* memseg flags */ }; /* memseg union aliases */ diff --git a/usr/src/uts/common/vm/seg_kpm.h b/usr/src/uts/common/vm/seg_kpm.h index 0b766bbaf4..b61980df01 100644 --- a/usr/src/uts/common/vm/seg_kpm.h +++ b/usr/src/uts/common/vm/seg_kpm.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -27,8 +26,6 @@ #ifndef _VM_SEG_KPM_H #define _VM_SEG_KPM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -85,8 +82,13 @@ extern int segmap_kpm; #define IS_KPM_ADDR(addr) \ ((addr) >= segkpm->s_base && (addr) < (segkpm->s_base + segkpm->s_size)) +#ifdef __x86 +/* x86 systems use neither kpm_page_t nor kpm_spage_t when supporting kpm. */ +#define KPMPAGE_T_SZ (0) +#else /* __x86 */ #define KPMPAGE_T_SZ \ ((kpm_smallpages == 0) ? sizeof (kpm_page_t) : sizeof (kpm_spage_t)) +#endif /* __x86 */ #else /* SEGKPM_SUPPORT */ diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 8a1cf4b2b7..eeaaa379ab 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -80,7 +80,10 @@ CORE_OBJS += \ mach_kdi.o \ mach_sysconfig.o \ machdep.o \ + mem_config.o \ mem_config_stubs.o \ + mem_config_arch.o \ + memlist_new.o \ memnode.o \ microcode.o \ microfind.o \ @@ -187,7 +190,7 @@ PCPLUSMP_OBJS += apic.o apic_regops.o psm_common.o apic_introp.o \ mp_platform_common.o hpet_acpi.o ACPI_DRV_OBJS += acpi_drv.o acpi_video.o -ACPINEX_OBJS += acpinex_drv.o +ACPINEX_OBJS += acpinex_drv.o acpinex_event.o CPUDRV_OBJS += \ cpudrv.o \ @@ -200,10 +203,16 @@ ACPIDEV_OBJS += acpidev_drv.o \ acpidev_scope.o acpidev_device.o \ acpidev_container.o \ acpidev_cpu.o \ + acpidev_dr.o \ acpidev_memory.o \ + acpidev_pci.o \ acpidev_resource.o \ acpidev_util.o +DRMACH_ACPI_OBJS += drmach_acpi.o dr_util.o drmach_err.o + +DR_OBJS += dr.o dr_cpu.o dr_err.o dr_io.o dr_mem_acpi.o dr_quiesce.o dr_util.o + ROOTNEX_OBJS += rootnex.o immu.o immu_dmar.o immu_dvma.o \ immu_intrmap.o immu_qinv.o immu_regs.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared index 89b47413f3..a8fbb83fdf 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc.shared +++ b/usr/src/uts/i86pc/Makefile.i86pc.shared @@ -61,6 +61,9 @@ LINTS_DIR = $(OBJS_DIR) LINT_LIB_DIR = $(UTSBASE)/$(PLATFORM)/lint-libs/$(OBJS_DIR) GEN_LINT_LIB_DIR = $(UTSBASE)/intel/lint-libs/$(OBJS_DIR) +LINT32_DIRS = $(LINT32_BUILDS:%=$(UTSBASE)/$(PLATFORM)/lint-libs/%) +LINT32_FILES = $(LINT32_DIRS:%=%/llib-l$(MODULE).ln) + DTRACESTUBS_O = $(OBJS_DIR)/dtracestubs.o DTRACESTUBS = $(OBJS_DIR)/libdtracestubs.so @@ -256,6 +259,7 @@ DRV_KMODS += tzmon DRV_KMODS += acpi_drv DRV_KMODS += acpinex DRV_KMODS += amd_iommu +DRV_KMODS += dr DRV_KMODS += ioat DRV_KMODS += fipe @@ -311,6 +315,7 @@ SYS_KMODS += # MISC_KMODS += gfx_private pcie MISC_KMODS += acpidev +MISC_KMODS += drmach_acpi # # 'Dacf' modules (/kernel/dacf) diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index ab82a3537f..592250db83 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -79,10 +79,42 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/acpi/acpinex/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +SBD_IOCTL = $(UTSBASE)/i86pc/sys/sbd_ioctl.h +DRMACH_IO = $(UTSBASE)/i86pc/io/acpi/drmach_acpi +DRMACH_GENERR = $(DRMACH_IO)/sbdgenerr +DR_IO = $(UTSBASE)/i86pc/io/dr +DR_GENERR = $(DR_IO)/sbdgenerr + +$(DRMACH_GENERR): $(DR_IO)/sbdgenerr.pl + $(RM) $@ + $(CAT) $(DR_IO)/sbdgenerr.pl > $@ + $(CHMOD) +x $@ + +$(DRMACH_IO)/drmach_err.c: $(DRMACH_GENERR) $(SBD_IOCTL) + $(RM) $@ + $(DRMACH_GENERR) EX86 < $(SBD_IOCTL) > $(DRMACH_IO)/drmach_err.c + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/acpi/drmach_acpi/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/amd_iommu/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(DR_GENERR): $(DR_IO)/sbdgenerr.pl + $(RM) $@ + $(CAT) $(DR_IO)/sbdgenerr.pl > $@ + $(CHMOD) +x $@ + +$(DR_IO)/dr_err.c: $(DR_GENERR) $(SBD_IOCTL) + $(RM) $@ + $(DR_GENERR) ESBD < $(SBD_IOCTL) > $(DR_IO)/dr_err.c + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/dr/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -307,9 +339,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi/acpidev/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi/acpinex/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi/drmach_acpi/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/amd_iommu/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/dr/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu.h b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu.h index f5d852b01a..c520e06be7 100644 --- a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu.h +++ b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu.h @@ -189,6 +189,7 @@ struct gcpu_chipshared { kmutex_t gcpus_cfglock; /* serial MCA config from chip cores */ kmutex_t gcpus_poll_lock; /* serialize pollers on the same chip */ uint32_t gcpus_actv_banks; /* MCA bank numbers active on chip */ + volatile uint32_t gcpus_actv_cnt; /* active cpu count in this chip */ }; struct gcpu_data { @@ -205,11 +206,13 @@ struct regs; * CMI implementation */ extern int gcpu_init(cmi_hdl_t, void **); +extern void gcpu_fini(cmi_hdl_t); extern void gcpu_post_startup(cmi_hdl_t); extern void gcpu_post_mpstartup(cmi_hdl_t); extern void gcpu_faulted_enter(cmi_hdl_t); extern void gcpu_faulted_exit(cmi_hdl_t); extern void gcpu_mca_init(cmi_hdl_t); +extern void gcpu_mca_fini(cmi_hdl_t hdl); extern cmi_errno_t gcpu_msrinject(cmi_hdl_t, cmi_mca_regs_t *, uint_t, int); #ifndef __xpv extern uint64_t gcpu_mca_trap(cmi_hdl_t, struct regs *); @@ -228,6 +231,7 @@ extern int cmi_enable_cmci; * Local functions */ extern void gcpu_mca_poll_init(cmi_hdl_t); +extern void gcpu_mca_poll_fini(cmi_hdl_t); extern void gcpu_mca_poll_start(cmi_hdl_t); extern void gcpu_poll_trace_init(gcpu_poll_trace_ctl_t *); extern void gcpu_poll_trace(gcpu_poll_trace_ctl_t *, uint8_t, uint8_t); diff --git a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_main.c b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_main.c index db4a695b7d..761d942f27 100644 --- a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_main.c +++ b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_main.c @@ -23,6 +23,10 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * Generic x86 CPU Module @@ -78,22 +82,52 @@ gcpu_init(cmi_hdl_t hdl, void **datap) */ if ((sp = gcpu_shared[chipid]) == NULL) { sp = kmem_zalloc(sizeof (struct gcpu_chipshared), KM_SLEEP); + mutex_init(&sp->gcpus_poll_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&sp->gcpus_cfglock, NULL, MUTEX_DRIVER, NULL); osp = atomic_cas_ptr(&gcpu_shared[chipid], NULL, sp); - if (osp == NULL) { - mutex_init(&sp->gcpus_poll_lock, NULL, MUTEX_DRIVER, - NULL); - mutex_init(&sp->gcpus_cfglock, NULL, MUTEX_DRIVER, - NULL); - } else { + if (osp != NULL) { + mutex_destroy(&sp->gcpus_cfglock); + mutex_destroy(&sp->gcpus_poll_lock); kmem_free(sp, sizeof (struct gcpu_chipshared)); sp = osp; } } + + atomic_inc_32(&sp->gcpus_actv_cnt); gcpu->gcpu_shared = sp; return (0); } +/* + * deconfigure gcpu_init() + */ +void +gcpu_fini(cmi_hdl_t hdl) +{ + uint_t chipid = cmi_hdl_chipid(hdl); + gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); + struct gcpu_chipshared *sp; + + if (gcpu_disable || chipid >= GCPU_MAX_CHIPID) + return; + + gcpu_mca_fini(hdl); + + /* + * Keep shared data in cache for reuse. + */ + sp = gcpu_shared[chipid]; + ASSERT(sp != NULL); + atomic_dec_32(&sp->gcpus_actv_cnt); + + if (gcpu != NULL) + kmem_free(gcpu, sizeof (gcpu_data_t)); + + /* Release reference count held in gcpu_init(). */ + cmi_hdl_rele(hdl); +} + void gcpu_post_startup(cmi_hdl_t hdl) { @@ -150,7 +184,7 @@ const cmi_ops_t _cmi_ops = { GCPU_OP(gcpu_cmci_trap, NULL), /* cmi_cmci_trap */ gcpu_msrinject, /* cmi_msrinject */ GCPU_OP(gcpu_hdl_poke, NULL), /* cmi_hdl_poke */ - NULL, /* cmi_fini */ + gcpu_fini, /* cmi_fini */ GCPU_OP(NULL, gcpu_xpv_panic_callback), /* cmi_panic_callback */ }; diff --git a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c index 911a7c6af7..730411ebf1 100644 --- a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c +++ b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/mca_x86.h> #include <sys/cpu_module_impl.h> @@ -1064,7 +1068,7 @@ gcpu_mca_init(cmi_hdl_t hdl) /* * CPU startup code only calls cmi_mca_init if x86_feature indicates * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier - * processors, which have their own * more primitive way of doing + * processors, which have their own more primitive way of doing * machine checks, will not have cmi_mca_init called since their * CPUID information will not indicate both MCA and MCE features. */ @@ -2002,3 +2006,50 @@ gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); } + +/* deconfigure gcpu_mca_init() */ +void +gcpu_mca_fini(cmi_hdl_t hdl) +{ + gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); + gcpu_mca_t *mca = &gcpu->gcpu_mca; + int i; + + /* + * CPU startup code only calls cmi_mca_init if x86_feature indicates + * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier + * processors, which have their own more primitive way of doing + * machine checks, will not have cmi_mca_init called since their + * CPUID information will not indicate both MCA and MCE features. + */ + if ((x86_feature & X86_MCA) == 0) + return; +#ifndef __xpv + /* + * disable machine check in CR4 + */ + cmi_ntv_hwdisable_mce(hdl); +#endif + mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); + gcpu_mca_poll_fini(hdl); + mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); + + /* + * free resources allocated during init + */ + if (mca->gcpu_bank_cmci != NULL) { + kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) * + mca->gcpu_mca_nbanks); + } + + for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { + if (mca->gcpu_mca_logout[i] != NULL) { + kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz); + } + } + + if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) { + kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg, + sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks); + } +} diff --git a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_poll_ntv.c b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_poll_ntv.c index f68f96259c..361d147b56 100644 --- a/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_poll_ntv.c +++ b/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_poll_ntv.c @@ -23,6 +23,10 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * Native MCA polling. We establish an ommipresent cyclic to fire on all @@ -31,6 +35,7 @@ */ #include <sys/types.h> +#include <sys/atomic.h> #include <sys/cyclic.h> #include <sys/x86_archext.h> #include <sys/mca_x86.h> @@ -39,7 +44,9 @@ hrtime_t gcpu_mca_poll_interval = NANOSEC * 10ULL; /* tuneable */ static cyclic_id_t gcpu_mca_poll_cycid; -static int gcpu_mca_poll_inits; +static volatile uint_t gcpu_mca_poll_inits; +extern int gcpu_poll_trace_always; +extern uint_t gcpu_poll_trace_nent; /* * Return nonzero of the given handle should poll the MCH. We stick with @@ -280,7 +287,24 @@ gcpu_mca_poll_init(cmi_hdl_t hdl) gcpu_poll_trace_init(ptc); - gcpu_mca_poll_inits++; + atomic_inc_uint(&gcpu_mca_poll_inits); +} + +/* deconfigure gcpu_mca_poll_init() */ +void +gcpu_mca_poll_fini(cmi_hdl_t hdl) +{ + gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); + gcpu_poll_trace_ctl_t *ptc = &gcpu->gcpu_mca.gcpu_polltrace; + + ASSERT(cmi_hdl_class(hdl) == CMI_HDL_NATIVE); + + if (gcpu_poll_trace_always && (ptc->mptc_tbufs != NULL)) { + kmem_free(ptc->mptc_tbufs, sizeof (gcpu_poll_trace_t) * + gcpu_poll_trace_nent); + } + + atomic_dec_uint(&gcpu_mca_poll_inits); } void diff --git a/usr/src/uts/i86pc/dr/Makefile b/usr/src/uts/i86pc/dr/Makefile new file mode 100644 index 0000000000..03aa5f615e --- /dev/null +++ b/usr/src/uts/i86pc/dr/Makefile @@ -0,0 +1,104 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# This makefile drives the production of the dr driver module. +# +# i86pc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. +PLATFORM = i86pc + +# +# Define the module and object file sets. +# +MODULE = dr +OBJECTS = $(DR_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DR_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/dr + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Overrides +# +DEF_BUILDS = $(DEF_BUILDS64) +ALL_BUILDS = $(ALL_BUILDS64) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# module dependencies +# +LDFLAGS += -dy -Nmisc/drmach_acpi + +CLEANFILES += $(DR_GENERR) +CLEANFILES += $(DR_IO)/dr_err.c + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) lint32 + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) $(CONF_INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ + diff --git a/usr/src/uts/i86pc/drmach_acpi/Makefile b/usr/src/uts/i86pc/drmach_acpi/Makefile new file mode 100644 index 0000000000..24f9197b32 --- /dev/null +++ b/usr/src/uts/i86pc/drmach_acpi/Makefile @@ -0,0 +1,101 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2010, Intel Corporation. +# All rights reserved. +# +# This makefile drives the production of the drmach_acpi loadable module. +# +# i86pc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = drmach_acpi +OBJECTS = $(DRMACH_ACPI_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DRMACH_ACPI_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides +# +DEF_BUILDS = $(DEF_BUILDS64) +ALL_BUILDS = $(ALL_BUILDS64) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# module dependencies +# +LDFLAGS += -dy -Nmisc/acpica -Nmisc/acpidev + +CLEANFILES += $(DRMACH_GENERR) +CLEANFILES += $(DRMACH_IO)/drmach_err.c + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) lint32 + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_container.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_container.c index 33de316493..5c58266634 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_container.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_container.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -54,11 +54,13 @@ #include <sys/types.h> #include <sys/atomic.h> +#include <sys/note.h> #include <sys/sunddi.h> #include <sys/sunndi.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> #include <sys/acpidev.h> +#include <sys/acpidev_dr.h> #include <sys/acpidev_impl.h> static ACPI_STATUS acpidev_container_probe(acpidev_walk_info_t *infop); @@ -124,7 +126,7 @@ static acpidev_filter_rule_t acpidev_container_filters[] = { static ACPI_STATUS acpidev_container_probe(acpidev_walk_info_t *infop) { - ACPI_STATUS rc; + ACPI_STATUS rc = AE_OK; int flags; ASSERT(infop != NULL); @@ -137,19 +139,35 @@ acpidev_container_probe(acpidev_walk_info_t *infop) return (AE_OK); } - if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u in " + flags = ACPIDEV_PROCESS_FLAG_SCAN; + switch (infop->awi_op_type) { + case ACPIDEV_OP_BOOT_PROBE: + if (acpica_get_devcfg_feature(ACPI_DEVCFG_CONTAINER)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE; + acpidev_dr_check(infop); + } + break; + + case ACPIDEV_OP_BOOT_REPROBE: + break; + + case ACPIDEV_OP_HOTPLUG_PROBE: + if (acpica_get_devcfg_feature(ACPI_DEVCFG_CONTAINER)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE | + ACPIDEV_PROCESS_FLAG_SYNCSTATUS | + ACPIDEV_PROCESS_FLAG_HOLDBRANCH; + } + break; + + default: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u in " "acpidev_container_probe().", infop->awi_op_type); rc = AE_BAD_PARAMETER; + break; + } + + if (rc == AE_OK) { + rc = acpidev_process_object(infop, flags); } if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { cmn_err(CE_WARN, @@ -162,11 +180,12 @@ acpidev_container_probe(acpidev_walk_info_t *infop) return (rc); } -/*ARGSUSED*/ static ACPI_STATUS acpidev_container_search_dev(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, void **retval) { + _NOTE(ARGUNUSED(hdl, retval)); + int *fp = (int *)ctx; *fp = lvl; @@ -221,11 +240,11 @@ acpidev_container_filter_func(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, } if (cpu_lvl == 1) { /* CPU as child, most likely a physical CPU. */ - (void) strncpy(devname, ACPIDEV_NODE_NAME_MODULE_CPU, + (void) strlcpy(devname, ACPIDEV_NODE_NAME_MODULE_CPU, devnamelen); } else if (cpu_lvl == 2 && module_lvl == 1) { /* CPU as grandchild, most likely a system board. */ - (void) strncpy(devname, ACPIDEV_NODE_NAME_MODULE_SBD, + (void) strlcpy(devname, ACPIDEV_NODE_NAME_MODULE_SBD, devnamelen); } else if (ACPI_FAILURE(AcpiGetName(infop->awi_hdl, ACPI_SINGLE_NAME, &buf))) { @@ -233,7 +252,7 @@ acpidev_container_filter_func(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, * Failed to get ACPI object name; use ACPI object name * as the default name. */ - (void) strncpy(devname, ACPIDEV_NODE_NAME_CONTAINER, + (void) strlcpy(devname, ACPIDEV_NODE_NAME_CONTAINER, devnamelen); } diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_cpu.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_cpu.c index 0576591147..fa93871fb4 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_cpu.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_cpu.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -43,6 +43,7 @@ #include <sys/bootconf.h> #include <sys/cpuvar.h> #include <sys/machsystm.h> +#include <sys/note.h> #include <sys/psm_types.h> #include <sys/x86_archext.h> #include <sys/sunddi.h> @@ -70,11 +71,14 @@ static ACPI_STATUS acpidev_cpu_probe(acpidev_walk_info_t *infop); static acpidev_filter_result_t acpidev_cpu_filter(acpidev_walk_info_t *infop, char *devname, int maxlen); static ACPI_STATUS acpidev_cpu_init(acpidev_walk_info_t *infop); +static void acpidev_cpu_fini(ACPI_HANDLE hdl, acpidev_data_handle_t dhdl, + acpidev_class_t *clsp); static acpidev_filter_result_t acpidev_cpu_filter_func( acpidev_walk_info_t *infop, ACPI_HANDLE hdl, acpidev_filter_rule_t *afrp, char *devname, int len); -static int acpidev_cpu_query_dip(cpu_t *, dev_info_t **); +static int acpidev_cpu_create_dip(cpu_t *, dev_info_t **); +static int acpidev_cpu_get_dip(cpu_t *, dev_info_t **); /* * Default class driver for ACPI processor/CPU objects. @@ -91,7 +95,7 @@ acpidev_class_t acpidev_class_cpu = { acpidev_cpu_probe, /* adc_probe */ acpidev_cpu_filter, /* adc_filter */ acpidev_cpu_init, /* adc_init */ - NULL, /* adc_fini */ + acpidev_cpu_fini, /* adc_fini */ }; /* @@ -133,12 +137,14 @@ static char *acpidev_cpu_uid_formats[] = { "SCK%x-CPU%x", }; -static ACPI_HANDLE acpidev_cpu_map_hdl = NULL; -static uint32_t acpidev_cpu_map_count = 0; -static struct acpidev_cpu_map_item *acpidev_cpu_map = NULL; +static ACPI_HANDLE acpidev_cpu_map_hdl; +static uint32_t acpidev_cpu_map_count; +static struct acpidev_cpu_map_item *acpidev_cpu_map; extern int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **); -static int (*psm_cpu_create_devinfo_old)(cpu_t *, dev_info_t **) = NULL; +static int (*psm_cpu_create_devinfo_old)(cpu_t *, dev_info_t **); +extern int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **); +static int (*psm_cpu_get_devinfo_old)(cpu_t *, dev_info_t **); /* Count how many enabled CPUs are in the MADT table. */ static ACPI_STATUS @@ -200,7 +206,7 @@ acpidev_cpu_parse_MADT(ACPI_SUBTABLE_HEADER *ap, void *context) /* See comment at beginning about 255 limitation. */ if (mpx2a->LocalApicId < 255) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: encountered CPU with X2APIC Id < 255."); + "!acpidev: encountered CPU with X2APIC Id < 255."); } else if (mpx2a->LapicFlags & ACPI_MADT_ENABLED) { ASSERT(*cntp < acpidev_cpu_map_count); acpidev_cpu_map[*cntp].proc_id = mpx2a->Uid; @@ -270,7 +276,7 @@ acpidev_cpu_query_MAT(ACPI_SUBTABLE_HEADER *ap, void *context) } return (AE_CTRL_TERMINATE); } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: encountered CPU " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: encountered CPU " "with X2APIC Id < 255 in _MAT."); } break; @@ -292,7 +298,7 @@ acpidev_cpu_query_MAT(ACPI_SUBTABLE_HEADER *ap, void *context) * x2APIC and x2APIC NMI Structure. */ ACPIDEV_DEBUG(CE_NOTE, - "acpidev: unknown APIC entry type %u in _MAT.", ap->Type); + "!acpidev: unknown APIC entry type %u in _MAT.", ap->Type); break; } @@ -313,7 +319,7 @@ acpidev_cpu_get_procid(acpidev_walk_info_t *infop, uint32_t *idp) if (infop->awi_info->Type != ACPI_TYPE_PROCESSOR && infop->awi_info->Type != ACPI_TYPE_DEVICE) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: object %s is not PROCESSOR or DEVICE.", + "!acpidev: object %s is not PROCESSOR or DEVICE.", infop->awi_name); return (AE_BAD_PARAMETER); } @@ -345,7 +351,7 @@ acpidev_cpu_get_procid(acpidev_walk_info_t *infop, uint32_t *idp) return (AE_OK); } else { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to evaluate ACPI object %s.", + "!acpidev: failed to evaluate ACPI object %s.", infop->awi_name); } } @@ -365,6 +371,59 @@ acpidev_cpu_get_procid(acpidev_walk_info_t *infop, uint32_t *idp) } static ACPI_STATUS +acpidev_cpu_get_proximity_id(ACPI_HANDLE hdl, uint32_t apicid, uint32_t *pxmidp) +{ + int len, off; + ACPI_SUBTABLE_HEADER *sp; + ACPI_SRAT_CPU_AFFINITY *xp; + ACPI_SRAT_X2APIC_CPU_AFFINITY *x2p; + + ASSERT(hdl != NULL); + ASSERT(pxmidp != NULL); + *pxmidp = UINT32_MAX; + + if (ACPI_SUCCESS(acpidev_eval_pxm(hdl, pxmidp))) { + return (AE_OK); + } + if (acpidev_srat_tbl_ptr == NULL) { + return (AE_NOT_FOUND); + } + + /* Search the static ACPI SRAT table for proximity domain id. */ + sp = (ACPI_SUBTABLE_HEADER *)(acpidev_srat_tbl_ptr + 1); + len = acpidev_srat_tbl_ptr->Header.Length; + off = sizeof (*acpidev_srat_tbl_ptr); + while (off < len) { + switch (sp->Type) { + case ACPI_SRAT_TYPE_CPU_AFFINITY: + xp = (ACPI_SRAT_CPU_AFFINITY *)sp; + if ((xp->Flags & ACPI_SRAT_CPU_ENABLED) && + xp->ApicId == apicid) { + *pxmidp = xp->ProximityDomainLo; + *pxmidp |= xp->ProximityDomainHi[0] << 8; + *pxmidp |= xp->ProximityDomainHi[1] << 16; + *pxmidp |= xp->ProximityDomainHi[2] << 24; + return (AE_OK); + } + break; + + case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: + x2p = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)sp; + if ((x2p->Flags & ACPI_SRAT_CPU_ENABLED) && + x2p->ApicId == apicid) { + *pxmidp = x2p->ProximityDomain; + return (AE_OK); + } + break; + } + off += sp->Length; + sp = (ACPI_SUBTABLE_HEADER *)(((char *)sp) + sp->Length); + } + + return (AE_NOT_FOUND); +} + +static ACPI_STATUS acpidev_cpu_pre_probe(acpidev_walk_info_t *infop) { uint32_t count = 0; @@ -373,6 +432,7 @@ acpidev_cpu_pre_probe(acpidev_walk_info_t *infop) ASSERT(infop != NULL); if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE && acpidev_cpu_map_hdl == NULL) { + /* Parse CPU relative information in the ACPI MADT table. */ (void) acpidev_walk_apic(NULL, NULL, NULL, acpidev_cpu_count_MADT, &acpidev_cpu_map_count); acpidev_cpu_map = kmem_zalloc(sizeof (acpidev_cpu_map[0]) @@ -381,6 +441,12 @@ acpidev_cpu_pre_probe(acpidev_walk_info_t *infop) acpidev_cpu_parse_MADT, &count); ASSERT(count == acpidev_cpu_map_count); acpidev_cpu_map_hdl = infop->awi_hdl; + + /* Cache pointer to the ACPI SRAT table. */ + if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_SRAT, 1, + (ACPI_TABLE_HEADER **)&acpidev_srat_tbl_ptr))) { + acpidev_srat_tbl_ptr = NULL; + } } return (AE_OK); @@ -404,7 +470,9 @@ acpidev_cpu_post_probe(acpidev_walk_info_t *infop) /* replace psm_cpu_create_devinfo with local implementation. */ psm_cpu_create_devinfo_old = psm_cpu_create_devinfo; - psm_cpu_create_devinfo = acpidev_cpu_query_dip; + psm_cpu_create_devinfo = acpidev_cpu_create_dip; + psm_cpu_get_devinfo_old = psm_cpu_get_devinfo; + psm_cpu_get_devinfo = acpidev_cpu_get_dip; } return (AE_OK); @@ -427,27 +495,40 @@ acpidev_cpu_probe(acpidev_walk_info_t *infop) return (AE_OK); } - /* - * Mark device as offline. It will be changed to online state - * when the corresponding CPU starts up. - */ - if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | - ACPIDEV_PROCESS_FLAG_CREATE | - ACPIDEV_PROCESS_FLAG_OFFLINE; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | - ACPIDEV_PROCESS_FLAG_CREATE | - ACPIDEV_PROCESS_FLAG_OFFLINE; - rc = acpidev_process_object(infop, flags); - } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u in " + flags = ACPIDEV_PROCESS_FLAG_SCAN; + switch (infop->awi_op_type) { + case ACPIDEV_OP_BOOT_PROBE: + /* + * Mark device as offline. It will be changed to online state + * when the corresponding CPU starts up. + */ + if (acpica_get_devcfg_feature(ACPI_DEVCFG_CPU)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE | + ACPIDEV_PROCESS_FLAG_OFFLINE; + } + break; + + case ACPIDEV_OP_BOOT_REPROBE: + break; + + case ACPIDEV_OP_HOTPLUG_PROBE: + if (acpica_get_devcfg_feature(ACPI_DEVCFG_CPU)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE | + ACPIDEV_PROCESS_FLAG_OFFLINE | + ACPIDEV_PROCESS_FLAG_SYNCSTATUS | + ACPIDEV_PROCESS_FLAG_HOLDBRANCH; + } + break; + + default: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u in " "acpidev_cpu_probe().", infop->awi_op_type); rc = AE_BAD_PARAMETER; + break; + } + + if (rc == AE_OK) { + rc = acpidev_process_object(infop, flags); } if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { cmn_err(CE_WARN, @@ -474,12 +555,12 @@ acpidev_cpu_filter_func(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, if (acpidev_cpu_get_procid(infop, &procid) != 0) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to query processor id for %s.", + "!acpidev: failed to query processor id for %s.", infop->awi_name); return (ACPIDEV_FILTER_SKIP); } else if (acpidev_cpu_get_apicid(procid, &apicid) != 0) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to query apic id for %s.", + "!acpidev: failed to query apic id for %s.", infop->awi_name); return (ACPIDEV_FILTER_SKIP); } @@ -499,7 +580,7 @@ acpidev_cpu_filter_func(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, return (ACPIDEV_FILTER_SKIP); } else if (!mat.enabled) { ACPIDEV_DEBUG(CE_NOTE, - "acpidev: CPU %s has been disabled.", + "!acpidev: CPU %s has been disabled.", infop->awi_name); return (ACPIDEV_FILTER_SKIP); } @@ -536,6 +617,7 @@ static ACPI_STATUS acpidev_cpu_init(acpidev_walk_info_t *infop) { int count; + uint32_t pxmid; dev_info_t *dip; ACPI_HANDLE hdl; char unitaddr[64]; @@ -550,7 +632,7 @@ acpidev_cpu_init(acpidev_walk_info_t *infop) dip = infop->awi_dip; hdl = infop->awi_hdl; - /* Create "apic_id" and "processor_id" properties. */ + /* Create "apic_id", "processor_id" and "proximity_id" properties. */ if (ndi_prop_update_int(DDI_DEV_T_NONE, dip, ACPIDEV_PROP_NAME_PROCESSOR_ID, infop->awi_scratchpad[0]) != NDI_SUCCESS) { @@ -567,6 +649,15 @@ acpidev_cpu_init(acpidev_walk_info_t *infop) infop->awi_name); return (AE_ERROR); } + if (ACPI_SUCCESS(acpidev_cpu_get_proximity_id(infop->awi_hdl, + infop->awi_scratchpad[1], &pxmid))) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, dip, + ACPIDEV_PROP_NAME_PROXIMITY_ID, pxmid) != NDI_SUCCESS) { + cmn_err(CE_WARN, "!acpidev: failed to set proximity id " + "property for %s.", infop->awi_name); + return (AE_ERROR); + } + } /* Set "compatible" property for CPU dip */ count = sizeof (compatible) / sizeof (compatible[0]); @@ -616,7 +707,7 @@ acpidev_cpu_init(acpidev_walk_info_t *infop) } } else { ACPIDEV_DEBUG(CE_WARN, - "acpidev: unknown operation type %u in acpidev_cpu_init.", + "!acpidev: unknown operation type %u in acpidev_cpu_init.", infop->awi_op_type); return (AE_BAD_PARAMETER); } @@ -624,18 +715,38 @@ acpidev_cpu_init(acpidev_walk_info_t *infop) return (AE_OK); } +static void +acpidev_cpu_fini(ACPI_HANDLE hdl, acpidev_data_handle_t dhdl, + acpidev_class_t *clsp) +{ + _NOTE(ARGUNUSED(clsp, dhdl)); + + int rc; + uint32_t procid; + + rc = acpica_get_procid_by_object(hdl, &procid); + ASSERT(ACPI_SUCCESS(rc)); + if (ACPI_SUCCESS(rc)) { + rc = acpica_remove_processor_from_map(procid); + ASSERT(ACPI_SUCCESS(rc)); + if (ACPI_FAILURE(rc)) { + cmn_err(CE_WARN, "!acpidev: failed to remove " + "processor from ACPICA."); + } + } +} + +/* + * Lookup the dip for a CPU if ACPI CPU autoconfig is enabled. + */ static int -acpidev_cpu_query_dip(cpu_t *cp, dev_info_t **dipp) +acpidev_cpu_lookup_dip(cpu_t *cp, dev_info_t **dipp) { uint32_t apicid; ACPI_HANDLE hdl; dev_info_t *dip = NULL; *dipp = NULL; - /* - * Try to get the dip associated with the CPU if ACPI_DEVCFG_CPU is - * enabled. - */ if (acpica_get_devcfg_feature(ACPI_DEVCFG_CPU)) { apicid = cpuid_get_apicid(cp); if (acpica_get_cpu_object_by_cpuid(cp->cpu_id, &hdl) == 0 || @@ -644,18 +755,41 @@ acpidev_cpu_query_dip(cpu_t *cp, dev_info_t **dipp) ASSERT(hdl != NULL); if (ACPI_SUCCESS(acpica_get_devinfo(hdl, &dip))) { ASSERT(dip != NULL); - ndi_hold_devi(dip); *dipp = dip; return (PSM_SUCCESS); } } + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to lookup dip for cpu %d(%p).", + cp->cpu_id, (void *)cp); } - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to get dip for cpu %d(%p).", - cp->cpu_id, (void *)cp); + return (PSM_FAILURE); +} + +static int +acpidev_cpu_create_dip(cpu_t *cp, dev_info_t **dipp) +{ + if (acpidev_cpu_lookup_dip(cp, dipp) == PSM_SUCCESS) { + ndi_hold_devi(*dipp); + return (PSM_SUCCESS); + } if (psm_cpu_create_devinfo_old != NULL) { return (psm_cpu_create_devinfo_old(cp, dipp)); } else { return (PSM_FAILURE); } } + +static int +acpidev_cpu_get_dip(cpu_t *cp, dev_info_t **dipp) +{ + if (acpidev_cpu_lookup_dip(cp, dipp) == PSM_SUCCESS) { + return (PSM_SUCCESS); + } + if (psm_cpu_get_devinfo_old != NULL) { + return (psm_cpu_get_devinfo_old(cp, dipp)); + } else { + return (PSM_FAILURE); + } +} diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_device.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_device.c index fbf6e944b4..cf65cf4084 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_device.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_device.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -103,7 +103,7 @@ static acpidev_filter_rule_t acpidev_device_filters[] = { static ACPI_STATUS acpidev_device_probe(acpidev_walk_info_t *infop) { - ACPI_STATUS rc; + ACPI_STATUS rc = AE_OK; int flags; ASSERT(infop != NULL); @@ -114,20 +114,31 @@ acpidev_device_probe(acpidev_walk_info_t *infop) return (AE_OK); } - if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else { + flags = ACPIDEV_PROCESS_FLAG_SCAN; + switch (infop->awi_op_type) { + case ACPIDEV_OP_BOOT_PROBE: + flags |= ACPIDEV_PROCESS_FLAG_CREATE; + break; + + case ACPIDEV_OP_BOOT_REPROBE: + break; + + case ACPIDEV_OP_HOTPLUG_PROBE: + flags |= ACPIDEV_PROCESS_FLAG_CREATE | + ACPIDEV_PROCESS_FLAG_SYNCSTATUS | + ACPIDEV_PROCESS_FLAG_HOLDBRANCH; + break; + + default: ACPIDEV_DEBUG(CE_WARN, - "acpidev: unknown operation type %u in " + "!acpidev: unknown operation type %u in " "acpi_device_probe().", infop->awi_op_type); rc = AE_BAD_PARAMETER; + break; + } + + if (rc == AE_OK) { + rc = acpidev_process_object(infop, flags); } if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { cmn_err(CE_WARN, @@ -153,7 +164,7 @@ acpidev_device_filter(acpidev_walk_info_t *infop, char *devname, int maxlen) ACPIDEV_ARRAY_PARAM(acpidev_device_filters), devname, maxlen); } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u " "in acpidev_device_filter().", infop->awi_op_type); res = ACPIDEV_FILTER_FAILED; } @@ -161,7 +172,6 @@ acpidev_device_filter(acpidev_walk_info_t *infop, char *devname, int maxlen) return (res); } -/*ARGSUSED*/ static ACPI_STATUS acpidev_device_init(acpidev_walk_info_t *infop) { diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_dr.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_dr.c new file mode 100644 index 0000000000..e346c65cfb --- /dev/null +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_dr.c @@ -0,0 +1,2743 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/memlist.h> +#include <sys/memlist_impl.h> +#include <sys/note.h> +#include <sys/obpdefs.h> +#include <sys/synch.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/x86_archext.h> +#include <sys/machsystm.h> +#include <sys/memnode.h> /* for lgrp_plat_node_cnt */ +#include <sys/psm_types.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/acpidev_rsc.h> +#include <sys/acpidev_dr.h> +#include <sys/acpidev_impl.h> + +struct acpidev_dr_set_prop_arg { + uint32_t level; + uint32_t bdnum; + uint32_t cpu_id; + uint32_t mem_id; + uint32_t io_id; + uint32_t mod_id; +}; + +struct acpidev_dr_device_remove_arg { + uint32_t level; +}; + +extern int acpidev_options; + +/* User configurable option to enable/disable ACPI based DR operations. */ +int acpidev_dr_enable = 1; +int acpidev_dr_hierarchy_name = 1; +uint32_t acpidev_dr_max_segs_per_mem_device = ACPIDEV_DR_SEGS_PER_MEM_DEV; +uint32_t acpidev_dr_max_memlists_per_seg = ACPIDEV_DR_MEMLISTS_PER_SEG; + +ACPI_TABLE_SRAT *acpidev_srat_tbl_ptr; +ACPI_TABLE_SLIT *acpidev_slit_tbl_ptr; + +/* ACPI based DR operations are unsupported if zero. */ +static int acpidev_dr_supported = -1; + +/* Failed to initialize support of DR operations if non-zero. */ +static int acpidev_dr_failed; + +static volatile uint32_t acpidev_dr_boards; +static volatile uint32_t acpidev_dr_board_index; +static uint32_t acpidev_dr_max_cmp_per_board; +static uint32_t acpidev_dr_max_memory_per_board; +static uint32_t acpidev_dr_max_io_per_board; +static uint32_t acpidev_dr_memory_device_cnt; + +static ACPI_HANDLE *acpidev_dr_board_handles[ACPIDEV_DR_MAX_BOARDS]; + +/* Lock to protect/block DR operations at runtime. */ +static kmutex_t acpidev_dr_lock; + +static acpidev_dr_capacity_t acpidev_dr_capacities[] = { + { /* Nehalem-EX */ + X86_VENDOR_Intel, 0x6, 0x2e, 0x2e, 0, UINT_MAX, + B_TRUE, /* Hotplug capable */ + 1ULL << 30, /* Align on 1GB boundary */ + }, + { /* the last item is used to mark end of the table */ + UINT_MAX, UINT_MAX, UINT_MAX, 0, UINT_MAX, 0, + B_FALSE, + 0, + }, +}; + +static ACPI_STATUS acpidev_dr_scan_topo(ACPI_HANDLE hdl, UINT32 lvl, void *arg, + void **retval); + +static acpidev_dr_capacity_t * +acpidev_dr_get_capacity(void) +{ + acpidev_dr_capacity_t *cp, *cp1; + uint_t vendor, family, model, step; + static acpidev_dr_capacity_t *acpidev_dr_capacity_curr = NULL; + + if (acpidev_dr_capacity_curr != NULL) { + return (acpidev_dr_capacity_curr); + } + + kpreempt_disable(); + vendor = cpuid_getvendor(CPU); + family = cpuid_getfamily(CPU); + model = cpuid_getmodel(CPU); + step = cpuid_getstep(CPU); + kpreempt_enable(); + + for (cp = acpidev_dr_capacities; ; cp++) { + ASSERT(cp < acpidev_dr_capacities + + sizeof (acpidev_dr_capacities) / sizeof (*cp)); + + /* Check whether it reaches the last item of the table. */ + if (cp->cpu_vendor == UINT_MAX && cp->cpu_family == UINT_MAX && + cp->cpu_model_min == UINT_MAX && cp->cpu_model_max == 0 && + cp->cpu_step_min == UINT_MAX && cp->cpu_step_max == 0) { + break; + } + if (cp->cpu_vendor == vendor && cp->cpu_family == family && + model >= cp->cpu_model_min && model <= cp->cpu_model_max && + step >= cp->cpu_step_min && step <= cp->cpu_step_max) { + break; + } + } + + /* Assume all CPUs in system are homogeneous. */ + cp1 = atomic_cas_ptr(&acpidev_dr_capacity_curr, NULL, cp); + ASSERT(cp1 == NULL || cp1 == cp); + if (cp1 != NULL && cp1 != cp) { + return (NULL); + } + + return (cp); +} + +int +acpidev_dr_capable(void) +{ + uint64_t flags1, flags2; + acpidev_dr_capacity_t *cp; + + /* + * Disable support of DR operations if: + * 1) acpidev fails to initialize DR interfaces. + * 2) ACPI based DR has been disabled by user. + * 3) No DR capable devices have been detected. + * 4) The system doesn't support DR operations. + * 5) Some acpidev features have been disabled by user. + */ + if (acpidev_dr_failed != 0 || acpidev_dr_enable == 0 || + acpidev_dr_supported == 0) { + return (0); + } + + flags1 = ACPI_FEATURE_DEVCFG | ACPI_FEATURE_OSI_MODULE; + flags2 = ACPI_DEVCFG_CPU | ACPI_DEVCFG_MEMORY | + ACPI_DEVCFG_CONTAINER | ACPI_DEVCFG_PCI; + if (acpica_get_core_feature(flags1) != flags1 || + acpica_get_devcfg_feature(flags2) != flags2) { + cmn_err(CE_CONT, + "?acpidev: disable support of ACPI based DR because " + "some acpidev features have been disabled by user.\n"); + acpidev_dr_supported = 0; + return (0); + } + + cp = acpidev_dr_get_capacity(); + if (cp == NULL || cp->hotplug_supported == B_FALSE) { + return (0); + } + + return (1); +} + +uint32_t +acpidev_dr_max_boards(void) +{ + return (acpidev_dr_boards); +} + +uint32_t +acpidev_dr_max_io_units_per_board(void) +{ + return (acpidev_dr_max_io_per_board); +} + +uint32_t +acpidev_dr_max_mem_units_per_board(void) +{ + return (acpidev_dr_max_memory_per_board); +} + +uint32_t +acpidev_dr_max_cmp_units_per_board(void) +{ + return (acpidev_dr_max_cmp_per_board); +} + +uint32_t +acpidev_dr_max_cpu_units_per_cmp(void) +{ + static int max_cnt; + + if (max_cnt == 0) { + kpreempt_disable(); + max_cnt = cpuid_get_ncpu_per_chip(CPU); + kpreempt_enable(); + } + + return (max_cnt); +} + +uint32_t +acpidev_dr_max_segments_per_mem_device(void) +{ + if (acpidev_dr_max_segs_per_mem_device < 1) { + return (ACPIDEV_DR_SEGS_PER_MEM_DEV); + } else { + return (acpidev_dr_max_segs_per_mem_device); + } +} + +uint32_t +acpidev_dr_max_memlists_per_segment(void) +{ + if (acpidev_dr_max_memlists_per_seg < ACPIDEV_DR_MEMLISTS_PER_SEG) { + return (ACPIDEV_DR_MEMLISTS_PER_SEG); + } else { + return (acpidev_dr_max_memlists_per_seg); + } +} + +void +acpidev_dr_init(void) +{ + mutex_init(&acpidev_dr_lock, NULL, MUTEX_DRIVER, NULL); +} + +static void +acpidev_dr_check_board_type(acpidev_data_handle_t dhdl, + struct acpidev_dr_set_prop_arg *ap, char *objname) +{ + if (dhdl->aod_class_id == ACPIDEV_CLASS_ID_MEMORY) { + /* Memory board should have only one memory device. */ + ASSERT(ap->cpu_id == 0); + ASSERT(ap->mem_id == 1); + ASSERT(ap->io_id == 0); + ASSERT(ap->mod_id == 0); + dhdl->aod_bdtype = ACPIDEV_MEMORY_BOARD; + } else if (dhdl->aod_class_id == ACPIDEV_CLASS_ID_PCI || + dhdl->aod_class_id == ACPIDEV_CLASS_ID_PCIEX) { + /* IO board should have only one IO device. */ + ASSERT(ap->cpu_id == 0); + ASSERT(ap->mem_id == 0); + ASSERT(ap->io_id == 1); + ASSERT(ap->mod_id == 0); + dhdl->aod_bdtype = ACPIDEV_IO_BOARD; + } else if (dhdl->aod_class_id == ACPIDEV_CLASS_ID_CONTAINER) { + if (ap->mod_id == 1 && ap->mem_id == 0) { + dhdl->aod_bdtype = ACPIDEV_CPU_BOARD; + } else { + dhdl->aod_bdtype = ACPIDEV_SYSTEM_BOARD; + } + } else { + cmn_err(CE_WARN, + "!acpidev: unknown type of hotplug capable board %s.", + objname); + ASSERT(0); + } +} + +/* + * Check for hotplug capable boards and create environment to support + * ACPI based DR operations. No need to acquire lock here, it's called + * from single-threaded context during boot. + */ +void +acpidev_dr_check(acpidev_walk_info_t *infop) +{ + uint_t cmp; + boolean_t found = B_FALSE; + ACPI_HANDLE phdl; + acpidev_data_handle_t dhdl, pdhdl; + struct acpidev_dr_set_prop_arg arg; + + if (infop == NULL || + infop->awi_op_type != ACPIDEV_OP_BOOT_PROBE) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: invalid parameter to acpidev_dr_check()."); + return; + } + + if (acpidev_dr_capable() == 0) { + return; + } + + dhdl = infop->awi_data; + ASSERT(dhdl != NULL); + + /* This device has already been handled before. */ + if (ACPIDEV_DR_IS_PROCESSED(dhdl)) { + return; + } + + /* + * It implies that the device is hotplug capable if ACPI _EJ0 method + * is available. + */ + if (!ACPIDEV_DR_IS_BOARD(dhdl) && + acpidev_dr_device_hotplug_capable(infop->awi_hdl)) { + ACPIDEV_DR_SET_BOARD(dhdl); + } + + /* All things are done if the device isn't hotplug capable. */ + if (!ACPIDEV_DR_IS_BOARD(dhdl)) { + return; + } + + /* Check whether hardware topology is supported or not. */ + if (ACPI_FAILURE(acpidev_dr_scan_topo(infop->awi_hdl, 0, NULL, + NULL))) { + ACPIDEV_DR_SET_FAILED(dhdl); + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: hardware topology under %s " + "is unsupported for DR operations.", infop->awi_name); + return; + } + + /* Generate board/index/port number for the hotplug capable board. */ + dhdl->aod_bdnum = atomic_inc_32_nv(&acpidev_dr_boards) - 1; + dhdl->aod_portid = 0; + phdl = infop->awi_hdl; + while (ACPI_SUCCESS(AcpiGetParent(phdl, &phdl)) && + phdl != ACPI_ROOT_OBJECT) { + pdhdl = acpidev_data_get_handle(phdl); + if (pdhdl != NULL && ACPIDEV_DR_IS_BOARD(pdhdl)) { + dhdl->aod_bdidx = atomic_inc_32_nv(&pdhdl->aod_chidx); + found = B_TRUE; + break; + } + } + if (found == B_FALSE) { + dhdl->aod_bdidx = atomic_inc_32_nv(&acpidev_dr_board_index); + } + dhdl->aod_bdidx -= 1; + + /* Found too many hotplug capable boards. */ + if (dhdl->aod_bdnum >= ACPIDEV_DR_MAX_BOARDS) { + ACPIDEV_DR_SET_FAILED(dhdl); + cmn_err(CE_WARN, "!acpidev: too many hotplug capable boards, " + "max %d, found %d.", + ACPIDEV_DR_MAX_BOARDS, dhdl->aod_bdnum + 1); + return; + } + + /* Scan all descendant devices to prepare info for DR operations. */ + bzero(&arg, sizeof (arg)); + arg.bdnum = dhdl->aod_bdnum; + arg.level = infop->awi_level; + if (ACPI_FAILURE(acpidev_dr_scan_topo(infop->awi_hdl, 0, &arg, + NULL))) { + ACPIDEV_DR_SET_FAILED(dhdl); + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: failed to set DR properties " + "for descendants of %s.", infop->awi_name); + return; + } + + /* Get type of the hotplug capable board. */ + acpidev_dr_check_board_type(dhdl, &arg, infop->awi_name); + + /* + * Save ACPI handle of the hotplug capable board to speed up lookup + * board handle if caching is enabled. + */ + if ((acpidev_options & ACPIDEV_OUSER_NO_CACHE) == 0) { + acpidev_dr_board_handles[dhdl->aod_bdnum] = infop->awi_hdl; + } + + /* Update system maximum DR capabilities. */ + cmp = (arg.cpu_id + acpidev_dr_max_cpu_units_per_cmp() - 1); + cmp /= acpidev_dr_max_cpu_units_per_cmp(); + if (cmp > acpidev_dr_max_cmp_per_board) { + acpidev_dr_max_cmp_per_board = cmp; + } + if (arg.mem_id > acpidev_dr_max_memory_per_board) { + acpidev_dr_max_memory_per_board = arg.mem_id; + } + if (arg.io_id > acpidev_dr_max_io_per_board) { + acpidev_dr_max_io_per_board = arg.io_id; + } +} + +static void +acpidev_dr_initialize_memory_hotplug(void) +{ + caddr_t buf; + uint32_t cnt; + acpidev_dr_capacity_t *cp; + + /* + * We have already checked that the platform supports DR operations. + */ + cp = acpidev_dr_get_capacity(); + ASSERT(cp != NULL && cp->hotplug_supported); + ASSERT(ISP2(cp->memory_alignment)); + ASSERT(cp->memory_alignment > MMU_PAGESIZE); + mem_node_physalign = cp->memory_alignment; + + /* Pre-populate memlist cache. */ + cnt = acpidev_dr_memory_device_cnt; + cnt *= acpidev_dr_max_segments_per_mem_device(); + cnt *= acpidev_dr_max_memlists_per_segment(); + if (cnt > ACPIDEV_DR_MAX_MEMLIST_ENTRIES) { + cmn_err(CE_WARN, "!acpidev: attempted to reserve too many " + "memlist entries (%u), max %u. Falling back to %u and " + "some memory hot add operations may fail.", + cnt, ACPIDEV_DR_MAX_MEMLIST_ENTRIES, + ACPIDEV_DR_MAX_MEMLIST_ENTRIES); + cnt = ACPIDEV_DR_MAX_MEMLIST_ENTRIES; + } + cnt *= sizeof (struct memlist); + buf = kmem_zalloc(cnt, KM_SLEEP); + memlist_free_block(buf, cnt); +} + +/* + * Create pseudo DR control device node if the system is hotplug capable. + * No need to acquire lock, it's called from single-threaded context + * during boot. pdip has been held by the caller. + */ +static ACPI_STATUS +acpidev_dr_create_node(dev_info_t *pdip) +{ + dev_info_t *dip; + char unit[32]; + char *path; + char *comps[] = { + "acpidr_sbd", + }; + + /* + * Disable support of DR operations if no hotplug capable board has + * been detected. + */ + if (acpidev_dr_boards == 0) { + acpidev_dr_supported = 0; + } else { + acpidev_dr_supported = 1; + } + + /* + * Don't create control device node if the system isn't hotplug capable. + */ + if (acpidev_dr_capable() == 0) { + return (AE_SUPPORT); + } + + /* Cache pointer to the ACPI SLIT table. */ + if (ACPI_FAILURE(AcpiGetTable(ACPI_SIG_SLIT, 1, + (ACPI_TABLE_HEADER **)&acpidev_slit_tbl_ptr))) { + acpidev_slit_tbl_ptr = NULL; + } + if (acpidev_srat_tbl_ptr == NULL || acpidev_slit_tbl_ptr == NULL) { + if (lgrp_plat_node_cnt != 1) { + /* + * Disable support of CPU/memory DR operations if lgrp + * is enabled but failed to cache SRAT/SLIT table + * pointers. + */ + cmn_err(CE_WARN, + "!acpidev: failed to get ACPI SRAT/SLIT table."); + plat_dr_disable_cpu(); + plat_dr_disable_memory(); + } + } + + ndi_devi_alloc_sleep(pdip, ACPIDEV_NODE_NAME_ACPIDR, + (pnode_t)DEVI_PSEUDO_NODEID, &dip); + + /* Set "unit-address" device property. */ + (void) snprintf(unit, sizeof (unit), "%u", 0); + if (ndi_prop_update_string(DDI_DEV_T_NONE, dip, + ACPIDEV_PROP_NAME_UNIT_ADDR, unit) != NDI_SUCCESS) { + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + cmn_err(CE_CONT, + "?acpidev: failed to set unit-address property for %s.\n", + ddi_pathname(dip, path)); + kmem_free(path, MAXPATHLEN); + (void) ddi_remove_child(dip, 0); + acpidev_dr_failed = 1; + return (AE_ERROR); + } + + /* Set "compatible" device property. */ + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, dip, OBP_COMPATIBLE, + comps, sizeof (comps) / sizeof (comps[0])) != NDI_SUCCESS) { + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + cmn_err(CE_CONT, "?acpidev: failed to set compatible " + "property for %s.\n", ddi_pathname(dip, path)); + kmem_free(path, MAXPATHLEN); + (void) ddi_remove_child(dip, 0); + acpidev_dr_failed = 1; + return (AE_ERROR); + } + + (void) ndi_devi_bind_driver(dip, 0); + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_initialize(dev_info_t *pdip) +{ + ACPI_STATUS rc; + + rc = acpidev_dr_create_node(pdip); + if (ACPI_FAILURE(rc)) { + return (rc); + } + + /* Initialize support of memory DR operations. */ + if (plat_dr_support_memory()) { + acpidev_dr_initialize_memory_hotplug(); + } + + /* Mark the DR subsystem is ready for use. */ + plat_dr_enable(); + + return (AE_OK); +} + +static ACPI_STATUS +acpidev_dr_find_board(ACPI_HANDLE hdl, uint_t lvl, void *ctx, void **retval) +{ + _NOTE(ARGUNUSED(lvl)); + + acpidev_data_handle_t dhdl; + + ASSERT(hdl != NULL); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + /* No data handle available, not ready for DR operations. */ + return (AE_CTRL_DEPTH); + } else if (ACPIDEV_DR_IS_BOARD(dhdl) && ACPIDEV_DR_IS_WORKING(dhdl) && + dhdl->aod_bdnum == (intptr_t)ctx) { + ASSERT(retval != NULL); + *(ACPI_HANDLE *)retval = hdl; + return (AE_CTRL_TERMINATE); + } + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_get_board_handle(uint_t board, ACPI_HANDLE *hdlp) +{ + ACPI_STATUS rc = AE_OK; + ACPI_HANDLE hdl; + + ASSERT(hdlp != NULL); + if (hdlp == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_board_handle()."); + return (AE_BAD_PARAMETER); + } + + if (board >= acpidev_dr_boards) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: board number %d is out of range, max %d.", + board, acpidev_dr_boards); + return (AE_NOT_FOUND); + } + + /* Use cached handles if caching is enabled. */ + if ((acpidev_options & ACPIDEV_OUSER_NO_CACHE) == 0) { + if (acpidev_dr_board_handles[board] != NULL) { + hdl = acpidev_dr_board_handles[board]; + if (ACPI_FAILURE(acpidev_dr_find_board(hdl, 1, + (void *)(intptr_t)board, (void **)hdlp)) && + *hdlp != NULL) { + return (AE_OK); + } + } + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: board %d doesn't exist.", board); + *hdlp = NULL; + return (AE_NOT_FOUND); + } + + /* All hotplug capable boards should exist under \_SB_. */ + if (ACPI_FAILURE(AcpiGetHandle(ACPI_ROOT_OBJECT, + ACPIDEV_OBJECT_NAME_SB, &hdl))) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get handle of %s.", + ACPIDEV_OBJECT_NAME_SB); + return (AE_ERROR); + } + + *hdlp = NULL; + if (ACPI_FAILURE(AcpiWalkNamespace(ACPI_TYPE_DEVICE, hdl, + ACPIDEV_MAX_ENUM_LEVELS - 1, acpidev_dr_find_board, NULL, + (void *)(intptr_t)board, (void **)hdlp))) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to find ACPI handle " + "for board %d.", board); + rc = AE_NOT_FOUND; + } else if (*hdlp == NULL) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: board %d doesn't exist.", board); + rc = AE_NOT_FOUND; + } + + return (rc); +} + +acpidev_board_type_t +acpidev_dr_get_board_type(ACPI_HANDLE hdl) +{ + acpidev_data_handle_t dhdl; + acpidev_board_type_t type = ACPIDEV_INVALID_BOARD; + + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_board_type()."); + return (type); + } + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data associated with %p.", hdl); + } else { + type = dhdl->aod_bdtype; + } + + return (type); +} + +ACPI_STATUS +acpidev_dr_get_board_number(ACPI_HANDLE hdl, uint32_t *bnump) +{ + acpidev_data_handle_t dhdl; + + if (hdl == NULL || bnump == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_board_number()."); + return (AE_BAD_PARAMETER); + } + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data associated with %p.", hdl); + return (AE_ERROR); + } + *bnump = dhdl->aod_bdnum; + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_get_board_name(ACPI_HANDLE hdl, char *buf, size_t len) +{ + char *fmt; + int count = 0; + size_t rlen = 0; + ACPI_HANDLE thdl; + acpidev_data_handle_t dhdl; + acpidev_data_handle_t dhdls[ACPIDEV_MAX_ENUM_LEVELS]; + + if (hdl == NULL || buf == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_board_name()."); + return (AE_BAD_PARAMETER); + } + + /* Find ancestors of the device which are hotplug capable. */ + for (thdl = hdl; thdl != NULL; ) { + dhdl = acpidev_data_get_handle(thdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get data " + "associated with %p.", thdl); + return (AE_ERROR); + } + + if (!ACPIDEV_DR_IS_BOARD(dhdl)) { + /* The board itself should be hotplug capable. */ + if (count == 0) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %p is " + "not hotplug capable.", thdl); + return (AE_ERROR); + } + } else { + if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %p is " + "in the FAILED state.", thdl); + } + + if (count >= ACPIDEV_MAX_ENUM_LEVELS) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: recursive level for hotplug " + "capable board is too deep."); + return (AE_ERROR); + } + + dhdls[count] = dhdl; + count++; + } + + if (acpidev_dr_hierarchy_name == 0) { + thdl = NULL; + } else if (ACPI_FAILURE(AcpiGetParent(thdl, &thdl))) { + thdl = NULL; + } + } + + /* Generate hierarchy board name for the board. */ + ASSERT(count > 0); + for (count--; count >= 0 && rlen < len; count--) { + dhdl = dhdls[count]; + switch (dhdl->aod_bdtype) { + case ACPIDEV_CPU_BOARD: + fmt = ACPIDEV_DR_CPU_BD_FMT; + break; + case ACPIDEV_MEMORY_BOARD: + fmt = ACPIDEV_DR_MEMORY_BD_FMT; + break; + case ACPIDEV_IO_BOARD: + fmt = ACPIDEV_DR_IO_BD_FMT; + break; + case ACPIDEV_SYSTEM_BOARD: + fmt = ACPIDEV_DR_SYSTEM_BD_FMT; + break; + case ACPIDEV_INVALID_BOARD: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid board type."); + return (AE_ERROR); + default: + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: unknown board type %u.", + dhdl->aod_bdtype); + return (AE_ERROR); + } + + /* Add "." before component name except first item. */ + if (rlen != 0) { + rlen += snprintf(buf + rlen, len - rlen, "."); + } + if (rlen < len) { + rlen += snprintf(buf + rlen, len - rlen, fmt, + dhdl->aod_bdidx); + } + } + + /* Check whether the buffer is sufficient. */ + if (rlen >= len) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: buffer length to " + "acpidev_dr_get_board_name() is too small."); + return (AE_NO_MEMORY); + } + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_get_attachment_point(ACPI_HANDLE hdl, char *buf, size_t len) +{ + size_t rlen; + + if (hdl == NULL || buf == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_attachment_point()."); + return (AE_BAD_PARAMETER); + } + + rlen = snprintf(buf, len, "/devices/%s/%s@%u:", + ACPIDEV_NODE_NAME_ROOT, ACPIDEV_NODE_NAME_ACPIDR, 0); + if (rlen >= len) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: buffer to " + "acpidev_dr_get_attachment_point() is too small."); + return (AE_NO_MEMORY); + } + + return (acpidev_dr_get_board_name(hdl, buf + rlen, len - rlen)); +} + +/* + * Existence of ACPI _EJ0 method implies that the device is hotplug capable. + */ +int +acpidev_dr_device_hotplug_capable(ACPI_HANDLE hdl) +{ + ACPI_HANDLE ej0; + + ASSERT(hdl != NULL); + if (ACPI_FAILURE(AcpiGetHandle(hdl, ACPIDEV_METHOD_NAME_EJ0, &ej0))) { + return (0); + } + + return (1); +} + +int +acpidev_dr_device_has_edl(ACPI_HANDLE hdl) +{ + ACPI_HANDLE edl; + + ASSERT(hdl != NULL); + if (ACPI_FAILURE(AcpiGetHandle(hdl, ACPIDEV_METHOD_NAME_EDL, &edl))) { + return (0); + } + + return (1); +} + +int +acpidev_dr_device_is_present(ACPI_HANDLE hdl) +{ + int status; + + ASSERT(hdl != NULL); + + status = acpidev_query_device_status(hdl); + if (acpidev_check_device_present(status)) { + return (1); + } + + return (0); +} + +int +acpidev_dr_device_is_powered(ACPI_HANDLE hdl) +{ + int status; + + ASSERT(hdl != NULL); + + /* + * Check device status returned by ACPI _STA method. + * It implies that the device is powered if status is both PRESENT + * and ENABLED. + */ + status = acpidev_query_device_status(hdl); + if (acpidev_check_device_enabled(status)) { + return (1); + } + + return (0); +} + +ACPI_STATUS +acpidev_dr_get_mem_alignment(ACPI_HANDLE hdl, uint64_t *ap) +{ + acpidev_dr_capacity_t *cp; + + ASSERT(hdl != NULL); + ASSERT(ap != NULL); + if (ap == NULL || hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_get_mem_alignment()."); + return (AE_BAD_PARAMETER); + } + + cp = acpidev_dr_get_capacity(); + if (cp == NULL || cp->hotplug_supported == B_FALSE) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get memory alignment."); + return (AE_SUPPORT); + } + *ap = cp->memory_alignment; + + return (AE_OK); +} + +/* + * Get the device property for the given name and store it into buf. + * Returns the amount of data copied to buf if len is large enough to + * hold all of the data. If len is not large enough, then the required + * len would be returned and buf would not be modified. On any errors, + * -1 is returned and buf is not modified. + */ +ACPI_STATUS +acpidev_dr_device_get_regspec(ACPI_HANDLE hdl, boolean_t assigned, + acpidev_regspec_t **regpp, uint_t *cntp) +{ + int *valp; + uint_t count; + char *propname; + dev_info_t *dip; + acpidev_data_handle_t dhdl; + + ASSERT(hdl != NULL); + ASSERT(regpp != NULL && cntp != NULL); + if (hdl == NULL || regpp == NULL || cntp == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters to " + "acpidev_dr_device_get_regspec()."); + return (AE_BAD_PARAMETER); + } + + /* Set default return value. */ + *regpp = NULL; + *cntp = 0; + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data associated with %p.", hdl); + return (AE_ERROR); + } else if ((dip = acpidev_data_get_devinfo(dhdl)) == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get dip associated with %p.", hdl); + return (AE_NOT_FOUND); + } + + propname = assigned ? "assigned-addresses" : "reg"; + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + propname, &valp, &count) != DDI_PROP_SUCCESS) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to lookup device property %s.", propname); + return (AE_NOT_FOUND); + } + + if (count % (sizeof (**regpp) / sizeof (int)) != 0) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: device property %s is invalid.", propname); + ddi_prop_free(valp); + return (AE_ERROR); + } + + *regpp = (acpidev_regspec_t *)valp; + *cntp = count / (sizeof (**regpp) / sizeof (int)); + + return (AE_OK); +} + +void +acpidev_dr_device_free_regspec(acpidev_regspec_t *regp, uint_t count) +{ + _NOTE(ARGUNUSED(count)); + + if (regp != NULL) { + ddi_prop_free(regp); + } +} + +/* + * Return values + * . negative values on error + * . size of data copied to buffer if it's bigger enough + * . size of buffer needed if buffer is too small + */ +int +acpidev_dr_device_getprop(ACPI_HANDLE hdl, char *name, caddr_t buf, size_t len) +{ + int rlen = -1; + acpidev_data_handle_t dhdl; + + if (hdl == NULL) { + return (-1); + } + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + return (-1); + } else if (!ACPIDEV_DR_IS_WORKING(dhdl)) { + return (-1); + } + + if (strcmp(name, ACPIDEV_DR_PROP_PORTID) == 0) { + if (len >= sizeof (uint32_t)) { + *(uint32_t *)(void *)buf = dhdl->aod_portid; + } + rlen = sizeof (uint32_t); + } else if (strcmp(name, ACPIDEV_DR_PROP_BOARDNUM) == 0) { + if (len >= sizeof (uint32_t)) { + *(uint32_t *)(void *)buf = dhdl->aod_bdnum; + } + rlen = sizeof (uint32_t); + } else if (strcmp(name, ACPIDEV_DR_PROP_DEVNAME) == 0) { + switch (dhdl->aod_class_id) { + case ACPIDEV_CLASS_ID_CPU: + if (len >= sizeof (ACPIDEV_NODE_NAME_CPU)) { + (void) strlcpy((char *)buf, + ACPIDEV_NODE_NAME_CPU, len); + } + rlen = sizeof (ACPIDEV_NODE_NAME_CPU); + break; + + case ACPIDEV_CLASS_ID_MEMORY: + if (len >= sizeof (ACPIDEV_NODE_NAME_MEMORY)) { + (void) strlcpy((char *)buf, + ACPIDEV_NODE_NAME_MEMORY, len); + } + rlen = sizeof (ACPIDEV_NODE_NAME_MEMORY); + break; + + case ACPIDEV_CLASS_ID_PCI: + case ACPIDEV_CLASS_ID_PCIEX: + if (len >= sizeof (ACPIDEV_NODE_NAME_PCI)) { + (void) strlcpy((char *)buf, + ACPIDEV_NODE_NAME_PCI, len); + } + rlen = sizeof (ACPIDEV_NODE_NAME_PCI); + break; + + default: + break; + } + } + + return (rlen); +} + +/* + * Figure out device class of the device. + * It only supports device classes which may be involved in DR operations. + */ +acpidev_class_id_t +acpidev_dr_device_get_class(ACPI_HANDLE hdl) +{ + ACPI_OBJECT_TYPE type; + ACPI_DEVICE_INFO *infop; + acpidev_class_id_t id = ACPIDEV_CLASS_ID_INVALID; + + static char *acpidev_id_cpu[] = { + ACPIDEV_HID_CPU, + }; + static char *acpidev_id_mem[] = { + ACPIDEV_HID_MEMORY, + }; + static char *acpidev_id_mod[] = { + ACPIDEV_HID_MODULE, + }; + static char *acpidev_id_pci[] = { + ACPIDEV_HID_PCI_HOSTBRIDGE, + }; + static char *acpidev_id_pciex[] = { + ACPIDEV_HID_PCIEX_HOSTBRIDGE, + }; + + /* Figure out device type by checking ACPI object type. */ + if (ACPI_FAILURE(AcpiGetType(hdl, &type))) { + return (ACPIDEV_CLASS_ID_INVALID); + } else if (type == ACPI_TYPE_PROCESSOR) { + return (ACPIDEV_CLASS_ID_CPU); + } else if (type != ACPI_TYPE_DEVICE) { + return (ACPIDEV_CLASS_ID_INVALID); + } + + if (ACPI_FAILURE(AcpiGetObjectInfo(hdl, &infop))) { + return (ACPIDEV_CLASS_ID_INVALID); + } + + /* Figure out device type by checking _HID and _CID. */ + if (acpidev_match_device_id(infop, + ACPIDEV_ARRAY_PARAM(acpidev_id_cpu))) { + id = ACPIDEV_CLASS_ID_CPU; + } else if (acpidev_match_device_id(infop, + ACPIDEV_ARRAY_PARAM(acpidev_id_mem))) { + id = ACPIDEV_CLASS_ID_MEMORY; + } else if (acpidev_match_device_id(infop, + ACPIDEV_ARRAY_PARAM(acpidev_id_mod))) { + id = ACPIDEV_CLASS_ID_CONTAINER; + } else if (acpidev_match_device_id(infop, + ACPIDEV_ARRAY_PARAM(acpidev_id_pciex))) { + id = ACPIDEV_CLASS_ID_PCIEX; + } else if (acpidev_match_device_id(infop, + ACPIDEV_ARRAY_PARAM(acpidev_id_pci))) { + id = ACPIDEV_CLASS_ID_PCI; + } + + AcpiOsFree(infop); + + return (id); +} + +ACPI_STATUS +acpidev_dr_device_get_memory_index(ACPI_HANDLE hdl, uint32_t *idxp) +{ + acpidev_data_handle_t dhdl; + + ASSERT(idxp != NULL); + ASSERT(hdl != NULL); + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle for %p.", hdl); + return (AE_ERROR); + } else if (dhdl->aod_class_id != ACPIDEV_CLASS_ID_MEMORY) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %p is not a memory device.", hdl); + return (AE_ERROR); + } else { + *idxp = dhdl->aod_memidx; + } + + return (AE_OK); +} + +int +acpidev_dr_device_is_board(ACPI_HANDLE hdl) +{ + acpidev_data_handle_t dhdl; + + ASSERT(hdl != NULL); + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_is_board()."); + return (0); + } + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + return (0); + } else if (!ACPIDEV_DR_IS_BOARD(dhdl)) { + return (0); + } + + return (1); +} + +ACPI_STATUS +acpidev_dr_device_walk_edl(ACPI_HANDLE hdl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval) +{ + ACPI_STATUS rc = AE_OK; + int i; + char *objname; + ACPI_OBJECT *obj; + ACPI_BUFFER buf; + char *method = ACPIDEV_METHOD_NAME_EDL; + + ASSERT(hdl != NULL); + ASSERT(cb != NULL); + if (hdl == NULL || cb == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_device_walk_edl()."); + return (AE_BAD_PARAMETER); + } + + objname = acpidev_get_object_name(hdl); + buf.Length = ACPI_ALLOCATE_BUFFER; + rc = AcpiEvaluateObjectTyped(hdl, method, NULL, &buf, + ACPI_TYPE_PACKAGE); + if (rc == AE_NOT_FOUND) { + acpidev_free_object_name(objname); + return (AE_OK); + } else if (ACPI_FAILURE(rc)) { + cmn_err(CE_WARN, + "!acpidev: failed to evaluate method %s under %s.", + method, objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Validate the package structure. */ + obj = buf.Pointer; + for (i = 0; i < obj->Package.Count; i++) { + if (obj->Package.Elements[i].Type != + ACPI_TYPE_LOCAL_REFERENCE) { + cmn_err(CE_WARN, "!acpidev: element %d in package " + "returned by %s of %s is not local reference.", + i, method, objname); + AcpiOsFree(buf.Pointer); + acpidev_free_object_name(objname); + return (AE_ERROR); + } else if (obj->Package.Elements[i].Reference.ActualType != + ACPI_TYPE_DEVICE) { + cmn_err(CE_WARN, "!acpidev: element %d in package " + "returned by %s of %s doesn't refer to device.", + i, method, objname); + AcpiOsFree(buf.Pointer); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + } + + for (i = 0; i < obj->Package.Count; i++) { + if (obj->Package.Elements[i].Reference.Handle == NULL) { + cmn_err(CE_WARN, "!acpidev: handle of element %d in " + "package returned by %s of %s is NULL.", + i, method, objname); + continue; + } + rc = (*cb)(obj->Package.Elements[i].Reference.Handle, + UINT32_MAX, arg, retval); + if (rc == AE_CTRL_DEPTH || rc == AE_CTRL_TERMINATE) { + rc = AE_OK; + } + if (ACPI_FAILURE(rc)) { + break; + } + } + + AcpiOsFree(buf.Pointer); + acpidev_free_object_name(objname); + + return (rc); +} + +ACPI_STATUS +acpidev_dr_device_walk_ejd(ACPI_HANDLE hdl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval) +{ + ACPI_STATUS rc = AE_OK; + char *objname; + ACPI_OBJECT *obj; + ACPI_BUFFER buf; + ACPI_HANDLE chdl; + char *method = ACPIDEV_METHOD_NAME_EJD; + + ASSERT(hdl != NULL); + ASSERT(cb != NULL); + if (hdl == NULL || cb == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_device_walk_ejd()."); + return (AE_BAD_PARAMETER); + } + + objname = acpidev_get_object_name(hdl); + buf.Length = ACPI_ALLOCATE_BUFFER; + rc = AcpiEvaluateObjectTyped(hdl, method, NULL, &buf, + ACPI_TYPE_STRING); + if (rc == AE_NOT_FOUND) { + acpidev_free_object_name(objname); + return (AE_OK); + } else if (ACPI_FAILURE(rc)) { + cmn_err(CE_WARN, + "!acpidev: failed to evaluate method %s under %s.", + method, objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + obj = buf.Pointer; + ASSERT(obj->String.Pointer); + if (ACPI_FAILURE(AcpiGetHandle(NULL, obj->String.Pointer, &chdl))) { + cmn_err(CE_WARN, "!acpidev: failed to get handle for %s.", + obj->String.Pointer); + rc = AE_ERROR; + } else { + rc = (*cb)(chdl, UINT32_MAX, arg, retval); + if (rc == AE_CTRL_DEPTH || rc == AE_CTRL_TERMINATE) { + rc = AE_OK; + } + } + + AcpiOsFree(buf.Pointer); + acpidev_free_object_name(objname); + + return (rc); +} + +/* + * Walk all child devices and special devices in the eject device list. + */ +static ACPI_STATUS +acpidev_dr_device_walk_child(ACPI_HANDLE hdl, boolean_t init, uint_t max_lvl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval) +{ + ACPI_STATUS rc = AE_OK; + + ASSERT(hdl != NULL); + ASSERT(cb != NULL); + if (hdl == NULL || cb == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_device_walk_child()."); + return (AE_BAD_PARAMETER); + } + + /* + * Walk the eject device list first when destroying. + * According to ACPI spec, devices in _EDL list must be handled first + * when the ejecting device. + */ + if (init == B_FALSE) { + rc = acpidev_dr_device_walk_edl(hdl, cb, arg, retval); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: failed to walk eject device list in " + "acpidev_dr_device_walk_child()."); + } + } + + /* Walk all child ACPI DEVICE objects. */ + if (ACPI_SUCCESS(rc)) { + rc = AcpiWalkNamespace(ACPI_TYPE_DEVICE, hdl, + max_lvl, cb, NULL, arg, retval); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: failed to walk DEVICE objects in " + "acpidev_dr_device_walk_child()."); + } + } + + /* Walk all child ACPI PROCESSOR objects. */ + if (ACPI_SUCCESS(rc)) { + rc = AcpiWalkNamespace(ACPI_TYPE_PROCESSOR, hdl, + max_lvl, cb, NULL, arg, retval); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: failed to walk PROCESSOR objects in " + "acpidev_dr_device_walk_child()."); + } + } + + /* + * Walk the eject device list last when initializing. + */ + if (init == B_TRUE && ACPI_SUCCESS(rc)) { + rc = acpidev_dr_device_walk_edl(hdl, cb, arg, retval); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: failed to walk eject device list in " + "acpidev_dr_device_walk_child()."); + } + } + + return (rc); +} + +ACPI_STATUS +acpidev_dr_device_walk_device(ACPI_HANDLE hdl, uint_t max_lvl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval) +{ + ACPI_STATUS rc = AE_OK; + char *objname; + + ASSERT(hdl != NULL); + ASSERT(cb != NULL); + if (hdl == NULL || cb == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter to " + "acpidev_dr_walk_device()."); + return (AE_BAD_PARAMETER); + } + + /* Walk the top object itself first. */ + rc = (*cb)(hdl, 0, arg, retval); + if (rc == AE_CTRL_DEPTH || rc == AE_CTRL_TERMINATE) { + rc = AE_OK; + } else if (ACPI_FAILURE(rc)) { + objname = acpidev_get_object_name(hdl); + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to handle top node %s " + "in acpidev_dr_walk_device().", objname); + acpidev_free_object_name(objname); + } else { + rc = acpidev_dr_device_walk_child(hdl, B_TRUE, max_lvl, + cb, arg, retval); + if (ACPI_FAILURE(rc)) { + objname = acpidev_get_object_name(hdl); + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to handle descendant nodes of %s " + "in acpidev_dr_walk_device().", objname); + acpidev_free_object_name(objname); + } + } + + return (rc); +} + +static ACPI_STATUS +acpidev_dr_no_support(ACPI_HANDLE hdl, UINT32 lvl, void *arg, void **retval) +{ + _NOTE(ARGUNUSED(arg, retval)); + + char *objname; + + ASSERT(hdl != NULL); + + objname = acpidev_get_object_name(hdl); + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: device %s at level 0x%x is unsupported.", + objname, lvl); + acpidev_free_object_name(objname); + + return (AE_SUPPORT); +} + +static ACPI_STATUS +acpidev_dr_set_prop(ACPI_HANDLE hdl, char *objname, + struct acpidev_dr_set_prop_arg *ap, uint32_t lvl, + acpidev_class_id_t clsid, uint_t *devid) +{ + acpidev_data_handle_t dhdl; + + /* Create data handle first if it doesn't exist yet. */ + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + uint32_t rlvl; + ACPI_HANDLE phdl; + + /* + * Compute level by walking ACPI namespace if it's a device + * from the eject device list. + */ + if (lvl == UINT32_MAX) { + /* + * AcpiGetParent() fails when it tries to get + * the parent of the ACPI namespace root node. + */ + for (rlvl = 0, phdl = hdl; + ACPI_SUCCESS(AcpiGetParent(phdl, &phdl)); + rlvl++) { + if (phdl == ACPI_ROOT_OBJECT) { + break; + } + } + if (rlvl == 0) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get level of %s.", + objname); + return (AE_BAD_PARAMETER); + } + } else { + rlvl = ap->level; + } + if (rlvl >= ACPIDEV_MAX_ENUM_LEVELS) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: recursive level of %s is too deep.", + objname); + return (AE_SUPPORT); + } + + dhdl = acpidev_data_create_handle(hdl); + if (dhdl != NULL) { + dhdl->aod_hdl = hdl; + dhdl->aod_level = rlvl; + } + } + + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to create data handle " + "for device %s.", objname); + return (AE_NO_MEMORY); + } + + if (ACPIDEV_DR_IS_READY(dhdl)) { + /* + * The same device may be enumerated twice at most. Once as + * child devices, another time from the eject device list. + */ + if (dhdl->aod_bdnum == ap->bdnum) { + return (AE_OK); + } else { + /* + * A device has been enumerated more than once from + * different paths. It's dangerous to support such + * a topology. Disable support of DR operations. + */ + ACPIDEV_DEBUG(CE_WARN, "!acpidev: device %s has been " + "enumerated more than once for DR.", objname); + acpidev_dr_failed = 1; + return (AE_SUPPORT); + } + } + + /* Set properties for DR operations. */ + dhdl->aod_class_id = clsid; + dhdl->aod_bdnum = ap->bdnum; + dhdl->aod_portid = atomic_inc_32_nv(devid) - 1; + if (clsid == ACPIDEV_CLASS_ID_MEMORY) { + dhdl->aod_memidx = acpidev_dr_memory_device_cnt; + ASSERT(dhdl->aod_memidx < ACPI_MEMNODE_DEVID_BOOT); + } + ACPIDEV_DR_SET_READY(dhdl); + + return (AE_OK); +} + +/* + * Verify whether the hardware topology is supported by the DR driver. + * The ACPI specification is so flexible that for safety reasons, only + * a few well defined topologies are supported. + * Possible values of parameter lvl: + * 0: the device is the board itself. + * UINT32_MAX: the device is from the _EDL list of the board. + * other: the device is a descendant of the board. + * Return values: + * AE_OK: the topology is supported + * AE_SUPPORT: the topology is unsupported + * AE_ERROR: other errors + */ +static ACPI_STATUS +acpidev_dr_scan_topo(ACPI_HANDLE hdl, UINT32 lvl, void *arg, void **retval) +{ + _NOTE(ARGUNUSED(retval)); + + ACPI_STATUS rc = AE_OK; + char *objname; + acpidev_class_id_t cid; + struct acpidev_dr_set_prop_arg *ap = arg; + + ASSERT(hdl != NULL); + ASSERT(lvl == 0 || lvl == 1 || lvl == UINT32_MAX); + objname = acpidev_get_object_name(hdl); + + /* + * Validate descendants of the hotplug capable board. + * lvl is zero if it's the hotplug capable board itself, otherwise + * non-zero for descendants. + */ + if (lvl != 0) { + /* + * Skip subtree if the device is hotplug capable. + * It will be treated as another hotplug capable board. + */ + if (acpidev_dr_device_hotplug_capable(hdl)) { + acpidev_free_object_name(objname); + return (AE_CTRL_DEPTH); + } + + /* + * Don't support the _EDL list of a non-hotplug-capable device. + */ + if (acpidev_dr_device_has_edl(hdl)) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: non-hotplug-capable " + "object %s has _EDL method.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + } + + cid = acpidev_dr_device_get_class(hdl); + switch (cid) { + case ACPIDEV_CLASS_ID_CPU: + /* Don't support logical CPUs in the _EDL list. */ + if (lvl == UINT32_MAX) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: logical CPU %s in " + "_EDL is unsupported.", objname); + rc = AE_SUPPORT; + break; + } + + /* Don't support logical CPUs with children. */ + ap->level++; + rc = acpidev_dr_device_walk_child(hdl, B_TRUE, 1, + acpidev_dr_no_support, arg, NULL); + ap->level--; + if (rc == AE_SUPPORT) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: logical CPU %s has " + "child or dependent devices.", objname); + break; + } else if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to scan " + "children of logical CPU %s.", objname); + rc = AE_ERROR; + break; + } else if (ap != NULL) { + rc = acpidev_dr_set_prop(hdl, objname, ap, lvl, + ACPIDEV_CLASS_ID_CPU, &ap->cpu_id); + } + break; + + case ACPIDEV_CLASS_ID_MEMORY: + /* Don't support memory devices with children. */ + ap->level++; + rc = acpidev_dr_device_walk_child(hdl, B_TRUE, 1, + acpidev_dr_no_support, arg, NULL); + ap->level--; + if (rc == AE_SUPPORT) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: memory device %s has child or " + "dependent devices.", objname); + } else if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to scan children of " + "memory device %s.", objname); + rc = AE_ERROR; + } else if (ap != NULL) { + acpidev_dr_memory_device_cnt++; + rc = acpidev_dr_set_prop(hdl, objname, ap, lvl, + ACPIDEV_CLASS_ID_MEMORY, &ap->mem_id); + } + break; + + case ACPIDEV_CLASS_ID_PCI: + case ACPIDEV_CLASS_ID_PCIEX: + /* Don't scan child/descendant devices of PCI/PCIex devices. */ + if (ap != NULL) { + rc = acpidev_dr_set_prop(hdl, objname, ap, lvl, + cid, &ap->io_id); + } + break; + + case ACPIDEV_CLASS_ID_CONTAINER: + /* Don't support module devices in the _EDL list. */ + if (lvl == UINT32_MAX) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: module device %s in " + "_EDL is unsupported.", objname); + rc = AE_SUPPORT; + break; + } + + /* Don't support recurrence of module devices. */ + if (lvl > 0) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: recursion level of " + "module device %s is too deep.", objname); + rc = AE_SUPPORT; + break; + } + + ap->level++; + rc = acpidev_dr_device_walk_child(hdl, B_TRUE, 1, + acpidev_dr_scan_topo, arg, NULL); + ap->level--; + if (ACPI_SUCCESS(rc) && ap != NULL) { + rc = acpidev_dr_set_prop(hdl, objname, ap, lvl, + ACPIDEV_CLASS_ID_CONTAINER, &ap->mod_id); + } + break; + + case ACPIDEV_CLASS_ID_INVALID: + /*FALLTHROUGH*/ + default: + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: device %s is unsupported.", objname); + rc = AE_SUPPORT; + break; + } + + acpidev_free_object_name(objname); + + return (rc); +} + +/* Create walk information structures. */ +static ACPI_STATUS +acpidev_dr_create_walk_info(ACPI_HANDLE hdl, acpidev_data_handle_t dhdl, + char *objname, acpidev_walk_info_t **infopp, acpidev_walk_info_t **cinfopp) +{ + ACPI_HANDLE phdl = NULL; + dev_info_t *pdip = NULL; + acpidev_data_handle_t pdhdl, tdhdl; + acpidev_walk_info_t *infop = NULL, *cinfop = NULL; + + ASSERT(hdl != NULL); + ASSERT(dhdl != NULL); + ASSERT(dhdl->aod_class_list != NULL); + ASSERT(objname != NULL); + ASSERT(infopp != NULL); + ASSERT(cinfopp != NULL); + + if (ACPI_FAILURE(AcpiGetParent(hdl, &phdl))) { + cmn_err(CE_WARN, + "!acpidev: failed to get parent object of %s.", objname); + return (AE_ERROR); + } + + pdhdl = acpidev_data_get_handle(phdl); + if (pdhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get data " + "associated with parent of %s.", objname); + return (AE_ERROR); + } + if (pdhdl->aod_level >= ACPIDEV_MAX_ENUM_LEVELS - 1) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: recursion level (%d) of %s is too deep.", + pdhdl->aod_level, objname); + return (AE_ERROR); + } + ASSERT(pdhdl->aod_class_list != NULL); + if (pdhdl->aod_class_list == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: class list for parent of %s is NULL.", objname); + return (AE_ERROR); + } + + /* Allocate a walk info structure for its parent. */ + infop = acpidev_alloc_walk_info(ACPIDEV_OP_HOTPLUG_PROBE, + pdhdl->aod_level, phdl, dhdl->aod_class_list, NULL); + if (infop == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to allocate walk info " + "structure for parent of %s.", objname); + return (AE_ERROR); + } + + /* Get the parent dip if it's not ready yet. */ + while (infop->awi_dip == NULL) { + if (ACPI_FAILURE(AcpiGetParent(phdl, &phdl))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get parent of object %p.", + phdl); + break; + } + tdhdl = acpidev_data_get_handle(phdl); + if (tdhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get data " + "associated with object %p.", phdl); + break; + } + pdip = acpidev_data_get_devinfo(tdhdl); + if (pdip != NULL) { + infop->awi_dip = pdip; + break; + } + /* Give up if reaches the ACPI namespace root node. */ + if (phdl == ACPI_ROOT_OBJECT) { + break; + } + } + if (infop->awi_dip == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get parent dip of %s.", objname); + acpidev_free_walk_info(infop); + return (AE_ERROR); + } + + /* Allocate a walk info for the child. */ + cinfop = acpidev_alloc_walk_info(ACPIDEV_OP_HOTPLUG_PROBE, + infop->awi_level + 1, hdl, NULL, infop); + if (cinfop == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to allocate walk info " + "structure for %s.", objname); + acpidev_free_walk_info(infop); + return (AE_ERROR); + } + + *infopp = infop; + *cinfopp = cinfop; + + return (AE_OK); +} + +static ACPI_STATUS +acpidev_dr_probe_object(ACPI_HANDLE hdl, acpidev_data_handle_t dhdl) +{ + ACPI_STATUS rc = AE_OK; + int circ; + char *objname; + dev_info_t *pdip; + ACPI_STATUS res; + ACPI_OBJECT_TYPE type; + acpidev_class_list_t *it; + acpidev_walk_info_t *infop, *cinfop; + + ASSERT(hdl != NULL); + ASSERT(dhdl != NULL); + if (hdl == NULL || dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: hdl or dhdl is NULL in " + "acpidev_dr_probe_object()."); + return (AE_BAD_PARAMETER); + } + objname = acpidev_get_object_name(hdl); + + /* Check whether the device is of interest. */ + if (ACPI_FAILURE(AcpiGetType(hdl, &type)) || + type > ACPI_TYPE_NS_NODE_MAX || + BT_TEST(acpidev_object_type_mask, type) == 0) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: ACPI object %s is unsupported.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + + if (dhdl->aod_class_list == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: class list is NULL in data associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + pdip = NULL; + infop = NULL; + cinfop = NULL; + rc = acpidev_dr_create_walk_info(hdl, dhdl, objname, &infop, &cinfop); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to create walk info structures for %s.", + objname); + acpidev_free_object_name(objname); + return (rc); + } + ASSERT(infop != NULL); + ASSERT(infop->awi_dip != NULL); + ASSERT(infop->awi_class_list != NULL); + ASSERT(cinfop != NULL); + ASSERT(cinfop->awi_data == dhdl); + + /* Lock the parent dip before touching children. */ + pdip = infop->awi_dip; + ndi_devi_enter(pdip, &circ); + rw_enter(&acpidev_class_lock, RW_READER); + + /* Call pre-probe callback functions to prepare for probing. */ + for (it = *(infop->awi_class_list); it != NULL; it = it->acl_next) { + if (it->acl_class->adc_pre_probe == NULL) { + continue; + } + infop->awi_class_curr = it->acl_class; + if (ACPI_FAILURE(it->acl_class->adc_pre_probe(infop))) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: failed to pre-probe " + "device of type %s under %s.", + it->acl_class->adc_class_name, infop->awi_name); + } + } + + /* Call registered probe callback functions to probe devices. */ + for (it = *(infop->awi_class_list); it != NULL; it = it->acl_next) { + if (it->acl_class->adc_probe == NULL) { + continue; + } + cinfop->awi_class_curr = it->acl_class; + res = it->acl_class->adc_probe(cinfop); + if (ACPI_FAILURE(res)) { + rc = res; + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: failed to process object %s under %s.", + objname, infop->awi_name); + } + } + + /* Call post-probe callback functions to clean up. */ + for (it = *(infop->awi_class_list); it != NULL; it = it->acl_next) { + if (it->acl_class->adc_post_probe == NULL) { + continue; + } + infop->awi_class_curr = it->acl_class; + if (ACPI_FAILURE(it->acl_class->adc_post_probe(infop))) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: failed to post-probe " + "device of type %s under %s.", + it->acl_class->adc_class_name, infop->awi_name); + } + } + + rw_exit(&acpidev_class_lock); + ndi_devi_exit(pdip, circ); + + acpidev_free_walk_info(cinfop); + acpidev_free_walk_info(infop); + acpidev_free_object_name(objname); + + return (rc); +} + +/* + * Some PCI/PCIex buses embedded in physical processors may be presented in + * the eject device list instead of being presented as child devices. + * This function figures out such devices and create device nodes for them. + */ +static ACPI_STATUS +acpidev_dr_probe_dependent(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, + void **retval) +{ + _NOTE(ARGUNUSED(retval)); + + ACPI_STATUS rc = AE_OK; + int status; + char *objname; + ACPI_HANDLE phdl, thdl; + acpidev_data_handle_t dhdl; + + ASSERT(lvl == UINT32_MAX); + ASSERT(hdl != NULL); + ASSERT(ctx != NULL); + phdl = ctx; + objname = acpidev_get_object_name(hdl); + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* + * It should be treated as another board if device is hotplug capable. + */ + if (ACPIDEV_DR_IS_BOARD(dhdl)) { + acpidev_free_object_name(objname); + return (AE_OK); + } else if (!ACPIDEV_DR_IS_WORKING(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: %s is unusable for DR operations.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + + /* + * Skip hdl if it's a descendant of phdl because it should have + * already been handled when handling phdl itself. + */ + for (thdl = hdl; ACPI_SUCCESS(AcpiGetParent(thdl, &thdl)); ) { + /* Return when reaches the phdl. */ + if (thdl == phdl) { + acpidev_free_object_name(objname); + return (AE_OK); + } + /* Break out when reaches the ACPI namespace root node. */ + if (thdl == ACPI_ROOT_OBJECT) { + break; + } + } + + /* + * No support of enumerating PCI/PCIex Host Bridge devices yet. + * It will be enabled when PCI/PCIex Host Bridge hotplug is ready. + */ + if (dhdl->aod_class_id == ACPIDEV_CLASS_ID_PCI || + dhdl->aod_class_id == ACPIDEV_CLASS_ID_PCIEX) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: PCI/PCIEX host bridge %s is " + "unsupported, skip it.", objname); + acpidev_free_object_name(objname); + return (AE_OK); + } + + /* Check whether the device exists and has been enabled. */ + status = acpidev_query_device_status(hdl); + if (!acpidev_check_device_enabled(status)) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: object %s is disabled/absent " + "when trying to connect it.", objname); + acpidev_free_object_name(objname); + return (AE_OK); + } + + /* Probe the device and its children. */ + rc = acpidev_dr_probe_object(hdl, dhdl); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to probe object %s in eject device list.", + objname); + return (rc); + } + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_device_insert(ACPI_HANDLE hdl) +{ + ACPI_STATUS rc = AE_OK; + int status, circ; + char *objname; + dev_info_t *dip; + acpidev_data_handle_t dhdl; + + ASSERT(acpidev_root_node() != NULL); + ASSERT(hdl != NULL); + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: parameter hdl to " + "acpidev_dr_insert_insert() is NULL."); + return (AE_BAD_PARAMETER); + } + + objname = acpidev_get_object_name(hdl); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Validate that the object is hotplug capable. */ + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is not hotplug capable.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %s is in the FAILED " + "state, unusable for DR.", objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Check whether the device exists and has been enabled. */ + status = acpidev_query_device_status(hdl); + if (!acpidev_check_device_enabled(status)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %s is disabled/absent " + "when trying to connect it.", objname); + acpidev_free_object_name(objname); + return (AE_NOT_EXIST); + } + + /* Check that there's no device node created for object yet. */ + dip = acpidev_data_get_devinfo(dhdl); + if (dip != NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: device node for object %s " + "already exists when trying to connect it.", objname); + acpidev_free_object_name(objname); + return (AE_ALREADY_EXISTS); + } + + /* + * Solaris has a limitation that all device nodes for PCI/PCIex host + * bridges must exist directly under /devices. + * Special care is needed here to deal with hot-adding PCI/PCIex host + * bridges to avoid dead lock caused by ndi_devi_enter(). + * Here the lock on ddi_root_node() is held first, which will break + * the dead lock loop. + */ + ndi_devi_enter(ddi_root_node(), &circ); + + rc = acpidev_dr_probe_object(hdl, dhdl); + if (ACPI_SUCCESS(rc)) { + rc = acpidev_dr_device_walk_edl(hdl, + &acpidev_dr_probe_dependent, hdl, NULL); + } + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to create device " + "nodes for children of %s.", objname); + cmn_err(CE_WARN, "!acpidev: disable DR support for object %s " + "due to failure when creating device nodes for it.", + objname); + ACPIDEV_DR_SET_FAILED(dhdl); + } + + ndi_devi_exit(ddi_root_node(), circ); + acpidev_free_object_name(objname); + + return (rc); +} + +static ACPI_STATUS +acpidev_dr_device_remove_cb(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, + void **retval) +{ + _NOTE(ARGUNUSED(lvl)); + + ACPI_STATUS rc = AE_OK; + int status; + char *objname; + dev_info_t *dip; + acpidev_data_handle_t dhdl; + struct acpidev_dr_device_remove_arg *argp; + + ASSERT(hdl != NULL && ctx != NULL); + if (hdl == NULL || ctx == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: parameter to " + "acpidev_dr_device_remove_cb() is NULL."); + return (AE_BAD_PARAMETER); + } + + argp = (struct acpidev_dr_device_remove_arg *)ctx; + objname = acpidev_get_object_name(hdl); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Validate that the object is hotplug capable. */ + /* It's the hotplug capable board itself if level is zero. */ + if (argp->level == 0) { + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is not hotplug capable.", + objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is unusable for DR.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + } else { + /* It's a device under the hotplug capable board. */ + /* + * Skip it if device itself is hotplug capable. + * It will be treated as another hotplug capable board. + */ + if (ACPIDEV_DR_IS_BOARD(dhdl)) { + acpidev_free_object_name(objname); + return (AE_OK); + } + + if (!ACPIDEV_DR_IS_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is not hotplug capable.", + objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is unusable for DR.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + } + + /* Skip the device if it hasn't been enabled at all. */ + status = acpidev_data_get_status(dhdl); + if (!acpidev_check_device_enabled(status)) { + acpidev_free_object_name(objname); + return (AE_OK); + } + + dip = acpidev_data_get_devinfo(dhdl); + if (dip == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get dev_info associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + + /* For safety, only handle supported device types when unconfiguring. */ + switch (dhdl->aod_class_id) { + case ACPIDEV_CLASS_ID_CONTAINER: + /*FALLTHROUGH*/ + case ACPIDEV_CLASS_ID_CPU: + /*FALLTHROUGH*/ + case ACPIDEV_CLASS_ID_MEMORY: + break; + + default: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %s (type %d) doesn't " + "support unconfiguration.", objname, dhdl->aod_class_id); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } + + /* Destroy descendants first. */ + argp->level++; + rc = acpidev_dr_device_walk_child(hdl, B_FALSE, 1, + acpidev_dr_device_remove_cb, ctx, retval); + argp->level--; + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to destroy descendants of %s.", objname); + acpidev_free_object_name(objname); + return (rc); + } + + /* Untag dip and ACPI object before destroying the dip. */ + if ((dhdl->aod_iflag & ACPIDEV_ODF_DEVINFO_TAGGED) && + ACPI_FAILURE(acpica_untag_devinfo(dip, hdl))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to untag object %s.", objname); + /* Mark the node as unusable. */ + ACPIDEV_DR_SET_FAILED(dhdl); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Destroy the node itself. */ + if (e_ddi_branch_destroy(dip, NULL, 0) != 0) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if ((dhdl->aod_iflag & ACPIDEV_ODF_DEVINFO_TAGGED) && + ACPI_FAILURE(acpica_tag_devinfo(dip, hdl))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to retag object %s.", objname); + } + + /* Mark the node as unusable. */ + ACPIDEV_DR_SET_FAILED(dhdl); + + (void) ddi_pathname(dip, path); + cmn_err(CE_WARN, + "acpidev: failed to remove node %s (%s).", path, objname); + kmem_free(path, MAXPATHLEN); + acpidev_free_object_name(objname); + + return (AE_ERROR); + } + + /* Update status and information associated with the device. */ + dhdl->aod_dip = NULL; + dhdl->aod_iflag &= ~ACPIDEV_ODF_DEVINFO_CREATED; + dhdl->aod_iflag &= ~ACPIDEV_ODF_DEVINFO_TAGGED; + if (dhdl->aod_class != NULL) { + if (dhdl->aod_class->adc_fini != NULL) { + (*(dhdl->aod_class->adc_fini))(hdl, dhdl, + dhdl->aod_class); + } + atomic_dec_32(&(dhdl->aod_class->adc_refcnt)); + dhdl->aod_class = NULL; + } + dhdl->aod_iflag &= ~ACPIDEV_ODF_STATUS_VALID; + dhdl->aod_status = 0; + + acpidev_free_object_name(objname); + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_device_remove(ACPI_HANDLE hdl) +{ + ACPI_STATUS rc = AE_OK; + int circ; + char *objname; + acpidev_data_handle_t dhdl; + struct acpidev_dr_device_remove_arg arg; + + ASSERT(acpidev_root_node() != NULL); + ASSERT(hdl != NULL); + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: parameter hdl to " + "acpidev_dr_device_remove() is NULL."); + return (AE_BAD_PARAMETER); + } + + objname = acpidev_get_object_name(hdl); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %s.", + objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* Validate that the device is hotplug capable. */ + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %s is not hotplug capable.", objname); + acpidev_free_object_name(objname); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %s is in the FAILED " + "state, unusable for DR.", objname); + acpidev_free_object_name(objname); + return (AE_ERROR); + } + + /* + * Recursively destroy descendants under the top node. + * No need to undo what has been done if error happens, it will be + * handled by DR driver. + */ + /* + * Lock ddi_root_node() to avoid deadlock. + */ + ndi_devi_enter(ddi_root_node(), &circ); + + arg.level = 0; + rc = acpidev_dr_device_remove_cb(hdl, 0, &arg, NULL); + ASSERT(arg.level == 0); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to destroy device " + "nodes for children of %s.", objname); + cmn_err(CE_WARN, "!acpidev: disable DR support for object %s " + "due to failure when destroying device nodes for it.", + objname); + ACPIDEV_DR_SET_FAILED(dhdl); + } + + ndi_devi_exit(ddi_root_node(), circ); + acpidev_free_object_name(objname); + + return (rc); +} + +ACPI_STATUS +acpidev_dr_device_poweron(ACPI_HANDLE hdl) +{ + acpidev_data_handle_t dhdl; + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %p.", + hdl); + return (AE_ERROR); + } + + /* Check whether the device is hotplug capable. */ + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %p is not hotplug capable.", hdl); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %p is in the FAILED " + "state, unusable for DR.", hdl); + return (AE_ERROR); + } + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_device_poweroff(ACPI_HANDLE hdl) +{ + ACPI_STATUS rc; + acpidev_data_handle_t dhdl; + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %p.", + hdl); + return (AE_ERROR); + } + + /* Check whether the device is hotplug capable. */ + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %p is not hotplug capable.", hdl); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %p is in the FAILED " + "state, unusable for DR.", hdl); + return (AE_ERROR); + } + + rc = acpidev_eval_ej0(hdl); + if (ACPI_FAILURE(rc)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to evaluate _EJ0 for object %p.", hdl); + } + + return (rc); +} + +ACPI_STATUS +acpidev_dr_device_check_status(ACPI_HANDLE hdl) +{ + acpidev_data_handle_t dhdl; + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data handle associated with %p.", + hdl); + return (AE_ERROR); + } + + /* Check whether the device is hotplug capable. */ + if (!ACPIDEV_DR_BOARD_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: object %p is not hotplug capable.", hdl); + return (AE_SUPPORT); + } else if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: object %p is in the FAILED " + "state, unusable for DR.", hdl); + return (AE_ERROR); + } + + return (AE_OK); +} + +void +acpidev_dr_lock_all(void) +{ + mutex_enter(&acpidev_dr_lock); +} + +void +acpidev_dr_unlock_all(void) +{ + mutex_exit(&acpidev_dr_lock); +} + +ACPI_STATUS +acpidev_dr_allocate_cpuid(ACPI_HANDLE hdl, processorid_t *idp) +{ + int rv; + processorid_t cpuid; + uint32_t procid, apicid; + mach_cpu_add_arg_t arg; + acpidev_data_handle_t dhdl; + dev_info_t *dip = NULL; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(hdl != NULL); + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: parameter hdl to " + "acpidev_dr_allocate_cpuid() is NULL."); + return (AE_BAD_PARAMETER); + } + + /* Validate that the device is ready for hotplug. */ + if (ACPI_FAILURE(acpica_get_devinfo(hdl, &dip))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get devinfo for object %p.", hdl); + return (AE_ERROR); + } + ASSERT(dip != NULL); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get data associated with object %p", + hdl); + return (AE_SUPPORT); + } + if (!ACPIDEV_DR_IS_READY(dhdl)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: dip %p is not hotplug ready.", (void *)dip); + return (AE_SUPPORT); + } + if (ACPIDEV_DR_IS_FAILED(dhdl)) { + ACPIDEV_DEBUG(CE_NOTE, + "!acpidev: dip %p is in the FAILED state.", (void *)dip); + return (AE_SUPPORT); + } + + /* Query CPU relative information */ + apicid = (uint32_t)ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, ACPIDEV_PROP_NAME_LOCALAPIC_ID, UINT32_MAX); + procid = (uint32_t)ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, ACPIDEV_PROP_NAME_PROCESSOR_ID, UINT32_MAX); + if (procid == UINT32_MAX || apicid == UINT32_MAX || apicid == 255) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: dip %p is malformed, " + "procid(0x%x) or apicid(0x%x) is invalid.", + (void *)dip, procid, apicid); + return (AE_ERROR); + } + + /* Check whether the CPU device is in offline state. */ + mutex_enter(&(DEVI(dip)->devi_lock)); + if (!DEVI_IS_DEVICE_OFFLINE(dip)) { + mutex_exit(&DEVI(dip)->devi_lock); + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: dip %p isn't in offline state.", (void *)dip); + return (AE_ERROR); + } + mutex_exit(&DEVI(dip)->devi_lock); + + /* Check whether the CPU already exists. */ + if (ACPI_SUCCESS(acpica_get_cpu_id_by_object(hdl, &cpuid))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: dip %p already has CPU id(%d) assigned.", + (void *)dip, cpuid); + return (AE_ALREADY_EXISTS); + } + + /* Allocate cpuid for the CPU */ + arg.arg.apic.apic_id = apicid; + arg.arg.apic.proc_id = procid; + if (apicid >= 255) { + arg.type = MACH_CPU_ARG_LOCAL_X2APIC; + } else { + arg.type = MACH_CPU_ARG_LOCAL_APIC; + } + rv = mach_cpu_add(&arg, &cpuid); + if (rv != PSM_SUCCESS) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to allocate cpu id for dip %p.", + (void *)dip); + return (AE_NOT_EXIST); + } + + ASSERT(cpuid >= 0 && cpuid < NCPU && cpuid < max_ncpus); + if (idp != NULL) { + *idp = cpuid; + } + + return (AE_OK); +} + +ACPI_STATUS +acpidev_dr_free_cpuid(ACPI_HANDLE hdl) +{ + ACPI_STATUS rv = AE_OK; + processorid_t cpuid; + + ASSERT(MUTEX_HELD(&cpu_lock)); + ASSERT(hdl != NULL); + if (hdl == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: parameter hdl to " + "acpidev_dr_free_cpuid() is NULL."); + return (AE_BAD_PARAMETER); + } + + if (ACPI_FAILURE(acpica_get_cpu_id_by_object(hdl, &cpuid))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get cpuid for object %p.", hdl); + rv = AE_NOT_EXIST; + } else if (cpuid < 0 || cpuid > max_ncpus) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: cpuid(%d) of object %p is invalid.", + cpuid, hdl); + rv = AE_ERROR; + } else if (mach_cpu_remove(cpuid) != PSM_SUCCESS) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to free cpuid(%d) for object %p.", + cpuid, hdl); + rv = AE_ERROR; + } + + return (rv); +} + +static ACPI_STATUS +acpidev_dr_get_latency(ACPI_HANDLE hdl, void **hdlpp, + uint32_t pxmid, uint32_t *slicntp, uchar_t **slipp) +{ + ACPI_STATUS rc; + ACPI_BUFFER buf; + uint32_t i, pxmcnt; + uchar_t *valp, *sp, *ep; + + /* Evaluate the ACPI _SLI method under the object. */ + buf.Length = ACPI_ALLOCATE_BUFFER; + rc = AcpiEvaluateObjectTyped(hdl, ACPIDEV_METHOD_NAME_SLI, NULL, &buf, + ACPI_TYPE_BUFFER); + if (ACPI_SUCCESS(rc)) { + valp = (uchar_t *)buf.Pointer; + if (acpidev_slit_tbl_ptr->LocalityCount > pxmid) { + pxmcnt = acpidev_slit_tbl_ptr->LocalityCount; + } else { + pxmcnt = pxmid + 1; + } + + /* + * Validate data returned by the ACPI _SLI method. + * Please refer to 6.2.14 "_SLI (System Locality Information)" + * in ACPI4.0 for data format returned by _SLI method. + */ + if (buf.Length != pxmcnt * 2 * sizeof (uchar_t)) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: buffer length returned by _SLI method " + "under %p is invalid.", hdl); + AcpiOsFree(buf.Pointer); + } else if (valp[pxmid] != ACPI_SLIT_SELF_LATENCY || + valp[pxmid + pxmcnt] != ACPI_SLIT_SELF_LATENCY) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: local latency returned by _SLI method " + "under %p is not %u.", hdl, ACPI_SLIT_SELF_LATENCY); + AcpiOsFree(buf.Pointer); + } else { + *slicntp = pxmcnt; + *slipp = (uchar_t *)buf.Pointer; + *hdlpp = buf.Pointer; + return (AE_OK); + } + } else if (rc != AE_NOT_FOUND) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to evaluate " + "_SLI method under object %p.", hdl); + } + + /* Return data from the ACPI SLIT table. */ + ASSERT(acpidev_slit_tbl_ptr != NULL); + pxmcnt = acpidev_slit_tbl_ptr->LocalityCount; + if (pxmid >= pxmcnt) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: proximity domain id " + "(%u) is too big, max %u.", pxmid, pxmcnt - 1); + *slicntp = 0; + *slipp = NULL; + return (AE_ERROR); + } else { + sp = AcpiOsAllocate(pxmcnt * 2 * sizeof (uchar_t)); + ep = acpidev_slit_tbl_ptr->Entry; + for (i = 0; i < pxmcnt; i++) { + sp[i] = ep[pxmcnt * pxmid + i]; + sp[i + pxmcnt] = ep[pxmcnt * i + pxmid]; + } + *slicntp = pxmcnt; + *slipp = sp; + *hdlpp = sp; + return (AE_OK); + } +} + +/* + * Query NUMA information for the CPU device. + * It returns APIC id, Proximity id and latency information of the CPU device. + */ +int +acpidev_dr_get_cpu_numa_info(cpu_t *cp, void **hdlpp, uint32_t *apicidp, + uint32_t *pxmidp, uint32_t *slicntp, uchar_t **slipp) +{ + dev_info_t *dip = NULL; + ACPI_HANDLE hdl = NULL; + + ASSERT(cp != NULL); + ASSERT(hdlpp != NULL); + ASSERT(apicidp != NULL); + ASSERT(pxmidp != NULL); + if (cp == NULL || hdlpp == NULL || apicidp == NULL || pxmidp == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters to " + "acpidev_dr_get_cpu_numa_info()."); + return (-1); + } + + *hdlpp = NULL; + *apicidp = UINT32_MAX; + *pxmidp = UINT32_MAX; + if (lgrp_plat_node_cnt == 1) { + return (-1); + } + ASSERT(acpidev_slit_tbl_ptr != NULL); + + /* Query APIC id and Proximity id from device properties. */ + if (ACPI_FAILURE(acpica_get_cpu_object_by_cpuid(cp->cpu_id, &hdl))) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get ACPI object " + "for CPU(%d).", cp->cpu_id); + return (-1); + } + if (ACPI_FAILURE(acpica_get_devinfo(hdl, &dip))) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get device node " + "for CPU(%d).", cp->cpu_id); + return (-1); + } + *apicidp = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + ACPIDEV_PROP_NAME_LOCALAPIC_ID, UINT32_MAX); + *pxmidp = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + ACPIDEV_PROP_NAME_PROXIMITY_ID, UINT32_MAX); + if (*apicidp == UINT32_MAX || *pxmidp == UINT32_MAX) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get local APIC id " + "or proximity id for CPU(%d).", cp->cpu_id); + return (-1); + } + + ASSERT((slicntp && slipp) || (!slicntp && !slipp)); + if (slicntp != NULL && slipp != NULL) { + if (ACPI_FAILURE(acpidev_dr_get_latency(hdl, hdlpp, *pxmidp, + slicntp, slipp))) { + return (-1); + } + } + + return (0); +} + +void +acpidev_dr_free_cpu_numa_info(void *hdlp) +{ + if (hdlp != NULL) { + AcpiOsFree(hdlp); + } +} + +static ACPI_STATUS +acpidev_dr_mem_search_srat(struct memlist *ml, uint32_t *pxmidp) +{ + int len, off; + uint64_t start, end; + boolean_t found = B_FALSE; + ACPI_SUBTABLE_HEADER *sp; + ACPI_SRAT_MEM_AFFINITY *mp; + + ASSERT(ml != NULL); + ASSERT(pxmidp != NULL); + ASSERT(acpidev_srat_tbl_ptr != NULL); + + /* Search the static ACPI SRAT table for proximity domain. */ + sp = (ACPI_SUBTABLE_HEADER *)(acpidev_srat_tbl_ptr + 1); + len = acpidev_srat_tbl_ptr->Header.Length; + off = sizeof (*acpidev_srat_tbl_ptr); + while (off < len) { + if (sp->Type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) { + mp = (ACPI_SRAT_MEM_AFFINITY *)sp; + if ((mp->Flags & ACPI_SRAT_MEM_ENABLED) && + (mp->Flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && + ml->ml_address >= mp->BaseAddress && + ml->ml_address <= mp->BaseAddress + mp->Length) { + found = B_TRUE; + break; + } + } + off += sp->Length; + sp = (ACPI_SUBTABLE_HEADER *)(((char *)sp) + sp->Length); + } + if (!found) + return (AE_NOT_FOUND); + + /* + * Verify that all memory regions in the list belong to the same domain. + */ + start = mp->BaseAddress; + end = mp->BaseAddress + mp->Length; + while (ml) { + if (ml->ml_address < start || + ml->ml_address + ml->ml_size > end) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: memory for hot-adding doesn't belong " + "to the same proximity domain."); + return (AE_ERROR); + } + ml = ml->ml_next; + } + + return (AE_OK); +} + +/* + * Query lgrp information for a memory device. + * It returns proximity domain id and latency information of the memory device. + */ +ACPI_STATUS +acpidev_dr_get_mem_numa_info(ACPI_HANDLE hdl, struct memlist *ml, + void **hdlpp, uint32_t *pxmidp, uint32_t *slicntp, uchar_t **slipp) +{ + ASSERT(ml != NULL); + ASSERT(hdlpp != NULL); + ASSERT(pxmidp != NULL); + if (ml == NULL || hdlpp == NULL || pxmidp == NULL) { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters to " + "acpidev_dr_get_mem_numa_info()."); + return (AE_BAD_PARAMETER); + } + + *pxmidp = UINT32_MAX; + if (lgrp_plat_node_cnt == 1) { + return (AE_SUPPORT); + } + + if (ACPI_FAILURE(acpidev_eval_pxm(hdl, pxmidp))) { + /* + * Try to get proximity domain id from SRAT table if failed to + * evaluate ACPI _PXM method for memory device. + */ + if (ACPI_FAILURE(acpidev_dr_mem_search_srat(ml, pxmidp))) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to get proximity domain id for " + "memory device %p.", hdl); + return (AE_ERROR); + } + } + + ASSERT((slicntp && slipp) || (!slicntp && !slipp)); + if (slicntp != NULL && slipp != NULL) { + if (ACPI_FAILURE(acpidev_dr_get_latency(hdl, hdlpp, *pxmidp, + slicntp, slipp))) { + return (AE_ERROR); + } + } + + return (AE_OK); +} + +void +acpidev_dr_free_mem_numa_info(void *hdlp) +{ + if (hdlp != NULL) { + AcpiOsFree(hdlp); + } +} diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_drv.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_drv.c index eee5d84715..2c0dab936b 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_drv.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_drv.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -60,19 +60,23 @@ #include <sys/errno.h> #include <sys/modctl.h> #include <sys/mutex.h> +#include <sys/note.h> #include <sys/obpdefs.h> #include <sys/sunddi.h> #include <sys/sunndi.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> #include <sys/acpidev.h> +#include <sys/acpidev_dr.h> #include <sys/acpidev_impl.h> /* Patchable through /etc/system */ int acpidev_options = 0; int acpidev_debug = 0; +krwlock_t acpidev_class_lock; acpidev_class_list_t *acpidev_class_list_root = NULL; +ulong_t acpidev_object_type_mask[BT_BITOUL(ACPI_TYPE_NS_NODE_MAX + 1)]; /* ACPI device autoconfig global status */ typedef enum acpidev_status { @@ -86,9 +90,7 @@ typedef enum acpidev_status { static acpidev_status_t acpidev_status = ACPIDEV_STATUS_UNKNOWN; static kmutex_t acpidev_drv_lock; -static krwlock_t acpidev_class_lock; static dev_info_t *acpidev_root_dip = NULL; -static ulong_t acpidev_object_type_mask[BT_BITOUL(ACPI_TYPE_NS_NODE_MAX + 1)]; /* Boot time ACPI device enumerator. */ static void acpidev_boot_probe(int type); @@ -117,6 +119,7 @@ _init(void) sizeof (acpidev_object_type_mask)); mutex_init(&acpidev_drv_lock, NULL, MUTEX_DRIVER, NULL); rw_init(&acpidev_class_lock, NULL, RW_DEFAULT, NULL); + acpidev_dr_init(); impl_bus_add_probe(acpidev_boot_probe); } else { cmn_err(CE_WARN, "!acpidev: failed to install driver."); @@ -157,6 +160,13 @@ acpidev_class_list_fini(void) { acpidev_unload_plat_modules(); + if ((acpidev_options & ACPIDEV_OUSER_NO_PCI) == 0) { + (void) acpidev_unregister_class(&acpidev_class_list_scope, + &acpidev_class_pci); + (void) acpidev_unregister_class(&acpidev_class_list_device, + &acpidev_class_pci); + } + if ((acpidev_options & ACPIDEV_OUSER_NO_MEM) == 0) { (void) acpidev_unregister_class(&acpidev_class_list_device, &acpidev_class_memory); @@ -252,7 +262,7 @@ acpidev_class_list_init(uint64_t *fp) *fp |= ACPI_DEVCFG_CPU; } - /* Check support for ACPI memory device. */ + /* Check support of ACPI memory devices. */ if ((acpidev_options & ACPIDEV_OUSER_NO_MEM) == 0) { /* * Register the ACPI memory class driver onto the @@ -267,10 +277,36 @@ acpidev_class_list_init(uint64_t *fp) *fp |= ACPI_DEVCFG_MEMORY; } + /* Check support of PCI/PCIex Host Bridge devices. */ + if ((acpidev_options & ACPIDEV_OUSER_NO_PCI) == 0) { + /* + * Register pci/pciex class drivers onto + * the acpidev_class_list_device class list because ACPI + * module class driver uses that list. + */ + if (ACPI_FAILURE(acpidev_register_class( + &acpidev_class_list_device, &acpidev_class_pci, + B_FALSE))) { + goto error_device_pci; + } + + /* + * Register pci/pciex class drivers onto the + * acpidev_class_list_scope class list. + */ + if (ACPI_FAILURE(acpidev_register_class( + &acpidev_class_list_scope, &acpidev_class_pci, + B_FALSE))) { + goto error_scope_pci; + } + + *fp |= ACPI_DEVCFG_PCI; + } + /* Check blacklist and load platform specific modules. */ rc = acpidev_load_plat_modules(); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to check blacklist " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to check blacklist " "or load pratform modules."); goto error_plat; } @@ -278,6 +314,16 @@ acpidev_class_list_init(uint64_t *fp) return (AE_OK); error_plat: + if ((acpidev_options & ACPIDEV_OUSER_NO_PCI) == 0) { + (void) acpidev_unregister_class(&acpidev_class_list_scope, + &acpidev_class_pci); + } +error_scope_pci: + if ((acpidev_options & ACPIDEV_OUSER_NO_PCI) == 0) { + (void) acpidev_unregister_class(&acpidev_class_list_device, + &acpidev_class_pci); + } +error_device_pci: if ((acpidev_options & ACPIDEV_OUSER_NO_MEM) == 0) { (void) acpidev_unregister_class(&acpidev_class_list_device, &acpidev_class_memory); @@ -313,7 +359,7 @@ error_root_device: &acpidev_class_scope); error_out: ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to register built-in class drivers."); + "!acpidev: failed to register built-in class drivers."); *fp = 0; return (AE_ERROR); @@ -353,7 +399,7 @@ acpidev_create_root_node(void) (pnode_t)DEVI_SID_NODEID, &dip); if (rv != NDI_SUCCESS) { ndi_devi_exit(ddi_root_node(), circ); - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to create device node " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to create device node " "for ACPI root with errcode %d.", rv); return (AE_ERROR); } @@ -362,7 +408,7 @@ acpidev_create_root_node(void) if (ACPI_FAILURE(acpica_tag_devinfo(dip, ACPI_ROOT_OBJECT))) { (void) ddi_remove_child(dip, 0); ndi_devi_exit(ddi_root_node(), circ); - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to tag object %s.", + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to tag object %s.", ACPIDEV_OBJECT_NAME_SB); return (AE_ERROR); } @@ -376,7 +422,7 @@ acpidev_create_root_node(void) } if (rv != DDI_SUCCESS) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to set device property for /devices/%s.", + "!acpidev: failed to set device property for /devices/%s.", ACPIDEV_NODE_NAME_ROOT); goto error_out; } @@ -384,7 +430,7 @@ acpidev_create_root_node(void) /* Manually create an object handle for the root node */ objhdl = acpidev_data_create_handle(ACPI_ROOT_OBJECT); if (objhdl == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to create object " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to create object " "handle for the root node."); goto error_out; } @@ -425,7 +471,7 @@ acpidev_initialize(void) return; } else if (acpidev_status != ACPIDEV_STATUS_UNKNOWN) { ACPIDEV_DEBUG(CE_NOTE, - "acpidev: initialization called more than once."); + "!acpidev: initialization called more than once."); return; } @@ -476,6 +522,18 @@ acpidev_initialize(void) acpidev_options = ddi_prop_get_int(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, "acpidev-options", acpidev_options); + /* Check whether ACPI based DR has been disabled by user. */ + rc = ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_DONTPASS, "acpidev-dr", &str); + if (rc == DDI_SUCCESS) { + if (strcasecmp(str, "off") == 0 || strcasecmp(str, "no") == 0) { + cmn_err(CE_CONT, "?acpidev: ACPI based DR has been " + "disabled by user.\n"); + acpidev_dr_enable = 0; + } + ddi_prop_free(str); + } + /* Register all device class drivers. */ if (ACPI_FAILURE(acpidev_class_list_init(&features))) { cmn_err(CE_WARN, @@ -518,7 +576,7 @@ acpidev_boot_probe_device(acpidev_op_type_t op_type) infop = acpidev_alloc_walk_info(op_type, 0, ACPI_ROOT_OBJECT, &acpidev_class_list_root, NULL); if (infop == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to allocate walk info " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to allocate walk info " "object in acpi_boot_probe_device()."); return (AE_ERROR); } @@ -558,9 +616,18 @@ acpidev_boot_probe(int type) if (type == 0 && acpidev_status == ACPIDEV_STATUS_INITIALIZED) { rc = acpidev_boot_probe_device(ACPIDEV_OP_BOOT_PROBE); if (ACPI_SUCCESS(rc)) { + /* + * Support of DR operations will be disabled + * if failed to initialize DR subsystem. + */ + rc = acpidev_dr_initialize(acpidev_root_dip); + if (ACPI_FAILURE(rc) && rc != AE_SUPPORT) { + cmn_err(CE_CONT, "?acpidev: failed to " + "initialize DR subsystem."); + } acpidev_status = ACPIDEV_STATUS_FIRST_PASS; } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to probe ACPI " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to probe ACPI " "devices during boot."); acpidev_status = ACPIDEV_STATUS_FAILED; } @@ -569,7 +636,7 @@ acpidev_boot_probe(int type) if (ACPI_SUCCESS(rc)) { acpidev_status = ACPIDEV_STATUS_READY; } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to reprobe " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to reprobe " "ACPI devices during boot."); acpidev_status = ACPIDEV_STATUS_FAILED; } @@ -598,12 +665,12 @@ acpidev_probe_child(acpidev_walk_info_t *infop) ASSERT(infop != NULL); if (infop == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: infop is NULL in acpidev_probe_child()."); + "!acpidev: infop is NULL in acpidev_probe_child()."); return (AE_BAD_PARAMETER); } ASSERT(infop->awi_level < ACPIDEV_MAX_ENUM_LEVELS - 1); if (infop->awi_level >= ACPIDEV_MAX_ENUM_LEVELS - 1) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: recursive level is too deep " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: recursive level is too deep " "in acpidev_probe_child()."); return (AE_BAD_PARAMETER); } @@ -615,14 +682,14 @@ acpidev_probe_child(acpidev_walk_info_t *infop) if (infop->awi_class_list == NULL || infop->awi_hdl == NULL || infop->awi_info == NULL || infop->awi_name == NULL || infop->awi_data == NULL) { - ACPIDEV_DEBUG(CE_WARN, - "acpidev: infop has NULL fields in acpidev_probe_child()."); + ACPIDEV_DEBUG(CE_WARN, "!acpidev: infop has NULL fields in " + "acpidev_probe_child()."); return (AE_BAD_PARAMETER); } pdip = acpidev_walk_info_get_pdip(infop); if (pdip == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: pdip is NULL in acpidev_probe_child()."); + "!acpidev: pdip is NULL in acpidev_probe_child()."); return (AE_BAD_PARAMETER); } @@ -636,7 +703,7 @@ acpidev_probe_child(acpidev_walk_info_t *infop) } infop->awi_class_curr = it->acl_class; if (ACPI_FAILURE(it->acl_class->adc_pre_probe(infop))) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to pre-probe " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to pre-probe " "device of type %s under %s.", it->acl_class->adc_class_name, infop->awi_name); } @@ -653,11 +720,17 @@ acpidev_probe_child(acpidev_walk_info_t *infop) continue; } + /* It's another hotplug-capable board, skip it. */ + if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE && + acpidev_dr_device_is_board(child)) { + continue; + } + /* Allocate the walk info structure. */ cinfop = acpidev_alloc_walk_info(infop->awi_op_type, infop->awi_level + 1, child, NULL, infop); if (cinfop == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to allocate " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to allocate " "walk info child object of %s.", infop->awi_name); /* Mark error and continue to handle next child. */ @@ -673,12 +746,6 @@ acpidev_probe_child(acpidev_walk_info_t *infop) datap = cinfop->awi_data; if (cinfop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { datap->aod_class_list = infop->awi_class_list; - } else if (datap->aod_class_list != infop->awi_class_list) { - ACPIDEV_DEBUG(CE_WARN, - "acpidev: class list for %s has been changed", - infop->awi_name); - acpidev_free_walk_info(cinfop); - continue; } /* Call registered process callbacks. */ @@ -691,7 +758,7 @@ acpidev_probe_child(acpidev_walk_info_t *infop) res = it->acl_class->adc_probe(cinfop); if (ACPI_FAILURE(res)) { rc = res; - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "process object of type %s under %s.", it->acl_class->adc_class_name, infop->awi_name); @@ -709,7 +776,7 @@ acpidev_probe_child(acpidev_walk_info_t *infop) } infop->awi_class_curr = it->acl_class; if (ACPI_FAILURE(it->acl_class->adc_post_probe(infop))) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to post-probe " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to post-probe " "device of type %s under %s.", it->acl_class->adc_class_name, infop->awi_name); } @@ -737,7 +804,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) ASSERT(infop != NULL); if (infop == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: infop is NULL in acpidev_process_object()."); + "!acpidev: infop is NULL in acpidev_process_object()."); return (AE_BAD_PARAMETER); } ASSERT(infop->awi_hdl != NULL); @@ -751,13 +818,13 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) clsp = infop->awi_class_curr; if (hdl == NULL || datap == NULL || adip == NULL || clsp == NULL || clsp->adc_filter == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: infop has NULL pointer in " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: infop has NULL pointer in " "acpidev_process_object()."); return (AE_BAD_PARAMETER); } pdip = acpidev_walk_info_get_pdip(infop); if (pdip == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to get pdip for %s " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to get pdip for %s " "in acpidev_process_object().", infop->awi_name); return (AE_BAD_PARAMETER); } @@ -782,7 +849,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) (infop->awi_flags & ACPIDEV_WI_DEVICE_CREATED)) { ASSERT(infop->awi_dip != NULL); ACPIDEV_DEBUG(CE_NOTE, - "acpidev: device has already been created for object %s.", + "!acpidev: device has already been created for object %s.", infop->awi_name); return (AE_ALREADY_EXISTS); } @@ -793,9 +860,9 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) * 6.3.1 and 6.5.1. * present functioning enabled Action * 0 0 x Do nothing - * 1 x 0 Create node in OFFLINE and scan child + * 1 x 0 Do nothing * 1 x 1 Create node and scan child - * x 1 0 Create node in OFFLINE and scan child + * x 1 0 Do nothing * x 1 1 Create node and scan child */ if ((datap->aod_iflag & ACPIDEV_ODF_STATUS_VALID) == 0 || @@ -807,31 +874,34 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) } datap->aod_iflag |= ACPIDEV_ODF_STATUS_VALID; } - if (!acpidev_check_device_present(datap->aod_status)) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: object %s doesn't exist.", + if (!acpidev_check_device_enabled(datap->aod_status)) { + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: object %s doesn't exist.", infop->awi_name); - return (AE_NOT_EXIST); + /* + * Need to scan for hotplug-capable boards even if object + * doesn't exist or has been disabled during the first pass. + * So just disable creating device node and keep on scanning. + */ + if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { + flags &= ~ACPIDEV_PROCESS_FLAG_CREATE; + } else { + return (AE_NOT_EXIST); + } } ASSERT(infop->awi_data != NULL); ASSERT(infop->awi_parent != NULL); ASSERT(infop->awi_parent->awi_data != NULL); - /* Put device into offline state if parent is in offline state. */ - if (infop->awi_parent->awi_data->aod_iflag & - ACPIDEV_ODF_DEVINFO_OFFLINE) { - flags |= ACPIDEV_PROCESS_FLAG_OFFLINE; - /* Put device into offline state if it's disabled. */ - } else if (!acpidev_check_device_enabled(datap->aod_status)) { - flags |= ACPIDEV_PROCESS_FLAG_OFFLINE; - } - /* - * Mark current node status as OFFLINE even if a device node will not - * be created for it. This is needed to handle the case when the current - * node is SKIPPED (no device node will be created for it), so that all - * descedants of current nodes could be correctly marked as OFFLINE. - */ - if (flags & ACPIDEV_PROCESS_FLAG_OFFLINE) { - infop->awi_data->aod_iflag |= ACPIDEV_ODF_DEVINFO_OFFLINE; + if (flags & ACPIDEV_PROCESS_FLAG_CREATE) { + mutex_enter(&(DEVI(pdip)->devi_lock)); + /* + * Put the device into offline state if its parent is in + * offline state. + */ + if (DEVI_IS_DEVICE_OFFLINE(pdip)) { + flags |= ACPIDEV_PROCESS_FLAG_OFFLINE; + } + mutex_exit(&(DEVI(pdip)->devi_lock)); } /* Evaluate filtering rules and generate device name. */ @@ -862,7 +932,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) OBP_DEVICETYPE, clsp->adc_dev_type); if (ret != NDI_SUCCESS) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to set device property for %s.", + "!acpidev: failed to set device property for %s.", infop->awi_name); (void) ddi_remove_child(dip, 0); infop->awi_dip = NULL; @@ -886,7 +956,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) if (clsp->adc_init != NULL && ACPI_FAILURE(clsp->adc_init(infop))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to initialize device %s.", + "!acpidev: failed to initialize device %s.", infop->awi_name); if ((flags & ACPIDEV_PROCESS_FLAG_NOTAG) == 0) { (void) acpica_untag_devinfo(dip, hdl); @@ -923,6 +993,11 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) } else { (void) ndi_devi_bind_driver(dip, 0); } + + /* Hold reference on branch when hot-adding devices. */ + if (flags & ACPIDEV_PROCESS_FLAG_HOLDBRANCH) { + e_ddi_branch_hold(dip); + } } /* Free resources */ @@ -942,7 +1017,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) rc = acpidev_probe_child(infop); if (ACPI_FAILURE(rc)) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to probe subtree of %s.", + "!acpidev: failed to probe subtree of %s.", infop->awi_name); rc = AE_ERROR; } @@ -960,7 +1035,7 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) case ACPIDEV_FILTER_FAILED: ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to probe device for %s.", + "!acpidev: failed to probe device for %s.", infop->awi_name); rc = AE_ERROR; break; @@ -975,11 +1050,12 @@ acpidev_process_object(acpidev_walk_info_t *infop, int flags) return (rc); } -/*ARGSUSED*/ acpidev_filter_result_t acpidev_filter_default(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, acpidev_filter_rule_t *afrp, char *devname, int len) { + _NOTE(ARGUNUSED(hdl)); + ASSERT(afrp != NULL); ASSERT(devname == NULL || len >= ACPIDEV_MAX_NAMELEN); if (infop->awi_level < afrp->adf_minlvl || @@ -992,8 +1068,7 @@ acpidev_filter_default(acpidev_walk_info_t *infop, ACPI_HANDLE hdl, return (ACPIDEV_FILTER_CONTINUE); } if (afrp->adf_replace != NULL && devname != NULL) { - (void) strncpy(devname, afrp->adf_replace, len - 1); - devname[len - 1] = 0; + (void) strlcpy(devname, afrp->adf_replace, len); } return (afrp->adf_retcode); @@ -1042,7 +1117,7 @@ acpidev_register_class(acpidev_class_list_t **listpp, acpidev_class_t *clsp, ASSERT(listpp != NULL); if (listpp == NULL || clsp == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: invalid parameter in acpidev_register_class()."); + "!acpidev: invalid parameter in acpidev_register_class()."); return (AE_BAD_PARAMETER); } else if (clsp->adc_version != ACPIDEV_CLASS_REV) { cmn_err(CE_WARN, @@ -1092,7 +1167,7 @@ acpidev_unregister_class(acpidev_class_list_t **listpp, ASSERT(clsp != NULL); ASSERT(listpp != NULL); if (listpp == NULL || clsp == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid parameter " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter " "in acpidev_unregister_class()."); return (AE_BAD_PARAMETER); } @@ -1106,12 +1181,12 @@ acpidev_unregister_class(acpidev_class_list_t **listpp, } } if (temp == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: class %p(%s) doesn't exist " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: class %p(%s) doesn't exist " "in acpidev_unregister_class().", (void *)clsp, clsp->adc_class_name); rc = AE_NOT_FOUND; } else if (temp->acl_class->adc_refcnt != 0) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: class %p(%s) is still in use " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: class %p(%s) is still in use " "in acpidev_unregister_class()..", (void *)clsp, clsp->adc_class_name); rc = AE_ERROR; diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_memory.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_memory.c index e4dcbcdd67..4185817dd9 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_memory.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_memory.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -31,6 +31,7 @@ #include <sys/acpica.h> #include <sys/acpidev.h> #include <sys/acpidev_rsc.h> +#include <sys/acpidev_dr.h> #include <sys/acpidev_impl.h> static ACPI_STATUS acpidev_memory_probe(acpidev_walk_info_t *infop); @@ -97,7 +98,7 @@ static acpidev_filter_rule_t acpidev_memory_filters[] = { static ACPI_STATUS acpidev_memory_probe(acpidev_walk_info_t *infop) { - ACPI_STATUS rc; + ACPI_STATUS rc = AE_OK; int flags; ASSERT(infop != NULL); @@ -109,19 +110,35 @@ acpidev_memory_probe(acpidev_walk_info_t *infop) return (AE_OK); } - if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u " + flags = ACPIDEV_PROCESS_FLAG_SCAN; + switch (infop->awi_op_type) { + case ACPIDEV_OP_BOOT_PROBE: + if (acpica_get_devcfg_feature(ACPI_DEVCFG_MEMORY)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE; + acpidev_dr_check(infop); + } + break; + + case ACPIDEV_OP_BOOT_REPROBE: + break; + + case ACPIDEV_OP_HOTPLUG_PROBE: + if (acpica_get_devcfg_feature(ACPI_DEVCFG_MEMORY)) { + flags |= ACPIDEV_PROCESS_FLAG_CREATE | + ACPIDEV_PROCESS_FLAG_SYNCSTATUS | + ACPIDEV_PROCESS_FLAG_HOLDBRANCH; + } + break; + + default: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u " "in acpidev_memory_probe.", infop->awi_op_type); rc = AE_BAD_PARAMETER; + break; + } + + if (rc == AE_OK) { + rc = acpidev_process_object(infop, flags); } if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { cmn_err(CE_WARN, @@ -153,7 +170,6 @@ acpidev_memory_filter(acpidev_walk_info_t *infop, char *devname, int maxlen) return (res); } -/*ARGSUSED*/ static ACPI_STATUS acpidev_memory_init(acpidev_walk_info_t *infop) { diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_pci.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_pci.c new file mode 100644 index 0000000000..5f0c6c6c13 --- /dev/null +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_pci.c @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/acpidev_rsc.h> +#include <sys/acpidev_dr.h> +#include <sys/acpidev_impl.h> + +static ACPI_STATUS acpidev_pci_probe(acpidev_walk_info_t *infop); + +/* + * Default class driver for PCI/PCIEX Host Bridge devices. + */ +acpidev_class_t acpidev_class_pci = { + 0, /* adc_refcnt */ + ACPIDEV_CLASS_REV1, /* adc_version */ + ACPIDEV_CLASS_ID_PCI, /* adc_class_id */ + "PCI/PCIex Host Bridge", /* adc_class_name */ + ACPIDEV_TYPE_PCI, /* adc_dev_type */ + NULL, /* adc_private */ + NULL, /* adc_pre_probe */ + NULL, /* adc_post_probe */ + acpidev_pci_probe, /* adc_probe */ + NULL, /* adc_filter */ + NULL, /* adc_init */ + NULL, /* adc_fini */ +}; + +static char *acpidev_pci_device_ids[] = { + ACPIDEV_HID_PCIEX_HOSTBRIDGE, + ACPIDEV_HID_PCI_HOSTBRIDGE, +}; + +static char *acpidev_pciex_device_ids[] = { + ACPIDEV_HID_PCIEX_HOSTBRIDGE, +}; + +static void +acpidev_pci_update_status(acpidev_walk_info_t *infop) +{ + int status; + dev_info_t *dip = NULL; + acpidev_data_handle_t dhdl; + + dhdl = infop->awi_data; + ASSERT((dhdl->aod_iflag & ACPIDEV_ODF_DEVINFO_CREATED) == 0); + ASSERT((dhdl->aod_iflag & ACPIDEV_ODF_DEVINFO_TAGGED) == 0); + if ((dhdl->aod_iflag & ACPIDEV_ODF_STATUS_VALID) == 0) { + status = acpidev_query_device_status(infop->awi_hdl); + dhdl->aod_status = status; + dhdl->aod_iflag |= ACPIDEV_ODF_STATUS_VALID; + } else { + status = dhdl->aod_status; + } + dhdl->aod_level = infop->awi_level; + dhdl->aod_hdl = infop->awi_hdl; + dhdl->aod_class = NULL; + dhdl->aod_class_list = NULL; + if (acpidev_match_device_id(infop->awi_info, + ACPIDEV_ARRAY_PARAM(acpidev_pciex_device_ids))) { + dhdl->aod_class_id = ACPIDEV_CLASS_ID_PCIEX; + } else { + dhdl->aod_class_id = ACPIDEV_CLASS_ID_PCI; + } + + if (ACPI_FAILURE(acpica_get_devinfo(infop->awi_hdl, &dip))) { + dip = NULL; + } else { + ASSERT(dip != NULL); + } + if (acpidev_check_device_enabled(status)) { + /* + * Mark the device as DISABLE if no device node created. + * BIOS may hide some special PCI/PCIex buses from OS. + */ + if (dip == NULL) { + dhdl->aod_dip = NULL; + dhdl->aod_status &= ~ACPI_STA_DEVICE_ENABLED; + } else { + dhdl->aod_dip = dip; + dhdl->aod_iflag |= ACPIDEV_ODF_DEVINFO_CREATED; + } + } else { + ASSERT(dip == NULL); + dhdl->aod_dip = NULL; + dhdl->aod_status &= ~ACPI_STA_DEVICE_ENABLED; + } +} + +static ACPI_STATUS +acpidev_pci_probe(acpidev_walk_info_t *infop) +{ + ACPI_STATUS rc = AE_OK; + + ASSERT(infop != NULL); + ASSERT(infop->awi_hdl != NULL); + ASSERT(infop->awi_info != NULL); + if (infop->awi_info->Type != ACPI_TYPE_DEVICE || + acpidev_match_device_id(infop->awi_info, + ACPIDEV_ARRAY_PARAM(acpidev_pci_device_ids)) == B_FALSE) { + return (AE_OK); + } + + if (acpica_get_devcfg_feature(ACPI_DEVCFG_PCI) == 0) { + return (AE_OK); + } + + if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { + /* + * Check hotplug capability on the first pass. + */ + acpidev_dr_check(infop); + } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { + /* + * Check whether the PCI device enumerator has created device + * nodes for PCI/PCIEX host bridges. + */ + acpidev_pci_update_status(infop); + } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { + /* + * No support of PCI/PCIEX host bridge hotplug yet. + * It will come in next phase. + */ + cmn_err(CE_WARN, + "!acpidev: no support of PCI/PCIEX host bridge hotplug."); + /* + * Don't block the hot-adding process, just skip it. + */ + rc = AE_OK; + } else { + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u " + "in acpidev_pci_probe().", infop->awi_op_type); + rc = AE_BAD_PARAMETER; + } + if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { + cmn_err(CE_CONT, "?acpidev: failed to process PCI/PCIEX host " + "bridge object %s.\n", infop->awi_name); + } else { + rc = AE_OK; + } + + return (rc); +} diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_resource.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_resource.c index 17dd9c774b..e9664c7614 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_resource.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_resource.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -331,7 +331,7 @@ acpidev_resource_insert_dma(acpidev_resource_handle_t rhdl, int dma) ASSERT(rhdl != NULL); if (rhdl->acpidev_dma_count >= ACPIDEV_RES_DMA_MAX) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: too many DMA resources, max %u.", + "!acpidev: too many DMA resources, max %u.", ACPIDEV_RES_DMA_MAX); return (AE_LIMIT); } @@ -380,7 +380,7 @@ acpidev_resource_insert_irq(acpidev_resource_handle_t rhdl, int irq) ASSERT(rhdl != NULL); if (rhdl->acpidev_irq_count >= ACPIDEV_RES_IRQ_MAX) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: too many IRQ resources, max %u.", + "!acpidev: too many IRQ resources, max %u.", ACPIDEV_RES_IRQ_MAX); return (AE_LIMIT); } @@ -456,7 +456,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, high |= ACPIDEV_REG_MEM_COHERENT_PF; } else { ACPIDEV_DEBUG(CE_WARN, - "acpidev: unknown memory caching type %u.", + "!acpidev: unknown memory caching type %u.", addrp->Info.Mem.Caching); rc = AE_ERROR; break; @@ -477,7 +477,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, reg.size_low = addrp->AddressLength & 0xFFFFFFFF; rc = acpidev_resource_insert_reg(rhdl, ®); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert regspec into resource handle."); } /* Generate 'ranges' for producer. */ @@ -502,7 +502,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, range.size_low = addrp->AddressLength & 0xFFFFFFFF; rc = acpidev_resource_insert_range(rhdl, &range); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert range into resource handle."); } } @@ -524,7 +524,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, high |= ACPIDEV_REG_IO_RANGE_FULL; } else { ACPIDEV_DEBUG(CE_WARN, - "acpidev: unknown IO range type %u.", + "!acpidev: unknown IO range type %u.", addrp->Info.Io.RangeType); rc = AE_ERROR; break; @@ -545,7 +545,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, reg.size_low = addrp->AddressLength & 0xFFFFFFFF; rc = acpidev_resource_insert_reg(rhdl, ®); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert regspec into resource handle."); } /* Generate 'ranges' for producer. */ @@ -570,7 +570,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, range.size_low = addrp->AddressLength & 0xFFFFFFFF; rc = acpidev_resource_insert_range(rhdl, &range); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert range into resource handle."); } } @@ -585,7 +585,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, end = addrp->Minimum + addrp->AddressLength; if (end < addrp->Minimum || end > UINT_MAX) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: bus range " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: bus range " "in ADDRESS64 is invalid."); rc = AE_ERROR; break; @@ -595,7 +595,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, ASSERT(bus.bus_start <= bus.bus_end); rc = acpidev_resource_insert_bus(rhdl, &bus); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert bus range into resource handle."); } } @@ -603,7 +603,7 @@ acpidev_resource_address64(acpidev_resource_handle_t rhdl, default: ACPIDEV_DEBUG(CE_WARN, - "acpidev: unknown resource type %u in ADDRESS64.", + "!acpidev: unknown resource type %u in ADDRESS64.", addrp->ResourceType); rc = AE_BAD_PARAMETER; } @@ -632,7 +632,7 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) case ACPI_RESOURCE_TYPE_GENERIC_REGISTER: case ACPI_RESOURCE_TYPE_VENDOR: ACPIDEV_DEBUG(CE_NOTE, - "acpidev: unsupported producer resource type %u, ignored.", + "!acpidev: unsupported producer resource type %u, ignored.", rscp->Type); break; @@ -651,13 +651,13 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) range.size_hi = 0; range.size_low = rscp->Data.Io.AddressLength; if ((uint64_t)range.child_low + range.size_low > UINT16_MAX) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid IO record, " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid IO record, " "IO max is out of range."); rc = AE_ERROR; } else if (range.size_low != 0) { rc = acpidev_resource_insert_range(rhdl, &range); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert range into resource handle."); } } @@ -671,16 +671,16 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) ACPI_RESOURCE_ADDRESS64 addr64; if (rscp->Data.Address.ProducerConsumer != ACPI_PRODUCER) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: producer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: producer encountered " "a CONSUMER resource, ignored."); } else if (ACPI_FAILURE(AcpiResourceToAddress64(rscp, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to convert resource to ADDR64."); + "!acpidev: failed to convert resource to ADDR64."); } else if (ACPI_FAILURE(rc = acpidev_resource_address64(rhdl, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to handle ADDRESS resource."); + "!acpidev: failed to handle ADDRESS resource."); } break; } @@ -690,7 +690,7 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) ACPI_RESOURCE_ADDRESS64 addr64; if (rscp->Data.ExtAddress64.ProducerConsumer != ACPI_PRODUCER) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: producer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: producer encountered " "a CONSUMER resource, ignored."); break; } @@ -705,14 +705,14 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) if (ACPI_FAILURE(rc = acpidev_resource_address64(rhdl, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to handle EXTADDRESS resource."); + "!acpidev: failed to handle EXTADDRESS resource."); } break; } case ACPI_RESOURCE_TYPE_START_DEPENDENT: case ACPI_RESOURCE_TYPE_END_DEPENDENT: - ACPIDEV_DEBUG(CE_NOTE, "acpidev: producer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: producer encountered " "START_DEPENDENT or END_DEPENDENT tag, ignored."); break; @@ -723,7 +723,7 @@ acpidev_resource_walk_producer(ACPI_RESOURCE *rscp, void *ctxp) default: ACPIDEV_DEBUG(CE_NOTE, - "acpidev: unknown ACPI resource type %u, ignored.", + "!acpidev: unknown ACPI resource type %u, ignored.", rscp->Type); break; } @@ -746,7 +746,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) case ACPI_RESOURCE_TYPE_GENERIC_REGISTER: case ACPI_RESOURCE_TYPE_VENDOR: ACPIDEV_DEBUG(CE_NOTE, - "acpidev: unsupported consumer resource type %u, ignored.", + "!acpidev: unsupported consumer resource type %u, ignored.", rscp->Type); break; @@ -755,7 +755,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) int i; if (rscp->Data.ExtendedIrq.ProducerConsumer != ACPI_CONSUMER) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: consumer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: consumer encountered " "a PRODUCER resource, ignored."); break; } @@ -764,7 +764,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) rscp->Data.ExtendedIrq.Interrupts[i]))) { continue; } - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to insert" + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to insert" "Extended IRQ into resource handle."); rc = AE_ERROR; break; @@ -781,7 +781,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) rscp->Data.Irq.Interrupts[i]))) { continue; } - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to insert" + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to insert" "IRQ into resource handle."); rc = AE_ERROR; break; @@ -798,7 +798,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) rscp->Data.Dma.Channels[i]))) { continue; } - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to insert" + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to insert" "dma into resource handle."); rc = AE_ERROR; break; @@ -827,13 +827,13 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) reg.phys_mid = 0; reg.size_hi = 0; if ((uint64_t)reg.phys_low + reg.size_low > UINT16_MAX) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid IO/FIXEDIO " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid IO/FIXEDIO " "record, IO max is out of range."); rc = AE_ERROR; } else if (reg.size_low != 0) { rc = acpidev_resource_insert_reg(rhdl, ®); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert reg into resource handle."); } } @@ -866,13 +866,13 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) reg.size_hi = 0; if ((uint64_t)reg.phys_low + reg.size_low > UINT32_MAX) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: invalid MEMORY32/FIXEDMEMORY32 record, " + "!acpidev: invalid MEMORY32/FIXEDMEMORY32 record, " "memory max is out of range."); rc = AE_ERROR; } else if (reg.size_low != 0) { rc = acpidev_resource_insert_reg(rhdl, ®); if (ACPI_FAILURE(rc)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to " "insert reg into resource handle."); } } @@ -886,16 +886,16 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) ACPI_RESOURCE_ADDRESS64 addr64; if (rscp->Data.Address.ProducerConsumer != ACPI_CONSUMER) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: consumer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: consumer encountered " "a PRODUCER resource, ignored."); } else if (ACPI_FAILURE(AcpiResourceToAddress64(rscp, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to convert resource to ADDR64."); + "!acpidev: failed to convert resource to ADDR64."); } else if (ACPI_FAILURE(rc = acpidev_resource_address64(rhdl, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to handle ADDRESS resource."); + "!acpidev: failed to handle ADDRESS resource."); } break; } @@ -905,7 +905,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) ACPI_RESOURCE_ADDRESS64 addr64; if (rscp->Data.ExtAddress64.ProducerConsumer != ACPI_CONSUMER) { - ACPIDEV_DEBUG(CE_NOTE, "acpidev: consumer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: consumer encountered " "a PRODUCER resource, ignored."); break; } @@ -920,14 +920,14 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) if (ACPI_FAILURE(rc = acpidev_resource_address64(rhdl, &addr64))) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to handle EXTADDRESS resource."); + "!acpidev: failed to handle EXTADDRESS resource."); } break; } case ACPI_RESOURCE_TYPE_START_DEPENDENT: case ACPI_RESOURCE_TYPE_END_DEPENDENT: - ACPIDEV_DEBUG(CE_NOTE, "acpidev: consumer encountered " + ACPIDEV_DEBUG(CE_NOTE, "!acpidev: consumer encountered " "START_DEPENDENT or END_DEPENDENT tag, ignored."); break; @@ -938,7 +938,7 @@ acpidev_resource_walk_consumer(ACPI_RESOURCE *rscp, void *ctxp) default: ACPIDEV_DEBUG(CE_NOTE, - "acpidev: unknown ACPI resource type %u, ignored.", + "!acpidev: unknown ACPI resource type %u, ignored.", rscp->Type); break; } @@ -959,14 +959,14 @@ acpidev_resource_walk(ACPI_HANDLE hdl, char *method, ASSERT(rhdlp != NULL); if (hdl == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: hdl is NULL in acpidev_resource_walk()."); + "!acpidev: hdl is NULL in acpidev_resource_walk()."); return (AE_BAD_PARAMETER); } else if (method == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: method is NULL in acpidev_resource_walk()."); + "!acpidev: method is NULL in acpidev_resource_walk()."); return (AE_BAD_PARAMETER); } else if (rhdlp == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: resource handle ptr is NULL " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: resource handle ptr is NULL " "in acpidev_resource_walk()."); return (AE_BAD_PARAMETER); } @@ -975,7 +975,7 @@ acpidev_resource_walk(ACPI_HANDLE hdl, char *method, if (ACPI_FAILURE(AcpiGetHandle(hdl, method, &mhdl))) { char *objname = acpidev_get_object_name(hdl); ACPIDEV_DEBUG(CE_NOTE, - "acpidev: method %s doesn't exist under %s", + "!acpidev: method %s doesn't exist under %s", method, objname); acpidev_free_object_name(objname); return (AE_NOT_FOUND); @@ -997,11 +997,9 @@ acpidev_resource_walk(ACPI_HANDLE hdl, char *method, } if (ACPI_FAILURE(rc)) { char *objname = acpidev_get_object_name(hdl); - ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to walk resource from method %s under %s.", - method, objname); + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to walk resource from " + "method %s under %s.", method, objname); acpidev_free_object_name(objname); - } return (rc); @@ -1016,7 +1014,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) ASSERT(infop != NULL); if (infop == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid parameter " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameter " "in acpidev_resource_process()."); return (AE_BAD_PARAMETER); } @@ -1027,7 +1025,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) consumer, &rhdl); if (ACPI_FAILURE(rc)) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: failed to walk ACPI resources of %s(%s).", + "!acpidev: failed to walk ACPI resources of %s(%s).", path, infop->awi_name); return (rc); } @@ -1041,7 +1039,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) "reg", (int *)rhdl->acpidev_regp, rhdl->acpidev_reg_count * sizeof (acpidev_regspec_t) / sizeof (int)) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'reg' property for %s.", path); rc = AE_ERROR; goto out; @@ -1051,7 +1049,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) "assigned-addresses", (int *)rhdl->acpidev_regp, rhdl->acpidev_reg_count * sizeof (acpidev_regspec_t) / sizeof (int)) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'assigned-addresses' property for %s.", path); rc = AE_ERROR; goto out; @@ -1062,7 +1060,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) ndi_prop_update_int_array(DDI_DEV_T_NONE, infop->awi_dip, "interrupts", (int *)rhdl->acpidev_irqp, rhdl->acpidev_irq_count) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'interrupts' property for %s.", path); rc = AE_ERROR; goto out; @@ -1073,7 +1071,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) ndi_prop_update_int_array(DDI_DEV_T_NONE, infop->awi_dip, "dma-channels", (int *)rhdl->acpidev_dmap, rhdl->acpidev_dma_count) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'dma-channels' property for %s.", path); rc = AE_ERROR; goto out; @@ -1088,7 +1086,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) "ranges", (int *)rhdl->acpidev_rangep, rhdl->acpidev_range_count * sizeof (acpidev_ranges_t) / sizeof (int)) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'ranges' property for %s.", path); rc = AE_ERROR; goto out; @@ -1100,7 +1098,7 @@ acpidev_resource_process(acpidev_walk_info_t *infop, boolean_t consumer) "bus-range", (int *)rhdl->acpidev_busp, rhdl->acpidev_bus_count * sizeof (acpidev_bus_range_t) / sizeof (int)) != NDI_SUCCESS) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to set " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to set " "'bus-range' property for %s.", path); rc = AE_ERROR; goto out; diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_scope.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_scope.c index 144cec2dbc..e19a6e92e4 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_scope.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_scope.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -102,7 +102,7 @@ static acpidev_filter_rule_t acpidev_scope_filters[] = { static ACPI_STATUS acpidev_scope_probe(acpidev_walk_info_t *infop) { - ACPI_STATUS rc; + ACPI_STATUS rc = AE_OK; int flags; ASSERT(infop != NULL); @@ -112,19 +112,29 @@ acpidev_scope_probe(acpidev_walk_info_t *infop) return (AE_OK); } - if (infop->awi_op_type == ACPIDEV_OP_BOOT_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN | ACPIDEV_PROCESS_FLAG_CREATE; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_BOOT_REPROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else if (infop->awi_op_type == ACPIDEV_OP_HOTPLUG_PROBE) { - flags = ACPIDEV_PROCESS_FLAG_SCAN; - rc = acpidev_process_object(infop, flags); - } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u " + flags = ACPIDEV_PROCESS_FLAG_SCAN; + switch (infop->awi_op_type) { + case ACPIDEV_OP_BOOT_PROBE: + flags |= ACPIDEV_PROCESS_FLAG_CREATE; + break; + + case ACPIDEV_OP_BOOT_REPROBE: + break; + + case ACPIDEV_OP_HOTPLUG_PROBE: + flags |= ACPIDEV_PROCESS_FLAG_SYNCSTATUS | + ACPIDEV_PROCESS_FLAG_HOLDBRANCH; + break; + + default: + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u " "in acpidev_scope_probe().", infop->awi_op_type); rc = AE_BAD_PARAMETER; + break; + } + + if (rc == AE_OK) { + rc = acpidev_process_object(infop, flags); } if (ACPI_FAILURE(rc) && rc != AE_NOT_EXIST && rc != AE_ALREADY_EXISTS) { cmn_err(CE_WARN, @@ -150,7 +160,7 @@ acpidev_scope_filter(acpidev_walk_info_t *infop, char *devname, int maxlen) ACPIDEV_ARRAY_PARAM(acpidev_scope_filters), devname, maxlen); } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: unknown operation type %u " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: unknown operation type %u " "in acpidev_scope_filter().", infop->awi_op_type); res = ACPIDEV_FILTER_FAILED; } @@ -158,7 +168,6 @@ acpidev_scope_filter(acpidev_walk_info_t *infop, char *devname, int maxlen) return (res); } -/*ARGSUSED*/ static ACPI_STATUS acpidev_scope_init(acpidev_walk_info_t *infop) { diff --git a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_util.c b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_util.c index 869a935f0d..1b1a0cf3ad 100644 --- a/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_util.c +++ b/usr/src/uts/i86pc/io/acpi/acpidev/acpidev_util.c @@ -24,12 +24,13 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ #include <sys/types.h> #include <sys/cmn_err.h> +#include <sys/note.h> #include <sys/sysmacros.h> #include <sys/sunddi.h> #include <sys/sunndi.h> @@ -55,7 +56,7 @@ acpidev_query_device_status(ACPI_HANDLE hdl) ASSERT(hdl != NULL); if (hdl == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: hdl is NULL in acpidev_query_device_status()."); + "!acpidev: hdl is NULL in acpidev_query_device_status()."); return (0); } @@ -114,7 +115,7 @@ acpidev_match_device_id(ACPI_DEVICE_INFO *infop, char **ids, int count) if (count == 0) { return (B_TRUE); } else if (infop == NULL || ids == NULL) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid parameters in " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters in " "acpidev_match_device_id()."); return (B_FALSE); } @@ -206,7 +207,7 @@ acpidev_get_device_by_id(ACPI_HANDLE hdl, char **ids, int count, ASSERT(userfunc != NULL); if (hdl == NULL || userfunc == NULL || (ids == NULL && count != 0)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid parameters " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters " "in acpidev_get_device_by_id()."); return (AE_BAD_PARAMETER); } @@ -237,7 +238,7 @@ acpidev_walk_apic(ACPI_BUFFER *bufp, ACPI_HANDLE hdl, char *method, ASSERT(func != NULL); if (func == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: invalid parameters for acpidev_walk_apic()."); + "!acpidev: invalid parameters for acpidev_walk_apic()."); return (AE_BAD_PARAMETER); } @@ -361,7 +362,7 @@ acpidev_alloc_walk_info(acpidev_op_type_t op_type, int lvl, ACPI_HANDLE hdl, datap->aod_level = lvl; datap->aod_hdl = hdl; } else { - ACPIDEV_DEBUG(CE_WARN, "acpidev: failed to create object " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: failed to create object " "handle for %s in acpidev_alloc_walk_info().", infop->awi_name); AcpiOsFree(infop->awi_info); @@ -416,12 +417,17 @@ acpidev_walk_info_get_pdip(acpidev_walk_info_t *infop) * Called to release resources when the corresponding object is going * to be destroyed. */ -/*ARGSUSED*/ static void -acpidev_get_object_handler(ACPI_HANDLE hdl, void *data) +acpidev_free_object_handler(ACPI_HANDLE hdl, void *data) { + _NOTE(ARGUNUSED(hdl)); + acpidev_data_handle_t objhdl = data; + if (objhdl->aod_class != NULL) { + atomic_dec_32(&objhdl->aod_class->adc_refcnt); + objhdl->aod_class = NULL; + } kmem_free(objhdl, sizeof (acpidev_data_handle_t)); } @@ -431,7 +437,7 @@ acpidev_data_get_handle(ACPI_HANDLE hdl) void *ptr; acpidev_data_handle_t objhdl = NULL; - if (ACPI_SUCCESS(AcpiGetData(hdl, acpidev_get_object_handler, &ptr))) { + if (ACPI_SUCCESS(AcpiGetData(hdl, acpidev_free_object_handler, &ptr))) { objhdl = (acpidev_data_handle_t)ptr; } @@ -444,7 +450,12 @@ acpidev_data_create_handle(ACPI_HANDLE hdl) acpidev_data_handle_t objhdl; objhdl = kmem_zalloc(sizeof (*objhdl), KM_SLEEP); - if (ACPI_FAILURE(AcpiAttachData(hdl, acpidev_get_object_handler, + objhdl->aod_bdtype = ACPIDEV_INVALID_BOARD; + objhdl->aod_bdnum = UINT32_MAX; + objhdl->aod_portid = UINT32_MAX; + objhdl->aod_class_id = ACPIDEV_CLASS_ID_INVALID; + + if (ACPI_FAILURE(AcpiAttachData(hdl, acpidev_free_object_handler, (void *)objhdl))) { cmn_err(CE_WARN, "!acpidev: failed to attach handle data to object."); @@ -459,9 +470,15 @@ void acpidev_data_destroy_handle(ACPI_HANDLE hdl) { void *ptr; + acpidev_data_handle_t objhdl = NULL; - if (ACPI_SUCCESS(AcpiGetData(hdl, acpidev_get_object_handler, &ptr)) && - ACPI_SUCCESS(AcpiDetachData(hdl, acpidev_get_object_handler))) { + if (ACPI_SUCCESS(AcpiGetData(hdl, acpidev_free_object_handler, &ptr)) && + ACPI_SUCCESS(AcpiDetachData(hdl, acpidev_free_object_handler))) { + objhdl = ptr; + if (objhdl->aod_class != NULL) { + atomic_dec_32(&objhdl->aod_class->adc_refcnt); + objhdl->aod_class = NULL; + } kmem_free(ptr, sizeof (acpidev_data_handle_t)); } } @@ -502,14 +519,14 @@ void acpidev_data_set_flag(acpidev_data_handle_t hdl, uint32_t flag) { ASSERT(hdl != NULL); - hdl->aod_eflag |= flag; + atomic_or_32(&hdl->aod_eflag, flag); } void acpidev_data_clear_flag(acpidev_data_handle_t hdl, uint32_t flag) { ASSERT(hdl != NULL); - hdl->aod_eflag &= ~flag; + atomic_and_32(&hdl->aod_eflag, ~flag); } uint32_t @@ -519,6 +536,27 @@ acpidev_data_get_flag(acpidev_data_handle_t hdl, uint32_t flag) return (hdl->aod_eflag & flag); } +boolean_t +acpidev_data_dr_capable(acpidev_data_handle_t hdl) +{ + ASSERT(hdl != NULL); + return (hdl->aod_iflag & ACPIDEV_ODF_HOTPLUG_CAPABLE); +} + +boolean_t +acpidev_data_dr_ready(acpidev_data_handle_t hdl) +{ + ASSERT(hdl != NULL); + return (hdl->aod_iflag & ACPIDEV_ODF_HOTPLUG_READY); +} + +boolean_t +acpidev_data_dr_failed(acpidev_data_handle_t hdl) +{ + ASSERT(hdl != NULL); + return (hdl->aod_iflag & ACPIDEV_ODF_HOTPLUG_FAILED); +} + static char * acpidev_generate_pseudo_unitaddr(char *uid, acpidev_class_id_t cid, char *buf, size_t len) @@ -591,7 +629,7 @@ acpidev_gen_unitaddr(char *uid, char *fmt, char *buf, size_t len) } if (cnt != 1 && cnt != 2) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: invalid uid format string '%s'.", fmt); + "!acpidev: invalid uid format string '%s'.", fmt); return (NULL); } @@ -605,11 +643,11 @@ acpidev_gen_unitaddr(char *uid, char *fmt, char *buf, size_t len) */ if (cnt == 2 && snprintf(buf, len, "%u,%u", id2, id1) >= len) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: generated unitaddr is too long."); + "!acpidev: generated unitaddr is too long."); return (NULL); } else if (cnt == 1 && snprintf(buf, len, "%u", id1) >= len) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: generated unitaddr is too long."); + "!acpidev: generated unitaddr is too long."); return (NULL); } @@ -681,7 +719,7 @@ acpidev_set_unitaddr(acpidev_walk_info_t *infop, char **fmts, size_t nfmt, if (infop == NULL || infop->awi_dip == NULL || infop->awi_info == NULL) { ACPIDEV_DEBUG(CE_WARN, - "acpidev: invalid parameters in acpidev_set_unitaddr()."); + "!acpidev: invalid parameters in acpidev_set_unitaddr()."); return (AE_BAD_PARAMETER); } @@ -758,7 +796,7 @@ acpidev_set_compatible(acpidev_walk_info_t *infop, char **compat, int acount) ASSERT(compat != NULL || acount == 0); if (infop == NULL || infop->awi_dip == NULL || infop->awi_info == NULL || (compat == NULL && acount != 0)) { - ACPIDEV_DEBUG(CE_WARN, "acpidev: invalid parameters " + ACPIDEV_DEBUG(CE_WARN, "!acpidev: invalid parameters " "in acpidev_set_compatible()."); return (AE_BAD_PARAMETER); } @@ -802,3 +840,90 @@ acpidev_set_compatible(acpidev_walk_info_t *infop, char **compat, int acount) return (AE_OK); } + +/* Evaluate _OST method under object, which is used to support hotplug event. */ +ACPI_STATUS +acpidev_eval_ost(ACPI_HANDLE hdl, uint32_t code, uint32_t status, + char *bufp, size_t len) +{ + ACPI_STATUS rc; + ACPI_OBJECT args[3]; + ACPI_OBJECT_LIST arglist; + + args[0].Type = ACPI_TYPE_INTEGER; + args[0].Integer.Value = code; + args[1].Type = ACPI_TYPE_INTEGER; + args[1].Integer.Value = status; + args[2].Type = ACPI_TYPE_BUFFER; + args[2].Buffer.Pointer = (UINT8 *)bufp; + args[2].Buffer.Length = (UINT32)len; + if (bufp == NULL || len == 0) { + arglist.Count = 2; + } else { + arglist.Count = 3; + } + arglist.Pointer = args; + rc = AcpiEvaluateObject(hdl, ACPIDEV_METHOD_NAME_OST, &arglist, NULL); + if (rc != AE_OK && rc != AE_NOT_FOUND) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to evaluate _OST method, code 0x%x.", rc); + } + + return (rc); +} + +ACPI_STATUS +acpidev_eval_ej0(ACPI_HANDLE hdl) +{ + ACPI_STATUS rc; + ACPI_OBJECT args[1]; + ACPI_OBJECT_LIST arglist; + + /* + * Quotation from ACPI spec 4.0 section 6.3.3. + * Arg0 An Integer containing a device ejection control + * 0 Cancel a mark for ejection request (EJ0 will never be called + * with this value) + * 1 Hot eject or mark for ejection + */ + args[0].Type = ACPI_TYPE_INTEGER; + args[0].Integer.Value = 1; + arglist.Count = 1; + arglist.Pointer = args; + rc = AcpiEvaluateObject(hdl, ACPIDEV_METHOD_NAME_EJ0, &arglist, NULL); + if (rc != AE_OK) { + ACPIDEV_DEBUG(CE_WARN, + "!acpidev: failed to evaluate _EJ0 method, code 0x%x.", rc); + } + + return (rc); +} + +ACPI_STATUS +acpidev_eval_pxm(ACPI_HANDLE hdl, uint32_t *idp) +{ + int pxmid; + + ASSERT(idp != NULL); + + /* + * Try to evaluate ACPI _PXM method to get proximity doamin id. + * Quotation from ACPI4.0: + * If the Local APIC ID / Local SAPIC ID / Local x2APIC ID of a + * dynamically added processor is not present in the System Resource + * Affinity Table (SRAT), a _PXM object must exist for the processor's + * device or one of its ancestors in the ACPI Namespace. + */ + while (hdl != NULL) { + if (ACPI_SUCCESS(acpica_eval_int(hdl, + ACPIDEV_METHOD_NAME_PXM, &pxmid))) { + *idp = (uint32_t)pxmid; + return (AE_OK); + } + if (ACPI_FAILURE(AcpiGetParent(hdl, &hdl))) { + break; + } + } + + return (AE_NOT_FOUND); +} diff --git a/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_drv.c b/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_drv.c index d9908aec59..35fb4b6af5 100644 --- a/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_drv.c +++ b/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_drv.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ /* @@ -40,6 +40,7 @@ #include <sys/ddi.h> #include <sys/ddi_impldefs.h> #include <sys/ddifm.h> +#include <sys/note.h> #include <sys/ndifm.h> #include <sys/sunddi.h> #include <sys/sunndi.h> @@ -175,6 +176,9 @@ _init(void) return (error); } + /* Initialize event subsystem. */ + acpinex_event_init(); + /* Install the module. */ if ((error = mod_install(&modlinkage)) != 0) { cmn_err(CE_WARN, "acpinex: failed to install module."); @@ -197,6 +201,9 @@ _fini(void) return (error); } + /* Shut down event subsystem. */ + acpinex_event_fini(); + /* Free the soft state info. */ ddi_soft_state_fini(&acpinex_softstates); @@ -211,10 +218,11 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -/* ARGSUSED */ static int acpinex_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + _NOTE(ARGUNUSED(dip)); + dev_t dev; int instance; @@ -268,16 +276,24 @@ acpinex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) (void) ddi_pathname(devi, softsp->ans_path); if (ACPI_FAILURE(acpica_get_handle(devi, &softsp->ans_hdl))) { ACPINEX_DEBUG(CE_WARN, - "acpinex: failed to get ACPI handle for %s.", + "!acpinex: failed to get ACPI handle for %s.", softsp->ans_path); ddi_soft_state_free(acpinex_softstates, instance); return (DDI_FAILURE); } mutex_init(&softsp->ans_lock, NULL, MUTEX_DRIVER, NULL); + /* Install event handler for child/descendant objects. */ + if (acpinex_event_scan(softsp, B_TRUE) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!acpinex: failed to install event handler " + "for children of %s.", softsp->ans_path); + } + /* nothing to suspend/resume here */ (void) ddi_prop_update_string(DDI_DEV_T_NONE, devi, "pm-hardware-state", "no-suspend-resume"); + (void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, + DDI_NO_AUTODETACH, 1); acpinex_fm_init(softsp); ddi_report_dev(devi); @@ -301,17 +317,24 @@ acpinex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) softsp = ddi_get_soft_state(acpinex_softstates, instance); if (softsp == NULL) { - ACPINEX_DEBUG(CE_WARN, "acpinex: failed to get soft state " + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to get soft state " "object for instance %d in acpinex_detach()", instance); return (DDI_FAILURE); } switch (cmd) { case DDI_DETACH: + if (acpinex_event_scan(softsp, B_FALSE) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!acpinex: failed to uninstall event " + "handler for children of %s.", softsp->ans_path); + return (DDI_FAILURE); + } ddi_remove_minor_node(devi, NULL); acpinex_fm_fini(softsp); mutex_destroy(&softsp->ans_lock); ddi_soft_state_free(acpinex_softstates, instance); + (void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, + DDI_NO_AUTODETACH, 0); return (DDI_SUCCESS); case DDI_SUSPEND: @@ -332,13 +355,11 @@ name_child(dev_info_t *child, char *name, int namelen) name[0] = '\0'; if (ddi_prop_lookup_string(DDI_DEV_T_ANY, child, DDI_PROP_DONTPASS, ACPIDEV_PROP_NAME_UNIT_ADDR, &unitaddr) == DDI_SUCCESS) { - (void) strncpy(name, unitaddr, namelen - 1); - name[namelen - 1] = '\0'; + (void) strlcpy(name, unitaddr, namelen); ddi_prop_free(unitaddr); } else { - ACPINEX_DEBUG(CE_NOTE, - "acpinex: failed to lookup child unit-address prop for %p.", - (void *)child); + ACPINEX_DEBUG(CE_NOTE, "!acpinex: failed to lookup child " + "unit-address prop for %p.", (void *)child); } return (DDI_SUCCESS); @@ -406,21 +427,22 @@ acpinex_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, off_t offset, off_t len, caddr_t *vaddrp) { ACPINEX_DEBUG(CE_WARN, - "acpinex: acpinex_bus_map called and it's unimplemented."); + "!acpinex: acpinex_bus_map called and it's unimplemented."); return (DDI_ME_UNIMPLEMENTED); } -/* ARGSUSED */ static int acpinex_open(dev_t *devi, int flags, int otyp, cred_t *credp) { + _NOTE(ARGUNUSED(flags, otyp, credp)); + minor_t minor, instance; acpinex_softstate_t *softsp; minor = getminor(*devi); instance = ACPINEX_GET_INSTANCE(minor); if (instance >= ACPINEX_INSTANCE_MAX) { - ACPINEX_DEBUG(CE_WARN, "acpinex: instance number %d out of " + ACPINEX_DEBUG(CE_WARN, "!acpinex: instance number %d out of " "range in acpinex_open, max %d.", instance, ACPINEX_INSTANCE_MAX - 1); return (EINVAL); @@ -428,7 +450,7 @@ acpinex_open(dev_t *devi, int flags, int otyp, cred_t *credp) softsp = ddi_get_soft_state(acpinex_softstates, instance); if (softsp == NULL) { - ACPINEX_DEBUG(CE_WARN, "acpinex: failed to get soft state " + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to get soft state " "object for instance %d in acpinex_open().", instance); return (EINVAL); } @@ -437,23 +459,24 @@ acpinex_open(dev_t *devi, int flags, int otyp, cred_t *credp) return (0); } else { ACPINEX_DEBUG(CE_WARN, - "acpinex: invalid minor number %d in acpinex_open().", + "!acpinex: invalid minor number %d in acpinex_open().", minor); return (EINVAL); } } -/* ARGSUSED */ static int acpinex_close(dev_t dev, int flags, int otyp, cred_t *credp) { + _NOTE(ARGUNUSED(flags, otyp, credp)); + minor_t minor, instance; acpinex_softstate_t *softsp; minor = getminor(dev); instance = ACPINEX_GET_INSTANCE(minor); if (instance >= ACPINEX_INSTANCE_MAX) { - ACPINEX_DEBUG(CE_WARN, "acpinex: instance number %d out of " + ACPINEX_DEBUG(CE_WARN, "!acpinex: instance number %d out of " "range in acpinex_close(), max %d.", instance, ACPINEX_INSTANCE_MAX - 1); return (EINVAL); @@ -461,7 +484,7 @@ acpinex_close(dev_t dev, int flags, int otyp, cred_t *credp) softsp = ddi_get_soft_state(acpinex_softstates, instance); if (softsp == NULL) { - ACPINEX_DEBUG(CE_WARN, "acpinex: failed to get soft state " + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to get soft state " "object for instance %d in acpinex_close().", instance); return (EINVAL); } @@ -470,17 +493,18 @@ acpinex_close(dev_t dev, int flags, int otyp, cred_t *credp) return (0); } else { ACPINEX_DEBUG(CE_WARN, - "acpinex: invalid minor number %d in acpinex_close().", + "!acpinex: invalid minor number %d in acpinex_close().", minor); return (EINVAL); } } -/* ARGSUSED */ static int acpinex_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { + _NOTE(ARGUNUSED(cmd, arg, mode, credp, rvalp)); + int rv = 0; minor_t minor, instance; acpinex_softstate_t *softsp; @@ -488,21 +512,21 @@ acpinex_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, minor = getminor(dev); instance = ACPINEX_GET_INSTANCE(minor); if (instance >= ACPINEX_INSTANCE_MAX) { - ACPINEX_DEBUG(CE_NOTE, "acpinex: instance number %d out of " + ACPINEX_DEBUG(CE_NOTE, "!acpinex: instance number %d out of " "range in acpinex_ioctl(), max %d.", instance, ACPINEX_INSTANCE_MAX - 1); return (EINVAL); } softsp = ddi_get_soft_state(acpinex_softstates, instance); if (softsp == NULL) { - ACPINEX_DEBUG(CE_WARN, "acpinex: failed to get soft state " + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to get soft state " "object for instance %d in acpinex_ioctl().", instance); return (EINVAL); } rv = ENOTSUP; ACPINEX_DEBUG(CE_WARN, - "acpinex: invalid minor number %d in acpinex_ioctl().", minor); + "!acpinex: invalid minor number %d in acpinex_ioctl().", minor); return (rv); } @@ -512,11 +536,12 @@ acpinex_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, * Register error handling callback with our parent. We will just call * our children's error callbacks and return their status. */ -/*ARGSUSED*/ static int acpinex_err_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data) { + _NOTE(ARGUNUSED(impl_data)); + /* Call our childrens error handlers */ return (ndi_fm_handler_dispatch(dip, NULL, derr)); } @@ -560,11 +585,12 @@ acpinex_fm_fini(acpinex_softstate_t *softsp) * Initialize FMA resources for child devices. * Called when child calls ddi_fm_init(). */ -/*ARGSUSED*/ static int acpinex_fm_init_child(dev_info_t *dip, dev_info_t *tdip, int cap, ddi_iblock_cookie_t *ibc) { + _NOTE(ARGUNUSED(tdip, cap)); + acpinex_softstate_t *softsp = ddi_get_soft_state(acpinex_softstates, ddi_get_instance(dip)); diff --git a/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_event.c b/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_event.c new file mode 100644 index 0000000000..633c99b442 --- /dev/null +++ b/usr/src/uts/i86pc/io/acpi/acpinex/acpinex_event.c @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/atomic.h> +#include <sys/bitmap.h> +#include <sys/cmn_err.h> +#include <sys/note.h> +#include <sys/sunndi.h> +#include <sys/fastboot_impl.h> +#include <sys/sysevent.h> +#include <sys/sysevent/dr.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/acpidev_dr.h> +#include <sys/acpinex.h> + +int acpinex_event_support_remove = 0; + +static volatile uint_t acpinex_dr_event_cnt = 0; +static ulong_t acpinex_object_type_mask[BT_BITOUL(ACPI_TYPE_NS_NODE_MAX + 1)]; + +/* + * Generate DR_REQ event to syseventd. + * Please refer to sys/sysevent/dr.h for message definition. + */ +static int +acpinex_event_generate_event(dev_info_t *dip, ACPI_HANDLE hdl, int req, + int event, char *objname) +{ + int rv = 0; + sysevent_id_t eid; + sysevent_value_t evnt_val; + sysevent_attr_list_t *evnt_attr_list = NULL; + char *attach_pnt; + char event_type[32]; + + /* Add "attachment point" attribute. */ + attach_pnt = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + if (ACPI_FAILURE(acpidev_dr_get_attachment_point(hdl, + attach_pnt, MAXPATHLEN))) { + cmn_err(CE_WARN, + "!acpinex: failed to generate AP name for %s.", objname); + kmem_free(attach_pnt, MAXPATHLEN); + return (-1); + } + ASSERT(attach_pnt[0] != '\0'); + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = attach_pnt; + rv = sysevent_add_attr(&evnt_attr_list, DR_AP_ID, &evnt_val, KM_SLEEP); + if (rv != 0) { + cmn_err(CE_WARN, + "!acpinex: failed to add attr [%s] for %s event.", + DR_AP_ID, EC_DR); + kmem_free(attach_pnt, MAXPATHLEN); + return (rv); + } + + /* Add "request type" attribute. */ + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = SE_REQ2STR(req); + rv = sysevent_add_attr(&evnt_attr_list, DR_REQ_TYPE, &evnt_val, + KM_SLEEP); + if (rv != 0) { + cmn_err(CE_WARN, + "!acpinex: failed to add attr [%s] for %s event.", + DR_REQ_TYPE, EC_DR); + sysevent_free_attr(evnt_attr_list); + kmem_free(attach_pnt, MAXPATHLEN); + return (rv); + } + + /* Add "acpi-event-type" attribute. */ + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + (void) snprintf(event_type, sizeof (event_type), + ACPIDEV_EVENT_TYPE_BUS_CHECK); + break; + case ACPI_NOTIFY_DEVICE_CHECK: + (void) snprintf(event_type, sizeof (event_type), + ACPIDEV_EVENT_TYPE_DEVICE_CHECK); + break; + case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: + (void) snprintf(event_type, sizeof (event_type), + ACPIDEV_EVENT_TYPE_DEVICE_CHECK_LIGHT); + break; + case ACPI_NOTIFY_EJECT_REQUEST: + (void) snprintf(event_type, sizeof (event_type), + ACPIDEV_EVENT_TYPE_EJECT_REQUEST); + break; + default: + cmn_err(CE_WARN, + "!acpinex: unknown ACPI event type %d.", event); + sysevent_free_attr(evnt_attr_list); + kmem_free(attach_pnt, MAXPATHLEN); + return (-1); + } + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = event_type; + rv = sysevent_add_attr(&evnt_attr_list, ACPIDEV_EVENT_TYPE_ATTR_NAME, + &evnt_val, KM_SLEEP); + if (rv != 0) { + cmn_err(CE_WARN, + "!acpinex: failed to add attr [%s] for %s event.", + ACPIDEV_EVENT_TYPE_ATTR_NAME, EC_DR); + sysevent_free_attr(evnt_attr_list); + kmem_free(attach_pnt, MAXPATHLEN); + return (rv); + } + + rv = ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DR, ESC_DR_REQ, + evnt_attr_list, &eid, KM_SLEEP); + if (rv != DDI_SUCCESS) { + cmn_err(CE_WARN, + "!acpinex: failed to log DR_REQ event for %s.", objname); + rv = -1; + } + + nvlist_free(evnt_attr_list); + kmem_free(attach_pnt, MAXPATHLEN); + + return (rv); +} + +/* + * Event handler for ACPI EJECT_REQUEST notifications. + * EJECT_REQUEST notifications should be generated on the device to be ejected, + * so no need to scan subtree of it. + * It also invokes ACPI _OST method to update event status if call_ost is true. + */ +static void +acpinex_event_handle_eject_request(ACPI_HANDLE hdl, acpinex_softstate_t *sp, + boolean_t call_ost) +{ + int code; + char *objname; + + ASSERT(hdl != NULL); + objname = acpidev_get_object_name(hdl); + + ASSERT(sp != NULL); + ASSERT(sp->ans_dip != NULL && sp->ans_hdl != NULL); + if (sp == NULL || sp->ans_dip == NULL || sp->ans_hdl == NULL) { + if (call_ost) { + (void) acpidev_eval_ost(hdl, ACPI_NOTIFY_EJECT_REQUEST, + ACPI_OST_STA_FAILURE, NULL, 0); + } + ACPINEX_DEBUG(CE_WARN, + "!acpinex: softstate data structure is invalid."); + cmn_err(CE_WARN, + "!acpinex: failed to handle EJECT_REQUEST event from %s.", + objname); + acpidev_free_object_name(objname); + return; + } + + if (acpinex_event_support_remove == 0) { + cmn_err(CE_WARN, + "!acpinex: hot-removing of device %s is unsupported.", + objname); + code = ACPI_OST_STA_EJECT_NOT_SUPPORT; + } else if (acpinex_event_generate_event(sp->ans_dip, hdl, + SE_OUTGOING_RES, ACPI_NOTIFY_EJECT_REQUEST, objname) != 0) { + cmn_err(CE_WARN, "!acpinex: failed to generate ESC_DR_REQ " + "event for device eject request from %s.", objname); + code = ACPI_OST_STA_FAILURE; + } else { + cmn_err(CE_NOTE, "!acpinex: generate ESC_DR_REQ event for " + "device eject request from %s.", objname); + code = ACPI_OST_STA_EJECT_IN_PROGRESS; + } + if (call_ost) { + (void) acpidev_eval_ost(hdl, ACPI_NOTIFY_EJECT_REQUEST, + code, NULL, 0); + } + + acpidev_free_object_name(objname); +} + +struct acpinex_event_check_arg { + acpinex_softstate_t *softstatep; + int event_type; + uint32_t device_insert; + uint32_t device_remove; + uint32_t device_fail; +}; + +static ACPI_STATUS +acpinex_event_handle_check_one(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, + void **retval) +{ + _NOTE(ARGUNUSED(lvl, retval)); + + char *objname; + int status, psta, csta; + acpidev_data_handle_t dhdl; + struct acpinex_event_check_arg *argp; + + ASSERT(hdl != NULL); + ASSERT(ctx != NULL); + argp = (struct acpinex_event_check_arg *)ctx; + + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + /* Skip subtree if failed to get the data handle. */ + ACPINEX_DEBUG(CE_NOTE, + "!acpinex: failed to get data associated with %p.", hdl); + return (AE_CTRL_DEPTH); + } else if (!acpidev_data_dr_capable(dhdl)) { + return (AE_OK); + } + + objname = acpidev_get_object_name(hdl); + + status = 0; + /* Query previous device status. */ + psta = acpidev_data_get_status(dhdl); + if (acpidev_check_device_enabled(psta)) { + status |= 0x1; + } + /* Query current device status. */ + csta = acpidev_query_device_status(hdl); + if (acpidev_check_device_enabled(csta)) { + status |= 0x2; + } + + switch (status) { + case 0x0: + /*FALLTHROUGH*/ + case 0x3: + /* No status changes, keep on walking. */ + acpidev_free_object_name(objname); + return (AE_OK); + + case 0x1: + /* Surprising removal. */ + cmn_err(CE_WARN, + "!acpinex: device %s has been surprisingly removed.", + objname); + if (argp->event_type == ACPI_NOTIFY_BUS_CHECK) { + /* + * According to ACPI spec, BUS_CHECK notification + * should be triggered for hot-adding events only. + */ + ACPINEX_DEBUG(CE_WARN, + "!acpinex: device %s has been surprisingly removed " + "when handling BUS_CHECK event.", objname); + } + acpidev_free_object_name(objname); + argp->device_remove++; + return (AE_CTRL_DEPTH); + + case 0x2: + /* Hot-adding. */ + ACPINEX_DEBUG(CE_NOTE, + "!acpinex: device %s has been inserted.", objname); + argp->device_insert++; + if (acpinex_event_generate_event(argp->softstatep->ans_dip, hdl, + SE_INCOMING_RES, argp->event_type, objname) != 0) { + cmn_err(CE_WARN, + "!acpinex: failed to generate ESC_DR_REQ event for " + "device insert request from %s.", objname); + argp->device_fail++; + } else { + cmn_err(CE_NOTE, "!acpinex: generate ESC_DR_REQ event " + "for device insert request from %s.", objname); + } + acpidev_free_object_name(objname); + return (AE_OK); + + default: + ASSERT(0); + break; + } + + return (AE_ERROR); +} + +/* + * Event handler for BUS_CHECK/DEVICE_CHECK/DEVICE_CHECK_LIGHT notifications. + * These events may be signaled on parent/ancestor of devices to be hot-added, + * so need to scan ACPI namespace to figure out devices in question. + * It also invokes ACPI _OST method to update event status if call_ost is true. + */ +static void +acpinex_event_handle_check_request(int event, ACPI_HANDLE hdl, + acpinex_softstate_t *sp, boolean_t call_ost) +{ + ACPI_STATUS rv; + int code; + char *objname; + struct acpinex_event_check_arg arg; + + ASSERT(hdl != NULL); + objname = acpidev_get_object_name(hdl); + + ASSERT(sp != NULL); + ASSERT(sp->ans_dip != NULL && sp->ans_hdl != NULL); + if (sp == NULL || sp->ans_dip == NULL || sp->ans_hdl == NULL) { + if (call_ost) { + (void) acpidev_eval_ost(hdl, event, + ACPI_OST_STA_FAILURE, NULL, 0); + } + ACPINEX_DEBUG(CE_WARN, + "!acpinex: softstate data structure is invalid."); + cmn_err(CE_WARN, "!acpinex: failed to handle " + "BUS/DEVICE_CHECK event from %s.", objname); + acpidev_free_object_name(objname); + return; + } + + bzero(&arg, sizeof (arg)); + arg.event_type = event; + arg.softstatep = sp; + rv = acpinex_event_handle_check_one(hdl, 0, &arg, NULL); + if (ACPI_SUCCESS(rv)) { + rv = AcpiWalkNamespace(ACPI_TYPE_DEVICE, hdl, + ACPIDEV_MAX_ENUM_LEVELS, + &acpinex_event_handle_check_one, NULL, &arg, NULL); + } + + if (ACPI_FAILURE(rv)) { + /* Failed to scan the ACPI namespace. */ + cmn_err(CE_WARN, "!acpinex: failed to handle event %d from %s.", + event, objname); + code = ACPI_OST_STA_FAILURE; + } else if (arg.device_remove != 0) { + /* Surprising removal happened. */ + ACPINEX_DEBUG(CE_WARN, + "!acpinex: some devices have been surprisingly removed."); + code = ACPI_OST_STA_NOT_SUPPORT; + } else if (arg.device_fail != 0) { + /* Failed to handle some devices. */ + ACPINEX_DEBUG(CE_WARN, + "!acpinex: failed to check status of some devices."); + code = ACPI_OST_STA_FAILURE; + } else if (arg.device_insert == 0) { + /* No hot-added devices found. */ + cmn_err(CE_WARN, + "!acpinex: no hot-added devices under %s found.", objname); + code = ACPI_OST_STA_FAILURE; + } else { + code = ACPI_OST_STA_INSERT_IN_PROGRESS; + } + if (call_ost) { + (void) acpidev_eval_ost(hdl, event, code, NULL, 0); + } + + acpidev_free_object_name(objname); +} + +static void +acpinex_event_system_handler(ACPI_HANDLE hdl, UINT32 type, void *arg) +{ + acpinex_softstate_t *sp; + + ASSERT(hdl != NULL); + ASSERT(arg != NULL); + sp = (acpinex_softstate_t *)arg; + + acpidev_dr_lock_all(); + mutex_enter(&sp->ans_lock); + + switch (type) { + case ACPI_NOTIFY_BUS_CHECK: + /* + * Bus Check. This notification is performed on a device object + * to indicate to OSPM that it needs to perform the Plug and + * Play re-enumeration operation on the device tree starting + * from the point where it has been notified. OSPM will only + * perform this operation at boot, and when notified. It is + * the responsibility of the ACPI AML code to notify OSPM at + * any other times that this operation is required. The more + * accurately and closer to the actual device tree change the + * notification can be done, the more efficient the operating + * system response will be; however, it can also be an issue + * when a device change cannot be confirmed. For example, if + * the hardware cannot notice a device change for a particular + * location during a system sleeping state, it issues a Bus + * Check notification on wake to inform OSPM that it needs to + * check the configuration for a device change. + */ + /*FALLTHROUGH*/ + case ACPI_NOTIFY_DEVICE_CHECK: + /* + * Device Check. Used to notify OSPM that the device either + * appeared or disappeared. If the device has appeared, OSPM + * will re-enumerate from the parent. If the device has + * disappeared, OSPM will invalidate the state of the device. + * OSPM may optimize out re-enumeration. If _DCK is present, + * then Notify(object,1) is assumed to indicate an undock + * request. + */ + /*FALLTHROUGH*/ + case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: + /* + * Device Check Light. Used to notify OSPM that the device + * either appeared or disappeared. If the device has appeared, + * OSPM will re-enumerate from the device itself, not the + * parent. If the device has disappeared, OSPM will invalidate + * the state of the device. + */ + atomic_inc_uint(&acpinex_dr_event_cnt); + acpinex_event_handle_check_request(type, hdl, sp, B_TRUE); + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + /* + * Eject Request. Used to notify OSPM that the device should + * be ejected, and that OSPM needs to perform the Plug and Play + * ejection operation. OSPM will run the _EJx method. + */ + atomic_inc_uint(&acpinex_dr_event_cnt); + acpinex_event_handle_eject_request(hdl, sp, B_TRUE); + break; + + default: + ACPINEX_DEBUG(CE_NOTE, + "!acpinex: unhandled event(%d) on hdl %p under %s.", + type, hdl, sp->ans_path); + (void) acpidev_eval_ost(hdl, type, ACPI_OST_STA_NOT_SUPPORT, + NULL, 0); + break; + } + + if (acpinex_dr_event_cnt != 0) { + /* + * Disable fast reboot if a CPU/MEM/IOH hotplug event happens. + * Note: this is a temporary solution and will be revised when + * fast reboot can support CPU/MEM/IOH DR operations in the + * future. + * + * ACPI BIOS generates some static ACPI tables, such as MADT, + * SRAT and SLIT, to describe the system hardware configuration + * on power-on. When a CPU/MEM/IOH hotplug event happens, those + * static tables won't be updated and will become stale. + * + * If we reset the system by fast reboot, BIOS will have no + * chance to regenerate those staled static tables. Fast reboot + * can't tolerate such inconsistency between staled ACPI tables + * and real hardware configuration yet. + * + * A temporary solution is introduced to disable fast reboot if + * CPU/MEM/IOH hotplug event happens. This solution should be + * revised when fast reboot is enhanced to support CPU/MEM/IOH + * DR operations. + */ + fastreboot_disable(FBNS_HOTPLUG); + } + + mutex_exit(&sp->ans_lock); + acpidev_dr_unlock_all(); +} + +/* + * Install event handler for ACPI system events. + * Acpinex driver handles ACPI system events for its children, + * device specific events will be handled by device drivers. + * Return DDI_SUCCESS on success, and DDI_FAILURE on failure. + */ +static int +acpinex_event_install_handler(ACPI_HANDLE hdl, void *arg, + ACPI_DEVICE_INFO *infop, acpidev_data_handle_t dhdl) +{ + int rc = DDI_SUCCESS; + + ASSERT(hdl != NULL); + ASSERT(dhdl != NULL); + ASSERT(infop != NULL); + + /* + * Check whether the event handler has already been installed on the + * device object. With the introduction of ACPI Alias objects, which are + * similar to symlinks in file systems, there may be multiple name + * objects in the ACPI namespace pointing to the same underlying device + * object. Those Alias objects need to be filtered out, otherwise + * it will attempt to install the event handler multiple times on the + * same device object which will fail. + */ + if (acpidev_data_get_flag(dhdl, ACPIDEV_DATA_HANDLER_READY)) { + return (DDI_SUCCESS); + } + if (ACPI_SUCCESS(AcpiInstallNotifyHandler(hdl, ACPI_SYSTEM_NOTIFY, + acpinex_event_system_handler, arg))) { + acpidev_data_set_flag(dhdl, ACPIDEV_DATA_HANDLER_READY); + } else { + char *objname; + + objname = acpidev_get_object_name(hdl); + cmn_err(CE_WARN, + "!acpinex: failed to install system event handler for %s.", + objname); + acpidev_free_object_name(objname); + rc = DDI_FAILURE; + } + + return (rc); +} + +/* + * Uninstall event handler for ACPI system events. + * Return DDI_SUCCESS on success, and DDI_FAILURE on failure. + */ +static int +acpinex_event_uninstall_handler(ACPI_HANDLE hdl, ACPI_DEVICE_INFO *infop, + acpidev_data_handle_t dhdl) +{ + ASSERT(hdl != NULL); + ASSERT(dhdl != NULL); + ASSERT(infop != NULL); + + if (!acpidev_data_get_flag(dhdl, ACPIDEV_DATA_HANDLER_READY)) { + return (DDI_SUCCESS); + } + if (ACPI_SUCCESS(AcpiRemoveNotifyHandler(hdl, ACPI_SYSTEM_NOTIFY, + acpinex_event_system_handler))) { + acpidev_data_clear_flag(dhdl, ACPIDEV_DATA_HANDLER_READY); + } else { + char *objname; + + objname = acpidev_get_object_name(hdl); + cmn_err(CE_WARN, "!acpinex: failed to uninstall system event " + "handler for %s.", objname); + acpidev_free_object_name(objname); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * Install/uninstall ACPI system event handler for child objects of hdl. + * Return DDI_SUCCESS on success, and DDI_FAILURE on failure. + */ +static int +acpinex_event_walk(boolean_t init, acpinex_softstate_t *sp, ACPI_HANDLE hdl) +{ + int rc; + int retval = DDI_SUCCESS; + dev_info_t *dip; + ACPI_HANDLE child = NULL; + ACPI_OBJECT_TYPE type; + ACPI_DEVICE_INFO *infop; + acpidev_data_handle_t dhdl; + + /* Walk all child objects. */ + ASSERT(hdl != NULL); + while (ACPI_SUCCESS(AcpiGetNextObject(ACPI_TYPE_ANY, hdl, child, + &child))) { + /* Skip unwanted object types. */ + if (ACPI_FAILURE(AcpiGetType(child, &type)) || + type > ACPI_TYPE_NS_NODE_MAX || + BT_TEST(acpinex_object_type_mask, type) == 0) { + continue; + } + + /* Get data associated with the object. Skip it if fails. */ + dhdl = acpidev_data_get_handle(child); + if (dhdl == NULL) { + ACPINEX_DEBUG(CE_NOTE, "!acpinex: failed to get data " + "associated with %p, skip.", child); + continue; + } + + /* Query ACPI object info for the object. */ + if (ACPI_FAILURE(AcpiGetObjectInfo(child, &infop))) { + cmn_err(CE_WARN, + "!acpidnex: failed to get object info for %p.", + child); + continue; + } + + if (init) { + rc = acpinex_event_install_handler(child, sp, infop, + dhdl); + if (rc != DDI_SUCCESS) { + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to " + "install handler for child %p of %s.", + child, sp->ans_path); + retval = DDI_FAILURE; + /* + * Try to handle descendants if both of the + * following two conditions are true: + * 1) Device corresponding to the current object is + * enabled. If the device is absent/disabled, + * no notification should be generated from + * descendant objects of it. + * 2) No Solaris device node has been created for the + * current object yet. If the device node has been + * created for the current object, notification + * events from child objects should be handled by + * the corresponding driver. + */ + } else if (acpidev_check_device_enabled( + acpidev_data_get_status(dhdl)) && + ACPI_FAILURE(acpica_get_devinfo(child, &dip))) { + rc = acpinex_event_walk(B_TRUE, sp, child); + if (rc != DDI_SUCCESS) { + ACPINEX_DEBUG(CE_WARN, + "!acpinex: failed to install " + "handler for descendants of %s.", + sp->ans_path); + retval = DDI_FAILURE; + } + } + } else { + rc = DDI_SUCCESS; + /* Uninstall handler for descendants if needed. */ + if (ACPI_FAILURE(acpica_get_devinfo(child, &dip))) { + rc = acpinex_event_walk(B_FALSE, sp, child); + } + if (rc == DDI_SUCCESS) { + rc = acpinex_event_uninstall_handler(child, + infop, dhdl); + } + /* Undo will be done by caller in case of failure. */ + if (rc != DDI_SUCCESS) { + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to " + "uninstall handler for descendants of %s.", + sp->ans_path); + AcpiOsFree(infop); + retval = DDI_FAILURE; + break; + } + } + + /* Release cached resources. */ + AcpiOsFree(infop); + } + + return (retval); +} + +int +acpinex_event_scan(acpinex_softstate_t *sp, boolean_t init) +{ + int rc; + + ASSERT(sp != NULL); + ASSERT(sp->ans_hdl != NULL); + ASSERT(sp->ans_dip != NULL); + if (sp == NULL || sp->ans_hdl == NULL || sp->ans_dip == NULL) { + ACPINEX_DEBUG(CE_WARN, + "!acpinex: invalid parameter to acpinex_event_scan()."); + return (DDI_FAILURE); + } + + /* Lock current device node and walk all child device nodes of it. */ + mutex_enter(&sp->ans_lock); + + rc = acpinex_event_walk(init, sp, sp->ans_hdl); + if (rc != DDI_SUCCESS) { + if (init) { + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to " + "configure child objects of %s.", sp->ans_path); + rc = DDI_FAILURE; + } else { + ACPINEX_DEBUG(CE_WARN, "!acpinex: failed to " + "unconfigure child objects of %s.", sp->ans_path); + /* Undo in case of errors */ + (void) acpinex_event_walk(B_TRUE, sp, sp->ans_hdl); + rc = DDI_FAILURE; + } + } + + mutex_exit(&sp->ans_lock); + + return (rc); +} + +void +acpinex_event_init(void) +{ + /* + * According to ACPI specifications, notification is only supported on + * Device, Processor and ThermalZone. Currently we only need to handle + * Device and Processor objects. + */ + BT_SET(acpinex_object_type_mask, ACPI_TYPE_PROCESSOR); + BT_SET(acpinex_object_type_mask, ACPI_TYPE_DEVICE); +} + +void +acpinex_event_fini(void) +{ + bzero(acpinex_object_type_mask, sizeof (acpinex_object_type_mask)); +} diff --git a/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.c b/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.c new file mode 100644 index 0000000000..35ad4353aa --- /dev/null +++ b/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.c @@ -0,0 +1,2927 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/note.h> +#include <sys/dditypes.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ddi_impldefs.h> +#include <sys/ndi_impldefs.h> +#include <sys/varargs.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cpuvar.h> +#include <sys/cpupart.h> +#include <sys/mem_config.h> +#include <sys/mem_cage.h> +#include <sys/memnode.h> +#include <sys/callb.h> +#include <sys/ontrap.h> +#include <sys/obpdefs.h> +#include <sys/promif.h> +#include <sys/synch.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> +#include <sys/x_call.h> +#include <sys/x86_archext.h> +#include <sys/fastboot_impl.h> +#include <sys/sysevent.h> +#include <sys/sysevent/dr.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/acpidev_rsc.h> +#include <sys/acpidev_dr.h> +#include <sys/dr.h> +#include <sys/dr_util.h> +#include <sys/drmach.h> +#include "drmach_acpi.h" + +/* utility */ +#define MBYTE (1048576ull) +#define _ptob64(p) ((uint64_t)(p) << PAGESHIFT) +#define _b64top(b) ((pgcnt_t)((b) >> PAGESHIFT)) + +static int drmach_init(void); +static void drmach_fini(void); +static int drmach_name2type_idx(char *); +static sbd_error_t *drmach_mem_update_lgrp(drmachid_t); + +#ifdef DEBUG +int drmach_debug = 1; /* set to non-zero to enable debug messages */ +#endif /* DEBUG */ + +drmach_domain_info_t drmach_domain; + +static char *drmach_ie_fmt = "drmach_acpi.c %d"; +static drmach_array_t *drmach_boards; + +/* rwlock to protect drmach_boards. */ +static krwlock_t drmach_boards_rwlock; + +/* rwlock to block out CPR thread. */ +static krwlock_t drmach_cpr_rwlock; + +/* CPR callb id. */ +static callb_id_t drmach_cpr_cid; + +static struct { + const char *name; + const char *type; + sbd_error_t *(*new)(drmach_device_t *, drmachid_t *); +} drmach_name2type[] = { + { ACPIDEV_NODE_NAME_CPU, DRMACH_DEVTYPE_CPU, drmach_cpu_new }, + { ACPIDEV_NODE_NAME_MEMORY, DRMACH_DEVTYPE_MEM, drmach_mem_new }, + { ACPIDEV_NODE_NAME_PCI, DRMACH_DEVTYPE_PCI, drmach_io_new }, +}; + +/* + * drmach autoconfiguration data structures and interfaces + */ +static struct modlmisc modlmisc = { + &mod_miscops, + "ACPI based DR v1.0" +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlmisc, + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = drmach_init()) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) { + drmach_fini(); + } + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) == 0) { + drmach_fini(); + } + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Internal support functions. + */ +static DRMACH_HANDLE +drmach_node_acpi_get_dnode(drmach_node_t *np) +{ + return ((DRMACH_HANDLE)(uintptr_t)np->here); +} + +static dev_info_t * +drmach_node_acpi_get_dip(drmach_node_t *np) +{ + dev_info_t *dip = NULL; + + if (ACPI_FAILURE(acpica_get_devinfo((DRMACH_HANDLE)(np->here), &dip))) { + return (NULL); + } + + return (dip); +} + +static int +drmach_node_acpi_get_prop(drmach_node_t *np, char *name, void *buf, int len) +{ + int rv = 0; + DRMACH_HANDLE hdl; + + hdl = np->get_dnode(np); + if (hdl == NULL) { + DRMACH_PR("!drmach_node_acpi_get_prop: NULL handle"); + rv = -1; + } else { + rv = acpidev_dr_device_getprop(hdl, name, buf, len); + if (rv >= 0) { + ASSERT(rv <= len); + rv = 0; + } + } + + return (rv); +} + +static int +drmach_node_acpi_get_proplen(drmach_node_t *np, char *name, int *len) +{ + int rv = 0; + DRMACH_HANDLE hdl; + + hdl = np->get_dnode(np); + if (hdl == NULL) { + DRMACH_PR("!drmach_node_acpi_get_proplen: NULL handle"); + rv = -1; + } else { + rv = acpidev_dr_device_getprop(hdl, name, NULL, 0); + if (rv >= 0) { + *len = rv; + return (0); + } + } + + return (-1); +} + +static ACPI_STATUS +drmach_node_acpi_callback(ACPI_HANDLE hdl, uint_t lvl, void *ctx, void **retval) +{ + _NOTE(ARGUNUSED(lvl)); + + int rv; + dev_info_t *dip; + drmach_node_walk_args_t *argp = ctx; + int (*cb)(drmach_node_walk_args_t *args); + acpidev_class_id_t clsid; + + ASSERT(hdl != NULL); + ASSERT(ctx != NULL); + ASSERT(retval != NULL); + + /* Skip subtree if the device is not powered. */ + if (!acpidev_dr_device_is_powered(hdl)) { + return (AE_CTRL_DEPTH); + } + + /* + * Keep scanning subtree if it fails to lookup device node. + * There may be some ACPI objects without device nodes created. + */ + if (ACPI_FAILURE(acpica_get_devinfo(hdl, &dip))) { + return (AE_OK); + } + + argp->node->here = hdl; + cb = (int (*)(drmach_node_walk_args_t *args))argp->func; + rv = (*cb)(argp); + argp->node->here = NULL; + if (rv) { + *(int *)retval = rv; + return (AE_CTRL_TERMINATE); + } + + /* + * Skip descendants of PCI/PCIex host bridges. + * PCI/PCIex devices will be handled by pcihp. + */ + clsid = acpidev_dr_device_get_class(hdl); + if (clsid == ACPIDEV_CLASS_ID_PCI || clsid == ACPIDEV_CLASS_ID_PCIEX) { + return (AE_CTRL_DEPTH); + } + + return (AE_OK); +} + +static int +drmach_node_acpi_walk(drmach_node_t *np, void *data, + int (*cb)(drmach_node_walk_args_t *args)) +{ + DRMACH_HANDLE hdl; + int rv = 0; + drmach_node_walk_args_t args; + + /* initialize the args structure for callback */ + args.node = np; + args.data = data; + args.func = (void *)cb; + + /* save the handle, it will be modified when walking the tree. */ + hdl = np->get_dnode(np); + if (hdl == NULL) { + DRMACH_PR("!drmach_node_acpi_walk: failed to get device node."); + return (EX86_INAPPROP); + } + + if (ACPI_FAILURE(acpidev_dr_device_walk_device(hdl, + ACPIDEV_MAX_ENUM_LEVELS, drmach_node_acpi_callback, + &args, (void *)&rv))) { + /* + * If acpidev_dr_device_walk_device() itself fails, rv won't + * be set to suitable error code. Set it here. + */ + if (rv == 0) { + cmn_err(CE_WARN, "!drmach_node_acpi_walk: failed to " + "walk ACPI namespace."); + rv = EX86_ACPIWALK; + } + } + + /* restore the handle to original value after walking the tree. */ + np->here = (void *)hdl; + + return ((int)rv); +} + +static drmach_node_t * +drmach_node_new(void) +{ + drmach_node_t *np; + + np = kmem_zalloc(sizeof (drmach_node_t), KM_SLEEP); + + np->get_dnode = drmach_node_acpi_get_dnode; + np->getdip = drmach_node_acpi_get_dip; + np->getproplen = drmach_node_acpi_get_proplen; + np->getprop = drmach_node_acpi_get_prop; + np->walk = drmach_node_acpi_walk; + + return (np); +} + +static drmachid_t +drmach_node_dup(drmach_node_t *np) +{ + drmach_node_t *dup; + + dup = drmach_node_new(); + dup->here = np->here; + dup->get_dnode = np->get_dnode; + dup->getdip = np->getdip; + dup->getproplen = np->getproplen; + dup->getprop = np->getprop; + dup->walk = np->walk; + + return (dup); +} + +static void +drmach_node_dispose(drmach_node_t *np) +{ + kmem_free(np, sizeof (*np)); +} + +static int +drmach_node_walk(drmach_node_t *np, void *param, + int (*cb)(drmach_node_walk_args_t *args)) +{ + return (np->walk(np, param, cb)); +} + +static DRMACH_HANDLE +drmach_node_get_dnode(drmach_node_t *np) +{ + return (np->get_dnode(np)); +} + +/* + * drmach_array provides convenient array construction, access, + * bounds checking and array destruction logic. + */ +static drmach_array_t * +drmach_array_new(uint_t min_index, uint_t max_index) +{ + drmach_array_t *arr; + + arr = kmem_zalloc(sizeof (drmach_array_t), KM_SLEEP); + + arr->arr_sz = (max_index - min_index + 1) * sizeof (void *); + if (arr->arr_sz > 0) { + arr->min_index = min_index; + arr->max_index = max_index; + + arr->arr = kmem_zalloc(arr->arr_sz, KM_SLEEP); + return (arr); + } else { + kmem_free(arr, sizeof (*arr)); + return (0); + } +} + +static int +drmach_array_set(drmach_array_t *arr, uint_t idx, drmachid_t val) +{ + if (idx < arr->min_index || idx > arr->max_index) + return (-1); + arr->arr[idx - arr->min_index] = val; + return (0); +} + +/* + * Get the item with index idx. + * Return 0 with the value stored in val if succeeds, otherwise return -1. + */ +static int +drmach_array_get(drmach_array_t *arr, uint_t idx, drmachid_t *val) +{ + if (idx < arr->min_index || idx > arr->max_index) + return (-1); + *val = arr->arr[idx - arr->min_index]; + return (0); +} + +static int +drmach_array_first(drmach_array_t *arr, uint_t *idx, drmachid_t *val) +{ + int rv; + + *idx = arr->min_index; + while ((rv = drmach_array_get(arr, *idx, val)) == 0 && *val == NULL) + *idx += 1; + + return (rv); +} + +static int +drmach_array_next(drmach_array_t *arr, uint_t *idx, drmachid_t *val) +{ + int rv; + + *idx += 1; + while ((rv = drmach_array_get(arr, *idx, val)) == 0 && *val == NULL) + *idx += 1; + + return (rv); +} + +static void +drmach_array_dispose(drmach_array_t *arr, void (*disposer)(drmachid_t)) +{ + drmachid_t val; + uint_t idx; + int rv; + + rv = drmach_array_first(arr, &idx, &val); + while (rv == 0) { + (*disposer)(val); + rv = drmach_array_next(arr, &idx, &val); + } + + kmem_free(arr->arr, arr->arr_sz); + kmem_free(arr, sizeof (*arr)); +} + +static drmach_board_t * +drmach_get_board_by_bnum(uint_t bnum) +{ + drmachid_t id; + + if (drmach_array_get(drmach_boards, bnum, &id) == 0) + return ((drmach_board_t *)id); + else + return (NULL); +} + +sbd_error_t * +drmach_device_new(drmach_node_t *node, + drmach_board_t *bp, int portid, drmachid_t *idp) +{ + int i; + int rv; + drmach_device_t proto; + sbd_error_t *err; + char name[OBP_MAXDRVNAME]; + + rv = node->getprop(node, ACPIDEV_DR_PROP_DEVNAME, name, OBP_MAXDRVNAME); + if (rv) { + /* every node is expected to have a name */ + err = drerr_new(1, EX86_GETPROP, "device node %s: property %s", + ddi_node_name(node->getdip(node)), + ACPIDEV_DR_PROP_DEVNAME); + return (err); + } + + /* + * The node currently being examined is not listed in the name2type[] + * array. In this case, the node is no interest to drmach. Both + * dp and err are initialized here to yield nothing (no device or + * error structure) for this case. + */ + i = drmach_name2type_idx(name); + if (i < 0) { + *idp = (drmachid_t)0; + return (NULL); + } + + /* device specific new function will set unum */ + bzero(&proto, sizeof (proto)); + proto.type = drmach_name2type[i].type; + proto.bp = bp; + proto.node = node; + proto.portid = portid; + + return (drmach_name2type[i].new(&proto, idp)); +} + +static void +drmach_device_dispose(drmachid_t id) +{ + drmach_device_t *self = id; + + self->cm.dispose(id); +} + +static sbd_error_t * +drmach_device_status(drmachid_t id, drmach_status_t *stat) +{ + drmach_common_t *cp; + + if (!DRMACH_IS_ID(id)) + return (drerr_new(0, EX86_NOTID, NULL)); + cp = id; + + return (cp->status(id, stat)); +} + +drmach_board_t * +drmach_board_new(uint_t bnum, int boot_board) +{ + static void drmach_board_dispose(drmachid_t id); + static sbd_error_t *drmach_board_release(drmachid_t); + static sbd_error_t *drmach_board_status(drmachid_t, drmach_status_t *); + + sbd_error_t *err; + drmach_board_t *bp; + dev_info_t *dip = NULL; + + bp = kmem_zalloc(sizeof (drmach_board_t), KM_SLEEP); + bp->cm.isa = (void *)drmach_board_new; + bp->cm.release = drmach_board_release; + bp->cm.status = drmach_board_status; + + bp->bnum = bnum; + bp->devices = NULL; + bp->tree = drmach_node_new(); + + acpidev_dr_lock_all(); + if (ACPI_FAILURE(acpidev_dr_get_board_handle(bnum, &bp->tree->here))) { + acpidev_dr_unlock_all(); + drmach_board_dispose(bp); + return (NULL); + } + acpidev_dr_unlock_all(); + ASSERT(bp->tree->here != NULL); + + err = drmach_board_name(bnum, bp->cm.name, sizeof (bp->cm.name)); + if (err != NULL) { + sbd_err_clear(&err); + drmach_board_dispose(bp); + return (NULL); + } + + if (acpidev_dr_device_is_powered(bp->tree->here)) { + bp->boot_board = boot_board; + bp->powered = 1; + } else { + bp->boot_board = 0; + bp->powered = 0; + } + bp->assigned = boot_board; + if (ACPI_SUCCESS(acpica_get_devinfo(bp->tree->here, &dip))) { + bp->connected = 1; + } else { + bp->connected = 0; + } + + (void) drmach_array_set(drmach_boards, bnum, bp); + + return (bp); +} + +static void +drmach_board_dispose(drmachid_t id) +{ + drmach_board_t *bp; + + ASSERT(DRMACH_IS_BOARD_ID(id)); + bp = id; + + if (bp->tree) + drmach_node_dispose(bp->tree); + + if (bp->devices) + drmach_array_dispose(bp->devices, drmach_device_dispose); + + kmem_free(bp, sizeof (drmach_board_t)); +} + +static sbd_error_t * +drmach_board_release(drmachid_t id) +{ + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +static int +drmach_board_check_power(drmach_board_t *bp) +{ + DRMACH_HANDLE hdl; + + hdl = drmach_node_get_dnode(bp->tree); + + return (acpidev_dr_device_is_powered(hdl)); +} + +struct drmach_board_list_dep_arg { + int count; + size_t len; + ssize_t off; + char *buf; + char temp[MAXPATHLEN]; +}; + +static ACPI_STATUS +drmach_board_generate_name(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, + void **retval) +{ + _NOTE(ARGUNUSED(retval)); + + struct drmach_board_list_dep_arg *argp = ctx; + + ASSERT(hdl != NULL); + ASSERT(lvl == UINT32_MAX); + ASSERT(ctx != NULL); + + /* Skip non-board devices. */ + if (!acpidev_dr_device_is_board(hdl)) { + return (AE_OK); + } + + if (ACPI_FAILURE(acpidev_dr_get_board_name(hdl, argp->temp, + sizeof (argp->temp)))) { + DRMACH_PR("!drmach_board_generate_name: failed to " + "generate board name for handle %p.", hdl); + /* Keep on walking. */ + return (AE_OK); + } + argp->count++; + argp->off += snprintf(argp->buf + argp->off, argp->len - argp->off, + " %s", argp->temp); + if (argp->off >= argp->len) { + return (AE_CTRL_TERMINATE); + } + + return (AE_OK); +} + +static ssize_t +drmach_board_list_dependency(ACPI_HANDLE hdl, boolean_t edl, char *prefix, + char *buf, size_t len) +{ + ACPI_STATUS rc; + ssize_t off; + struct drmach_board_list_dep_arg *ap; + + ASSERT(buf != NULL && len != 0); + if (buf == NULL || len == 0) { + return (-1); + } + + ap = kmem_zalloc(sizeof (*ap), KM_SLEEP); + ap->buf = buf; + ap->len = len; + ap->off = snprintf(buf, len, "%s", prefix); + if (ap->off >= len) { + *buf = '\0'; + kmem_free(ap, sizeof (*ap)); + return (-1); + } + + /* Generate the device dependency list. */ + if (edl) { + rc = acpidev_dr_device_walk_edl(hdl, + drmach_board_generate_name, ap, NULL); + } else { + rc = acpidev_dr_device_walk_ejd(hdl, + drmach_board_generate_name, ap, NULL); + } + if (ACPI_FAILURE(rc)) { + *buf = '\0'; + ap->off = -1; + /* No device has dependency on this board. */ + } else if (ap->count == 0) { + *buf = '\0'; + ap->off = 0; + } + + off = ap->off; + kmem_free(ap, sizeof (*ap)); + + return (off); +} + +static sbd_error_t * +drmach_board_status(drmachid_t id, drmach_status_t *stat) +{ + sbd_error_t *err = NULL; + drmach_board_t *bp; + DRMACH_HANDLE hdl; + size_t off; + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = id; + + if (bp->tree == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + stat->busy = 0; /* assume not busy */ + stat->configured = 0; /* assume not configured */ + stat->assigned = bp->assigned; + stat->powered = bp->powered = acpidev_dr_device_is_powered(hdl); + stat->empty = !acpidev_dr_device_is_present(hdl); + if (ACPI_SUCCESS(acpidev_dr_device_check_status(hdl))) { + stat->cond = bp->cond = SBD_COND_OK; + } else { + stat->cond = bp->cond = SBD_COND_FAILED; + } + stat->info[0] = '\0'; + + /* Generate the eject device list. */ + if (drmach_board_list_dependency(hdl, B_TRUE, "EDL:", + stat->info, sizeof (stat->info)) < 0) { + DRMACH_PR("!drmach_board_status: failed to generate " + "eject device list for board %d.", bp->bnum); + stat->info[0] = '\0'; + } + off = strlen(stat->info); + if (off < sizeof (stat->info)) { + if (drmach_board_list_dependency(hdl, B_FALSE, + off ? ", EJD:" : "EJD:", + stat->info + off, sizeof (stat->info) - off) < 0) { + DRMACH_PR("!drmach_board_status: failed to generate " + "eject dependent device for board %d.", bp->bnum); + stat->info[off] = '\0'; + } + } + + switch (acpidev_dr_get_board_type(bp->tree->get_dnode(bp->tree))) { + case ACPIDEV_CPU_BOARD: + (void) strlcpy(stat->type, "CPU Board", sizeof (stat->type)); + break; + case ACPIDEV_MEMORY_BOARD: + (void) strlcpy(stat->type, "MemoryBoard", sizeof (stat->type)); + break; + case ACPIDEV_IO_BOARD: + (void) strlcpy(stat->type, "IO Board", sizeof (stat->type)); + break; + case ACPIDEV_SYSTEM_BOARD: + /*FALLTHROUGH*/ + default: + (void) strlcpy(stat->type, "SystemBoard", sizeof (stat->type)); + break; + } + + if (bp->devices) { + int rv; + uint_t d_idx; + drmachid_t d_id; + + rv = drmach_array_first(bp->devices, &d_idx, &d_id); + while (rv == 0) { + drmach_status_t d_stat; + + err = drmach_device_status(d_id, &d_stat); + if (err) + break; + + stat->busy |= d_stat.busy; + stat->configured |= d_stat.configured; + + rv = drmach_array_next(bp->devices, &d_idx, &d_id); + } + } + + return (err); +} + +/* + * When DR is initialized, we walk the device tree and acquire a hold on + * all the nodes that are interesting to DR. This is so that the corresponding + * branches cannot be deleted. + */ +static int +drmach_hold_rele_devtree(dev_info_t *rdip, void *arg) +{ + int *holdp = (int *)arg; + ACPI_HANDLE hdl = NULL; + acpidev_data_handle_t dhdl; + + /* Skip nodes and subtrees which are not created by acpidev. */ + if (ACPI_FAILURE(acpica_get_handle(rdip, &hdl))) { + return (DDI_WALK_PRUNECHILD); + } + ASSERT(hdl != NULL); + dhdl = acpidev_data_get_handle(hdl); + if (dhdl == NULL) { + return (DDI_WALK_PRUNECHILD); + } + + /* Hold/release devices which are interesting to DR operations. */ + if (acpidev_data_dr_ready(dhdl)) { + if (*holdp) { + ASSERT(!e_ddi_branch_held(rdip)); + e_ddi_branch_hold(rdip); + } else { + ASSERT(e_ddi_branch_held(rdip)); + e_ddi_branch_rele(rdip); + } + } + + return (DDI_WALK_CONTINUE); +} + +static void +drmach_hold_devtree(void) +{ + dev_info_t *dip; + int circ; + int hold = 1; + + dip = ddi_root_node(); + ndi_devi_enter(dip, &circ); + ddi_walk_devs(ddi_get_child(dip), drmach_hold_rele_devtree, &hold); + ndi_devi_exit(dip, circ); +} + +static void +drmach_release_devtree(void) +{ + dev_info_t *dip; + int circ; + int hold = 0; + + dip = ddi_root_node(); + ndi_devi_enter(dip, &circ); + ddi_walk_devs(ddi_get_child(dip), drmach_hold_rele_devtree, &hold); + ndi_devi_exit(dip, circ); +} + +static boolean_t +drmach_cpr_callb(void *arg, int code) +{ + _NOTE(ARGUNUSED(arg)); + + if (code == CB_CODE_CPR_CHKPT) { + /* + * Temporarily block CPR operations if there are DR operations + * ongoing. + */ + rw_enter(&drmach_cpr_rwlock, RW_WRITER); + } else { + rw_exit(&drmach_cpr_rwlock); + } + + return (B_TRUE); +} + +static int +drmach_init(void) +{ + DRMACH_HANDLE hdl; + drmachid_t id; + uint_t bnum; + + if (MAX_BOARDS > SHRT_MAX) { + cmn_err(CE_WARN, "!drmach_init: system has too many (%d) " + "hotplug capable boards.", MAX_BOARDS); + return (ENXIO); + } else if (MAX_CMP_UNITS_PER_BOARD > 1) { + cmn_err(CE_WARN, "!drmach_init: DR doesn't support multiple " + "(%d) physical processors on one board.", + MAX_CMP_UNITS_PER_BOARD); + return (ENXIO); + } else if (MAX_CORES_PER_CMP & (MAX_CORES_PER_CMP - 1)) { + cmn_err(CE_WARN, "!drmach_init: number of logical CPUs (%d) in " + "physical processor is not power of 2.", + MAX_CORES_PER_CMP); + return (ENXIO); + } else if (MAX_CPU_UNITS_PER_BOARD > DEVSET_CPU_NUMBER || + MAX_MEM_UNITS_PER_BOARD > DEVSET_MEM_NUMBER || + MAX_IO_UNITS_PER_BOARD > DEVSET_IO_NUMBER) { + cmn_err(CE_WARN, "!drmach_init: system has more CPU/memory/IO " + "units than the DR driver can handle."); + return (ENXIO); + } + + rw_init(&drmach_cpr_rwlock, NULL, RW_DEFAULT, NULL); + drmach_cpr_cid = callb_add(drmach_cpr_callb, NULL, + CB_CL_CPR_PM, "drmach"); + + rw_init(&drmach_boards_rwlock, NULL, RW_DEFAULT, NULL); + drmach_boards = drmach_array_new(0, MAX_BOARDS - 1); + drmach_domain.allow_dr = acpidev_dr_capable(); + + for (bnum = 0; bnum < MAX_BOARDS; bnum++) { + hdl = NULL; + if (ACPI_FAILURE(acpidev_dr_get_board_handle(bnum, &hdl)) || + hdl == NULL) { + cmn_err(CE_WARN, "!drmach_init: failed to lookup ACPI " + "handle for board %d.", bnum); + continue; + } + if (drmach_array_get(drmach_boards, bnum, &id) == -1) { + DRMACH_PR("!drmach_init: failed to get handle " + "for board %d.", bnum); + ASSERT(0); + goto error; + } else if (id == NULL) { + (void) drmach_board_new(bnum, 1); + } + } + + /* + * Walk descendants of the devinfo root node and hold + * all devinfo branches of interest. + */ + drmach_hold_devtree(); + + return (0); + +error: + drmach_array_dispose(drmach_boards, drmach_board_dispose); + rw_destroy(&drmach_boards_rwlock); + rw_destroy(&drmach_cpr_rwlock); + return (ENXIO); +} + +static void +drmach_fini(void) +{ + rw_enter(&drmach_boards_rwlock, RW_WRITER); + if (drmach_boards != NULL) { + drmach_array_dispose(drmach_boards, drmach_board_dispose); + drmach_boards = NULL; + } + rw_exit(&drmach_boards_rwlock); + + /* + * Walk descendants of the root devinfo node + * release holds acquired on branches in drmach_init() + */ + drmach_release_devtree(); + + (void) callb_delete(drmach_cpr_cid); + rw_destroy(&drmach_cpr_rwlock); + rw_destroy(&drmach_boards_rwlock); +} + +sbd_error_t * +drmach_io_new(drmach_device_t *proto, drmachid_t *idp) +{ + static void drmach_io_dispose(drmachid_t); + static sbd_error_t *drmach_io_release(drmachid_t); + static sbd_error_t *drmach_io_status(drmachid_t, drmach_status_t *); + + drmach_io_t *ip; + int portid; + + portid = proto->portid; + ASSERT(portid != -1); + proto->unum = portid; + + ip = kmem_zalloc(sizeof (drmach_io_t), KM_SLEEP); + bcopy(proto, &ip->dev, sizeof (ip->dev)); + ip->dev.node = drmach_node_dup(proto->node); + ip->dev.cm.isa = (void *)drmach_io_new; + ip->dev.cm.dispose = drmach_io_dispose; + ip->dev.cm.release = drmach_io_release; + ip->dev.cm.status = drmach_io_status; + (void) snprintf(ip->dev.cm.name, sizeof (ip->dev.cm.name), "%s%d", + ip->dev.type, ip->dev.unum); + + *idp = (drmachid_t)ip; + + return (NULL); +} + +static void +drmach_io_dispose(drmachid_t id) +{ + drmach_io_t *self; + + ASSERT(DRMACH_IS_IO_ID(id)); + + self = id; + if (self->dev.node) + drmach_node_dispose(self->dev.node); + + kmem_free(self, sizeof (*self)); +} + +static sbd_error_t * +drmach_io_release(drmachid_t id) +{ + if (!DRMACH_IS_IO_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +static sbd_error_t * +drmach_io_status(drmachid_t id, drmach_status_t *stat) +{ + drmach_device_t *dp; + sbd_error_t *err; + int configured; + + ASSERT(DRMACH_IS_IO_ID(id)); + dp = id; + + err = drmach_io_is_attached(id, &configured); + if (err) + return (err); + + stat->assigned = dp->bp->assigned; + stat->powered = dp->bp->powered; + stat->configured = (configured != 0); + stat->busy = dp->busy; + (void) strlcpy(stat->type, dp->type, sizeof (stat->type)); + stat->info[0] = '\0'; + + return (NULL); +} + +sbd_error_t * +drmach_cpu_new(drmach_device_t *proto, drmachid_t *idp) +{ + static void drmach_cpu_dispose(drmachid_t); + static sbd_error_t *drmach_cpu_release(drmachid_t); + static sbd_error_t *drmach_cpu_status(drmachid_t, drmach_status_t *); + + int portid; + processorid_t cpuid; + drmach_cpu_t *cp = NULL; + + /* the portid is APIC ID of the node */ + portid = proto->portid; + ASSERT(portid != -1); + + /* + * Assume all CPUs are homogeneous and have the same number of + * cores/threads. + */ + proto->unum = portid % MAX_CPU_UNITS_PER_BOARD; + + cp = kmem_zalloc(sizeof (drmach_cpu_t), KM_SLEEP); + bcopy(proto, &cp->dev, sizeof (cp->dev)); + cp->dev.node = drmach_node_dup(proto->node); + cp->dev.cm.isa = (void *)drmach_cpu_new; + cp->dev.cm.dispose = drmach_cpu_dispose; + cp->dev.cm.release = drmach_cpu_release; + cp->dev.cm.status = drmach_cpu_status; + (void) snprintf(cp->dev.cm.name, sizeof (cp->dev.cm.name), "%s%d", + cp->dev.type, cp->dev.unum); + + cp->apicid = portid; + if (ACPI_SUCCESS(acpica_get_cpu_id_by_object( + drmach_node_get_dnode(proto->node), &cpuid))) { + cp->cpuid = cpuid; + } else { + cp->cpuid = -1; + } + + /* Mark CPU0 as busy, many other components have dependency on it. */ + if (cp->cpuid == 0) { + cp->dev.busy = 1; + } + + *idp = (drmachid_t)cp; + + return (NULL); +} + +static void +drmach_cpu_dispose(drmachid_t id) +{ + drmach_cpu_t *self; + + ASSERT(DRMACH_IS_CPU_ID(id)); + + self = id; + if (self->dev.node) + drmach_node_dispose(self->dev.node); + + kmem_free(self, sizeof (*self)); +} + +static sbd_error_t * +drmach_cpu_release(drmachid_t id) +{ + if (!DRMACH_IS_CPU_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +static sbd_error_t * +drmach_cpu_status(drmachid_t id, drmach_status_t *stat) +{ + drmach_cpu_t *cp; + drmach_device_t *dp; + + ASSERT(DRMACH_IS_CPU_ID(id)); + cp = (drmach_cpu_t *)id; + dp = &cp->dev; + + stat->assigned = dp->bp->assigned; + stat->powered = dp->bp->powered; + mutex_enter(&cpu_lock); + stat->configured = (cpu_get(cp->cpuid) != NULL); + mutex_exit(&cpu_lock); + stat->busy = dp->busy; + (void) strlcpy(stat->type, dp->type, sizeof (stat->type)); + stat->info[0] = '\0'; + + return (NULL); +} + +static int +drmach_setup_mc_info(DRMACH_HANDLE hdl, drmach_mem_t *mp) +{ + uint_t i, j, count; + struct memlist *ml = NULL, *ml2 = NULL; + acpidev_regspec_t *regp; + uint64_t align, addr_min, addr_max, total_size, skipped_size; + + if (hdl == NULL) { + return (-1); + } else if (ACPI_FAILURE(acpidev_dr_get_mem_alignment(hdl, &align))) { + return (-1); + } else { + ASSERT((align & (align - 1)) == 0); + mp->mem_alignment = align; + } + + addr_min = UINT64_MAX; + addr_max = 0; + total_size = 0; + skipped_size = 0; + /* + * There's a memory hole just below 4G on x86, which needs special + * handling. All other addresses assigned to a specific memory device + * should be contiguous. + */ + if (ACPI_FAILURE(acpidev_dr_device_get_regspec(hdl, TRUE, ®p, + &count))) { + return (-1); + } + for (i = 0, j = 0; i < count; i++) { + uint64_t addr, size; + + addr = (uint64_t)regp[i].phys_mid << 32; + addr |= (uint64_t)regp[i].phys_low; + size = (uint64_t)regp[i].size_hi << 32; + size |= (uint64_t)regp[i].size_low; + if (size == 0) + continue; + else + j++; + + total_size += size; + if (addr < addr_min) + addr_min = addr; + if (addr + size > addr_max) + addr_max = addr + size; + if (mp->dev.bp->boot_board || + j <= acpidev_dr_max_segments_per_mem_device()) { + ml = memlist_add_span(ml, addr, size); + } else { + skipped_size += size; + } + } + acpidev_dr_device_free_regspec(regp, count); + + if (skipped_size != 0) { + cmn_err(CE_WARN, "!drmach: too many (%d) segments on memory " + "device, max (%d) segments supported, 0x%" PRIx64 " bytes " + "of memory skipped.", + j, acpidev_dr_max_segments_per_mem_device(), skipped_size); + } + + mp->slice_base = addr_min; + mp->slice_top = addr_max; + mp->slice_size = total_size; + + if (mp->dev.bp->boot_board) { + uint64_t endpa = _ptob64(physmax + 1); + + /* + * we intersect phys_install to get base_pa. + * This only works at boot-up time. + */ + memlist_read_lock(); + ml2 = memlist_dup(phys_install); + memlist_read_unlock(); + + ml2 = memlist_del_span(ml2, 0ull, mp->slice_base); + if (ml2 && endpa > addr_max) { + ml2 = memlist_del_span(ml2, addr_max, endpa - addr_max); + } + } + + /* + * Create a memlist for the memory board. + * The created memlist only contains configured memory if there's + * configured memory on the board, otherwise it contains all memory + * on the board. + */ + if (ml2) { + uint64_t nbytes = 0; + struct memlist *p; + + for (p = ml2; p; p = p->ml_next) { + nbytes += p->ml_size; + } + if (nbytes == 0) { + memlist_delete(ml2); + ml2 = NULL; + } else { + /* Node has configured memory at boot time. */ + mp->base_pa = ml2->ml_address; + mp->nbytes = nbytes; + mp->memlist = ml2; + if (ml) + memlist_delete(ml); + } + } + if (ml2 == NULL) { + /* Not configured at boot time. */ + mp->base_pa = UINT64_MAX; + mp->nbytes = 0; + mp->memlist = ml; + } + + return (0); +} + +sbd_error_t * +drmach_mem_new(drmach_device_t *proto, drmachid_t *idp) +{ + static void drmach_mem_dispose(drmachid_t); + static sbd_error_t *drmach_mem_release(drmachid_t); + static sbd_error_t *drmach_mem_status(drmachid_t, drmach_status_t *); + + DRMACH_HANDLE hdl; + drmach_mem_t *mp; + int portid; + + mp = kmem_zalloc(sizeof (drmach_mem_t), KM_SLEEP); + portid = proto->portid; + ASSERT(portid != -1); + proto->unum = portid; + + bcopy(proto, &mp->dev, sizeof (mp->dev)); + mp->dev.node = drmach_node_dup(proto->node); + mp->dev.cm.isa = (void *)drmach_mem_new; + mp->dev.cm.dispose = drmach_mem_dispose; + mp->dev.cm.release = drmach_mem_release; + mp->dev.cm.status = drmach_mem_status; + + (void) snprintf(mp->dev.cm.name, sizeof (mp->dev.cm.name), "%s%d", + mp->dev.type, proto->unum); + hdl = mp->dev.node->get_dnode(mp->dev.node); + ASSERT(hdl != NULL); + if (drmach_setup_mc_info(hdl, mp) != 0) { + kmem_free(mp, sizeof (drmach_mem_t)); + *idp = (drmachid_t)NULL; + return (drerr_new(1, EX86_MC_SETUP, NULL)); + } + + /* make sure we do not create memoryless nodes */ + if (mp->nbytes == 0 && mp->slice_size == 0) { + kmem_free(mp, sizeof (drmach_mem_t)); + *idp = (drmachid_t)NULL; + } else + *idp = (drmachid_t)mp; + + return (NULL); +} + +static void +drmach_mem_dispose(drmachid_t id) +{ + drmach_mem_t *mp; + + ASSERT(DRMACH_IS_MEM_ID(id)); + + mp = id; + + if (mp->dev.node) + drmach_node_dispose(mp->dev.node); + + if (mp->memlist) { + memlist_delete(mp->memlist); + mp->memlist = NULL; + } + + kmem_free(mp, sizeof (*mp)); +} + +static sbd_error_t * +drmach_mem_release(drmachid_t id) +{ + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +static sbd_error_t * +drmach_mem_status(drmachid_t id, drmach_status_t *stat) +{ + uint64_t pa; + drmach_mem_t *dp; + struct memlist *ml = NULL; + + ASSERT(DRMACH_IS_MEM_ID(id)); + dp = id; + + /* get starting physical address of target memory */ + pa = dp->base_pa; + /* round down to slice boundary */ + pa &= ~(dp->mem_alignment - 1); + + /* stop at first span that is in slice */ + memlist_read_lock(); + for (ml = phys_install; ml; ml = ml->ml_next) + if (ml->ml_address >= pa && ml->ml_address < dp->slice_top) + break; + memlist_read_unlock(); + + stat->assigned = dp->dev.bp->assigned; + stat->powered = dp->dev.bp->powered; + stat->configured = (ml != NULL); + stat->busy = dp->dev.busy; + (void) strlcpy(stat->type, dp->dev.type, sizeof (stat->type)); + stat->info[0] = '\0'; + + return (NULL); +} + +/* + * Public interfaces exported to support platform independent dr driver. + */ +uint_t +drmach_max_boards(void) +{ + return (acpidev_dr_max_boards()); +} + +uint_t +drmach_max_io_units_per_board(void) +{ + return (acpidev_dr_max_io_units_per_board()); +} + +uint_t +drmach_max_cmp_units_per_board(void) +{ + return (acpidev_dr_max_cmp_units_per_board()); +} + +uint_t +drmach_max_mem_units_per_board(void) +{ + return (acpidev_dr_max_mem_units_per_board()); +} + +uint_t +drmach_max_core_per_cmp(void) +{ + return (acpidev_dr_max_cpu_units_per_cmp()); +} + +sbd_error_t * +drmach_pre_op(int cmd, drmachid_t id, drmach_opts_t *opts, void *argp) +{ + drmach_board_t *bp = (drmach_board_t *)id; + sbd_error_t *err = NULL; + + /* allow status and ncm operations to always succeed */ + if ((cmd == SBD_CMD_STATUS) || (cmd == SBD_CMD_GETNCM)) { + return (NULL); + } + + switch (cmd) { + case SBD_CMD_POWERON: + case SBD_CMD_POWEROFF: + /* + * Disable fast reboot if CPU/MEM/IOH hotplug event happens. + * Note: this is a temporary solution and will be revised when + * fast reboot can support CPU/MEM/IOH DR operations in future. + * + * ACPI BIOS generates some static ACPI tables, such as MADT, + * SRAT and SLIT, to describe system hardware configuration on + * power-on. When CPU/MEM/IOH hotplug event happens, those + * static tables won't be updated and will become stale. + * + * If we reset system by fast reboot, BIOS will have no chance + * to regenerate those staled static tables. Fast reboot can't + * tolerate such inconsistency between staled ACPI tables and + * real hardware configuration yet. + * + * A temporary solution is introduced to disable fast reboot if + * CPU/MEM/IOH hotplug event happens. This solution should be + * revised when fast reboot is enhanced to support CPU/MEM/IOH + * DR operations. + */ + fastreboot_disable(FBNS_HOTPLUG); + /*FALLTHROUGH*/ + + default: + /* Block out the CPR thread. */ + rw_enter(&drmach_cpr_rwlock, RW_READER); + break; + } + + /* check all other commands for the required option string */ + if ((opts->size > 0) && (opts->copts != NULL)) { + if (strstr(opts->copts, ACPIDEV_CMD_OST_PREFIX) == NULL) { + err = drerr_new(1, EX86_SUPPORT, NULL); + } + } else { + err = drerr_new(1, EX86_SUPPORT, NULL); + } + + if (!err && id && DRMACH_IS_BOARD_ID(id)) { + switch (cmd) { + case SBD_CMD_TEST: + break; + case SBD_CMD_CONNECT: + if (bp->connected) + err = drerr_new(0, ESBD_STATE, NULL); + else if (!drmach_domain.allow_dr) + err = drerr_new(1, EX86_SUPPORT, NULL); + break; + case SBD_CMD_DISCONNECT: + if (!bp->connected) + err = drerr_new(0, ESBD_STATE, NULL); + else if (!drmach_domain.allow_dr) + err = drerr_new(1, EX86_SUPPORT, NULL); + break; + default: + if (!drmach_domain.allow_dr) + err = drerr_new(1, EX86_SUPPORT, NULL); + break; + + } + } + + /* + * CPU/memory/IO DR operations will be supported in stages on x86. + * With early versions, some operations should be blocked here. + * This temporary hook will be removed when all CPU/memory/IO DR + * operations are supported on x86 systems. + * + * We only need to filter unsupported device types for + * SBD_CMD_CONFIGURE/SBD_CMD_UNCONFIGURE commands, all other + * commands are supported by all device types. + */ + if (!err && (cmd == SBD_CMD_CONFIGURE || cmd == SBD_CMD_UNCONFIGURE)) { + int i; + dr_devset_t *devsetp = (dr_devset_t *)argp; + dr_devset_t devset = *devsetp; + + switch (cmd) { + case SBD_CMD_CONFIGURE: + if (!plat_dr_support_cpu()) { + DEVSET_DEL(devset, SBD_COMP_CPU, + DEVSET_ANYUNIT); + } else { + for (i = MAX_CPU_UNITS_PER_BOARD; + i < DEVSET_CPU_NUMBER; i++) { + DEVSET_DEL(devset, SBD_COMP_CPU, i); + } + } + + if (!plat_dr_support_memory()) { + DEVSET_DEL(devset, SBD_COMP_MEM, + DEVSET_ANYUNIT); + } else { + for (i = MAX_MEM_UNITS_PER_BOARD; + i < DEVSET_MEM_NUMBER; i++) { + DEVSET_DEL(devset, SBD_COMP_MEM, i); + } + } + + /* No support of configuring IOH devices yet. */ + DEVSET_DEL(devset, SBD_COMP_IO, DEVSET_ANYUNIT); + break; + + case SBD_CMD_UNCONFIGURE: + if (!plat_dr_support_cpu()) { + DEVSET_DEL(devset, SBD_COMP_CPU, + DEVSET_ANYUNIT); + } else { + for (i = MAX_CPU_UNITS_PER_BOARD; + i < DEVSET_CPU_NUMBER; i++) { + DEVSET_DEL(devset, SBD_COMP_CPU, i); + } + } + + /* No support of unconfiguring MEM/IOH devices yet. */ + DEVSET_DEL(devset, SBD_COMP_MEM, DEVSET_ANYUNIT); + DEVSET_DEL(devset, SBD_COMP_IO, DEVSET_ANYUNIT); + break; + } + + *devsetp = devset; + if (DEVSET_IS_NULL(devset)) { + err = drerr_new(1, EX86_SUPPORT, NULL); + } + } + + return (err); +} + +sbd_error_t * +drmach_post_op(int cmd, drmachid_t id, drmach_opts_t *opts, int rv) +{ + _NOTE(ARGUNUSED(id, opts, rv)); + + switch (cmd) { + case SBD_CMD_STATUS: + case SBD_CMD_GETNCM: + break; + + default: + rw_exit(&drmach_cpr_rwlock); + break; + } + + return (NULL); +} + +sbd_error_t * +drmach_configure(drmachid_t id, int flags) +{ + _NOTE(ARGUNUSED(flags)); + + drmach_device_t *dp; + sbd_error_t *err = NULL; + dev_info_t *rdip; + dev_info_t *fdip = NULL; + + if (!DRMACH_IS_DEVICE_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + dp = id; + + rdip = dp->node->getdip(dp->node); + ASSERT(rdip); + ASSERT(e_ddi_branch_held(rdip)); + + /* allocate cpu id for the CPU device. */ + if (DRMACH_IS_CPU_ID(id)) { + DRMACH_HANDLE hdl = drmach_node_get_dnode(dp->node); + ASSERT(hdl != NULL); + if (ACPI_FAILURE(acpidev_dr_allocate_cpuid(hdl, NULL))) { + err = drerr_new(1, EX86_ALLOC_CPUID, NULL); + } + return (err); + } + + if (DRMACH_IS_MEM_ID(id)) { + err = drmach_mem_update_lgrp(id); + if (err) + return (err); + } + + if (e_ddi_branch_configure(rdip, &fdip, 0) != 0) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + dev_info_t *dip = (fdip != NULL) ? fdip : rdip; + + (void) ddi_pathname(dip, path); + err = drerr_new(1, EX86_DRVFAIL, path); + kmem_free(path, MAXPATHLEN); + + /* If non-NULL, fdip is returned held and must be released */ + if (fdip != NULL) + ddi_release_devi(fdip); + } + + return (err); +} + +sbd_error_t * +drmach_unconfigure(drmachid_t id, int flags) +{ + _NOTE(ARGUNUSED(flags)); + + drmach_device_t *dp; + sbd_error_t *err = NULL; + dev_info_t *rdip, *fdip = NULL; + + if (!DRMACH_IS_DEVICE_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + dp = id; + + rdip = dp->node->getdip(dp->node); + ASSERT(rdip); + ASSERT(e_ddi_branch_held(rdip)); + + if (DRMACH_IS_CPU_ID(id)) { + DRMACH_HANDLE hdl = drmach_node_get_dnode(dp->node); + ASSERT(hdl != NULL); + if (ACPI_FAILURE(acpidev_dr_free_cpuid(hdl))) { + err = drerr_new(1, EX86_FREE_CPUID, NULL); + } + return (err); + } + + /* + * Note: FORCE flag is no longer necessary under devfs + */ + if (e_ddi_branch_unconfigure(rdip, &fdip, 0)) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * If non-NULL, fdip is returned held and must be released. + */ + if (fdip != NULL) { + (void) ddi_pathname(fdip, path); + ndi_rele_devi(fdip); + } else { + (void) ddi_pathname(rdip, path); + } + + err = drerr_new(1, EX86_DRVFAIL, path); + + kmem_free(path, MAXPATHLEN); + } + + return (err); +} + +sbd_error_t * +drmach_get_dip(drmachid_t id, dev_info_t **dip) +{ + drmach_device_t *dp; + + if (!DRMACH_IS_DEVICE_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + dp = id; + + *dip = dp->node->getdip(dp->node); + + return (NULL); +} + +sbd_error_t * +drmach_release(drmachid_t id) +{ + drmach_common_t *cp; + + if (!DRMACH_IS_DEVICE_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + cp = id; + + return (cp->release(id)); +} + +sbd_error_t * +drmach_status(drmachid_t id, drmach_status_t *stat) +{ + drmach_common_t *cp; + sbd_error_t *err; + + rw_enter(&drmach_boards_rwlock, RW_READER); + if (!DRMACH_IS_ID(id)) { + rw_exit(&drmach_boards_rwlock); + return (drerr_new(0, EX86_NOTID, NULL)); + } + cp = (drmach_common_t *)id; + err = cp->status(id, stat); + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +static sbd_error_t * +drmach_update_acpi_status(drmachid_t id, drmach_opts_t *opts) +{ + char *copts; + drmach_board_t *bp; + DRMACH_HANDLE hdl; + int event, code; + boolean_t inprogress = B_FALSE; + + if (DRMACH_NULL_ID(id) || !DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = (drmach_board_t *)id; + hdl = drmach_node_get_dnode(bp->tree); + ASSERT(hdl != NULL); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + /* Get the status code. */ + copts = opts->copts; + if (strncmp(copts, ACPIDEV_CMD_OST_INPROGRESS, + strlen(ACPIDEV_CMD_OST_INPROGRESS)) == 0) { + inprogress = B_TRUE; + code = ACPI_OST_STA_INSERT_IN_PROGRESS; + copts += strlen(ACPIDEV_CMD_OST_INPROGRESS); + } else if (strncmp(copts, ACPIDEV_CMD_OST_SUCCESS, + strlen(ACPIDEV_CMD_OST_SUCCESS)) == 0) { + code = ACPI_OST_STA_SUCCESS; + copts += strlen(ACPIDEV_CMD_OST_SUCCESS); + } else if (strncmp(copts, ACPIDEV_CMD_OST_FAILURE, + strlen(ACPIDEV_CMD_OST_FAILURE)) == 0) { + code = ACPI_OST_STA_FAILURE; + copts += strlen(ACPIDEV_CMD_OST_FAILURE); + } else if (strncmp(copts, ACPIDEV_CMD_OST_NOOP, + strlen(ACPIDEV_CMD_OST_NOOP)) == 0) { + return (NULL); + } else { + return (drerr_new(0, EX86_UNKPTCMD, opts->copts)); + } + + /* Get the event type. */ + copts = strstr(copts, ACPIDEV_EVENT_TYPE_ATTR_NAME); + if (copts == NULL) { + return (drerr_new(0, EX86_UNKPTCMD, opts->copts)); + } + copts += strlen(ACPIDEV_EVENT_TYPE_ATTR_NAME); + if (copts[0] != '=') { + return (drerr_new(0, EX86_UNKPTCMD, opts->copts)); + } + copts += strlen("="); + if (strncmp(copts, ACPIDEV_EVENT_TYPE_BUS_CHECK, + strlen(ACPIDEV_EVENT_TYPE_BUS_CHECK)) == 0) { + event = ACPI_NOTIFY_BUS_CHECK; + } else if (strncmp(copts, ACPIDEV_EVENT_TYPE_DEVICE_CHECK, + strlen(ACPIDEV_EVENT_TYPE_DEVICE_CHECK)) == 0) { + event = ACPI_NOTIFY_DEVICE_CHECK; + } else if (strncmp(copts, ACPIDEV_EVENT_TYPE_DEVICE_CHECK_LIGHT, + strlen(ACPIDEV_EVENT_TYPE_DEVICE_CHECK_LIGHT)) == 0) { + event = ACPI_NOTIFY_DEVICE_CHECK_LIGHT; + } else if (strncmp(copts, ACPIDEV_EVENT_TYPE_EJECT_REQUEST, + strlen(ACPIDEV_EVENT_TYPE_EJECT_REQUEST)) == 0) { + event = ACPI_NOTIFY_EJECT_REQUEST; + if (inprogress) { + code = ACPI_OST_STA_EJECT_IN_PROGRESS; + } + } else { + return (drerr_new(0, EX86_UNKPTCMD, opts->copts)); + } + + (void) acpidev_eval_ost(hdl, event, code, NULL, 0); + + return (NULL); +} + +static struct { + const char *name; + sbd_error_t *(*handler)(drmachid_t id, drmach_opts_t *opts); +} drmach_pt_arr[] = { + { ACPIDEV_CMD_OST_PREFIX, &drmach_update_acpi_status }, + /* the following line must always be last */ + { NULL, NULL } +}; + +sbd_error_t * +drmach_passthru(drmachid_t id, drmach_opts_t *opts) +{ + int i; + sbd_error_t *err; + + i = 0; + while (drmach_pt_arr[i].name != NULL) { + int len = strlen(drmach_pt_arr[i].name); + + if (strncmp(drmach_pt_arr[i].name, opts->copts, len) == 0) + break; + + i += 1; + } + + if (drmach_pt_arr[i].name == NULL) + err = drerr_new(0, EX86_UNKPTCMD, opts->copts); + else + err = (*drmach_pt_arr[i].handler)(id, opts); + + return (err); +} + +/* + * Board specific interfaces to support dr driver + */ +static int +drmach_get_portid(drmach_node_t *np) +{ + uint32_t portid; + + if (np->getprop(np, ACPIDEV_DR_PROP_PORTID, + &portid, sizeof (portid)) == 0) { + /* + * acpidev returns portid as uint32_t, validates it. + */ + if (portid > INT_MAX) { + return (-1); + } else { + return (portid); + } + } + + return (-1); +} + +/* + * This is a helper function to determine if a given + * node should be considered for a dr operation according + * to predefined dr type nodes and the node's name. + * Formal Parameter : The name of a device node. + * Return Value: -1, name does not map to a valid dr type. + * A value greater or equal to 0, name is a valid dr type. + */ +static int +drmach_name2type_idx(char *name) +{ + int index, ntypes; + + if (name == NULL) + return (-1); + + /* + * Determine how many possible types are currently supported + * for dr. + */ + ntypes = sizeof (drmach_name2type) / sizeof (drmach_name2type[0]); + + /* Determine if the node's name correspond to a predefined type. */ + for (index = 0; index < ntypes; index++) { + if (strcmp(drmach_name2type[index].name, name) == 0) + /* The node is an allowed type for dr. */ + return (index); + } + + /* + * If the name of the node does not map to any of the + * types in the array drmach_name2type then the node is not of + * interest to dr. + */ + return (-1); +} + +static int +drmach_board_find_devices_cb(drmach_node_walk_args_t *args) +{ + drmach_node_t *node = args->node; + drmach_board_cb_data_t *data = args->data; + drmach_board_t *obj = data->obj; + + int rv, portid; + uint32_t bnum; + drmachid_t id; + drmach_device_t *device; + char name[OBP_MAXDRVNAME]; + + portid = drmach_get_portid(node); + rv = node->getprop(node, ACPIDEV_DR_PROP_DEVNAME, + name, OBP_MAXDRVNAME); + if (rv) + return (0); + + rv = node->getprop(node, ACPIDEV_DR_PROP_BOARDNUM, + &bnum, sizeof (bnum)); + if (rv) { + return (0); + } + if (bnum > INT_MAX) { + return (0); + } + + if (bnum != obj->bnum) + return (0); + + if (drmach_name2type_idx(name) < 0) { + return (0); + } + + /* + * Create a device data structure from this node data. + * The call may yield nothing if the node is not of interest + * to drmach. + */ + data->err = drmach_device_new(node, obj, portid, &id); + if (data->err) + return (-1); + else if (!id) { + /* + * drmach_device_new examined the node we passed in + * and determined that it was one not of interest to + * drmach. So, it is skipped. + */ + return (0); + } + + rv = drmach_array_set(obj->devices, data->ndevs++, id); + if (rv) { + data->err = DRMACH_INTERNAL_ERROR(); + return (-1); + } + device = id; + + data->err = (*data->found)(data->a, device->type, device->unum, id); + + return (data->err == NULL ? 0 : -1); +} + +sbd_error_t * +drmach_board_find_devices(drmachid_t id, void *a, + sbd_error_t *(*found)(void *a, const char *, int, drmachid_t)) +{ + drmach_board_t *bp = (drmach_board_t *)id; + sbd_error_t *err; + int max_devices; + int rv; + drmach_board_cb_data_t data; + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + max_devices = MAX_CPU_UNITS_PER_BOARD; + max_devices += MAX_MEM_UNITS_PER_BOARD; + max_devices += MAX_IO_UNITS_PER_BOARD; + + if (bp->devices == NULL) + bp->devices = drmach_array_new(0, max_devices); + ASSERT(bp->tree != NULL); + + data.obj = bp; + data.ndevs = 0; + data.found = found; + data.a = a; + data.err = NULL; + + acpidev_dr_lock_all(); + rv = drmach_node_walk(bp->tree, &data, drmach_board_find_devices_cb); + acpidev_dr_unlock_all(); + if (rv == 0) { + err = NULL; + } else { + drmach_array_dispose(bp->devices, drmach_device_dispose); + bp->devices = NULL; + + if (data.err) + err = data.err; + else + err = DRMACH_INTERNAL_ERROR(); + } + + return (err); +} + +int +drmach_board_lookup(int bnum, drmachid_t *id) +{ + int rv = 0; + + if (bnum < 0) { + *id = 0; + return (-1); + } + + rw_enter(&drmach_boards_rwlock, RW_READER); + if (drmach_array_get(drmach_boards, (uint_t)bnum, id)) { + *id = 0; + rv = -1; + } + rw_exit(&drmach_boards_rwlock); + + return (rv); +} + +sbd_error_t * +drmach_board_name(int bnum, char *buf, int buflen) +{ + ACPI_HANDLE hdl; + sbd_error_t *err = NULL; + + if (bnum < 0) { + return (drerr_new(1, EX86_BNUM, "%d", bnum)); + } + + acpidev_dr_lock_all(); + if (ACPI_FAILURE(acpidev_dr_get_board_handle(bnum, &hdl))) { + DRMACH_PR("!drmach_board_name: failed to lookup ACPI handle " + "for board %d.", bnum); + err = drerr_new(1, EX86_BNUM, "%d", bnum); + } else if (ACPI_FAILURE(acpidev_dr_get_board_name(hdl, buf, buflen))) { + DRMACH_PR("!drmach_board_name: failed to generate board name " + "for board %d.", bnum); + err = drerr_new(0, EX86_INVALID_ARG, + ": buffer is too small for board name."); + } + acpidev_dr_unlock_all(); + + return (err); +} + +int +drmach_board_is_floating(drmachid_t id) +{ + drmach_board_t *bp; + + if (!DRMACH_IS_BOARD_ID(id)) + return (0); + + bp = (drmach_board_t *)id; + + return ((drmach_domain.floating & (1ULL << bp->bnum)) ? 1 : 0); +} + +static ACPI_STATUS +drmach_board_check_dependent_cb(ACPI_HANDLE hdl, UINT32 lvl, void *ctx, + void **retval) +{ + uint32_t bdnum; + drmach_board_t *bp; + ACPI_STATUS rc = AE_OK; + int cmd = (int)(intptr_t)ctx; + + ASSERT(hdl != NULL); + ASSERT(lvl == UINT32_MAX); + ASSERT(retval != NULL); + + /* Skip non-board devices. */ + if (!acpidev_dr_device_is_board(hdl)) { + return (AE_OK); + } else if (ACPI_FAILURE(acpidev_dr_get_board_number(hdl, &bdnum))) { + DRMACH_PR("!drmach_board_check_dependent_cb: failed to get " + "board number for object %p.\n", hdl); + return (AE_ERROR); + } else if (bdnum > MAX_BOARDS) { + DRMACH_PR("!drmach_board_check_dependent_cb: board number %u " + "is too big, max %u.", bdnum, MAX_BOARDS); + return (AE_ERROR); + } + + bp = drmach_get_board_by_bnum(bdnum); + switch (cmd) { + case SBD_CMD_CONNECT: + /* + * Its parent board should be present, assigned, powered and + * connected when connecting the child board. + */ + if (bp == NULL) { + *retval = hdl; + rc = AE_ERROR; + } else { + bp->powered = acpidev_dr_device_is_powered(hdl); + if (!bp->connected || !bp->powered || !bp->assigned) { + *retval = hdl; + rc = AE_ERROR; + } + } + break; + + case SBD_CMD_POWERON: + /* + * Its parent board should be present, assigned and powered when + * powering on the child board. + */ + if (bp == NULL) { + *retval = hdl; + rc = AE_ERROR; + } else { + bp->powered = acpidev_dr_device_is_powered(hdl); + if (!bp->powered || !bp->assigned) { + *retval = hdl; + rc = AE_ERROR; + } + } + break; + + case SBD_CMD_ASSIGN: + /* + * Its parent board should be present and assigned when + * assigning the child board. + */ + if (bp == NULL) { + *retval = hdl; + rc = AE_ERROR; + } else if (!bp->assigned) { + *retval = hdl; + rc = AE_ERROR; + } + break; + + case SBD_CMD_DISCONNECT: + /* + * The child board should be disconnected if present when + * disconnecting its parent board. + */ + if (bp != NULL && bp->connected) { + *retval = hdl; + rc = AE_ERROR; + } + break; + + case SBD_CMD_POWEROFF: + /* + * The child board should be disconnected and powered off if + * present when powering off its parent board. + */ + if (bp != NULL) { + bp->powered = acpidev_dr_device_is_powered(hdl); + if (bp->connected || bp->powered) { + *retval = hdl; + rc = AE_ERROR; + } + } + break; + + case SBD_CMD_UNASSIGN: + /* + * The child board should be disconnected, powered off and + * unassigned if present when unassigning its parent board. + */ + if (bp != NULL) { + bp->powered = acpidev_dr_device_is_powered(hdl); + if (bp->connected || bp->powered || bp->assigned) { + *retval = hdl; + rc = AE_ERROR; + } + } + break; + + default: + /* Return success for all other commands. */ + break; + } + + return (rc); +} + +sbd_error_t * +drmach_board_check_dependent(int cmd, drmach_board_t *bp) +{ + int reverse; + char *name; + sbd_error_t *err = NULL; + DRMACH_HANDLE hdl; + DRMACH_HANDLE dp = NULL; + + ASSERT(bp != NULL); + ASSERT(DRMACH_IS_BOARD_ID(bp)); + ASSERT(RW_LOCK_HELD(&drmach_boards_rwlock)); + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + switch (cmd) { + case SBD_CMD_ASSIGN: + case SBD_CMD_POWERON: + case SBD_CMD_CONNECT: + if (ACPI_SUCCESS(acpidev_dr_device_walk_ejd(hdl, + &drmach_board_check_dependent_cb, + (void *)(intptr_t)cmd, &dp))) { + return (NULL); + } + reverse = 0; + break; + + case SBD_CMD_UNASSIGN: + case SBD_CMD_POWEROFF: + case SBD_CMD_DISCONNECT: + if (ACPI_SUCCESS(acpidev_dr_device_walk_edl(hdl, + &drmach_board_check_dependent_cb, + (void *)(intptr_t)cmd, &dp))) { + return (NULL); + } + reverse = 1; + break; + + default: + return (drerr_new(0, EX86_INAPPROP, NULL)); + } + + if (dp == NULL) { + return (drerr_new(1, EX86_WALK_DEPENDENCY, "%s", bp->cm.name)); + } + name = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + if (ACPI_FAILURE(acpidev_dr_get_board_name(dp, name, MAXPATHLEN))) { + err = drerr_new(1, EX86_WALK_DEPENDENCY, "%s", bp->cm.name); + } else if (reverse == 0) { + err = drerr_new(1, EX86_WALK_DEPENDENCY, + "%s, depends on board %s", bp->cm.name, name); + } else { + err = drerr_new(1, EX86_WALK_DEPENDENCY, + "board %s depends on %s", name, bp->cm.name); + } + kmem_free(name, MAXPATHLEN); + + return (err); +} + +sbd_error_t * +drmach_board_assign(int bnum, drmachid_t *id) +{ + sbd_error_t *err = NULL; + + if (bnum < 0) { + return (drerr_new(1, EX86_BNUM, "%d", bnum)); + } + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + + if (drmach_array_get(drmach_boards, bnum, id) == -1) { + err = drerr_new(1, EX86_BNUM, "%d", bnum); + } else { + drmach_board_t *bp; + + /* + * Board has already been created, downgrade to reader. + */ + if (*id) + rw_downgrade(&drmach_boards_rwlock); + + bp = *id; + if (!(*id)) + bp = *id = + (drmachid_t)drmach_board_new(bnum, 0); + + if (bp == NULL) { + DRMACH_PR("!drmach_board_assign: failed to create " + "object for board %d.", bnum); + err = drerr_new(1, EX86_BNUM, "%d", bnum); + } else { + err = drmach_board_check_dependent(SBD_CMD_ASSIGN, bp); + if (err == NULL) + bp->assigned = 1; + } + } + + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_unassign(drmachid_t id) +{ + drmach_board_t *bp; + sbd_error_t *err; + drmach_status_t stat; + + if (DRMACH_NULL_ID(id)) + return (NULL); + + if (!DRMACH_IS_BOARD_ID(id)) { + return (drerr_new(0, EX86_INAPPROP, NULL)); + } + bp = id; + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + + err = drmach_board_status(id, &stat); + if (err) { + rw_exit(&drmach_boards_rwlock); + return (err); + } + + if (stat.configured || stat.busy) { + err = drerr_new(0, EX86_CONFIGBUSY, bp->cm.name); + } else if (bp->connected) { + err = drerr_new(0, EX86_CONNECTBUSY, bp->cm.name); + } else if (stat.powered) { + err = drerr_new(0, EX86_POWERBUSY, bp->cm.name); + } else { + err = drmach_board_check_dependent(SBD_CMD_UNASSIGN, bp); + if (err == NULL) { + if (drmach_array_set(drmach_boards, bp->bnum, 0) != 0) + err = DRMACH_INTERNAL_ERROR(); + else + drmach_board_dispose(bp); + } + } + + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_poweron(drmachid_t id) +{ + drmach_board_t *bp; + sbd_error_t *err = NULL; + DRMACH_HANDLE hdl; + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = id; + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + bp->powered = drmach_board_check_power(bp); + if (bp->powered) { + return (NULL); + } + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + err = drmach_board_check_dependent(SBD_CMD_POWERON, bp); + if (err == NULL) { + acpidev_dr_lock_all(); + if (ACPI_FAILURE(acpidev_dr_device_poweron(hdl))) + err = drerr_new(0, EX86_POWERON, NULL); + acpidev_dr_unlock_all(); + + /* Check whether the board is powered on. */ + bp->powered = drmach_board_check_power(bp); + if (err == NULL && bp->powered == 0) + err = drerr_new(0, EX86_POWERON, NULL); + } + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_poweroff(drmachid_t id) +{ + sbd_error_t *err = NULL; + drmach_board_t *bp; + drmach_status_t stat; + DRMACH_HANDLE hdl; + + if (DRMACH_NULL_ID(id)) + return (NULL); + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = id; + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + /* Check whether the board is busy, configured or connected. */ + err = drmach_board_status(id, &stat); + if (err != NULL) + return (err); + if (stat.configured || stat.busy) { + return (drerr_new(0, EX86_CONFIGBUSY, bp->cm.name)); + } else if (bp->connected) { + return (drerr_new(0, EX86_CONNECTBUSY, bp->cm.name)); + } + + bp->powered = drmach_board_check_power(bp); + if (bp->powered == 0) { + return (NULL); + } + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + err = drmach_board_check_dependent(SBD_CMD_POWEROFF, bp); + if (err == NULL) { + acpidev_dr_lock_all(); + if (ACPI_FAILURE(acpidev_dr_device_poweroff(hdl))) + err = drerr_new(0, EX86_POWEROFF, NULL); + acpidev_dr_unlock_all(); + + bp->powered = drmach_board_check_power(bp); + if (err == NULL && bp->powered != 0) + err = drerr_new(0, EX86_POWEROFF, NULL); + } + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_test(drmachid_t id, drmach_opts_t *opts, int force) +{ + _NOTE(ARGUNUSED(opts, force)); + + drmach_board_t *bp; + DRMACH_HANDLE hdl; + + if (DRMACH_NULL_ID(id)) + return (NULL); + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = id; + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + if (ACPI_FAILURE(acpidev_dr_device_check_status(hdl))) + return (drerr_new(0, EX86_IN_FAILURE, NULL)); + + return (NULL); +} + +sbd_error_t * +drmach_board_connect(drmachid_t id, drmach_opts_t *opts) +{ + _NOTE(ARGUNUSED(opts)); + + sbd_error_t *err = NULL; + drmach_board_t *bp = (drmach_board_t *)id; + DRMACH_HANDLE hdl; + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = (drmach_board_t *)id; + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + err = drmach_board_check_dependent(SBD_CMD_CONNECT, bp); + if (err == NULL) { + acpidev_dr_lock_all(); + if (ACPI_FAILURE(acpidev_dr_device_insert(hdl))) { + (void) acpidev_dr_device_remove(hdl); + err = drerr_new(1, EX86_PROBE, NULL); + } else { + bp->connected = 1; + } + acpidev_dr_unlock_all(); + } + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_disconnect(drmachid_t id, drmach_opts_t *opts) +{ + _NOTE(ARGUNUSED(opts)); + + DRMACH_HANDLE hdl; + drmach_board_t *bp; + drmach_status_t stat; + sbd_error_t *err = NULL; + + if (DRMACH_NULL_ID(id)) + return (NULL); + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = (drmach_board_t *)id; + + hdl = drmach_node_get_dnode(bp->tree); + if (hdl == NULL) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + /* Check whether the board is busy or configured. */ + err = drmach_board_status(id, &stat); + if (err != NULL) + return (err); + if (stat.configured || stat.busy) + return (drerr_new(0, EX86_CONFIGBUSY, bp->cm.name)); + + rw_enter(&drmach_boards_rwlock, RW_WRITER); + err = drmach_board_check_dependent(SBD_CMD_DISCONNECT, bp); + if (err == NULL) { + acpidev_dr_lock_all(); + if (ACPI_SUCCESS(acpidev_dr_device_remove(hdl))) { + bp->connected = 0; + } else { + err = drerr_new(1, EX86_DEPROBE, bp->cm.name); + } + acpidev_dr_unlock_all(); + } + rw_exit(&drmach_boards_rwlock); + + return (err); +} + +sbd_error_t * +drmach_board_deprobe(drmachid_t id) +{ + drmach_board_t *bp; + + if (!DRMACH_IS_BOARD_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + bp = id; + + cmn_err(CE_CONT, "DR: detach board %d\n", bp->bnum); + + if (bp->devices) { + drmach_array_dispose(bp->devices, drmach_device_dispose); + bp->devices = NULL; + } + + bp->boot_board = 0; + + return (NULL); +} + +/* + * CPU specific interfaces to support dr driver + */ +sbd_error_t * +drmach_cpu_disconnect(drmachid_t id) +{ + if (!DRMACH_IS_CPU_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +sbd_error_t * +drmach_cpu_get_id(drmachid_t id, processorid_t *cpuid) +{ + drmach_cpu_t *cpu; + + if (!DRMACH_IS_CPU_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + cpu = (drmach_cpu_t *)id; + + if (cpu->cpuid == -1) { + if (ACPI_SUCCESS(acpica_get_cpu_id_by_object( + drmach_node_get_dnode(cpu->dev.node), cpuid))) { + cpu->cpuid = *cpuid; + } else { + *cpuid = -1; + } + } else { + *cpuid = cpu->cpuid; + } + + return (NULL); +} + +sbd_error_t * +drmach_cpu_get_impl(drmachid_t id, int *ip) +{ + if (!DRMACH_IS_CPU_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + /* Assume all CPUs in system are homogeneous. */ + *ip = X86_CPU_IMPL_UNKNOWN; + + kpreempt_disable(); + if (cpuid_getvendor(CPU) == X86_VENDOR_Intel) { + /* NHM-EX CPU */ + if (cpuid_getfamily(CPU) == 0x6 && + cpuid_getmodel(CPU) == 0x2e) { + *ip = X86_CPU_IMPL_NEHALEM_EX; + } + } + kpreempt_enable(); + + return (NULL); +} + +/* + * Memory specific interfaces to support dr driver + */ + +/* + * When drmach_mem_new() is called, the mp->base_pa field is set to the base + * address of configured memory if there's configured memory on the board, + * otherwise set to UINT64_MAX. For hot-added memory board, there's no + * configured memory when drmach_mem_new() is called, so mp->base_pa is set + * to UINT64_MAX and we need to set a correct value for it after memory + * hot-add operations. + * A hot-added memory board may contain multiple memory segments, + * drmach_mem_add_span() will be called once for each segment, so we can't + * rely on the basepa argument. And it's possible that only part of a memory + * segment is added into OS, so need to intersect with phys_installed list + * to get the real base address of configured memory on the board. + */ +sbd_error_t * +drmach_mem_add_span(drmachid_t id, uint64_t basepa, uint64_t size) +{ + _NOTE(ARGUNUSED(basepa)); + + uint64_t nbytes = 0; + uint64_t endpa; + drmach_mem_t *mp; + struct memlist *ml2; + struct memlist *p; + + ASSERT(size != 0); + + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + mp = (drmach_mem_t *)id; + + /* Compute basepa and size of installed memory. */ + endpa = _ptob64(physmax + 1); + memlist_read_lock(); + ml2 = memlist_dup(phys_install); + memlist_read_unlock(); + ml2 = memlist_del_span(ml2, 0ull, mp->slice_base); + if (ml2 && endpa > mp->slice_top) { + ml2 = memlist_del_span(ml2, mp->slice_top, + endpa - mp->slice_top); + } + + ASSERT(ml2); + if (ml2) { + for (p = ml2; p; p = p->ml_next) { + nbytes += p->ml_size; + if (mp->base_pa > p->ml_address) + mp->base_pa = p->ml_address; + } + ASSERT(nbytes > 0); + mp->nbytes += nbytes; + memlist_delete(ml2); + } + + return (NULL); +} + +static sbd_error_t * +drmach_mem_update_lgrp(drmachid_t id) +{ + ACPI_STATUS rc; + DRMACH_HANDLE hdl; + void *hdlp; + drmach_mem_t *mp; + update_membounds_t umb; + + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + mp = (drmach_mem_t *)id; + /* No need to update lgrp if memory is already installed. */ + if (mp->nbytes != 0) + return (NULL); + /* No need to update lgrp if lgrp is disabled. */ + if (max_mem_nodes == 1) + return (NULL); + + /* Add memory to lgroup */ + hdl = mp->dev.node->get_dnode(mp->dev.node); + rc = acpidev_dr_device_get_memory_index(hdl, &umb.u_device_id); + ASSERT(ACPI_SUCCESS(rc)); + if (ACPI_FAILURE(rc)) { + cmn_err(CE_WARN, "drmach: failed to get device id of memory, " + "can't update lgrp information."); + return (drerr_new(0, EX86_INTERNAL, NULL)); + } + rc = acpidev_dr_get_mem_numa_info(hdl, mp->memlist, &hdlp, + &umb.u_domain, &umb.u_sli_cnt, &umb.u_sli_ptr); + ASSERT(ACPI_SUCCESS(rc)); + if (ACPI_FAILURE(rc)) { + cmn_err(CE_WARN, "drmach: failed to get lgrp info of memory, " + "can't update lgrp information."); + return (drerr_new(0, EX86_INTERNAL, NULL)); + } + umb.u_base = (uint64_t)mp->slice_base; + umb.u_length = (uint64_t)(mp->slice_top - mp->slice_base); + lgrp_plat_config(LGRP_CONFIG_MEM_ADD, (uintptr_t)&umb); + acpidev_dr_free_mem_numa_info(hdlp); + + return (NULL); +} + +sbd_error_t * +drmach_mem_enable(drmachid_t id) +{ + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + else + return (NULL); +} + +sbd_error_t * +drmach_mem_get_info(drmachid_t id, drmach_mem_info_t *mem) +{ + drmach_mem_t *mp; + + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + mp = (drmach_mem_t *)id; + + /* + * This is only used by dr to round up/down the memory + * for copying. + */ + mem->mi_alignment_mask = mp->mem_alignment - 1; + mem->mi_basepa = mp->base_pa; + mem->mi_size = mp->nbytes; + mem->mi_slice_base = mp->slice_base; + mem->mi_slice_top = mp->slice_top; + mem->mi_slice_size = mp->slice_size; + + return (NULL); +} + +sbd_error_t * +drmach_mem_get_slice_info(drmachid_t id, + uint64_t *bp, uint64_t *ep, uint64_t *sp) +{ + drmach_mem_t *mp; + + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + mp = (drmach_mem_t *)id; + + if (bp) + *bp = mp->slice_base; + if (ep) + *ep = mp->slice_top; + if (sp) + *sp = mp->slice_size; + + return (NULL); +} + +sbd_error_t * +drmach_mem_get_memlist(drmachid_t id, struct memlist **ml) +{ +#ifdef DEBUG + int rv; +#endif + drmach_mem_t *mem; + struct memlist *mlist; + + if (!DRMACH_IS_MEM_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + mem = (drmach_mem_t *)id; + + mlist = memlist_dup(mem->memlist); + *ml = mlist; + +#ifdef DEBUG + /* + * Make sure the incoming memlist doesn't already + * intersect with what's present in the system (phys_install). + */ + memlist_read_lock(); + rv = memlist_intersect(phys_install, mlist); + memlist_read_unlock(); + if (rv) { + DRMACH_PR("Derived memlist intersects with phys_install\n"); + memlist_dump(mlist); + + DRMACH_PR("phys_install memlist:\n"); + memlist_dump(phys_install); + + memlist_delete(mlist); + return (DRMACH_INTERNAL_ERROR()); + } + + DRMACH_PR("Derived memlist:"); + memlist_dump(mlist); +#endif + + return (NULL); +} + +processorid_t +drmach_mem_cpu_affinity(drmachid_t id) +{ + _NOTE(ARGUNUSED(id)); + + return (CPU_CURRENT); +} + +int +drmach_copy_rename_need_suspend(drmachid_t id) +{ + _NOTE(ARGUNUSED(id)); + + return (0); +} + +/* + * IO specific interfaces to support dr driver + */ +sbd_error_t * +drmach_io_pre_release(drmachid_t id) +{ + if (!DRMACH_IS_IO_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +sbd_error_t * +drmach_io_unrelease(drmachid_t id) +{ + if (!DRMACH_IS_IO_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +sbd_error_t * +drmach_io_post_release(drmachid_t id) +{ + _NOTE(ARGUNUSED(id)); + + return (NULL); +} + +sbd_error_t * +drmach_io_post_attach(drmachid_t id) +{ + if (!DRMACH_IS_IO_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + + return (NULL); +} + +sbd_error_t * +drmach_io_is_attached(drmachid_t id, int *yes) +{ + drmach_device_t *dp; + dev_info_t *dip; + int state; + + if (!DRMACH_IS_IO_ID(id)) + return (drerr_new(0, EX86_INAPPROP, NULL)); + dp = id; + + dip = dp->node->getdip(dp->node); + if (dip == NULL) { + *yes = 0; + return (NULL); + } + + state = ddi_get_devstate(dip); + *yes = ((i_ddi_node_state(dip) >= DS_ATTACHED) || + (state == DDI_DEVSTATE_UP)); + + return (NULL); +} + +/* + * Miscellaneous interfaces to support dr driver + */ +int +drmach_verify_sr(dev_info_t *dip, int sflag) +{ + _NOTE(ARGUNUSED(dip, sflag)); + + return (0); +} + +void +drmach_suspend_last(void) +{ +} + +void +drmach_resume_first(void) +{ +} + +/* + * Log a DR sysevent. + * Return value: 0 success, non-zero failure. + */ +int +drmach_log_sysevent(int board, char *hint, int flag, int verbose) +{ + sysevent_t *ev = NULL; + sysevent_id_t eid; + int rv, km_flag; + sysevent_value_t evnt_val; + sysevent_attr_list_t *evnt_attr_list = NULL; + sbd_error_t *err; + char attach_pnt[MAXNAMELEN]; + + km_flag = (flag == SE_SLEEP) ? KM_SLEEP : KM_NOSLEEP; + attach_pnt[0] = '\0'; + err = drmach_board_name(board, attach_pnt, MAXNAMELEN); + if (err != NULL) { + sbd_err_clear(&err); + rv = -1; + goto logexit; + } + if (verbose) { + DRMACH_PR("drmach_log_sysevent: %s %s, flag: %d, verbose: %d\n", + attach_pnt, hint, flag, verbose); + } + + if ((ev = sysevent_alloc(EC_DR, ESC_DR_AP_STATE_CHANGE, + SUNW_KERN_PUB"dr", km_flag)) == NULL) { + rv = -2; + goto logexit; + } + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = attach_pnt; + if ((rv = sysevent_add_attr(&evnt_attr_list, DR_AP_ID, &evnt_val, + km_flag)) != 0) + goto logexit; + + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = hint; + if ((rv = sysevent_add_attr(&evnt_attr_list, DR_HINT, &evnt_val, + km_flag)) != 0) { + sysevent_free_attr(evnt_attr_list); + goto logexit; + } + + (void) sysevent_attach_attributes(ev, evnt_attr_list); + + /* + * Log the event but do not sleep waiting for its + * delivery. This provides insulation from syseventd. + */ + rv = log_sysevent(ev, SE_NOSLEEP, &eid); + +logexit: + if (ev) + sysevent_free(ev); + if ((rv != 0) && verbose) + cmn_err(CE_WARN, "!drmach_log_sysevent failed (rv %d) for %s " + " %s\n", rv, attach_pnt, hint); + + return (rv); +} diff --git a/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.h b/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.h new file mode 100644 index 0000000000..abbc264ef0 --- /dev/null +++ b/usr/src/uts/i86pc/io/acpi/drmach_acpi/drmach_acpi.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_DRMACH_ACPI_H +#define _SYS_DRMACH_ACPI_H +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/param.h> +#include <sys/sunddi.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/drmach.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* Use ACPI handle as DRMACH handle on x86 systems. */ +#define DRMACH_HANDLE ACPI_HANDLE + +/* Macros to deal with object type. */ +#define DRMACH_OBJ(id) ((drmach_common_t *)id) + +#define DRMACH_NULL_ID(id) ((id) == 0) + +#define DRMACH_IS_BOARD_ID(id) \ + ((id != 0) && (DRMACH_OBJ(id)->isa == (void *)drmach_board_new)) + +#define DRMACH_IS_CPU_ID(id) \ + ((id != 0) && (DRMACH_OBJ(id)->isa == (void *)drmach_cpu_new)) + +#define DRMACH_IS_MEM_ID(id) \ + ((id != 0) && (DRMACH_OBJ(id)->isa == (void *)drmach_mem_new)) + +#define DRMACH_IS_IO_ID(id) \ + ((id != 0) && (DRMACH_OBJ(id)->isa == (void *)drmach_io_new)) + +#define DRMACH_IS_DEVICE_ID(id) \ + ((id != 0) && \ + (DRMACH_OBJ(id)->isa == (void *)drmach_cpu_new || \ + DRMACH_OBJ(id)->isa == (void *)drmach_mem_new || \ + DRMACH_OBJ(id)->isa == (void *)drmach_io_new)) + +#define DRMACH_IS_ID(id) \ + ((id != 0) && \ + (DRMACH_OBJ(id)->isa == (void *)drmach_board_new || \ + DRMACH_OBJ(id)->isa == (void *)drmach_cpu_new || \ + DRMACH_OBJ(id)->isa == (void *)drmach_mem_new || \ + DRMACH_OBJ(id)->isa == (void *)drmach_io_new)) + +#define DRMACH_INTERNAL_ERROR() \ + drerr_new(1, EX86_INTERNAL, drmach_ie_fmt, __LINE__) + +#ifdef DEBUG +extern int drmach_debug; + +#define DRMACH_PR if (drmach_debug) printf +#else +#define DRMACH_PR _NOTE(CONSTANTCONDITION) if (0) printf +#endif /* DEBUG */ + +typedef struct { + struct drmach_node *node; + void *data; + void *func; +} drmach_node_walk_args_t; + +typedef struct drmach_node { + void *here; + + DRMACH_HANDLE (*get_dnode)(struct drmach_node *node); + dev_info_t *(*getdip)(struct drmach_node *node); + int (*getproplen)(struct drmach_node *node, char *name, + int *len); + int (*getprop)(struct drmach_node *node, char *name, + void *buf, int len); + int (*walk)(struct drmach_node *node, void *data, + int (*cb)(drmach_node_walk_args_t *args)); +} drmach_node_t; + +typedef struct { + int min_index; + int max_index; + int arr_sz; + drmachid_t *arr; +} drmach_array_t; + +typedef struct { + void *isa; + + void (*dispose)(drmachid_t); + sbd_error_t *(*release)(drmachid_t); + sbd_error_t *(*status)(drmachid_t, drmach_status_t *); + + char name[MAXNAMELEN]; +} drmach_common_t; + +typedef struct { + drmach_common_t cm; + uint_t bnum; + int assigned; + int powered; + int connected; + int cond; + drmach_node_t *tree; + drmach_array_t *devices; + int boot_board; /* if board exists on bootup */ +} drmach_board_t; + +typedef struct { + drmach_common_t cm; + drmach_board_t *bp; + int unum; + uint_t portid; + int busy; + int powered; + const char *type; + drmach_node_t *node; +} drmach_device_t; + +typedef struct drmach_cpu { + drmach_device_t dev; + processorid_t cpuid; + uint32_t apicid; +} drmach_cpu_t; + +typedef struct drmach_mem { + drmach_device_t dev; + uint64_t mem_alignment; + uint64_t slice_base; + uint64_t slice_top; + uint64_t slice_size; + uint64_t base_pa; /* lowest installed memory base */ + uint64_t nbytes; /* size of installed memory */ + struct memlist *memlist; +} drmach_mem_t; + +typedef struct drmach_io { + drmach_device_t dev; +} drmach_io_t; + +typedef struct drmach_domain_info { + uint64_t floating; + int allow_dr; +} drmach_domain_info_t; + +typedef struct { + drmach_board_t *obj; + int ndevs; + void *a; + sbd_error_t *(*found)(void *a, const char *, int, drmachid_t); + sbd_error_t *err; +} drmach_board_cb_data_t; + +extern drmach_domain_info_t drmach_domain; + +extern drmach_board_t *drmach_board_new(uint_t, int); +extern sbd_error_t *drmach_device_new(drmach_node_t *, + drmach_board_t *, int, drmachid_t *); +extern sbd_error_t *drmach_cpu_new(drmach_device_t *, drmachid_t *); +extern sbd_error_t *drmach_mem_new(drmach_device_t *, drmachid_t *); +extern sbd_error_t *drmach_io_new(drmach_device_t *, drmachid_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DRMACH_ACPI_H */ diff --git a/usr/src/uts/i86pc/io/acpi_drv/acpi_drv.c b/usr/src/uts/i86pc/io/acpi_drv/acpi_drv.c index 3a237509ba..cece3f77f2 100644 --- a/usr/src/uts/i86pc/io/acpi_drv/acpi_drv.c +++ b/usr/src/uts/i86pc/io/acpi_drv/acpi_drv.c @@ -919,7 +919,7 @@ acpi_drv_obj_copy(ACPI_OBJECT *op, char *bp, struct obj_desc *dp) ep->Type == dp->type) { ACPI_DRV_DBG(CE_NOTE, NULL, "\t%s: \"%s\"", dp->name, ep->String.Pointer); - (void) strncpy(fp, ep->String.Pointer, dp->size); + (void) strlcpy(fp, ep->String.Pointer, dp->size); } else if (dp->type == ACPI_TYPE_STRING && ep->Type == ACPI_TYPE_BUFFER) { #ifdef DEBUG @@ -1559,9 +1559,9 @@ acpi_drv_obj_init(struct acpi_drv_dev *p) if ((info->Valid & ACPI_VALID_HID) == 0) { ACPI_DRV_DBG(CE_WARN, NULL, "AcpiGetObjectInfo(): _HID not available"); - (void) strncpy(p->hid, "\0", ID_LEN); + p->hid[0] = 0; } else { - (void) strncpy(p->hid, info->HardwareId.String, ID_LEN); + (void) strlcpy(p->hid, info->HardwareId.String, ID_LEN); } /* @@ -1572,9 +1572,9 @@ acpi_drv_obj_init(struct acpi_drv_dev *p) ACPI_DRV_DBG(CE_WARN, NULL, "AcpiGetObjectInfo(): _UID not available"); /* Use 0 as the default _UID */ - (void) strncpy(p->uid, "\0", ID_LEN); + p->uid[0] = 0; } else { - (void) strncpy(p->uid, info->UniqueId.String, ID_LEN); + (void) strlcpy(p->uid, info->UniqueId.String, ID_LEN); } AcpiOsFree(info); @@ -2064,9 +2064,9 @@ acpi_drv_dev_init(struct acpi_drv_dev *p) if ((info->Valid & ACPI_VALID_HID) == 0) { ACPI_DRV_DBG(CE_WARN, NULL, "!AcpiGetObjectInfo(): _HID not available"); - (void) strncpy(p->hid, "\0", ID_LEN); + p->hid[0] = 0; } else { - (void) strncpy(p->hid, info->HardwareId.String, ID_LEN); + (void) strlcpy(p->hid, info->HardwareId.String, ID_LEN); } /* @@ -2077,9 +2077,9 @@ acpi_drv_dev_init(struct acpi_drv_dev *p) ACPI_DRV_DBG(CE_WARN, NULL, "!AcpiGetObjectInfo(): _UID not available"); /* Use 0 as the default _UID */ - (void) strncpy(p->uid, "\0", ID_LEN); + p->uid[0] = 0; } else { - (void) strncpy(p->uid, info->UniqueId.String, ID_LEN); + (void) strlcpy(p->uid, info->UniqueId.String, ID_LEN); } if (info->Valid & ACPI_VALID_ADR) { diff --git a/usr/src/uts/i86pc/io/cbe.c b/usr/src/uts/i86pc/io/cbe.c index d5e08c63bc..87ec248717 100644 --- a/usr/src/uts/i86pc/io/cbe.c +++ b/usr/src/uts/i86pc/io/cbe.c @@ -40,6 +40,7 @@ #include <sys/ddi_impldefs.h> #include <sys/ddi_intr.h> #include <sys/avintr.h> +#include <sys/note.h> static int cbe_vector; static int cbe_ticks = 0; @@ -207,6 +208,13 @@ cbe_configure(cpu_t *cpu) return (cpu); } +void +cbe_unconfigure(void *arg) +{ + _NOTE(ARGUNUSED(arg)); + ASSERT(!CPU_IN_SET(cbe_enabled, ((cpu_t *)arg)->cpu_id)); +} + #ifndef __xpv /* * declarations needed for time adjustment @@ -318,7 +326,7 @@ cbe_init(void) { cyc_backend_t cbe = { cbe_configure, /* cyb_configure */ - NULL, /* cyb_unconfigure */ + cbe_unconfigure, /* cyb_unconfigure */ cbe_enable, /* cyb_enable */ cbe_disable, /* cyb_disable */ cbe_reprogram, /* cyb_reprogram */ diff --git a/usr/src/uts/i86pc/io/dr/dr.c b/usr/src/uts/i86pc/io/dr/dr.c new file mode 100644 index 0000000000..90f3a80adf --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr.c @@ -0,0 +1,3329 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * PIM-DR layer of DR driver. Provides interface between user + * level applications and the PSM-DR layer. + */ + +#include <sys/note.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/dditypes.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/poll.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/processor.h> +#include <sys/cpuvar.h> +#include <sys/mem_config.h> + +#include <sys/autoconf.h> +#include <sys/cmn_err.h> + +#include <sys/ddi_impldefs.h> +#include <sys/promif.h> +#include <sys/machsystm.h> + +#include <sys/dr.h> +#include <sys/drmach.h> +#include <sys/dr_util.h> + +extern int nulldev(); +extern int nodev(); +extern struct memlist *phys_install; + +#ifdef DEBUG +uint_t dr_debug = 0; /* dr.h for bit values */ +#endif /* DEBUG */ + +/* + * NOTE: state_str, nt_str and SBD_CMD_STR are only used in a debug + * kernel. They are, however, referenced during both debug and non-debug + * compiles. + */ + +static char *state_str[] = { + "EMPTY", "OCCUPIED", "CONNECTED", "UNCONFIGURED", + "PARTIAL", "CONFIGURED", "RELEASE", "UNREFERENCED", + "FATAL" +}; + +#define SBD_CMD_STR(c) \ + (((c) == SBD_CMD_ASSIGN) ? "ASSIGN" : \ + ((c) == SBD_CMD_UNASSIGN) ? "UNASSIGN" : \ + ((c) == SBD_CMD_POWERON) ? "POWERON" : \ + ((c) == SBD_CMD_POWEROFF) ? "POWEROFF" : \ + ((c) == SBD_CMD_TEST) ? "TEST" : \ + ((c) == SBD_CMD_CONNECT) ? "CONNECT" : \ + ((c) == SBD_CMD_DISCONNECT) ? "DISCONNECT" : \ + ((c) == SBD_CMD_CONFIGURE) ? "CONFIGURE" : \ + ((c) == SBD_CMD_UNCONFIGURE) ? "UNCONFIGURE" : \ + ((c) == SBD_CMD_GETNCM) ? "GETNCM" : \ + ((c) == SBD_CMD_PASSTHRU) ? "PASSTHRU" : \ + ((c) == SBD_CMD_STATUS) ? "STATUS" : "unknown") + +#define DR_GET_BOARD_DEVUNIT(sb, ut, un) (&((sb)->b_dev[DEVSET_NIX(ut)][un])) + +#define DR_MAKE_MINOR(i, b) (((i) << 16) | (b)) +#define DR_MINOR2INST(m) (((m) >> 16) & 0xffff) +#define DR_MINOR2BNUM(m) ((m) & 0xffff) + +/* for the DR*INTERNAL_ERROR macros. see sys/dr.h. */ +static char *dr_ie_fmt = "dr.c %d"; + +/* struct for drmach device name to sbd_comp_type_t mapping */ +typedef struct { + char *s_devtype; + sbd_comp_type_t s_nodetype; +} dr_devname_t; + +/* struct to map starfire device attributes - name:sbd_comp_type_t */ +static dr_devname_t dr_devattr[] = { + { DRMACH_DEVTYPE_MEM, SBD_COMP_MEM }, + { DRMACH_DEVTYPE_CPU, SBD_COMP_CPU }, + { DRMACH_DEVTYPE_PCI, SBD_COMP_IO }, +#if defined(DRMACH_DEVTYPE_SBUS) + { DRMACH_DEVTYPE_SBUS, SBD_COMP_IO }, +#endif +#if defined(DRMACH_DEVTYPE_WCI) + { DRMACH_DEVTYPE_WCI, SBD_COMP_IO }, +#endif + /* last s_devtype must be NULL, s_nodetype must be SBD_COMP_UNKNOWN */ + { NULL, SBD_COMP_UNKNOWN } +}; + +/* + * Per instance soft-state structure. + */ +typedef struct dr_softstate { + dev_info_t *dip; + dr_board_t *boards; + kmutex_t i_lock; + int dr_initialized; +} dr_softstate_t; + +/* + * dr Global data elements + */ +struct dr_global { + dr_softstate_t *softsp; /* pointer to initialize soft state */ + kmutex_t lock; +} dr_g; + +dr_unsafe_devs_t dr_unsafe_devs; + +/* + * Table of known passthru commands. + */ +struct { + char *pt_name; + int (*pt_func)(dr_handle_t *); +} pt_arr[] = { + "quiesce", dr_pt_test_suspend, +}; + +int dr_modunload_okay = 0; /* set to non-zero to allow unload */ + +/* + * State transition table. States valid transitions for "board" state. + * Recall that non-zero return value terminates operation, however + * the herrno value is what really indicates an error , if any. + */ +static int +_cmd2index(int c) +{ + /* + * Translate DR CMD to index into dr_state_transition. + */ + switch (c) { + case SBD_CMD_CONNECT: return (0); + case SBD_CMD_DISCONNECT: return (1); + case SBD_CMD_CONFIGURE: return (2); + case SBD_CMD_UNCONFIGURE: return (3); + case SBD_CMD_ASSIGN: return (4); + case SBD_CMD_UNASSIGN: return (5); + case SBD_CMD_POWERON: return (6); + case SBD_CMD_POWEROFF: return (7); + case SBD_CMD_TEST: return (8); + default: return (-1); + } +} + +#define CMD2INDEX(c) _cmd2index(c) + +static struct dr_state_trans { + int x_cmd; + struct { + int x_rv; /* return value of pre_op */ + int x_err; /* error, if any */ + } x_op[DR_STATE_MAX]; +} dr_state_transition[] = { + { SBD_CMD_CONNECT, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_DISCONNECT, + { + { -1, ESBD_STATE }, /* empty */ + { 0, 0 }, /* occupied */ + { 0, 0 }, /* connected */ + { 0, 0 }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_CONFIGURE, + { + { -1, ESBD_STATE }, /* empty */ + { -1, ESBD_STATE }, /* occupied */ + { 0, 0 }, /* connected */ + { 0, 0 }, /* unconfigured */ + { 0, 0 }, /* partial */ + { 0, 0 }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_UNCONFIGURE, + { + { -1, ESBD_STATE }, /* empty */ + { -1, ESBD_STATE }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { 0, 0 }, /* partial */ + { 0, 0 }, /* configured */ + { 0, 0 }, /* release */ + { 0, 0 }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_ASSIGN, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_UNASSIGN, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_POWERON, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_POWEROFF, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, + { SBD_CMD_TEST, + { + { 0, 0 }, /* empty */ + { 0, 0 }, /* occupied */ + { -1, ESBD_STATE }, /* connected */ + { -1, ESBD_STATE }, /* unconfigured */ + { -1, ESBD_STATE }, /* partial */ + { -1, ESBD_STATE }, /* configured */ + { -1, ESBD_STATE }, /* release */ + { -1, ESBD_STATE }, /* unreferenced */ + { -1, ESBD_FATAL_STATE }, /* fatal */ + } + }, +}; + +/* + * Global R/W lock to synchronize access across + * multiple boards. Users wanting multi-board access + * must grab WRITE lock, others must grab READ lock. + */ +krwlock_t dr_grwlock; + +/* + * Head of the boardlist used as a reference point for + * locating board structs. + * TODO: eliminate dr_boardlist + */ +dr_board_t *dr_boardlist; + +/* + * DR support functions. + */ +static dr_devset_t dr_dev2devset(sbd_comp_id_t *cid); +static int dr_check_transition(dr_board_t *bp, + dr_devset_t *devsetp, + struct dr_state_trans *transp, + int cmd); +static int dr_check_unit_attached(dr_common_unit_t *dp); +static sbd_error_t *dr_init_devlists(dr_board_t *bp); +static void dr_board_discovery(dr_board_t *bp); +static int dr_board_init(dr_board_t *bp, dev_info_t *dip, int bd); +static void dr_board_destroy(dr_board_t *bp); +static void dr_board_transition(dr_board_t *bp, dr_state_t st); + +/* + * DR driver (DDI) entry points. + */ +static int dr_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, + void *arg, void **result); +static int dr_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int dr_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int dr_probe(dev_info_t *dip); +static int dr_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *cred_p, int *rval_p); +static int dr_close(dev_t dev, int flag, int otyp, cred_t *cred_p); +static int dr_open(dev_t *dev, int flag, int otyp, cred_t *cred_p); + +/* + * DR command processing operations. + */ +static int dr_copyin_iocmd(dr_handle_t *hp); +static int dr_copyout_iocmd(dr_handle_t *hp); +static int dr_copyout_errs(dr_handle_t *hp); +static int dr_pre_op(dr_handle_t *hp); +static int dr_post_op(dr_handle_t *hp, int rv); +static int dr_exec_op(dr_handle_t *hp); +static void dr_assign_board(dr_handle_t *hp); +static void dr_unassign_board(dr_handle_t *hp); +static void dr_connect(dr_handle_t *hp); +static int dr_disconnect(dr_handle_t *hp); +static void dr_dev_configure(dr_handle_t *hp); +static void dr_dev_release(dr_handle_t *hp); +static int dr_dev_unconfigure(dr_handle_t *hp); +static void dr_dev_cancel(dr_handle_t *hp); +static int dr_dev_status(dr_handle_t *hp); +static int dr_get_ncm(dr_handle_t *hp); +static int dr_pt_ioctl(dr_handle_t *hp); +static void dr_poweron_board(dr_handle_t *hp); +static void dr_poweroff_board(dr_handle_t *hp); +static void dr_test_board(dr_handle_t *hp); + +/* + * Autoconfiguration data structures + */ +struct cb_ops dr_cb_ops = { + dr_open, /* open */ + dr_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + dr_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* struct streamtab */ + D_NEW | D_MP | D_MTSAFE, /* compatibility flags */ + CB_REV, /* Rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +struct dev_ops dr_dev_ops = { + DEVO_REV, /* build version */ + 0, /* dev ref count */ + dr_getinfo, /* getinfo */ + nulldev, /* identify */ + dr_probe, /* probe */ + dr_attach, /* attach */ + dr_detach, /* detach */ + nodev, /* reset */ + &dr_cb_ops, /* cb_ops */ + (struct bus_ops *)NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +extern struct mod_ops mod_driverops; + +static struct modldrv modldrv = { + &mod_driverops, + "Dynamic Reconfiguration", + &dr_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +/* + * Driver entry points. + */ +int +_init(void) +{ + int err; + + /* + * If you need to support multiple nodes (instances), then + * whatever the maximum number of supported nodes is would + * need to passed as the third parameter to ddi_soft_state_init(). + * Alternative would be to dynamically fini and re-init the + * soft state structure each time a node is attached. + */ + err = ddi_soft_state_init((void **)&dr_g.softsp, + sizeof (dr_softstate_t), 1); + if (err) + return (err); + + mutex_init(&dr_g.lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&dr_grwlock, NULL, RW_DEFAULT, NULL); + + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) + return (err); + + mutex_destroy(&dr_g.lock); + rw_destroy(&dr_grwlock); + + ddi_soft_state_fini((void **)&dr_g.softsp); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/*ARGSUSED1*/ +static int +dr_open(dev_t *dev, int flag, int otyp, cred_t *cred_p) +{ + int instance; + dr_softstate_t *softsp; + dr_board_t *bp; + + /* + * Don't open unless we've attached. + */ + instance = DR_MINOR2INST(getminor(*dev)); + softsp = ddi_get_soft_state(dr_g.softsp, instance); + if (softsp == NULL) + return (ENXIO); + + mutex_enter(&softsp->i_lock); + if (!softsp->dr_initialized) { + int bd; + int rv = 0; + + bp = softsp->boards; + + /* initialize each array element */ + for (bd = 0; bd < MAX_BOARDS; bd++, bp++) { + rv = dr_board_init(bp, softsp->dip, bd); + if (rv) + break; + } + + if (rv == 0) { + softsp->dr_initialized = 1; + } else { + /* destroy elements initialized thus far */ + while (--bp >= softsp->boards) + dr_board_destroy(bp); + + /* TODO: should this be another errno val ? */ + mutex_exit(&softsp->i_lock); + return (ENXIO); + } + } + mutex_exit(&softsp->i_lock); + + bp = &softsp->boards[DR_MINOR2BNUM(getminor(*dev))]; + + /* + * prevent opening of a dyn-ap for a board + * that does not exist + */ + if (!bp->b_assigned) { + if (drmach_board_lookup(bp->b_num, &bp->b_id) != 0) + return (ENODEV); + } + + return (0); +} + +/*ARGSUSED*/ +static int +dr_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +/* + * Enable/disable DR features. + */ +int dr_enable = 1; + +/*ARGSUSED3*/ +static int +dr_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *cred_p, int *rval_p) +{ + static int dr_dev_type_to_nt(char *); + + int rv = 0; + int instance; + int bd; + dr_handle_t *hp; + dr_softstate_t *softsp; + static fn_t f = "dr_ioctl"; + + PR_ALL("%s...\n", f); + + instance = DR_MINOR2INST(getminor(dev)); + softsp = ddi_get_soft_state(dr_g.softsp, instance); + if (softsp == NULL) { + cmn_err(CE_WARN, "dr%d: module not yet attached", instance); + return (ENXIO); + } + + if (!dr_enable) { + switch (cmd) { + case SBD_CMD_STATUS: + case SBD_CMD_GETNCM: + case SBD_CMD_PASSTHRU: + break; + default: + return (ENOTSUP); + } + } + + bd = DR_MINOR2BNUM(getminor(dev)); + if (bd >= MAX_BOARDS) + return (ENXIO); + + /* get and initialize storage for new handle */ + hp = GETSTRUCT(dr_handle_t, 1); + hp->h_bd = &softsp->boards[bd]; + hp->h_err = NULL; + hp->h_dev = getminor(dev); + hp->h_cmd = cmd; + hp->h_mode = mode; + hp->h_iap = (sbd_ioctl_arg_t *)arg; + + /* copy sbd command into handle */ + rv = dr_copyin_iocmd(hp); + if (rv) { + FREESTRUCT(hp, dr_handle_t, 1); + return (EINVAL); + } + + /* translate canonical name to component type */ + if (hp->h_sbdcmd.cmd_cm.c_id.c_name[0] != '\0') { + hp->h_sbdcmd.cmd_cm.c_id.c_type = + dr_dev_type_to_nt(hp->h_sbdcmd.cmd_cm.c_id.c_name); + + PR_ALL("%s: c_name = %s, c_type = %d\n", + f, + hp->h_sbdcmd.cmd_cm.c_id.c_name, + hp->h_sbdcmd.cmd_cm.c_id.c_type); + } else { + /*EMPTY*/ + PR_ALL("%s: c_name is NULL\n", f); + } + + /* determine scope of operation */ + hp->h_devset = dr_dev2devset(&hp->h_sbdcmd.cmd_cm.c_id); + + switch (hp->h_cmd) { + case SBD_CMD_STATUS: + case SBD_CMD_GETNCM: + /* no locks needed for these commands */ + break; + + default: + rw_enter(&dr_grwlock, RW_WRITER); + mutex_enter(&hp->h_bd->b_lock); + + /* + * If we're dealing with memory at all, then we have + * to keep the "exclusive" global lock held. This is + * necessary since we will probably need to look at + * multiple board structs. Otherwise, we only have + * to deal with the board in question and so can drop + * the global lock to "shared". + */ + rv = DEVSET_IN_SET(hp->h_devset, SBD_COMP_MEM, DEVSET_ANYUNIT); + if (rv == 0) + rw_downgrade(&dr_grwlock); + break; + } + rv = 0; + + if (rv == 0) + rv = dr_pre_op(hp); + if (rv == 0) { + rv = dr_exec_op(hp); + rv = dr_post_op(hp, rv); + } + + if (rv == -1) + rv = EIO; + + if (hp->h_err != NULL) + if (!(rv = dr_copyout_errs(hp))) + rv = EIO; + + /* undo locking, if any, done before dr_pre_op */ + switch (hp->h_cmd) { + case SBD_CMD_STATUS: + case SBD_CMD_GETNCM: + break; + + case SBD_CMD_ASSIGN: + case SBD_CMD_UNASSIGN: + case SBD_CMD_POWERON: + case SBD_CMD_POWEROFF: + case SBD_CMD_CONNECT: + case SBD_CMD_CONFIGURE: + case SBD_CMD_UNCONFIGURE: + case SBD_CMD_DISCONNECT: + /* Board changed state. Log a sysevent. */ + if (rv == 0) + (void) drmach_log_sysevent(hp->h_bd->b_num, "", + SE_SLEEP, 0); + /* Fall through */ + + default: + mutex_exit(&hp->h_bd->b_lock); + rw_exit(&dr_grwlock); + } + + if (hp->h_opts.size != 0) + FREESTRUCT(hp->h_opts.copts, char, hp->h_opts.size); + + FREESTRUCT(hp, dr_handle_t, 1); + + return (rv); +} + +/*ARGSUSED*/ +static int +dr_probe(dev_info_t *dip) +{ + return (DDI_PROBE_SUCCESS); +} + +static int +dr_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int rv, rv2; + int bd; + int instance; + sbd_error_t *err; + dr_softstate_t *softsp; + + instance = ddi_get_instance(dip); + + switch (cmd) { + case DDI_ATTACH: + rw_enter(&dr_grwlock, RW_WRITER); + + rv = ddi_soft_state_zalloc(dr_g.softsp, instance); + if (rv != DDI_SUCCESS) { + cmn_err(CE_WARN, "dr%d: failed to alloc soft-state", + instance); + return (DDI_FAILURE); + } + + /* initialize softstate structure */ + softsp = ddi_get_soft_state(dr_g.softsp, instance); + softsp->dip = dip; + + mutex_init(&softsp->i_lock, NULL, MUTEX_DRIVER, NULL); + + /* allocate board array (aka boardlist) */ + softsp->boards = GETSTRUCT(dr_board_t, MAX_BOARDS); + + /* TODO: eliminate dr_boardlist */ + dr_boardlist = softsp->boards; + + /* initialize each array element */ + rv = DDI_SUCCESS; + for (bd = 0; bd < MAX_BOARDS; bd++) { + dr_board_t *bp = &softsp->boards[bd]; + char *p, *name; + int l, minor_num; + + /* + * initialized board attachment point path + * (relative to pseudo) in a form immediately + * reusable as an cfgadm command argument. + * TODO: clean this up + */ + p = bp->b_path; + l = sizeof (bp->b_path); + (void) snprintf(p, l, "dr@%d:", instance); + while (*p != '\0') { + l--; + p++; + } + + name = p; + err = drmach_board_name(bd, p, l); + if (err) { + sbd_err_clear(&err); + rv = DDI_FAILURE; + break; + } + + minor_num = DR_MAKE_MINOR(instance, bd); + rv = ddi_create_minor_node(dip, name, S_IFCHR, + minor_num, DDI_NT_SBD_ATTACHMENT_POINT, NULL); + if (rv != DDI_SUCCESS) + rv = DDI_FAILURE; + } + + if (rv == DDI_SUCCESS) { + /* + * Announce the node's presence. + */ + ddi_report_dev(dip); + } else { + ddi_remove_minor_node(dip, NULL); + } + /* + * Init registered unsafe devs. + */ + dr_unsafe_devs.devnames = NULL; + rv2 = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + "unsupported-io-drivers", &dr_unsafe_devs.devnames, + &dr_unsafe_devs.ndevs); + + if (rv2 != DDI_PROP_SUCCESS) + dr_unsafe_devs.ndevs = 0; + + rw_exit(&dr_grwlock); + return (rv); + + default: + return (DDI_FAILURE); + } + + /*NOTREACHED*/ +} + +static int +dr_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance; + dr_softstate_t *softsp; + + switch (cmd) { + case DDI_DETACH: + if (!dr_modunload_okay) + return (DDI_FAILURE); + + rw_enter(&dr_grwlock, RW_WRITER); + + instance = ddi_get_instance(dip); + softsp = ddi_get_soft_state(dr_g.softsp, instance); + + /* TODO: eliminate dr_boardlist */ + ASSERT(softsp->boards == dr_boardlist); + + /* remove all minor nodes */ + ddi_remove_minor_node(dip, NULL); + + if (softsp->dr_initialized) { + int bd; + + for (bd = 0; bd < MAX_BOARDS; bd++) + dr_board_destroy(&softsp->boards[bd]); + } + + FREESTRUCT(softsp->boards, dr_board_t, MAX_BOARDS); + mutex_destroy(&softsp->i_lock); + ddi_soft_state_free(dr_g.softsp, instance); + + rw_exit(&dr_grwlock); + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + /*NOTREACHED*/ +} + +static int +dr_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + _NOTE(ARGUNUSED(dip)) + + dev_t dev = (dev_t)arg; + int instance, error; + dr_softstate_t *softsp; + + *result = NULL; + error = DDI_SUCCESS; + instance = DR_MINOR2INST(getminor(dev)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + softsp = ddi_get_soft_state(dr_g.softsp, instance); + if (softsp == NULL) + return (DDI_FAILURE); + *result = (void *)softsp->dip; + break; + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + break; + + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +/* + * DR operations. + */ + +static int +dr_copyin_iocmd(dr_handle_t *hp) +{ + static fn_t f = "dr_copyin_iocmd"; + sbd_cmd_t *scp = &hp->h_sbdcmd; + + if (hp->h_iap == NULL) + return (EINVAL); + + bzero((caddr_t)scp, sizeof (sbd_cmd_t)); + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(hp->h_mode & FMODELS) == DDI_MODEL_ILP32) { + sbd_cmd32_t scmd32; + + bzero((caddr_t)&scmd32, sizeof (sbd_cmd32_t)); + + if (ddi_copyin((void *)hp->h_iap, (void *)&scmd32, + sizeof (sbd_cmd32_t), hp->h_mode)) { + cmn_err(CE_WARN, + "%s: (32bit) failed to copyin " + "sbdcmd-struct", f); + return (EFAULT); + } + scp->cmd_cm.c_id.c_type = scmd32.cmd_cm.c_id.c_type; + scp->cmd_cm.c_id.c_unit = scmd32.cmd_cm.c_id.c_unit; + bcopy(&scmd32.cmd_cm.c_id.c_name[0], + &scp->cmd_cm.c_id.c_name[0], OBP_MAXPROPNAME); + scp->cmd_cm.c_flags = scmd32.cmd_cm.c_flags; + scp->cmd_cm.c_len = scmd32.cmd_cm.c_len; + scp->cmd_cm.c_opts = (caddr_t)(uintptr_t)scmd32.cmd_cm.c_opts; + + switch (hp->h_cmd) { + case SBD_CMD_STATUS: + scp->cmd_stat.s_nbytes = scmd32.cmd_stat.s_nbytes; + scp->cmd_stat.s_statp = + (caddr_t)(uintptr_t)scmd32.cmd_stat.s_statp; + break; + default: + break; + + } + } else +#endif /* _MULTI_DATAMODEL */ + if (ddi_copyin((void *)hp->h_iap, (void *)scp, + sizeof (sbd_cmd_t), hp->h_mode) != 0) { + cmn_err(CE_WARN, + "%s: failed to copyin sbdcmd-struct", f); + return (EFAULT); + } + + if ((hp->h_opts.size = scp->cmd_cm.c_len) != 0) { + hp->h_opts.copts = GETSTRUCT(char, scp->cmd_cm.c_len + 1); + ++hp->h_opts.size; + if (ddi_copyin((void *)scp->cmd_cm.c_opts, + (void *)hp->h_opts.copts, + scp->cmd_cm.c_len, hp->h_mode) != 0) { + cmn_err(CE_WARN, "%s: failed to copyin options", f); + return (EFAULT); + } + } + + return (0); +} + +static int +dr_copyout_iocmd(dr_handle_t *hp) +{ + static fn_t f = "dr_copyout_iocmd"; + sbd_cmd_t *scp = &hp->h_sbdcmd; + + if (hp->h_iap == NULL) + return (EINVAL); + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(hp->h_mode & FMODELS) == DDI_MODEL_ILP32) { + sbd_cmd32_t scmd32; + + scmd32.cmd_cm.c_id.c_type = scp->cmd_cm.c_id.c_type; + scmd32.cmd_cm.c_id.c_unit = scp->cmd_cm.c_id.c_unit; + bcopy(&scp->cmd_cm.c_id.c_name[0], + &scmd32.cmd_cm.c_id.c_name[0], OBP_MAXPROPNAME); + + scmd32.cmd_cm.c_flags = scp->cmd_cm.c_flags; + scmd32.cmd_cm.c_len = scp->cmd_cm.c_len; + scmd32.cmd_cm.c_opts = (caddr32_t)(uintptr_t)scp->cmd_cm.c_opts; + + switch (hp->h_cmd) { + case SBD_CMD_GETNCM: + scmd32.cmd_getncm.g_ncm = scp->cmd_getncm.g_ncm; + break; + default: + break; + } + + if (ddi_copyout((void *)&scmd32, (void *)hp->h_iap, + sizeof (sbd_cmd32_t), hp->h_mode)) { + cmn_err(CE_WARN, + "%s: (32bit) failed to copyout " + "sbdcmd-struct", f); + return (EFAULT); + } + } else +#endif /* _MULTI_DATAMODEL */ + if (ddi_copyout((void *)scp, (void *)hp->h_iap, + sizeof (sbd_cmd_t), hp->h_mode) != 0) { + cmn_err(CE_WARN, + "%s: failed to copyout sbdcmd-struct", f); + return (EFAULT); + } + + return (0); +} + +static int +dr_copyout_errs(dr_handle_t *hp) +{ + static fn_t f = "dr_copyout_errs"; + + if (hp->h_err == NULL) + return (0); + + if (hp->h_err->e_code) { + PR_ALL("%s: error %d %s", + f, hp->h_err->e_code, hp->h_err->e_rsc); + } + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(hp->h_mode & FMODELS) == DDI_MODEL_ILP32) { + sbd_error32_t *serr32p; + + serr32p = GETSTRUCT(sbd_error32_t, 1); + + serr32p->e_code = hp->h_err->e_code; + bcopy(&hp->h_err->e_rsc[0], &serr32p->e_rsc[0], + MAXPATHLEN); + if (ddi_copyout((void *)serr32p, + (void *)&((sbd_ioctl_arg32_t *)hp->h_iap)->i_err, + sizeof (sbd_error32_t), hp->h_mode)) { + cmn_err(CE_WARN, + "%s: (32bit) failed to copyout", f); + return (EFAULT); + } + FREESTRUCT(serr32p, sbd_error32_t, 1); + } else +#endif /* _MULTI_DATAMODEL */ + if (ddi_copyout((void *)hp->h_err, + (void *)&hp->h_iap->i_err, + sizeof (sbd_error_t), hp->h_mode)) { + cmn_err(CE_WARN, + "%s: failed to copyout", f); + return (EFAULT); + } + + sbd_err_clear(&hp->h_err); + + return (0); + +} + +/* + * pre-op entry point must sbd_err_set_c(), if needed. + * Return value of non-zero indicates failure. + */ +static int +dr_pre_op(dr_handle_t *hp) +{ + int rv = 0, t; + int cmd, serr = 0; + dr_devset_t devset; + dr_board_t *bp = hp->h_bd; + dr_handle_t *shp = hp; + static fn_t f = "dr_pre_op"; + + cmd = hp->h_cmd; + devset = shp->h_devset; + + PR_ALL("%s (cmd = %s)...\n", f, SBD_CMD_STR(cmd)); + + devset = DEVSET_AND(devset, DR_DEVS_PRESENT(bp)); + hp->h_err = drmach_pre_op(cmd, bp->b_id, &hp->h_opts, &devset); + if (hp->h_err != NULL) { + PR_ALL("drmach_pre_op failed for cmd %s(%d)\n", + SBD_CMD_STR(cmd), cmd); + return (-1); + } + + /* + * Check for valid state transitions. + */ + if ((t = CMD2INDEX(cmd)) != -1) { + struct dr_state_trans *transp; + int state_err; + + transp = &dr_state_transition[t]; + ASSERT(transp->x_cmd == cmd); + + state_err = dr_check_transition(bp, &devset, transp, cmd); + + if (state_err < 0) { + /* + * Invalidate device. + */ + dr_op_err(CE_IGNORE, hp, ESBD_INVAL, NULL); + serr = -1; + PR_ALL("%s: invalid devset (0x%x)\n", + f, (uint_t)devset); + } else if (state_err != 0) { + /* + * State transition is not a valid one. + */ + dr_op_err(CE_IGNORE, hp, + transp->x_op[state_err].x_err, NULL); + + serr = transp->x_op[state_err].x_rv; + + PR_ALL("%s: invalid state %s(%d) for cmd %s(%d)\n", + f, state_str[state_err], state_err, + SBD_CMD_STR(cmd), cmd); + } else { + shp->h_devset = devset; + } + } + + if (serr) { + rv = -1; + } + + return (rv); +} + +static int +dr_post_op(dr_handle_t *hp, int rv) +{ + int cmd; + sbd_error_t *err; + dr_board_t *bp = hp->h_bd; + static fn_t f = "dr_post_op"; + + cmd = hp->h_cmd; + + PR_ALL("%s (cmd = %s)...\n", f, SBD_CMD_STR(cmd)); + + err = drmach_post_op(cmd, bp->b_id, &hp->h_opts, rv); + if (err != NULL) { + PR_ALL("drmach_post_op failed for cmd %s(%d)\n", + SBD_CMD_STR(cmd), cmd); + if (rv == 0) { + ASSERT(hp->h_err == NULL); + hp->h_err = err; + rv = -1; + } else if (hp->h_err == NULL) { + hp->h_err = err; + } else { + sbd_err_clear(&err); + } + } + + return (rv); +} + +static int +dr_exec_op(dr_handle_t *hp) +{ + int rv = 0; + static fn_t f = "dr_exec_op"; + + /* errors should have been caught by now */ + ASSERT(hp->h_err == NULL); + + switch (hp->h_cmd) { + case SBD_CMD_ASSIGN: + dr_assign_board(hp); + break; + + case SBD_CMD_UNASSIGN: + dr_unassign_board(hp); + break; + + case SBD_CMD_POWEROFF: + dr_poweroff_board(hp); + break; + + case SBD_CMD_POWERON: + dr_poweron_board(hp); + break; + + case SBD_CMD_TEST: + dr_test_board(hp); + break; + + case SBD_CMD_CONNECT: + dr_connect(hp); + break; + + case SBD_CMD_CONFIGURE: + dr_dev_configure(hp); + break; + + case SBD_CMD_UNCONFIGURE: + dr_dev_release(hp); + if (hp->h_err == NULL) + rv = dr_dev_unconfigure(hp); + else + dr_dev_cancel(hp); + break; + + case SBD_CMD_DISCONNECT: + rv = dr_disconnect(hp); + break; + + case SBD_CMD_STATUS: + rv = dr_dev_status(hp); + break; + + case SBD_CMD_GETNCM: + hp->h_sbdcmd.cmd_getncm.g_ncm = dr_get_ncm(hp); + rv = dr_copyout_iocmd(hp); + break; + + case SBD_CMD_PASSTHRU: + rv = dr_pt_ioctl(hp); + break; + + default: + cmn_err(CE_WARN, + "%s: unknown command (%d)", + f, hp->h_cmd); + break; + } + + if (hp->h_err != NULL) { + rv = -1; + } + + return (rv); +} + +static void +dr_assign_board(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + + hp->h_err = drmach_board_assign(bp->b_num, &bp->b_id); + if (hp->h_err == NULL) { + bp->b_assigned = 1; + } +} + +static void +dr_unassign_board(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + + /* + * Block out status during unassign. + * Not doing cv_wait_sig here as starfire SSP software + * ignores unassign failure and removes board from + * domain mask causing system panic. + * TODO: Change cv_wait to cv_wait_sig when SSP software + * handles unassign failure. + */ + dr_lock_status(bp); + + hp->h_err = drmach_board_unassign(bp->b_id); + if (hp->h_err == NULL) { + /* + * clear drmachid_t handle; not valid after board unassign + */ + bp->b_id = 0; + bp->b_assigned = 0; + } + + dr_unlock_status(bp); +} + +static void +dr_poweron_board(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + + hp->h_err = drmach_board_poweron(bp->b_id); +} + +static void +dr_poweroff_board(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + + hp->h_err = drmach_board_poweroff(bp->b_id); +} + +static void +dr_test_board(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + hp->h_err = drmach_board_test(bp->b_id, &hp->h_opts, + dr_cmd_flags(hp) & SBD_FLAG_FORCE); +} + +/* + * Create and populate the component nodes for a board. Assumes that the + * devlists for the board have been initialized. + */ +static void +dr_make_comp_nodes(dr_board_t *bp) +{ + int i; + + /* + * Make nodes for the individual components on the board. + * First we need to initialize memory unit data structures of board + * structure. + */ + for (i = 0; i < MAX_MEM_UNITS_PER_BOARD; i++) { + dr_mem_unit_t *mp; + + mp = dr_get_mem_unit(bp, i); + dr_init_mem_unit(mp); + } + + /* + * Initialize cpu unit data structures. + */ + for (i = 0; i < MAX_CPU_UNITS_PER_BOARD; i++) { + dr_cpu_unit_t *cp; + + cp = dr_get_cpu_unit(bp, i); + dr_init_cpu_unit(cp); + } + + /* + * Initialize io unit data structures. + */ + for (i = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + dr_io_unit_t *ip; + + ip = dr_get_io_unit(bp, i); + dr_init_io_unit(ip); + } + + dr_board_transition(bp, DR_STATE_CONNECTED); + + bp->b_rstate = SBD_STAT_CONNECTED; + bp->b_ostate = SBD_STAT_UNCONFIGURED; + bp->b_cond = SBD_COND_OK; + (void) drv_getparm(TIME, (void *)&bp->b_time); + +} + +/* + * Only do work if called to operate on an entire board + * which doesn't already have components present. + */ +static void +dr_connect(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + static fn_t f = "dr_connect"; + + PR_ALL("%s...\n", f); + + if (DR_DEVS_PRESENT(bp)) { + /* + * Board already has devices present. + */ + PR_ALL("%s: devices already present (" DEVSET_FMT_STR ")\n", + f, DEVSET_FMT_ARG(DR_DEVS_PRESENT(bp))); + return; + } + + hp->h_err = drmach_board_connect(bp->b_id, &hp->h_opts); + if (hp->h_err) + return; + + hp->h_err = dr_init_devlists(bp); + if (hp->h_err) + return; + else if (bp->b_ndev == 0) { + dr_op_err(CE_WARN, hp, ESBD_EMPTY_BD, bp->b_path); + return; + } else { + dr_make_comp_nodes(bp); + return; + } + /*NOTREACHED*/ +} + +static int +dr_disconnect(dr_handle_t *hp) +{ + int i; + dr_devset_t devset; + dr_board_t *bp = hp->h_bd; + static fn_t f = "dr_disconnect"; + + PR_ALL("%s...\n", f); + + /* + * Only devices which are present, but + * unattached can be disconnected. + */ + devset = hp->h_devset & DR_DEVS_PRESENT(bp) & + DR_DEVS_UNATTACHED(bp); + + if ((devset == 0) && DR_DEVS_PRESENT(bp)) { + dr_op_err(CE_IGNORE, hp, ESBD_EMPTY_BD, bp->b_path); + return (0); + } + + /* + * Block out status during disconnect. + */ + mutex_enter(&bp->b_slock); + while (bp->b_sflags & DR_BSLOCK) { + if (cv_wait_sig(&bp->b_scv, &bp->b_slock) == 0) { + mutex_exit(&bp->b_slock); + return (EINTR); + } + } + bp->b_sflags |= DR_BSLOCK; + mutex_exit(&bp->b_slock); + + hp->h_err = drmach_board_disconnect(bp->b_id, &hp->h_opts); + if (hp->h_err && hp->h_err->e_code == EX86_WALK_DEPENDENCY) { + /* + * Other boards have dependency on this board. No device nodes + * have been destroyed so keep current board status. + */ + goto disconnect_done; + } + + DR_DEVS_DISCONNECT(bp, devset); + + ASSERT((DR_DEVS_ATTACHED(bp) & devset) == 0); + + /* + * Update per-device state transitions. + */ + for (i = 0; i < MAX_CPU_UNITS_PER_BOARD; i++) { + dr_cpu_unit_t *cp; + + if (!DEVSET_IN_SET(devset, SBD_COMP_CPU, i)) + continue; + + cp = dr_get_cpu_unit(bp, i); + if (dr_disconnect_cpu(cp) == 0) + dr_device_transition(&cp->sbc_cm, DR_STATE_EMPTY); + else if (cp->sbc_cm.sbdev_error != NULL) + DRERR_SET_C(&hp->h_err, &cp->sbc_cm.sbdev_error); + + ASSERT(cp->sbc_cm.sbdev_error == NULL); + } + + for (i = 0; i < MAX_MEM_UNITS_PER_BOARD; i++) { + dr_mem_unit_t *mp; + + if (!DEVSET_IN_SET(devset, SBD_COMP_MEM, i)) + continue; + + mp = dr_get_mem_unit(bp, i); + if (dr_disconnect_mem(mp) == 0) + dr_device_transition(&mp->sbm_cm, DR_STATE_EMPTY); + else if (mp->sbm_cm.sbdev_error != NULL) + DRERR_SET_C(&hp->h_err, &mp->sbm_cm.sbdev_error); + + ASSERT(mp->sbm_cm.sbdev_error == NULL); + } + + for (i = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + dr_io_unit_t *ip; + + if (!DEVSET_IN_SET(devset, SBD_COMP_IO, i)) + continue; + + ip = dr_get_io_unit(bp, i); + if (dr_disconnect_io(ip) == 0) + dr_device_transition(&ip->sbi_cm, DR_STATE_EMPTY); + else if (ip->sbi_cm.sbdev_error != NULL) + DRERR_SET_C(&hp->h_err, &ip->sbi_cm.sbdev_error); + + ASSERT(ip->sbi_cm.sbdev_error == NULL); + } + + if (hp->h_err) { + /* + * For certain errors, drmach_board_disconnect will mark + * the board as unusable; in these cases the devtree must + * be purged so that status calls will succeed. + * XXX + * This implementation checks for discrete error codes - + * someday, the i/f to drmach_board_disconnect should be + * changed to avoid the e_code testing. + */ + if (hp->h_err->e_code == EX86_DEPROBE) { + bp->b_ostate = SBD_STAT_UNCONFIGURED; + bp->b_busy = 0; + (void) drv_getparm(TIME, (void *)&bp->b_time); + + if (drmach_board_deprobe(bp->b_id)) + goto disconnect_done; + else + bp->b_ndev = 0; + } + } + + /* + * Once all the components on a board have been disconnect + * the board's state can transition to disconnected and + * we can allow the deprobe to take place. + */ + if (hp->h_err == NULL && DR_DEVS_PRESENT(bp) == 0) { + dr_board_transition(bp, DR_STATE_OCCUPIED); + bp->b_rstate = SBD_STAT_DISCONNECTED; + bp->b_ostate = SBD_STAT_UNCONFIGURED; + bp->b_busy = 0; + (void) drv_getparm(TIME, (void *)&bp->b_time); + + hp->h_err = drmach_board_deprobe(bp->b_id); + + if (hp->h_err == NULL) { + bp->b_ndev = 0; + dr_board_transition(bp, DR_STATE_EMPTY); + bp->b_rstate = SBD_STAT_EMPTY; + (void) drv_getparm(TIME, (void *)&bp->b_time); + } + } + +disconnect_done: + dr_unlock_status(bp); + + return (0); +} + +/* + * Check if a particular device is a valid target of the current + * operation. Return 1 if it is a valid target, and 0 otherwise. + */ +static int +dr_dev_is_target(dr_dev_unit_t *dp, int present_only, uint_t uset) +{ + dr_common_unit_t *cp; + int is_present; + int is_attached; + + cp = &dp->du_common; + + /* check if the user requested this device */ + if ((uset & (1 << cp->sbdev_unum)) == 0) { + return (0); + } + + is_present = DR_DEV_IS_PRESENT(cp) ? 1 : 0; + is_attached = DR_DEV_IS_ATTACHED(cp) ? 1 : 0; + + /* + * If the present_only flag is set, a valid target + * must be present but not attached. Otherwise, it + * must be both present and attached. + */ + if (is_present && (present_only ^ is_attached)) { + /* sanity check */ + ASSERT(cp->sbdev_id != (drmachid_t)0); + + return (1); + } + + return (0); +} + +static void +dr_dev_make_list(dr_handle_t *hp, sbd_comp_type_t type, int present_only, + dr_common_unit_t ***devlist, int *devnum) +{ + dr_board_t *bp = hp->h_bd; + int unum; + int nunits; + uint_t uset; + int len; + dr_common_unit_t **list, **wp; + + switch (type) { + case SBD_COMP_CPU: + nunits = MAX_CPU_UNITS_PER_BOARD; + break; + case SBD_COMP_MEM: + nunits = MAX_MEM_UNITS_PER_BOARD; + break; + case SBD_COMP_IO: + nunits = MAX_IO_UNITS_PER_BOARD; + break; + default: + /* catch this in debug kernels */ + ASSERT(0); + break; + } + + /* allocate list storage. */ + len = sizeof (dr_common_unit_t *) * (nunits + 1); + list = kmem_zalloc(len, KM_SLEEP); + + /* record length of storage in first element */ + *list++ = (dr_common_unit_t *)(uintptr_t)len; + + /* get bit array signifying which units are to be involved */ + uset = DEVSET_GET_UNITSET(hp->h_devset, type); + + /* + * Adjust the loop count for CPU devices since all cores + * in a CMP will be examined in a single iteration. + */ + if (type == SBD_COMP_CPU) { + nunits = MAX_CMP_UNITS_PER_BOARD; + } + + /* populate list */ + for (wp = list, unum = 0; unum < nunits; unum++) { + dr_dev_unit_t *dp; + int core; + int cunum; + + dp = DR_GET_BOARD_DEVUNIT(bp, type, unum); + if (dr_dev_is_target(dp, present_only, uset)) { + *wp++ = &dp->du_common; + } + + /* further processing is only required for CPUs */ + if (type != SBD_COMP_CPU) { + continue; + } + + /* + * Add any additional cores from the current CPU + * device. This is to ensure that all the cores + * are grouped together in the device list, and + * consequently sequenced together during the actual + * operation. + */ + for (core = 1; core < MAX_CORES_PER_CMP; core++) { + cunum = DR_CMP_CORE_UNUM(unum, core); + dp = DR_GET_BOARD_DEVUNIT(bp, type, cunum); + + if (dr_dev_is_target(dp, present_only, uset)) { + *wp++ = &dp->du_common; + } + } + } + + /* calculate number of units in list, return result and list pointer */ + *devnum = wp - list; + *devlist = list; +} + +static void +dr_dev_clean_up(dr_handle_t *hp, dr_common_unit_t **list, int devnum) +{ + int len; + int n = 0; + dr_common_unit_t *cp, **rp = list; + + /* + * move first encountered unit error to handle if handle + * does not yet have a recorded error. + */ + if (hp->h_err == NULL) { + while (n++ < devnum) { + cp = *rp++; + if (cp->sbdev_error != NULL) { + hp->h_err = cp->sbdev_error; + cp->sbdev_error = NULL; + break; + } + } + } + + /* free remaining unit errors */ + while (n++ < devnum) { + cp = *rp++; + if (cp->sbdev_error != NULL) { + sbd_err_clear(&cp->sbdev_error); + cp->sbdev_error = NULL; + } + } + + /* free list */ + list -= 1; + len = (int)(uintptr_t)list[0]; + kmem_free(list, len); +} + +static int +dr_dev_walk(dr_handle_t *hp, sbd_comp_type_t type, int present_only, + int (*pre_op)(dr_handle_t *, dr_common_unit_t **, int), + void (*op)(dr_handle_t *, dr_common_unit_t *), + int (*post_op)(dr_handle_t *, dr_common_unit_t **, int), + void (*board_op)(dr_handle_t *, dr_common_unit_t **, int)) +{ + int devnum, rv; + dr_common_unit_t **devlist; + + dr_dev_make_list(hp, type, present_only, &devlist, &devnum); + + rv = 0; + if (devnum > 0) { + rv = (*pre_op)(hp, devlist, devnum); + if (rv == 0) { + int n; + + for (n = 0; n < devnum; n++) + (*op)(hp, devlist[n]); + + rv = (*post_op)(hp, devlist, devnum); + + (*board_op)(hp, devlist, devnum); + } + } + + dr_dev_clean_up(hp, devlist, devnum); + return (rv); +} + +/*ARGSUSED*/ +static int +dr_dev_noop(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + return (0); +} + +static void +dr_attach_update_state(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + dr_board_t *bp = hp->h_bd; + int i; + dr_devset_t devs_unattached, devs_present; + static fn_t f = "dr_attach_update_state"; + + for (i = 0; i < devnum; i++) { + dr_common_unit_t *cp = devlist[i]; + + if (dr_check_unit_attached(cp) == -1) { + PR_ALL("%s: ERROR %s not attached\n", + f, cp->sbdev_path); + continue; + } + + DR_DEV_SET_ATTACHED(cp); + + dr_device_transition(cp, DR_STATE_CONFIGURED); + cp->sbdev_cond = SBD_COND_OK; + } + + devs_present = DR_DEVS_PRESENT(bp); + devs_unattached = DR_DEVS_UNATTACHED(bp); + + switch (bp->b_state) { + case DR_STATE_CONNECTED: + case DR_STATE_UNCONFIGURED: + ASSERT(devs_present); + + if (devs_unattached == 0) { + /* + * All devices finally attached. + */ + dr_board_transition(bp, DR_STATE_CONFIGURED); + hp->h_bd->b_ostate = SBD_STAT_CONFIGURED; + hp->h_bd->b_rstate = SBD_STAT_CONNECTED; + hp->h_bd->b_cond = SBD_COND_OK; + hp->h_bd->b_busy = 0; + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } else if (devs_present != devs_unattached) { + /* + * Only some devices are fully attached. + */ + dr_board_transition(bp, DR_STATE_PARTIAL); + hp->h_bd->b_rstate = SBD_STAT_CONNECTED; + hp->h_bd->b_ostate = SBD_STAT_CONFIGURED; + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } + break; + + case DR_STATE_PARTIAL: + ASSERT(devs_present); + /* + * All devices finally attached. + */ + if (devs_unattached == 0) { + dr_board_transition(bp, DR_STATE_CONFIGURED); + hp->h_bd->b_rstate = SBD_STAT_CONNECTED; + hp->h_bd->b_ostate = SBD_STAT_CONFIGURED; + hp->h_bd->b_cond = SBD_COND_OK; + hp->h_bd->b_busy = 0; + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } + break; + + default: + break; + } +} + +static void +dr_dev_configure(dr_handle_t *hp) +{ + int rv; + + rv = dr_dev_walk(hp, SBD_COMP_CPU, 1, + dr_pre_attach_cpu, + dr_attach_cpu, + dr_post_attach_cpu, + dr_attach_update_state); + + if (rv >= 0) { + rv = dr_dev_walk(hp, SBD_COMP_MEM, 1, + dr_pre_attach_mem, + dr_attach_mem, + dr_post_attach_mem, + dr_attach_update_state); + } + + if (rv >= 0) { + (void) dr_dev_walk(hp, SBD_COMP_IO, 1, + dr_pre_attach_io, + dr_attach_io, + dr_post_attach_io, + dr_attach_update_state); + } +} + +static void +dr_release_update_state(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + _NOTE(ARGUNUSED(devlist)) + _NOTE(ARGUNUSED(devnum)) + + dr_board_t *bp = hp->h_bd; + + /* + * If the entire board was released and all components + * unreferenced then transfer it to the UNREFERENCED state. + */ + if ((bp->b_state != DR_STATE_RELEASE) && + (DR_DEVS_RELEASED(bp) == DR_DEVS_ATTACHED(bp))) { + dr_board_transition(bp, DR_STATE_RELEASE); + hp->h_bd->b_busy = 1; + } +} + +/* called by dr_release_done [below] and dr_release_mem_done [dr_mem.c] */ +int +dr_release_dev_done(dr_common_unit_t *cp) +{ + if (cp->sbdev_state == DR_STATE_RELEASE) { + ASSERT(DR_DEV_IS_RELEASED(cp)); + + DR_DEV_SET_UNREFERENCED(cp); + + dr_device_transition(cp, DR_STATE_UNREFERENCED); + + return (0); + } else { + return (-1); + } +} + +static void +dr_release_done(dr_handle_t *hp, dr_common_unit_t *cp) +{ + _NOTE(ARGUNUSED(hp)) + + dr_board_t *bp; + static fn_t f = "dr_release_done"; + + PR_ALL("%s...\n", f); + + /* get board pointer & sanity check */ + bp = cp->sbdev_bp; + ASSERT(bp == hp->h_bd); + + /* + * Transfer the device which just completed its release + * to the UNREFERENCED state. + */ + switch (cp->sbdev_type) { + case SBD_COMP_MEM: + dr_release_mem_done(cp); + break; + + default: + DR_DEV_SET_RELEASED(cp); + + dr_device_transition(cp, DR_STATE_RELEASE); + + (void) dr_release_dev_done(cp); + break; + } + + /* + * If we're not already in the RELEASE state for this + * board and we now have released all that were previously + * attached, then transfer the board to the RELEASE state. + */ + if ((bp->b_state == DR_STATE_RELEASE) && + (DR_DEVS_RELEASED(bp) == DR_DEVS_UNREFERENCED(bp))) { + dr_board_transition(bp, DR_STATE_UNREFERENCED); + bp->b_busy = 1; + (void) drv_getparm(TIME, (void *)&bp->b_time); + } +} + +static void +dr_dev_release_mem(dr_handle_t *hp, dr_common_unit_t *dv) +{ + dr_release_mem(dv); + dr_release_done(hp, dv); +} + +static void +dr_dev_release(dr_handle_t *hp) +{ + int rv; + + hp->h_bd->b_busy = 1; + + rv = dr_dev_walk(hp, SBD_COMP_CPU, 0, + dr_pre_release_cpu, + dr_release_done, + dr_dev_noop, + dr_release_update_state); + + if (rv >= 0) { + rv = dr_dev_walk(hp, SBD_COMP_MEM, 0, + dr_pre_release_mem, + dr_dev_release_mem, + dr_dev_noop, + dr_release_update_state); + } + + if (rv >= 0) { + rv = dr_dev_walk(hp, SBD_COMP_IO, 0, + dr_pre_release_io, + dr_release_done, + dr_dev_noop, + dr_release_update_state); + + } + + if (rv < 0) + hp->h_bd->b_busy = 0; + /* else, b_busy will be cleared in dr_detach_update_state() */ +} + +static void +dr_detach_update_state(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + dr_board_t *bp = hp->h_bd; + int i; + dr_state_t bstate; + static fn_t f = "dr_detach_update_state"; + + for (i = 0; i < devnum; i++) { + dr_common_unit_t *cp = devlist[i]; + + if (dr_check_unit_attached(cp) >= 0) { + /* + * Device is still attached probably due + * to an error. Need to keep track of it. + */ + PR_ALL("%s: ERROR %s not detached\n", + f, cp->sbdev_path); + + continue; + } + + DR_DEV_CLR_ATTACHED(cp); + DR_DEV_CLR_RELEASED(cp); + DR_DEV_CLR_UNREFERENCED(cp); + dr_device_transition(cp, DR_STATE_UNCONFIGURED); + } + + bstate = bp->b_state; + if (bstate != DR_STATE_UNCONFIGURED) { + if (DR_DEVS_PRESENT(bp) == DR_DEVS_UNATTACHED(bp)) { + /* + * All devices are finally detached. + */ + dr_board_transition(bp, DR_STATE_UNCONFIGURED); + hp->h_bd->b_ostate = SBD_STAT_UNCONFIGURED; + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } else if ((bp->b_state != DR_STATE_PARTIAL) && + (DR_DEVS_ATTACHED(bp) != + DR_DEVS_PRESENT(bp))) { + /* + * Some devices remain attached. + */ + dr_board_transition(bp, DR_STATE_PARTIAL); + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } + + if ((hp->h_devset & DR_DEVS_UNATTACHED(bp)) == hp->h_devset) + hp->h_bd->b_busy = 0; + } +} + +static int +dr_dev_unconfigure(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + + /* + * Block out status during IO unconfig. + */ + mutex_enter(&bp->b_slock); + while (bp->b_sflags & DR_BSLOCK) { + if (cv_wait_sig(&bp->b_scv, &bp->b_slock) == 0) { + mutex_exit(&bp->b_slock); + return (EINTR); + } + } + bp->b_sflags |= DR_BSLOCK; + mutex_exit(&bp->b_slock); + + (void) dr_dev_walk(hp, SBD_COMP_IO, 0, + dr_pre_detach_io, + dr_detach_io, + dr_post_detach_io, + dr_detach_update_state); + + dr_unlock_status(bp); + + (void) dr_dev_walk(hp, SBD_COMP_CPU, 0, + dr_pre_detach_cpu, + dr_detach_cpu, + dr_post_detach_cpu, + dr_detach_update_state); + + (void) dr_dev_walk(hp, SBD_COMP_MEM, 0, + dr_pre_detach_mem, + dr_detach_mem, + dr_post_detach_mem, + dr_detach_update_state); + + return (0); +} + +static void +dr_dev_cancel(dr_handle_t *hp) +{ + int i; + dr_devset_t devset; + dr_board_t *bp = hp->h_bd; + static fn_t f = "dr_dev_cancel"; + + PR_ALL("%s...\n", f); + + /* + * Only devices which have been "released" are + * subject to cancellation. + */ + devset = hp->h_devset & DR_DEVS_RELEASED(bp); + + /* + * Nothing to do for CPUs or IO other than change back + * their state. + */ + for (i = 0; i < MAX_CPU_UNITS_PER_BOARD; i++) { + dr_cpu_unit_t *cp; + dr_state_t nstate; + + if (!DEVSET_IN_SET(devset, SBD_COMP_CPU, i)) + continue; + + cp = dr_get_cpu_unit(bp, i); + if (dr_cancel_cpu(cp) == 0) + nstate = DR_STATE_CONFIGURED; + else + nstate = DR_STATE_FATAL; + + dr_device_transition(&cp->sbc_cm, nstate); + } + + for (i = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + dr_io_unit_t *ip; + + if (!DEVSET_IN_SET(devset, SBD_COMP_IO, i)) + continue; + ip = dr_get_io_unit(bp, i); + dr_device_transition(&ip->sbi_cm, DR_STATE_CONFIGURED); + } + for (i = 0; i < MAX_MEM_UNITS_PER_BOARD; i++) { + dr_mem_unit_t *mp; + dr_state_t nstate; + + if (!DEVSET_IN_SET(devset, SBD_COMP_MEM, i)) + continue; + + mp = dr_get_mem_unit(bp, i); + if (dr_cancel_mem(mp) == 0) + nstate = DR_STATE_CONFIGURED; + else + nstate = DR_STATE_FATAL; + + dr_device_transition(&mp->sbm_cm, nstate); + } + + PR_ALL("%s: unreleasing devset (0x%x)\n", f, (uint_t)devset); + + DR_DEVS_CANCEL(bp, devset); + + if (DR_DEVS_RELEASED(bp) == 0) { + dr_state_t new_state; + /* + * If the board no longer has any released devices + * than transfer it back to the CONFIG/PARTIAL state. + */ + if (DR_DEVS_ATTACHED(bp) == DR_DEVS_PRESENT(bp)) + new_state = DR_STATE_CONFIGURED; + else + new_state = DR_STATE_PARTIAL; + if (bp->b_state != new_state) { + dr_board_transition(bp, new_state); + } + hp->h_bd->b_ostate = SBD_STAT_CONFIGURED; + hp->h_bd->b_busy = 0; + (void) drv_getparm(TIME, (void *)&hp->h_bd->b_time); + } +} + +static int +dr_dev_status(dr_handle_t *hp) +{ + int nstat, mode, ncm, sz, pbsz, pnstat; + dr_handle_t *shp; + dr_devset_t devset = 0; + sbd_stat_t *dstatp = NULL; + sbd_dev_stat_t *devstatp; + dr_board_t *bp; + drmach_status_t pstat; + int rv = 0; + +#ifdef _MULTI_DATAMODEL + int sz32 = 0; +#endif /* _MULTI_DATAMODEL */ + + static fn_t f = "dr_dev_status"; + + PR_ALL("%s...\n", f); + + mode = hp->h_mode; + shp = hp; + devset = shp->h_devset; + bp = hp->h_bd; + + /* + * Block out disconnect, unassign, IO unconfigure and + * devinfo branch creation during status. + */ + mutex_enter(&bp->b_slock); + while (bp->b_sflags & DR_BSLOCK) { + if (cv_wait_sig(&bp->b_scv, &bp->b_slock) == 0) { + mutex_exit(&bp->b_slock); + return (EINTR); + } + } + bp->b_sflags |= DR_BSLOCK; + mutex_exit(&bp->b_slock); + + ncm = 1; + if (hp->h_sbdcmd.cmd_cm.c_id.c_type == SBD_COMP_NONE) { + if (dr_cmd_flags(hp) & SBD_FLAG_ALLCMP) { + /* + * Calculate the maximum number of components possible + * for a board. This number will be used to size the + * status scratch buffer used by board and component + * status functions. + * This buffer may differ in size from what is provided + * by the plugin, since the known component set on the + * board may change between the plugin's GETNCM call, and + * the status call. Sizing will be adjusted to the plugin's + * receptacle buffer at copyout time. + */ + ncm = MAX_CPU_UNITS_PER_BOARD + + MAX_MEM_UNITS_PER_BOARD + + MAX_IO_UNITS_PER_BOARD; + + } else { + /* + * In the case of c_type == SBD_COMP_NONE, and + * SBD_FLAG_ALLCMP not specified, only the board + * info is to be returned, no components. + */ + ncm = 0; + devset = 0; + } + } + + sz = sizeof (sbd_stat_t); + if (ncm > 1) + sz += sizeof (sbd_dev_stat_t) * (ncm - 1); + + + pbsz = (int)hp->h_sbdcmd.cmd_stat.s_nbytes; + pnstat = (pbsz - sizeof (sbd_stat_t)) / sizeof (sbd_dev_stat_t); + + /* + * s_nbytes describes the size of the preallocated user + * buffer into which the application is execting to + * receive the sbd_stat_t and sbd_dev_stat_t structures. + */ + +#ifdef _MULTI_DATAMODEL + + /* + * More buffer space is required for the 64bit to 32bit + * conversion of data structures. + */ + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + sz32 = sizeof (sbd_stat32_t); + if (ncm > 1) + sz32 += sizeof (sbd_dev_stat32_t) * (ncm - 1); + pnstat = (pbsz - sizeof (sbd_stat32_t))/ + sizeof (sbd_dev_stat32_t); + } + + sz += sz32; +#endif + /* + * Since one sbd_dev_stat_t is included in the sbd_stat_t, + * increment the plugin's nstat count. + */ + ++pnstat; + + if (bp->b_id == 0) { + bzero(&pstat, sizeof (pstat)); + } else { + sbd_error_t *err; + + err = drmach_status(bp->b_id, &pstat); + if (err) { + DRERR_SET_C(&hp->h_err, &err); + rv = EIO; + goto status_done; + } + } + + dstatp = (sbd_stat_t *)(void *)GETSTRUCT(char, sz); + + devstatp = &dstatp->s_stat[0]; + + dstatp->s_board = bp->b_num; + + /* + * Detect transitions between empty and disconnected. + */ + if (!pstat.empty && (bp->b_rstate == SBD_STAT_EMPTY)) + bp->b_rstate = SBD_STAT_DISCONNECTED; + else if (pstat.empty && (bp->b_rstate == SBD_STAT_DISCONNECTED)) + bp->b_rstate = SBD_STAT_EMPTY; + + dstatp->s_rstate = bp->b_rstate; + dstatp->s_ostate = bp->b_ostate; + dstatp->s_cond = bp->b_cond = pstat.cond; + dstatp->s_busy = bp->b_busy | pstat.busy; + dstatp->s_time = bp->b_time; + dstatp->s_power = pstat.powered; + dstatp->s_assigned = bp->b_assigned = pstat.assigned; + dstatp->s_nstat = nstat = 0; + bcopy(&pstat.type[0], &dstatp->s_type[0], SBD_TYPE_LEN); + bcopy(&pstat.info[0], &dstatp->s_info[0], SBD_MAX_INFO); + + devset &= DR_DEVS_PRESENT(bp); + if (devset == 0) { + /* + * No device chosen. + */ + PR_ALL("%s: no device present\n", f); + } + + if (DEVSET_IN_SET(devset, SBD_COMP_CPU, DEVSET_ANYUNIT)) + if ((nstat = dr_cpu_status(hp, devset, devstatp)) > 0) { + dstatp->s_nstat += nstat; + devstatp += nstat; + } + + if (DEVSET_IN_SET(devset, SBD_COMP_MEM, DEVSET_ANYUNIT)) + if ((nstat = dr_mem_status(hp, devset, devstatp)) > 0) { + dstatp->s_nstat += nstat; + devstatp += nstat; + } + + if (DEVSET_IN_SET(devset, SBD_COMP_IO, DEVSET_ANYUNIT)) + if ((nstat = dr_io_status(hp, devset, devstatp)) > 0) { + dstatp->s_nstat += nstat; + devstatp += nstat; + } + + /* + * Due to a possible change in number of components between + * the time of plugin's GETNCM call and now, there may be + * more or less components than the plugin's buffer can + * hold. Adjust s_nstat accordingly. + */ + + dstatp->s_nstat = dstatp->s_nstat > pnstat ? pnstat : dstatp->s_nstat; + +#ifdef _MULTI_DATAMODEL + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + int i, j; + sbd_stat32_t *dstat32p; + + dstat32p = (sbd_stat32_t *)devstatp; + + /* Alignment Paranoia */ + if ((ulong_t)dstat32p & 0x1) { + PR_ALL("%s: alignment: sz=0x%lx dstat32p=0x%p\n", + f, sizeof (sbd_stat32_t), (void *)dstat32p); + DR_OP_INTERNAL_ERROR(hp); + rv = EINVAL; + goto status_done; + } + + /* paranoia: detect buffer overrun */ + if ((caddr_t)&dstat32p->s_stat[dstatp->s_nstat] > + ((caddr_t)dstatp) + sz) { + DR_OP_INTERNAL_ERROR(hp); + rv = EINVAL; + goto status_done; + } + + /* copy sbd_stat_t structure members */ +#define _SBD_STAT(t, m) dstat32p->m = (t)dstatp->m + _SBD_STAT(int32_t, s_board); + _SBD_STAT(int32_t, s_rstate); + _SBD_STAT(int32_t, s_ostate); + _SBD_STAT(int32_t, s_cond); + _SBD_STAT(int32_t, s_busy); + _SBD_STAT(time32_t, s_time); + _SBD_STAT(uint32_t, s_power); + _SBD_STAT(uint32_t, s_assigned); + _SBD_STAT(int32_t, s_nstat); + bcopy(&dstatp->s_type[0], &dstat32p->s_type[0], + SBD_TYPE_LEN); + bcopy(&dstatp->s_info[0], &dstat32p->s_info[0], + SBD_MAX_INFO); +#undef _SBD_STAT + + for (i = 0; i < dstatp->s_nstat; i++) { + sbd_dev_stat_t *dsp = &dstatp->s_stat[i]; + sbd_dev_stat32_t *ds32p = &dstat32p->s_stat[i]; +#define _SBD_DEV_STAT(t, m) ds32p->m = (t)dsp->m + + /* copy sbd_cm_stat_t structure members */ + _SBD_DEV_STAT(int32_t, ds_type); + _SBD_DEV_STAT(int32_t, ds_unit); + _SBD_DEV_STAT(int32_t, ds_ostate); + _SBD_DEV_STAT(int32_t, ds_cond); + _SBD_DEV_STAT(int32_t, ds_busy); + _SBD_DEV_STAT(int32_t, ds_suspend); + _SBD_DEV_STAT(time32_t, ds_time); + bcopy(&dsp->ds_name[0], &ds32p->ds_name[0], + OBP_MAXPROPNAME); + + switch (dsp->ds_type) { + case SBD_COMP_CPU: + /* copy sbd_cpu_stat_t structure members */ + _SBD_DEV_STAT(int32_t, d_cpu.cs_isbootproc); + _SBD_DEV_STAT(int32_t, d_cpu.cs_cpuid); + _SBD_DEV_STAT(int32_t, d_cpu.cs_speed); + _SBD_DEV_STAT(int32_t, d_cpu.cs_ecache); + break; + + case SBD_COMP_MEM: + /* copy sbd_mem_stat_t structure members */ + _SBD_DEV_STAT(int32_t, d_mem.ms_interleave); + _SBD_DEV_STAT(uint32_t, d_mem.ms_basepfn); + _SBD_DEV_STAT(uint32_t, d_mem.ms_totpages); + _SBD_DEV_STAT(uint32_t, d_mem.ms_detpages); + _SBD_DEV_STAT(int32_t, d_mem.ms_pageslost); + _SBD_DEV_STAT(uint32_t, d_mem.ms_managed_pages); + _SBD_DEV_STAT(uint32_t, d_mem.ms_noreloc_pages); + _SBD_DEV_STAT(uint32_t, d_mem.ms_noreloc_first); + _SBD_DEV_STAT(uint32_t, d_mem.ms_noreloc_last); + _SBD_DEV_STAT(int32_t, d_mem.ms_cage_enabled); + _SBD_DEV_STAT(int32_t, d_mem.ms_peer_is_target); + bcopy(&dsp->d_mem.ms_peer_ap_id[0], + &ds32p->d_mem.ms_peer_ap_id[0], + sizeof (ds32p->d_mem.ms_peer_ap_id)); + break; + + case SBD_COMP_IO: + /* copy sbd_io_stat_t structure members */ + _SBD_DEV_STAT(int32_t, d_io.is_referenced); + _SBD_DEV_STAT(int32_t, d_io.is_unsafe_count); + + for (j = 0; j < SBD_MAX_UNSAFE; j++) + _SBD_DEV_STAT(int32_t, + d_io.is_unsafe_list[j]); + + bcopy(&dsp->d_io.is_pathname[0], + &ds32p->d_io.is_pathname[0], MAXPATHLEN); + break; + + case SBD_COMP_CMP: + /* copy sbd_cmp_stat_t structure members */ + bcopy(&dsp->d_cmp.ps_cpuid[0], + &ds32p->d_cmp.ps_cpuid[0], + sizeof (ds32p->d_cmp.ps_cpuid)); + _SBD_DEV_STAT(int32_t, d_cmp.ps_ncores); + _SBD_DEV_STAT(int32_t, d_cmp.ps_speed); + _SBD_DEV_STAT(int32_t, d_cmp.ps_ecache); + break; + + default: + cmn_err(CE_WARN, "%s: unknown dev type (%d)", + f, (int)dsp->ds_type); + rv = EFAULT; + goto status_done; + } +#undef _SBD_DEV_STAT + } + + + if (ddi_copyout((void *)dstat32p, + hp->h_sbdcmd.cmd_stat.s_statp, pbsz, mode) != 0) { + cmn_err(CE_WARN, + "%s: failed to copyout status " + "for board %d", f, bp->b_num); + rv = EFAULT; + goto status_done; + } + } else +#endif /* _MULTI_DATAMODEL */ + + if (ddi_copyout((void *)dstatp, hp->h_sbdcmd.cmd_stat.s_statp, + pbsz, mode) != 0) { + cmn_err(CE_WARN, + "%s: failed to copyout status for board %d", + f, bp->b_num); + rv = EFAULT; + goto status_done; + } + +status_done: + if (dstatp != NULL) + FREESTRUCT(dstatp, char, sz); + + dr_unlock_status(bp); + + return (rv); +} + +static int +dr_get_ncm(dr_handle_t *hp) +{ + int i; + int ncm = 0; + dr_devset_t devset; + + devset = DR_DEVS_PRESENT(hp->h_bd); + if (hp->h_sbdcmd.cmd_cm.c_id.c_type != SBD_COMP_NONE) + devset &= DEVSET(hp->h_sbdcmd.cmd_cm.c_id.c_type, + DEVSET_ANYUNIT); + + /* + * Handle CPUs first to deal with possible CMP + * devices. If the CPU is a CMP, we need to only + * increment ncm once even if there are multiple + * cores for that CMP present in the devset. + */ + for (i = 0; i < MAX_CMP_UNITS_PER_BOARD; i++) { + if (devset & DEVSET(SBD_COMP_CMP, i)) { + ncm++; + } + } + + /* eliminate the CPU information from the devset */ + devset &= ~(DEVSET(SBD_COMP_CMP, DEVSET_ANYUNIT)); + + for (i = 0; i < (sizeof (dr_devset_t) * 8); i++) { + ncm += devset & 0x1; + devset >>= 1; + } + + return (ncm); +} + +/* used by dr_mem.c */ +/* TODO: eliminate dr_boardlist */ +dr_board_t * +dr_lookup_board(int board_num) +{ + dr_board_t *bp; + + ASSERT(board_num >= 0 && board_num < MAX_BOARDS); + + bp = &dr_boardlist[board_num]; + ASSERT(bp->b_num == board_num); + + return (bp); +} + +static dr_dev_unit_t * +dr_get_dev_unit(dr_board_t *bp, sbd_comp_type_t nt, int unit_num) +{ + dr_dev_unit_t *dp; + + dp = DR_GET_BOARD_DEVUNIT(bp, nt, unit_num); + ASSERT(dp->du_common.sbdev_bp == bp); + ASSERT(dp->du_common.sbdev_unum == unit_num); + ASSERT(dp->du_common.sbdev_type == nt); + + return (dp); +} + +dr_cpu_unit_t * +dr_get_cpu_unit(dr_board_t *bp, int unit_num) +{ + dr_dev_unit_t *dp; + + ASSERT(unit_num >= 0 && unit_num < MAX_CPU_UNITS_PER_BOARD); + + dp = dr_get_dev_unit(bp, SBD_COMP_CPU, unit_num); + return (&dp->du_cpu); +} + +dr_mem_unit_t * +dr_get_mem_unit(dr_board_t *bp, int unit_num) +{ + dr_dev_unit_t *dp; + + ASSERT(unit_num >= 0 && unit_num < MAX_MEM_UNITS_PER_BOARD); + + dp = dr_get_dev_unit(bp, SBD_COMP_MEM, unit_num); + return (&dp->du_mem); +} + +dr_io_unit_t * +dr_get_io_unit(dr_board_t *bp, int unit_num) +{ + dr_dev_unit_t *dp; + + ASSERT(unit_num >= 0 && unit_num < MAX_IO_UNITS_PER_BOARD); + + dp = dr_get_dev_unit(bp, SBD_COMP_IO, unit_num); + return (&dp->du_io); +} + +dr_common_unit_t * +dr_get_common_unit(dr_board_t *bp, sbd_comp_type_t nt, int unum) +{ + dr_dev_unit_t *dp; + + dp = dr_get_dev_unit(bp, nt, unum); + return (&dp->du_common); +} + +static dr_devset_t +dr_dev2devset(sbd_comp_id_t *cid) +{ + static fn_t f = "dr_dev2devset"; + + dr_devset_t devset; + int unit = cid->c_unit; + + switch (cid->c_type) { + case SBD_COMP_NONE: + devset = DEVSET(SBD_COMP_CPU, DEVSET_ANYUNIT); + devset |= DEVSET(SBD_COMP_MEM, DEVSET_ANYUNIT); + devset |= DEVSET(SBD_COMP_IO, DEVSET_ANYUNIT); + PR_ALL("%s: COMP_NONE devset = " DEVSET_FMT_STR "\n", + f, DEVSET_FMT_ARG(devset)); + break; + + case SBD_COMP_CPU: + if ((unit > MAX_CPU_UNITS_PER_BOARD) || (unit < 0)) { + cmn_err(CE_WARN, + "%s: invalid cpu unit# = %d", + f, unit); + devset = 0; + } else { + /* + * Generate a devset that includes all the + * cores of a CMP device. If this is not a + * CMP, the extra cores will be eliminated + * later since they are not present. This is + * also true for CMP devices that do not have + * all cores active. + */ + devset = DEVSET(SBD_COMP_CMP, unit); + } + + PR_ALL("%s: CPU devset = " DEVSET_FMT_STR "\n", + f, DEVSET_FMT_ARG(devset)); + break; + + case SBD_COMP_MEM: + if (unit == SBD_NULL_UNIT) { + unit = 0; + cid->c_unit = 0; + } + + if ((unit > MAX_MEM_UNITS_PER_BOARD) || (unit < 0)) { + cmn_err(CE_WARN, + "%s: invalid mem unit# = %d", + f, unit); + devset = 0; + } else + devset = DEVSET(cid->c_type, unit); + + PR_ALL("%s: MEM devset = " DEVSET_FMT_STR "\n", + f, DEVSET_FMT_ARG(devset)); + break; + + case SBD_COMP_IO: + if ((unit > MAX_IO_UNITS_PER_BOARD) || (unit < 0)) { + cmn_err(CE_WARN, + "%s: invalid io unit# = %d", + f, unit); + devset = 0; + } else + devset = DEVSET(cid->c_type, unit); + + PR_ALL("%s: IO devset = " DEVSET_FMT_STR "\n", + f, DEVSET_FMT_ARG(devset)); + break; + + default: + case SBD_COMP_UNKNOWN: + devset = 0; + break; + } + + return (devset); +} + +/* + * Converts a dynamic attachment point name to a SBD_COMP_* type. + * Returns SDB_COMP_UNKNOWN if name is not recognized. + */ +static int +dr_dev_type_to_nt(char *type) +{ + int i; + + for (i = 0; dr_devattr[i].s_nodetype != SBD_COMP_UNKNOWN; i++) + if (strcmp(dr_devattr[i].s_devtype, type) == 0) + break; + + return (dr_devattr[i].s_nodetype); +} + +/* + * Converts a SBD_COMP_* type to a dynamic attachment point name. + * Return NULL if SBD_COMP_ type is not recognized. + */ +char * +dr_nt_to_dev_type(int nt) +{ + int i; + + for (i = 0; dr_devattr[i].s_nodetype != SBD_COMP_UNKNOWN; i++) + if (dr_devattr[i].s_nodetype == nt) + break; + + return (dr_devattr[i].s_devtype); +} + +/* + * State transition policy is that if there is some component for which + * the state transition is valid, then let it through. The exception is + * SBD_CMD_DISCONNECT. On disconnect, the state transition must be valid + * for ALL components. + * Returns the state that is in error, if any. + */ +static int +dr_check_transition(dr_board_t *bp, dr_devset_t *devsetp, + struct dr_state_trans *transp, int cmd) +{ + int s, ut; + int state_err = 0; + dr_devset_t devset; + dr_common_unit_t *cp; + static fn_t f = "dr_check_transition"; + + devset = *devsetp; + + if (DEVSET_IN_SET(devset, SBD_COMP_CPU, DEVSET_ANYUNIT)) { + for (ut = 0; ut < MAX_CPU_UNITS_PER_BOARD; ut++) { + if (DEVSET_IN_SET(devset, SBD_COMP_CPU, ut) == 0) + continue; + + cp = dr_get_common_unit(bp, SBD_COMP_CPU, ut); + s = (int)cp->sbdev_state; + if (!DR_DEV_IS_PRESENT(cp)) { + DEVSET_DEL(devset, SBD_COMP_CPU, ut); + } else { + if (transp->x_op[s].x_rv) { + if (!state_err) + state_err = s; + DEVSET_DEL(devset, SBD_COMP_CPU, ut); + } + } + } + } + if (DEVSET_IN_SET(devset, SBD_COMP_MEM, DEVSET_ANYUNIT)) { + for (ut = 0; ut < MAX_MEM_UNITS_PER_BOARD; ut++) { + if (DEVSET_IN_SET(devset, SBD_COMP_MEM, ut) == 0) + continue; + + cp = dr_get_common_unit(bp, SBD_COMP_MEM, ut); + s = (int)cp->sbdev_state; + if (!DR_DEV_IS_PRESENT(cp)) { + DEVSET_DEL(devset, SBD_COMP_MEM, ut); + } else { + if (transp->x_op[s].x_rv) { + if (!state_err) + state_err = s; + DEVSET_DEL(devset, SBD_COMP_MEM, ut); + } + } + } + } + if (DEVSET_IN_SET(devset, SBD_COMP_IO, DEVSET_ANYUNIT)) { + for (ut = 0; ut < MAX_IO_UNITS_PER_BOARD; ut++) { + if (DEVSET_IN_SET(devset, SBD_COMP_IO, ut) == 0) + continue; + + cp = dr_get_common_unit(bp, SBD_COMP_IO, ut); + s = (int)cp->sbdev_state; + if (!DR_DEV_IS_PRESENT(cp)) { + DEVSET_DEL(devset, SBD_COMP_IO, ut); + } else { + if (transp->x_op[s].x_rv) { + if (!state_err) + state_err = s; + DEVSET_DEL(devset, SBD_COMP_IO, ut); + } + } + } + } + + PR_ALL("%s: requested devset = 0x%x, final devset = 0x%x\n", + f, (uint_t)*devsetp, (uint_t)devset); + + *devsetp = devset; + /* + * If there are some remaining components for which + * this state transition is valid, then allow them + * through, otherwise if none are left then return + * the state error. The exception is SBD_CMD_DISCONNECT. + * On disconnect, the state transition must be valid for ALL + * components. + */ + if (cmd == SBD_CMD_DISCONNECT) + return (state_err); + return (devset ? 0 : state_err); +} + +void +dr_device_transition(dr_common_unit_t *cp, dr_state_t st) +{ + PR_STATE("%s STATE %s(%d) -> %s(%d)\n", + cp->sbdev_path, + state_str[cp->sbdev_state], cp->sbdev_state, + state_str[st], st); + + cp->sbdev_state = st; + if (st == DR_STATE_CONFIGURED) { + cp->sbdev_ostate = SBD_STAT_CONFIGURED; + if (cp->sbdev_bp->b_ostate != SBD_STAT_CONFIGURED) { + cp->sbdev_bp->b_ostate = SBD_STAT_CONFIGURED; + (void) drv_getparm(TIME, + (void *) &cp->sbdev_bp->b_time); + } + } else + cp->sbdev_ostate = SBD_STAT_UNCONFIGURED; + + (void) drv_getparm(TIME, (void *) &cp->sbdev_time); +} + +static void +dr_board_transition(dr_board_t *bp, dr_state_t st) +{ + PR_STATE("BOARD %d STATE: %s(%d) -> %s(%d)\n", + bp->b_num, + state_str[bp->b_state], bp->b_state, + state_str[st], st); + + bp->b_state = st; +} + +void +dr_op_err(int ce, dr_handle_t *hp, int code, char *fmt, ...) +{ + sbd_error_t *err; + va_list args; + + va_start(args, fmt); + err = drerr_new_v(code, fmt, args); + va_end(args); + + if (ce != CE_IGNORE) + sbd_err_log(err, ce); + + DRERR_SET_C(&hp->h_err, &err); +} + +void +dr_dev_err(int ce, dr_common_unit_t *cp, int code) +{ + sbd_error_t *err; + + err = drerr_new(0, code, cp->sbdev_path, NULL); + + if (ce != CE_IGNORE) + sbd_err_log(err, ce); + + DRERR_SET_C(&cp->sbdev_error, &err); +} + +/* + * A callback routine. Called from the drmach layer as a result of + * call to drmach_board_find_devices from dr_init_devlists. + */ +static sbd_error_t * +dr_dev_found(void *data, const char *name, int unum, drmachid_t id) +{ + dr_board_t *bp = data; + dr_dev_unit_t *dp; + int nt; + static fn_t f = "dr_dev_found"; + + PR_ALL("%s (board = %d, name = %s, unum = %d, id = %p)...\n", + f, bp->b_num, name, unum, id); + + nt = dr_dev_type_to_nt((char *)name); + if (nt == SBD_COMP_UNKNOWN) { + /* + * this should not happen. When it does, it indicates + * a missmatch in devices supported by the drmach layer + * vs devices supported by this layer. + */ + return (DR_INTERNAL_ERROR()); + } + + dp = DR_GET_BOARD_DEVUNIT(bp, nt, unum); + + /* sanity check */ + ASSERT(dp->du_common.sbdev_bp == bp); + ASSERT(dp->du_common.sbdev_unum == unum); + ASSERT(dp->du_common.sbdev_type == nt); + + /* render dynamic attachment point path of this unit */ + (void) snprintf(dp->du_common.sbdev_path, + sizeof (dp->du_common.sbdev_path), "%s::%s%d", + bp->b_path, name, DR_UNUM2SBD_UNUM(unum, nt)); + + dp->du_common.sbdev_id = id; + DR_DEV_SET_PRESENT(&dp->du_common); + + bp->b_ndev++; + + return (NULL); +} + +static sbd_error_t * +dr_init_devlists(dr_board_t *bp) +{ + int i; + sbd_error_t *err; + dr_dev_unit_t *dp; + static fn_t f = "dr_init_devlists"; + + PR_ALL("%s (%s)...\n", f, bp->b_path); + + /* sanity check */ + ASSERT(bp->b_ndev == 0); + + DR_DEVS_DISCONNECT(bp, (uint_t)-1); + + /* + * This routine builds the board's devlist and initializes + * the common portion of the unit data structures. + * Note: because the common portion is considered + * uninitialized, the dr_get_*_unit() routines can not + * be used. + */ + + /* + * Clear out old entries, if any. + */ + for (i = 0; i < MAX_CPU_UNITS_PER_BOARD; i++) { + dp = DR_GET_BOARD_DEVUNIT(bp, SBD_COMP_CPU, i); + + bzero(dp, sizeof (*dp)); + dp->du_common.sbdev_bp = bp; + dp->du_common.sbdev_unum = i; + dp->du_common.sbdev_type = SBD_COMP_CPU; + } + + for (i = 0; i < MAX_MEM_UNITS_PER_BOARD; i++) { + dp = DR_GET_BOARD_DEVUNIT(bp, SBD_COMP_MEM, i); + + bzero(dp, sizeof (*dp)); + dp->du_common.sbdev_bp = bp; + dp->du_common.sbdev_unum = i; + dp->du_common.sbdev_type = SBD_COMP_MEM; + } + + for (i = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + dp = DR_GET_BOARD_DEVUNIT(bp, SBD_COMP_IO, i); + + bzero(dp, sizeof (*dp)); + dp->du_common.sbdev_bp = bp; + dp->du_common.sbdev_unum = i; + dp->du_common.sbdev_type = SBD_COMP_IO; + } + + err = NULL; + if (bp->b_id) { + /* find devices on this board */ + err = drmach_board_find_devices( + bp->b_id, bp, dr_dev_found); + } + + return (err); +} + +/* + * Return the unit number of the respective drmachid if + * it's found to be attached. + */ +static int +dr_check_unit_attached(dr_common_unit_t *cp) +{ + int rv = 0; + processorid_t cpuid; + uint64_t basepa, endpa; + struct memlist *ml; + extern struct memlist *phys_install; + sbd_error_t *err; + int yes; + static fn_t f = "dr_check_unit_attached"; + + switch (cp->sbdev_type) { + case SBD_COMP_CPU: + err = drmach_cpu_get_id(cp->sbdev_id, &cpuid); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + rv = -1; + break; + } + mutex_enter(&cpu_lock); + if (cpu_get(cpuid) == NULL) + rv = -1; + mutex_exit(&cpu_lock); + break; + + case SBD_COMP_MEM: + err = drmach_mem_get_slice_info(cp->sbdev_id, + &basepa, &endpa, NULL); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + rv = -1; + break; + } + + /* + * Check if base address is in phys_install. + */ + memlist_read_lock(); + for (ml = phys_install; ml; ml = ml->ml_next) + if ((endpa <= ml->ml_address) || + (basepa >= (ml->ml_address + ml->ml_size))) + continue; + else + break; + memlist_read_unlock(); + if (ml == NULL) + rv = -1; + break; + + case SBD_COMP_IO: + err = drmach_io_is_attached(cp->sbdev_id, &yes); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + rv = -1; + break; + } else if (!yes) + rv = -1; + break; + + default: + PR_ALL("%s: unexpected nodetype(%d) for id 0x%p\n", + f, cp->sbdev_type, cp->sbdev_id); + rv = -1; + break; + } + + return (rv); +} + +/* + * See if drmach recognizes the passthru command. DRMACH expects the + * id to identify the thing to which the command is being applied. Using + * nonsense SBD terms, that information has been perversely encoded in the + * c_id member of the sbd_cmd_t structure. This logic reads those tea + * leaves, finds the associated drmach id, then calls drmach to process + * the passthru command. + */ +static int +dr_pt_try_drmach(dr_handle_t *hp) +{ + dr_board_t *bp = hp->h_bd; + sbd_comp_id_t *comp_id = &hp->h_sbdcmd.cmd_cm.c_id; + drmachid_t id; + + if (comp_id->c_type == SBD_COMP_NONE) { + id = bp->b_id; + } else { + sbd_comp_type_t nt; + + nt = dr_dev_type_to_nt(comp_id->c_name); + if (nt == SBD_COMP_UNKNOWN) { + dr_op_err(CE_IGNORE, hp, ESBD_INVAL, comp_id->c_name); + id = 0; + } else { + /* pt command applied to dynamic attachment point */ + dr_common_unit_t *cp; + cp = dr_get_common_unit(bp, nt, comp_id->c_unit); + id = cp->sbdev_id; + } + } + + if (hp->h_err == NULL) + hp->h_err = drmach_passthru(id, &hp->h_opts); + + return (hp->h_err == NULL ? 0 : -1); +} + +static int +dr_pt_ioctl(dr_handle_t *hp) +{ + int cmd, rv, len; + int32_t sz; + int found; + char *copts; + static fn_t f = "dr_pt_ioctl"; + + PR_ALL("%s...\n", f); + + sz = hp->h_opts.size; + copts = hp->h_opts.copts; + + if (sz == 0 || copts == (char *)NULL) { + cmn_err(CE_WARN, "%s: invalid passthru args", f); + return (EINVAL); + } + + found = 0; + for (cmd = 0; cmd < (sizeof (pt_arr) / sizeof (pt_arr[0])); cmd++) { + len = strlen(pt_arr[cmd].pt_name); + found = (strncmp(pt_arr[cmd].pt_name, copts, len) == 0); + if (found) + break; + } + + if (found) + rv = (*pt_arr[cmd].pt_func)(hp); + else + rv = dr_pt_try_drmach(hp); + + return (rv); +} + +/* + * Called at driver load time to determine the state and condition + * of an existing board in the system. + */ +static void +dr_board_discovery(dr_board_t *bp) +{ + int i; + dr_devset_t devs_lost, devs_attached = 0; + dr_cpu_unit_t *cp; + dr_mem_unit_t *mp; + dr_io_unit_t *ip; + static fn_t f = "dr_board_discovery"; + + if (DR_DEVS_PRESENT(bp) == 0) { + PR_ALL("%s: board %d has no devices present\n", + f, bp->b_num); + return; + } + + /* + * Check for existence of cpus. + */ + for (i = 0; i < MAX_CPU_UNITS_PER_BOARD; i++) { + cp = dr_get_cpu_unit(bp, i); + + if (!DR_DEV_IS_PRESENT(&cp->sbc_cm)) + continue; + + if (dr_check_unit_attached(&cp->sbc_cm) >= 0) { + DR_DEV_SET_ATTACHED(&cp->sbc_cm); + DEVSET_ADD(devs_attached, SBD_COMP_CPU, i); + PR_ALL("%s: board %d, cpu-unit %d - attached\n", + f, bp->b_num, i); + } + dr_init_cpu_unit(cp); + } + + /* + * Check for existence of memory. + */ + for (i = 0; i < MAX_MEM_UNITS_PER_BOARD; i++) { + mp = dr_get_mem_unit(bp, i); + + if (!DR_DEV_IS_PRESENT(&mp->sbm_cm)) + continue; + + if (dr_check_unit_attached(&mp->sbm_cm) >= 0) { + DR_DEV_SET_ATTACHED(&mp->sbm_cm); + DEVSET_ADD(devs_attached, SBD_COMP_MEM, i); + PR_ALL("%s: board %d, mem-unit %d - attached\n", + f, bp->b_num, i); + } + dr_init_mem_unit(mp); + } + + /* + * Check for i/o state. + */ + for (i = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + ip = dr_get_io_unit(bp, i); + + if (!DR_DEV_IS_PRESENT(&ip->sbi_cm)) + continue; + + if (dr_check_unit_attached(&ip->sbi_cm) >= 0) { + /* + * Found it! + */ + DR_DEV_SET_ATTACHED(&ip->sbi_cm); + DEVSET_ADD(devs_attached, SBD_COMP_IO, i); + PR_ALL("%s: board %d, io-unit %d - attached\n", + f, bp->b_num, i); + } + dr_init_io_unit(ip); + } + + DR_DEVS_CONFIGURE(bp, devs_attached); + if (devs_attached && ((devs_lost = DR_DEVS_UNATTACHED(bp)) != 0)) { + int ut; + + /* + * It is not legal on board discovery to have a + * board that is only partially attached. A board + * is either all attached or all connected. If a + * board has at least one attached device, then + * the the remaining devices, if any, must have + * been lost or disconnected. These devices can + * only be recovered by a full attach from scratch. + * Note that devices previously in the unreferenced + * state are subsequently lost until the next full + * attach. This is necessary since the driver unload + * that must have occurred would have wiped out the + * information necessary to re-configure the device + * back online, e.g. memlist. + */ + PR_ALL("%s: some devices LOST (" DEVSET_FMT_STR ")...\n", + f, DEVSET_FMT_ARG(devs_lost)); + + for (ut = 0; ut < MAX_CPU_UNITS_PER_BOARD; ut++) { + if (!DEVSET_IN_SET(devs_lost, SBD_COMP_CPU, ut)) + continue; + + cp = dr_get_cpu_unit(bp, ut); + dr_device_transition(&cp->sbc_cm, DR_STATE_EMPTY); + } + + for (ut = 0; ut < MAX_MEM_UNITS_PER_BOARD; ut++) { + if (!DEVSET_IN_SET(devs_lost, SBD_COMP_MEM, ut)) + continue; + + mp = dr_get_mem_unit(bp, ut); + dr_device_transition(&mp->sbm_cm, DR_STATE_EMPTY); + } + + for (ut = 0; ut < MAX_IO_UNITS_PER_BOARD; ut++) { + if (!DEVSET_IN_SET(devs_lost, SBD_COMP_IO, ut)) + continue; + + ip = dr_get_io_unit(bp, ut); + dr_device_transition(&ip->sbi_cm, DR_STATE_EMPTY); + } + + DR_DEVS_DISCONNECT(bp, devs_lost); + } +} + +static int +dr_board_init(dr_board_t *bp, dev_info_t *dip, int bd) +{ + sbd_error_t *err; + + mutex_init(&bp->b_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&bp->b_slock, NULL, MUTEX_DRIVER, NULL); + cv_init(&bp->b_scv, NULL, CV_DRIVER, NULL); + bp->b_rstate = SBD_STAT_EMPTY; + bp->b_ostate = SBD_STAT_UNCONFIGURED; + bp->b_cond = SBD_COND_UNKNOWN; + (void) drv_getparm(TIME, (void *)&bp->b_time); + + (void) drmach_board_lookup(bd, &bp->b_id); + bp->b_num = bd; + bp->b_dip = dip; + + bp->b_dev[DEVSET_NIX(SBD_COMP_CPU)] = GETSTRUCT(dr_dev_unit_t, + MAX_CPU_UNITS_PER_BOARD); + + bp->b_dev[DEVSET_NIX(SBD_COMP_MEM)] = GETSTRUCT(dr_dev_unit_t, + MAX_MEM_UNITS_PER_BOARD); + + bp->b_dev[DEVSET_NIX(SBD_COMP_IO)] = GETSTRUCT(dr_dev_unit_t, + MAX_IO_UNITS_PER_BOARD); + + /* + * Initialize the devlists + */ + err = dr_init_devlists(bp); + if (err) { + sbd_err_clear(&err); + dr_board_destroy(bp); + return (-1); + } else if (bp->b_ndev == 0) { + dr_board_transition(bp, DR_STATE_EMPTY); + } else { + /* + * Couldn't have made it down here without + * having found at least one device. + */ + ASSERT(DR_DEVS_PRESENT(bp) != 0); + /* + * Check the state of any possible devices on the + * board. + */ + dr_board_discovery(bp); + + bp->b_assigned = 1; + + if (DR_DEVS_UNATTACHED(bp) == 0) { + /* + * The board has no unattached devices, therefore + * by reason of insanity it must be configured! + */ + dr_board_transition(bp, DR_STATE_CONFIGURED); + bp->b_ostate = SBD_STAT_CONFIGURED; + bp->b_rstate = SBD_STAT_CONNECTED; + bp->b_cond = SBD_COND_OK; + (void) drv_getparm(TIME, (void *)&bp->b_time); + } else if (DR_DEVS_ATTACHED(bp)) { + dr_board_transition(bp, DR_STATE_PARTIAL); + bp->b_ostate = SBD_STAT_CONFIGURED; + bp->b_rstate = SBD_STAT_CONNECTED; + bp->b_cond = SBD_COND_OK; + (void) drv_getparm(TIME, (void *)&bp->b_time); + } else { + dr_board_transition(bp, DR_STATE_CONNECTED); + bp->b_rstate = SBD_STAT_CONNECTED; + (void) drv_getparm(TIME, (void *)&bp->b_time); + } + } + + return (0); +} + +static void +dr_board_destroy(dr_board_t *bp) +{ + PR_ALL("dr_board_destroy: num %d, path %s\n", + bp->b_num, bp->b_path); + + dr_board_transition(bp, DR_STATE_EMPTY); + bp->b_rstate = SBD_STAT_EMPTY; + (void) drv_getparm(TIME, (void *)&bp->b_time); + + /* + * Free up MEM unit structs. + */ + FREESTRUCT(bp->b_dev[DEVSET_NIX(SBD_COMP_MEM)], + dr_dev_unit_t, MAX_MEM_UNITS_PER_BOARD); + bp->b_dev[DEVSET_NIX(SBD_COMP_MEM)] = NULL; + /* + * Free up CPU unit structs. + */ + FREESTRUCT(bp->b_dev[DEVSET_NIX(SBD_COMP_CPU)], + dr_dev_unit_t, MAX_CPU_UNITS_PER_BOARD); + bp->b_dev[DEVSET_NIX(SBD_COMP_CPU)] = NULL; + /* + * Free up IO unit structs. + */ + FREESTRUCT(bp->b_dev[DEVSET_NIX(SBD_COMP_IO)], + dr_dev_unit_t, MAX_IO_UNITS_PER_BOARD); + bp->b_dev[DEVSET_NIX(SBD_COMP_IO)] = NULL; + + mutex_destroy(&bp->b_lock); + mutex_destroy(&bp->b_slock); + cv_destroy(&bp->b_scv); + + /* + * Reset the board structure to its initial state, otherwise it will + * cause trouble on the next call to dr_board_init() for the same board. + * dr_board_init() may be called multiple times for the same board + * if DR driver fails to initialize some boards. + */ + bzero(bp, sizeof (*bp)); +} + +void +dr_lock_status(dr_board_t *bp) +{ + mutex_enter(&bp->b_slock); + while (bp->b_sflags & DR_BSLOCK) + cv_wait(&bp->b_scv, &bp->b_slock); + bp->b_sflags |= DR_BSLOCK; + mutex_exit(&bp->b_slock); +} + +void +dr_unlock_status(dr_board_t *bp) +{ + mutex_enter(&bp->b_slock); + bp->b_sflags &= ~DR_BSLOCK; + cv_signal(&bp->b_scv); + mutex_exit(&bp->b_slock); +} + +/* + * Extract flags passed via ioctl. + */ +int +dr_cmd_flags(dr_handle_t *hp) +{ + return (hp->h_sbdcmd.cmd_cm.c_flags); +} diff --git a/usr/src/uts/i86pc/io/dr/dr.conf b/usr/src/uts/i86pc/io/dr/dr.conf new file mode 100644 index 0000000000..c694f5573f --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2009, Intel Corporation. +# All rights reserved. +# + +# Force attach driver to enable DR at boot time. +ddi-forceattach=1; diff --git a/usr/src/uts/i86pc/io/dr/dr_cpu.c b/usr/src/uts/i86pc/io/dr/dr_cpu.c new file mode 100644 index 0000000000..a893310977 --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr_cpu.c @@ -0,0 +1,885 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * CPU support routines for DR + */ + +#include <sys/note.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/dditypes.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ndi_impldefs.h> +#include <sys/kmem.h> +#include <sys/processor.h> +#include <sys/cpuvar.h> +#include <sys/promif.h> +#include <sys/sysmacros.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> +#include <sys/cpu_module.h> +#include <sys/cmn_err.h> + +#include <sys/dr.h> +#include <sys/dr_util.h> + +/* for the DR*INTERNAL_ERROR macros. see sys/dr.h. */ +static char *dr_ie_fmt = "dr_cpu.c %d"; + +int +dr_cpu_unit_is_sane(dr_board_t *bp, dr_cpu_unit_t *cp) +{ +#ifdef DEBUG + ASSERT(cp->sbc_cm.sbdev_bp == bp); + ASSERT(cp->sbc_cm.sbdev_type == SBD_COMP_CPU); +#else + _NOTE(ARGUNUSED(bp)) + _NOTE(ARGUNUSED(cp)) +#endif + + return (1); +} + +static int +dr_errno2ecode(int error) +{ + int rv; + + switch (error) { + case EBUSY: + rv = ESBD_BUSY; + break; + case EINVAL: + rv = ESBD_INVAL; + break; + case EALREADY: + rv = ESBD_ALREADY; + break; + case ENODEV: + rv = ESBD_NODEV; + break; + case ENOMEM: + rv = ESBD_NOMEM; + break; + default: + rv = ESBD_INVAL; + } + + return (rv); +} + +/* + * On x86, the "clock-frequency" and cache size device properties may be + * unavailable before CPU starts. If they are unavailabe, just set them to zero. + */ +static void +dr_cpu_set_prop(dr_cpu_unit_t *cp) +{ + sbd_error_t *err; + dev_info_t *dip; + uint64_t clock_freq; + int ecache_size = 0; + char *cache_str = NULL; + + err = drmach_get_dip(cp->sbc_cm.sbdev_id, &dip); + if (err) { + DRERR_SET_C(&cp->sbc_cm.sbdev_error, &err); + return; + } + + if (dip == NULL) { + DR_DEV_INTERNAL_ERROR(&cp->sbc_cm); + return; + } + + /* read in the CPU speed */ + clock_freq = (unsigned int)ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "clock-frequency", 0); + + /* + * The ecache property string is not the same + * for all CPU implementations. + */ + switch (cp->sbc_cpu_impl) { + case X86_CPU_IMPL_NEHALEM_EX: + cache_str = "l3-cache-size"; + break; + default: + cmn_err(CE_WARN, "Unknown cpu implementation=0x%x", + cp->sbc_cpu_impl); + break; + } + + if (cache_str != NULL) { + /* read in the ecache size */ + /* + * If the property is not found in the CPU node, + * it has to be kept in the core or cmp node so + * we just keep looking. + */ + + ecache_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, + cache_str, 0); + } + + /* convert to the proper units */ + cp->sbc_speed = (clock_freq + 500000) / 1000000; + cp->sbc_ecache = ecache_size / (1024 * 1024); +} + +void +dr_init_cpu_unit(dr_cpu_unit_t *cp) +{ + sbd_error_t *err; + dr_state_t new_state; + int cpuid; + int impl; + + if (DR_DEV_IS_ATTACHED(&cp->sbc_cm)) { + new_state = DR_STATE_CONFIGURED; + cp->sbc_cm.sbdev_cond = SBD_COND_OK; + } else if (DR_DEV_IS_PRESENT(&cp->sbc_cm)) { + new_state = DR_STATE_CONNECTED; + cp->sbc_cm.sbdev_cond = SBD_COND_OK; + } else { + new_state = DR_STATE_EMPTY; + cp->sbc_cm.sbdev_cond = SBD_COND_UNKNOWN; + } + + if (DR_DEV_IS_PRESENT(&cp->sbc_cm)) { + err = drmach_cpu_get_id(cp->sbc_cm.sbdev_id, &cpuid); + if (err) { + DRERR_SET_C(&cp->sbc_cm.sbdev_error, &err); + new_state = DR_STATE_FATAL; + goto done; + } + + err = drmach_cpu_get_impl(cp->sbc_cm.sbdev_id, &impl); + if (err) { + DRERR_SET_C(&cp->sbc_cm.sbdev_error, &err); + new_state = DR_STATE_FATAL; + goto done; + } + } else { + cp->sbc_cpu_id = -1; + cp->sbc_cpu_impl = -1; + goto done; + } + + cp->sbc_cpu_id = cpuid; + cp->sbc_cpu_impl = impl; + + /* if true at init time, it must always be true */ + ASSERT(dr_cpu_unit_is_sane(cp->sbc_cm.sbdev_bp, cp)); + + mutex_enter(&cpu_lock); + if ((cpuid >= 0) && cpu[cpuid]) + cp->sbc_cpu_flags = cpu[cpuid]->cpu_flags; + else + cp->sbc_cpu_flags = P_OFFLINE | P_POWEROFF; + mutex_exit(&cpu_lock); + + dr_cpu_set_prop(cp); + +done: + /* delay transition until fully initialized */ + dr_device_transition(&cp->sbc_cm, new_state); +} + +int +dr_pre_attach_cpu(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + int i; + static fn_t f = "dr_pre_attach_cpu"; + + PR_CPU("%s...\n", f); + + for (i = 0; i < devnum; i++) { + dr_cpu_unit_t *up = (dr_cpu_unit_t *)devlist[i]; + + ASSERT(dr_cpu_unit_is_sane(hp->h_bd, up)); + + /* + * Print a console message for each attachment + * point. For CMP devices, this means that only + * one message should be printed, no matter how + * many cores are actually present. + */ + if ((up->sbc_cm.sbdev_unum % MAX_CORES_PER_CMP) == 0) { + cmn_err(CE_CONT, "OS configure %s", + up->sbc_cm.sbdev_path); + } + } + + /* + * Block out status threads while creating + * devinfo tree branches + */ + dr_lock_status(hp->h_bd); + ndi_devi_enter(ddi_root_node(), (int *)(&hp->h_ndi)); + mutex_enter(&cpu_lock); + + return (0); +} + +/*ARGSUSED*/ +void +dr_attach_cpu(dr_handle_t *hp, dr_common_unit_t *cp) +{ + sbd_error_t *err; + processorid_t cpuid; + int rv; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + err = drmach_configure(cp->sbdev_id, 0); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + return; + } + + err = drmach_cpu_get_id(cp->sbdev_id, &cpuid); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + + err = drmach_unconfigure(cp->sbdev_id, DEVI_BRANCH_DESTROY); + if (err) + sbd_err_clear(&err); + } else if ((rv = cpu_configure(cpuid)) != 0) { + dr_dev_err(CE_WARN, cp, dr_errno2ecode(rv)); + err = drmach_unconfigure(cp->sbdev_id, DEVI_BRANCH_DESTROY); + if (err) + sbd_err_clear(&err); + } else { + dr_cpu_unit_t *up = (dr_cpu_unit_t *)cp; + up->sbc_cpu_id = cpuid; + } +} + +/* + * dr_post_attach_cpu + * + * sbd error policy: Does not stop on error. Processes all units in list. + */ +int +dr_post_attach_cpu(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + int i; + int errflag = 0; + static fn_t f = "dr_post_attach_cpu"; + + PR_CPU("%s...\n", f); + + /* Startup and online newly-attached CPUs */ + for (i = 0; i < devnum; i++) { + dr_cpu_unit_t *up = (dr_cpu_unit_t *)devlist[i]; + struct cpu *cp; + + ASSERT(dr_cpu_unit_is_sane(hp->h_bd, up)); + + cp = cpu_get(up->sbc_cpu_id); + if (cp == NULL) { + cmn_err(CE_WARN, "%s: cpu_get failed for cpu %d", + f, up->sbc_cpu_id); + continue; + } + + if (cpu_is_poweredoff(cp)) { + if (cpu_poweron(cp) != 0) { + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_CPUSTART); + errflag = 1; + } + PR_CPU("%s: cpu %d powered ON\n", f, up->sbc_cpu_id); + } + + if (cpu_is_offline(cp)) { + PR_CPU("%s: onlining cpu %d...\n", f, up->sbc_cpu_id); + + if (cpu_online(cp) != 0) { + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_ONLINE); + errflag = 1; + } + } + + } + + mutex_exit(&cpu_lock); + ndi_devi_exit(ddi_root_node(), hp->h_ndi); + dr_unlock_status(hp->h_bd); + + if (errflag) + return (-1); + else + return (0); +} + +/* + * dr_pre_release_cpu + * + * sbd error policy: Stops on first error. + */ +int +dr_pre_release_cpu(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + int c, cix, i, lastoffline = -1, rv = 0; + processorid_t cpuid; + struct cpu *cp; + dr_cpu_unit_t *up; + dr_devset_t devset; + sbd_dev_stat_t *ds; + static fn_t f = "dr_pre_release_cpu"; + int cpu_flags = 0; + + devset = DR_DEVS_PRESENT(hp->h_bd); + + /* allocate status struct storage. */ + ds = (sbd_dev_stat_t *) kmem_zalloc(sizeof (sbd_dev_stat_t) * + MAX_CPU_UNITS_PER_BOARD, KM_SLEEP); + + cix = dr_cpu_status(hp, devset, ds); + + mutex_enter(&cpu_lock); + + for (i = 0; i < devnum; i++) { + up = (dr_cpu_unit_t *)devlist[i]; + if (!DR_DEV_IS_ATTACHED(&up->sbc_cm)) { + continue; + } + ASSERT(dr_cpu_unit_is_sane(hp->h_bd, up)); + + /* + * On x86 systems, some CPUs can't be unconfigured. + * For example, CPU0 can't be unconfigured because many other + * components have a dependency on it. + * This check determines if a CPU is currently in use and + * returns a "Device busy" error if so. + */ + for (c = 0; c < cix; c++) { + if (ds[c].d_cpu.cs_unit == up->sbc_cm.sbdev_unum) { + if (ds[c].d_cpu.cs_busy) { + dr_dev_err(CE_WARN, &up->sbc_cm, + ESBD_BUSY); + rv = -1; + break; + } + } + } + if (c < cix) + break; + + cpuid = up->sbc_cpu_id; + if ((cp = cpu_get(cpuid)) == NULL) { + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_OFFLINE); + rv = -1; + break; + } + + /* used by dr_cancel_cpu during error flow */ + up->sbc_cpu_flags = cp->cpu_flags; + + if (CPU_ACTIVE(cp)) { + if (dr_cmd_flags(hp) & SBD_FLAG_FORCE) + cpu_flags = CPU_FORCED; + + PR_CPU("%s: offlining cpu %d\n", f, cpuid); + if (cpu_offline(cp, cpu_flags)) { + PR_CPU("%s: failed to offline cpu %d\n", f, + cpuid); + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_OFFLINE); + if (disp_bound_threads(cp, 0)) { + cmn_err(CE_WARN, "%s: thread(s) bound " + "to cpu %d", f, cp->cpu_id); + } + rv = -1; + break; + } else + lastoffline = i; + } + + if (!rv) { + sbd_error_t *err; + + err = drmach_release(up->sbc_cm.sbdev_id); + if (err) { + DRERR_SET_C(&up->sbc_cm.sbdev_error, &err); + rv = -1; + break; + } + } + } + + mutex_exit(&cpu_lock); + + if (rv) { + /* + * Need to unwind others since at this level (pre-release) + * the device state has not yet transitioned and failures + * will prevent us from reaching the "post" release + * function where states are normally transitioned. + */ + for (i = lastoffline; i >= 0; i--) { + up = (dr_cpu_unit_t *)devlist[i]; + (void) dr_cancel_cpu(up); + } + } + + kmem_free(ds, sizeof (sbd_dev_stat_t) * MAX_CPU_UNITS_PER_BOARD); + return (rv); +} + +/* + * dr_pre_detach_cpu + * + * sbd error policy: Stops on first error. + */ +int +dr_pre_detach_cpu(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + _NOTE(ARGUNUSED(hp)) + + int i; + int cpu_flags = 0; + static fn_t f = "dr_pre_detach_cpu"; + + PR_CPU("%s...\n", f); + + /* + * Block out status threads while destroying devinfo tree + * branches + */ + dr_lock_status(hp->h_bd); + mutex_enter(&cpu_lock); + + for (i = 0; i < devnum; i++) { + dr_cpu_unit_t *up = (dr_cpu_unit_t *)devlist[i]; + struct cpu *cp; + + if (!DR_DEV_IS_ATTACHED(&up->sbc_cm)) { + continue; + } + + ASSERT(dr_cpu_unit_is_sane(hp->h_bd, up)); + + cp = cpu_get(up->sbc_cpu_id); + if (cp == NULL) + continue; + + /* + * Print a console message for each attachment + * point. For CMP devices, this means that only + * one message should be printed, no matter how + * many cores are actually present. + */ + if ((up->sbc_cm.sbdev_unum % MAX_CORES_PER_CMP) == 0) { + cmn_err(CE_CONT, "OS unconfigure %s\n", + up->sbc_cm.sbdev_path); + } + + /* + * CPUs were offlined during Release. + */ + if (cpu_is_poweredoff(cp)) { + PR_CPU("%s: cpu %d already powered OFF\n", + f, up->sbc_cpu_id); + continue; + } + + if (!cpu_is_offline(cp)) { + if (dr_cmd_flags(hp) & SBD_FLAG_FORCE) + cpu_flags = CPU_FORCED; + /* cpu was onlined after release. Offline it again */ + PR_CPU("%s: offlining cpu %d\n", f, up->sbc_cpu_id); + if (cpu_offline(cp, cpu_flags)) { + PR_CPU("%s: failed to offline cpu %d\n", + f, up->sbc_cpu_id); + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_OFFLINE); + if (disp_bound_threads(cp, 0)) { + cmn_err(CE_WARN, "%s: thread(s) bound " + "to cpu %d", f, cp->cpu_id); + } + goto err; + } + } + if (cpu_poweroff(cp) != 0) { + dr_dev_err(CE_WARN, &up->sbc_cm, ESBD_CPUSTOP); + goto err; + } else { + PR_CPU("%s: cpu %d powered OFF\n", f, up->sbc_cpu_id); + } + } + + return (0); + +err: + mutex_exit(&cpu_lock); + dr_unlock_status(hp->h_bd); + return (-1); +} + +/*ARGSUSED*/ +void +dr_detach_cpu(dr_handle_t *hp, dr_common_unit_t *cp) +{ + sbd_error_t *err; + processorid_t cpuid; + int rv; + dr_cpu_unit_t *up = (dr_cpu_unit_t *)cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!DR_DEV_IS_ATTACHED(&up->sbc_cm)) { + return; + } + + err = drmach_cpu_get_id(cp->sbdev_id, &cpuid); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + } else if ((rv = cpu_unconfigure(cpuid)) != 0) { + dr_dev_err(CE_IGNORE, cp, dr_errno2ecode(rv)); + } else { + err = drmach_unconfigure(cp->sbdev_id, DEVI_BRANCH_DESTROY); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + } else { + up->sbc_cpu_id = -1; + } + } +} + +/*ARGSUSED1*/ +int +dr_post_detach_cpu(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + static fn_t f = "dr_post_detach_cpu"; + + PR_CPU("%s...\n", f); + hp->h_ndi = 0; + + mutex_exit(&cpu_lock); + dr_unlock_status(hp->h_bd); + + return (0); +} + +static void +dr_fill_cpu_stat(dr_cpu_unit_t *cp, drmach_status_t *pstat, sbd_cpu_stat_t *csp) +{ + ASSERT(cp && pstat && csp); + + /* Fill in the common status information */ + bzero((caddr_t)csp, sizeof (*csp)); + csp->cs_type = cp->sbc_cm.sbdev_type; + csp->cs_unit = cp->sbc_cm.sbdev_unum; + (void) strlcpy(csp->cs_name, pstat->type, sizeof (csp->cs_name)); + csp->cs_cond = cp->sbc_cm.sbdev_cond; + csp->cs_busy = cp->sbc_cm.sbdev_busy | pstat->busy; + csp->cs_time = cp->sbc_cm.sbdev_time; + csp->cs_ostate = cp->sbc_cm.sbdev_ostate; + csp->cs_suspend = 0; + + /* CPU specific status data */ + csp->cs_cpuid = cp->sbc_cpu_id; + + /* + * If the speed and ecache properties have not been + * cached yet, read them in from the device tree. + */ + if ((cp->sbc_speed == 0) || (cp->sbc_ecache == 0)) + dr_cpu_set_prop(cp); + + /* use the cached speed and ecache values */ + csp->cs_speed = cp->sbc_speed; + csp->cs_ecache = cp->sbc_ecache; + + mutex_enter(&cpu_lock); + if (!cpu_get(csp->cs_cpuid)) { + /* ostate must be UNCONFIGURED */ + csp->cs_cm.c_ostate = SBD_STAT_UNCONFIGURED; + } + mutex_exit(&cpu_lock); +} + +/*ARGSUSED2*/ +static void +dr_fill_cmp_stat(sbd_cpu_stat_t *csp, int ncores, int impl, sbd_cmp_stat_t *psp) +{ + int core; + + ASSERT(csp && psp && (ncores >= 1)); + + bzero((caddr_t)psp, sizeof (*psp)); + + /* + * Fill in the common status information based + * on the data for the first core. + */ + psp->ps_type = SBD_COMP_CMP; + psp->ps_unit = DR_UNUM2SBD_UNUM(csp->cs_unit, SBD_COMP_CMP); + (void) strlcpy(psp->ps_name, csp->cs_name, sizeof (psp->ps_name)); + psp->ps_cond = csp->cs_cond; + psp->ps_busy = csp->cs_busy; + psp->ps_time = csp->cs_time; + psp->ps_ostate = csp->cs_ostate; + psp->ps_suspend = csp->cs_suspend; + + /* CMP specific status data */ + *psp->ps_cpuid = csp->cs_cpuid; + psp->ps_ncores = 1; + psp->ps_speed = csp->cs_speed; + psp->ps_ecache = csp->cs_ecache; + + /* + * Walk through the data for the remaining cores. + * Make any adjustments to the common status data, + * or the shared CMP specific data if necessary. + */ + for (core = 1; core < ncores; core++) { + /* + * The following properties should be the same + * for all the cores of the CMP. + */ + ASSERT(psp->ps_unit == DR_UNUM2SBD_UNUM(csp[core].cs_unit, + SBD_COMP_CMP)); + + if (csp[core].cs_speed > psp->ps_speed) + psp->ps_speed = csp[core].cs_speed; + if (csp[core].cs_ecache > psp->ps_ecache) + psp->ps_ecache = csp[core].cs_ecache; + + psp->ps_cpuid[core] = csp[core].cs_cpuid; + psp->ps_ncores++; + + /* adjust time if necessary */ + if (csp[core].cs_time > psp->ps_time) { + psp->ps_time = csp[core].cs_time; + } + + psp->ps_busy |= csp[core].cs_busy; + + /* + * If any of the cores are configured, the + * entire CMP is marked as configured. + */ + if (csp[core].cs_ostate == SBD_STAT_CONFIGURED) { + psp->ps_ostate = csp[core].cs_ostate; + } + } +} + +int +dr_cpu_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp) +{ + int cmp; + int core; + int ncpu; + dr_board_t *bp; + sbd_cpu_stat_t *cstat; + int impl; + + bp = hp->h_bd; + ncpu = 0; + + devset &= DR_DEVS_PRESENT(bp); + cstat = kmem_zalloc(sizeof (sbd_cpu_stat_t) * MAX_CORES_PER_CMP, + KM_SLEEP); + + /* + * Treat every CPU as a CMP. In the case where the + * device is not a CMP, treat it as a CMP with only + * one core. + */ + for (cmp = 0; cmp < MAX_CMP_UNITS_PER_BOARD; cmp++) { + int ncores; + dr_cpu_unit_t *cp; + drmach_status_t pstat; + sbd_error_t *err; + sbd_cmp_stat_t *psp; + + if ((devset & DEVSET(SBD_COMP_CMP, cmp)) == 0) { + continue; + } + + ncores = 0; + + for (core = 0; core < MAX_CORES_PER_CMP; core++) { + + cp = dr_get_cpu_unit(bp, DR_CMP_CORE_UNUM(cmp, core)); + + if (cp->sbc_cm.sbdev_state == DR_STATE_EMPTY) { + /* present, but not fully initialized */ + continue; + } + + ASSERT(dr_cpu_unit_is_sane(hp->h_bd, cp)); + + /* skip if not present */ + if (cp->sbc_cm.sbdev_id == (drmachid_t)0) { + continue; + } + + /* fetch platform status */ + err = drmach_status(cp->sbc_cm.sbdev_id, &pstat); + if (err) { + DRERR_SET_C(&cp->sbc_cm.sbdev_error, &err); + continue; + } + + dr_fill_cpu_stat(cp, &pstat, &cstat[ncores++]); + /* + * We should set impl here because the last core + * found might be EMPTY or not present. + */ + impl = cp->sbc_cpu_impl; + } + + if (ncores == 0) { + continue; + } + + /* + * Store the data to the outgoing array. If the + * device is a CMP, combine all the data for the + * cores into a single stat structure. + * + * The check for a CMP device uses the last core + * found, assuming that all cores will have the + * same implementation. + */ + if (CPU_IMPL_IS_CMP(impl)) { + psp = (sbd_cmp_stat_t *)dsp; + dr_fill_cmp_stat(cstat, ncores, impl, psp); + } else { + ASSERT(ncores == 1); + bcopy(cstat, dsp, sizeof (sbd_cpu_stat_t)); + } + + dsp++; + ncpu++; + } + + kmem_free(cstat, sizeof (sbd_cpu_stat_t) * MAX_CORES_PER_CMP); + + return (ncpu); +} + +/* + * Cancel previous release operation for cpu. + * For cpus this means simply bringing cpus that + * were offline back online. Note that they had + * to have been online at the time there were + * released. + */ +int +dr_cancel_cpu(dr_cpu_unit_t *up) +{ + int rv = 0; + static fn_t f = "dr_cancel_cpu"; + + ASSERT(dr_cpu_unit_is_sane(up->sbc_cm.sbdev_bp, up)); + + if (cpu_flagged_active(up->sbc_cpu_flags)) { + struct cpu *cp; + + /* + * CPU had been online, go ahead + * bring it back online. + */ + PR_CPU("%s: bringing cpu %d back ONLINE\n", f, up->sbc_cpu_id); + + mutex_enter(&cpu_lock); + cp = cpu[up->sbc_cpu_id]; + + if (cpu_is_poweredoff(cp)) { + if (cpu_poweron(cp)) { + cmn_err(CE_WARN, "%s: failed to power-on " + "cpu %d", f, up->sbc_cpu_id); + rv = -1; + } + } + + if (rv == 0 && cpu_is_offline(cp)) { + if (cpu_online(cp)) { + cmn_err(CE_WARN, "%s: failed to online cpu %d", + f, up->sbc_cpu_id); + rv = -1; + } + } + + if (rv == 0 && cpu_is_online(cp)) { + if (cpu_flagged_nointr(up->sbc_cpu_flags)) { + if (cpu_intr_disable(cp) != 0) { + cmn_err(CE_WARN, "%s: failed to " + "disable interrupts on cpu %d", f, + up->sbc_cpu_id); + } + } + } + + mutex_exit(&cpu_lock); + } + + return (rv); +} + +int +dr_disconnect_cpu(dr_cpu_unit_t *up) +{ + sbd_error_t *err; + static fn_t f = "dr_disconnect_cpu"; + + PR_CPU("%s...\n", f); + + ASSERT((up->sbc_cm.sbdev_state == DR_STATE_CONNECTED) || + (up->sbc_cm.sbdev_state == DR_STATE_UNCONFIGURED)); + + ASSERT(dr_cpu_unit_is_sane(up->sbc_cm.sbdev_bp, up)); + + if (up->sbc_cm.sbdev_state == DR_STATE_CONNECTED) { + /* + * Cpus were never brought in and so are still + * effectively disconnected, so nothing to do here. + */ + PR_CPU("%s: cpu %d never brought in\n", f, up->sbc_cpu_id); + return (0); + } + + err = drmach_cpu_disconnect(up->sbc_cm.sbdev_id); + if (err == NULL) + return (0); + else { + DRERR_SET_C(&up->sbc_cm.sbdev_error, &err); + return (-1); + } + /*NOTREACHED*/ +} diff --git a/usr/src/uts/i86pc/io/dr/dr_io.c b/usr/src/uts/i86pc/io/dr/dr_io.c new file mode 100644 index 0000000000..96e0e458cb --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr_io.c @@ -0,0 +1,389 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * I/O support routines for DR + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/dditypes.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ndi_impldefs.h> +#include <sys/kmem.h> +#include <sys/promif.h> +#include <sys/sysmacros.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> + +#include <sys/dr.h> +#include <sys/dr_util.h> +#include <sys/drmach.h> + +void +dr_init_io_unit(dr_io_unit_t *ip) +{ + dr_state_t new_state; + + if (DR_DEV_IS_ATTACHED(&ip->sbi_cm)) { + new_state = DR_STATE_CONFIGURED; + ip->sbi_cm.sbdev_cond = SBD_COND_OK; + } else if (DR_DEV_IS_PRESENT(&ip->sbi_cm)) { + new_state = DR_STATE_CONNECTED; + ip->sbi_cm.sbdev_cond = SBD_COND_OK; + } else { + new_state = DR_STATE_EMPTY; + } + dr_device_transition(&ip->sbi_cm, new_state); +} + +/*ARGSUSED*/ +void +dr_attach_io(dr_handle_t *hp, dr_common_unit_t *cp) +{ + sbd_error_t *err; + + dr_lock_status(hp->h_bd); + err = drmach_configure(cp->sbdev_id, 0); + dr_unlock_status(hp->h_bd); + + if (!err) + err = drmach_io_post_attach(cp->sbdev_id); + + if (err) + DRERR_SET_C(&cp->sbdev_error, &err); +} + +/* + * remove device nodes for the branch indicated by cp + */ +/*ARGSUSED*/ +void +dr_detach_io(dr_handle_t *hp, dr_common_unit_t *cp) +{ + sbd_error_t *err; + + err = drmach_unconfigure(cp->sbdev_id, 0); + + if (!err) + err = drmach_unconfigure(cp->sbdev_id, DEVI_BRANCH_DESTROY); + + if (!err) + err = drmach_io_post_release(cp->sbdev_id); + + if (err) { + dr_device_transition(cp, DR_STATE_CONFIGURED); + DRERR_SET_C(&cp->sbdev_error, &err); + } +} + +/*ARGSUSED*/ +int +dr_disconnect_io(dr_io_unit_t *ip) +{ + return (0); +} + +/*ARGSUSED*/ +int +dr_pre_attach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + int d; + + for (d = 0; d < devnum; d++) { + dr_common_unit_t *cp = devlist[d]; + + cmn_err(CE_CONT, "OS configure %s", cp->sbdev_path); + } + + return (0); +} + +/*ARGSUSED*/ +int +dr_post_attach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + return (0); +} + +static int +dr_check_io_refs(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + register int i, reftotal = 0; + static fn_t f = "dr_check_io_refs"; + + for (i = 0; i < devnum; i++) { + dr_io_unit_t *ip = (dr_io_unit_t *)devlist[i]; + dev_info_t *dip; + int ref; + int refcount_non_gldv3; + sbd_error_t *err; + + err = drmach_get_dip(ip->sbi_cm.sbdev_id, &dip); + if (err) + DRERR_SET_C(&ip->sbi_cm.sbdev_error, &err); + else if (dip != NULL) { + ref = 0; + refcount_non_gldv3 = 0; + ASSERT(e_ddi_branch_held(dip)); + dr_check_devices(dip, &ref, hp, NULL, NULL, + 0, &refcount_non_gldv3); + ASSERT(refcount_non_gldv3 >= 0); + ASSERT(ref >= refcount_non_gldv3); + /* + * Ignore reference counts of non-gldv3 network devices + * as Crossbow creates reference counts for non-active + * (unplumbed) instances. Reference count check in + * detach() known to prevent device from detaching + * as necessary. + */ + ref -= refcount_non_gldv3; + hp->h_err = NULL; + if (ref) { + dr_dev_err(CE_WARN, &ip->sbi_cm, ESBD_BUSY); + } + PR_IO("%s: dip(%s) ref = %d\n", + f, ddi_get_name(dip), ref); + reftotal += ref; + } else { + PR_IO("%s: NO dip for id (0x%x)\n", + f, (uint_t)(uintptr_t)ip->sbi_cm.sbdev_id); + } + } + + return (reftotal); +} + +int +dr_pre_release_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + static fn_t f = "dr_pre_release_io"; + int d; + + ASSERT(devnum > 0); + + /* fail if any I/O device pre-release fails */ + for (d = 0; d < devnum; d++) { + dr_io_unit_t *ip = (dr_io_unit_t *)devlist[d]; + + if ((hp->h_err = drmach_io_pre_release( + ip->sbi_cm.sbdev_id)) != 0) { + return (-1); + } + } + + for (d = 0; d < devnum; d++) { + dr_io_unit_t *ip = (dr_io_unit_t *)devlist[d]; + sbd_error_t *err; + + err = drmach_release(ip->sbi_cm.sbdev_id); + if (err) { + DRERR_SET_C(&ip->sbi_cm.sbdev_error, + &err); + return (-1); + } + } + + /* fail if any I/O devices are still referenced */ + if (dr_check_io_refs(hp, devlist, devnum) > 0) { + PR_IO("%s: failed - I/O devices ref'd\n", f); + + /* recover before return error */ + for (d = 0; d < devnum; d++) { + dr_io_unit_t *ip = (dr_io_unit_t *)devlist[d]; + sbd_error_t *err; + err = drmach_io_unrelease(ip->sbi_cm.sbdev_id); + if (err) { + DRERR_SET_C(&ip->sbi_cm.sbdev_error, &err); + return (-1); + } + } + return (-1); + } + return (0); +} + +/*ARGSUSED*/ +int +dr_pre_detach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum) +{ + int d; + + ASSERT(devnum > 0); + + for (d = 0; d < devnum; d++) { + dr_common_unit_t *cp = devlist[d]; + + cmn_err(CE_CONT, "OS unconfigure %s", cp->sbdev_path); + } + + return (0); +} + +/*ARGSUSED*/ +int +dr_post_detach_io(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + register int i; + int rv = 0; + static fn_t f = "dr_post_detach_io"; + + ASSERT(devnum > 0); + for (i = 0; i < devnum; i++) { + dr_common_unit_t *cp = devlist[i]; + if (cp->sbdev_error != NULL) { + PR_IO("%s: Failed\n", f); + rv = -1; + break; + } + } + return (rv); +} + +static void +dr_get_comp_cond(dr_io_unit_t *ip, dev_info_t *dip) +{ + if (dip == NULL) { + ip->sbi_cm.sbdev_cond = SBD_COND_UNKNOWN; + return; + } + + if (DEVI(dip)->devi_flags & DEVI_RETIRED) { + ip->sbi_cm.sbdev_cond = SBD_COND_FAILED; + return; + } + + if (DR_DEV_IS_ATTACHED(&ip->sbi_cm)) { + ip->sbi_cm.sbdev_cond = SBD_COND_OK; + } else if (DR_DEV_IS_PRESENT(&ip->sbi_cm)) { + ip->sbi_cm.sbdev_cond = SBD_COND_OK; + } +} + +int +dr_io_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp) +{ + int i, ix; + dr_board_t *bp; + sbd_io_stat_t *isp; + dr_io_unit_t *ip; + + bp = hp->h_bd; + + /* + * Only look for requested devices that are actually present. + */ + devset &= DR_DEVS_PRESENT(bp); + + for (i = ix = 0; i < MAX_IO_UNITS_PER_BOARD; i++) { + drmachid_t id; + dev_info_t *dip; + sbd_error_t *err; + drmach_status_t pstat; + + if (DEVSET_IN_SET(devset, SBD_COMP_IO, i) == 0) + continue; + + ip = dr_get_io_unit(bp, i); + + if (ip->sbi_cm.sbdev_state == DR_STATE_EMPTY) { + /* present, but not fully initialized */ + continue; + } + + id = ip->sbi_cm.sbdev_id; + if (id == (drmachid_t)0) + continue; + + err = drmach_status(ip->sbi_cm.sbdev_id, &pstat); + if (err) { + DRERR_SET_C(&ip->sbi_cm.sbdev_error, &err); + return (-1); + } + + dip = NULL; + err = drmach_get_dip(id, &dip); + if (err) { + /* catch this in debug kernels */ + ASSERT(0); + + sbd_err_clear(&err); + continue; + } + + isp = &dsp->d_io; + bzero((caddr_t)isp, sizeof (*isp)); + + isp->is_cm.c_id.c_type = ip->sbi_cm.sbdev_type; + isp->is_cm.c_id.c_unit = ip->sbi_cm.sbdev_unum; + (void) strlcpy(isp->is_cm.c_id.c_name, pstat.type, + sizeof (isp->is_cm.c_id.c_name)); + + dr_get_comp_cond(ip, dip); + isp->is_cm.c_cond = ip->sbi_cm.sbdev_cond; + isp->is_cm.c_busy = ip->sbi_cm.sbdev_busy | pstat.busy; + isp->is_cm.c_time = ip->sbi_cm.sbdev_time; + isp->is_cm.c_ostate = ip->sbi_cm.sbdev_ostate; + isp->is_cm.c_sflags = 0; + + if (dip == NULL) { + isp->is_pathname[0] = '\0'; + isp->is_referenced = 0; + isp->is_unsafe_count = 0; + } else { + int refcount = 0, idx = 0; + uint64_t unsafe_devs[SBD_MAX_UNSAFE]; + + ASSERT(e_ddi_branch_held(dip)); + (void) ddi_pathname(dip, isp->is_pathname); + + /* check reference and unsafe counts on devices */ + isp->is_unsafe_count = 0; + dr_check_devices(dip, &refcount, hp, unsafe_devs, + &idx, SBD_MAX_UNSAFE, NULL); + while (idx > 0) { + isp->is_unsafe_list[idx-1] = unsafe_devs[idx-1]; + --idx; + } + + isp->is_referenced = (refcount == 0) ? 0 : 1; + + hp->h_err = NULL; + } + ix++; + dsp++; + } + + return (ix); +} diff --git a/usr/src/uts/i86pc/io/dr/dr_mem_acpi.c b/usr/src/uts/i86pc/io/dr/dr_mem_acpi.c new file mode 100644 index 0000000000..90ceb84044 --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr_mem_acpi.c @@ -0,0 +1,639 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* + * DR memory support routines. + */ + +#include <sys/note.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/kobj.h> +#include <sys/conf.h> +#include <sys/dditypes.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/ddi_impldefs.h> +#include <sys/ndi_impldefs.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/promif.h> +#include <sys/lgrp.h> +#include <sys/mem_config.h> +#include <vm/seg_kmem.h> +#include <vm/page.h> + +#include <sys/dr.h> +#include <sys/dr_util.h> +#include <sys/drmach.h> + +extern struct memlist *phys_install; + +/* TODO: push this reference below drmach line */ +extern int kcage_on; + +/* for the DR*INTERNAL_ERROR macros. see sys/dr.h. */ +static char *dr_ie_fmt = "dr_mem_acpi.c %d"; + +static void dr_init_mem_unit_data(dr_mem_unit_t *mp); + +/* + * dr_mem_unit_t.sbm_flags + */ +#define DR_MFLAG_RESERVED 0x01 /* mem unit reserved for delete */ +#define DR_MFLAG_SOURCE 0x02 /* source brd of copy/rename op */ +#define DR_MFLAG_TARGET 0x04 /* target brd of copy/rename op */ +#define DR_MFLAG_RELOWNER 0x20 /* memory release (delete) owner */ +#define DR_MFLAG_RELDONE 0x40 /* memory release (delete) done */ + +/* helper macros */ +#define _ptob64(p) ((uint64_t)(p) << PAGESHIFT) +#define _b64top(b) ((pgcnt_t)((b) >> PAGESHIFT)) + +static struct memlist * +dr_get_memlist(dr_mem_unit_t *mp) +{ + struct memlist *mlist = NULL; + sbd_error_t *err; + static fn_t f = "dr_get_memlist"; + + PR_MEM("%s for %s...\n", f, mp->sbm_cm.sbdev_path); + + /* + * Return cached memlist, if present. + * This memlist will be present following an + * unconfigure (a.k.a: detach) of this memunit. + * It should only be used in the case were a configure + * is bringing this memunit back in without going + * through the disconnect and connect states. + */ + if (mp->sbm_mlist) { + PR_MEM("%s: found cached memlist\n", f); + + mlist = memlist_dup(mp->sbm_mlist); + } else { + uint64_t basepa = _ptob64(mp->sbm_basepfn); + + /* attempt to construct a memlist using phys_install */ + + /* round down to slice base address */ + basepa &= ~mp->sbm_alignment_mask; + + /* get a copy of phys_install to edit */ + memlist_read_lock(); + mlist = memlist_dup(phys_install); + memlist_read_unlock(); + + /* trim lower irrelevant span */ + if (mlist) + mlist = memlist_del_span(mlist, 0ull, basepa); + + /* trim upper irrelevant span */ + if (mlist) { + uint64_t endpa, toppa; + + toppa = mp->sbm_slice_top; + endpa = _ptob64(physmax + 1); + if (endpa > toppa) + mlist = memlist_del_span( + mlist, toppa, + endpa - toppa); + } + + if (mlist) { + /* successfully built a memlist */ + PR_MEM("%s: derived memlist from phys_install\n", f); + } + + /* if no mlist yet, try platform layer */ + if (!mlist) { + err = drmach_mem_get_memlist( + mp->sbm_cm.sbdev_id, &mlist); + if (err) { + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + mlist = NULL; /* paranoia */ + } + } + } + + PR_MEM("%s: memlist for %s\n", f, mp->sbm_cm.sbdev_path); + PR_MEMLIST_DUMP(mlist); + + return (mlist); +} + +/*ARGSUSED*/ +void +dr_release_mem(dr_common_unit_t *cp) +{ +} + +void +dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp) +{ + dr_mem_unit_t *mp = (dr_mem_unit_t *)cp; + struct memlist *ml, *mc; + sbd_error_t *err; + static fn_t f = "dr_attach_mem"; + uint64_t dr_physmax; + + PR_MEM("%s...\n", f); + + dr_lock_status(hp->h_bd); + err = drmach_configure(cp->sbdev_id, 0); + dr_unlock_status(hp->h_bd); + if (err) { + DRERR_SET_C(&cp->sbdev_error, &err); + return; + } + + ml = dr_get_memlist(mp); + + /* Skip memory with address above plat_dr_physmax or kpm_size */ + dr_physmax = plat_dr_physmax ? ptob(plat_dr_physmax) : UINT64_MAX; + if (kpm_size < dr_physmax) + dr_physmax = kpm_size; + ml = memlist_del_span(ml, dr_physmax, UINT64_MAX - dr_physmax); + + for (mc = ml; mc; mc = mc->ml_next) { + int rv; + sbd_error_t *err; + + rv = kphysm_add_memory_dynamic( + (pfn_t)btop(mc->ml_address), + (pgcnt_t)btop(mc->ml_size)); + if (rv != KPHYSM_OK) { + /* + * translate kphysm error and + * store in devlist error + */ + switch (rv) { + case KPHYSM_ERESOURCE: + rv = ESBD_NOMEM; + break; + + case KPHYSM_EFAULT: + rv = ESBD_FAULT; + break; + + default: + rv = ESBD_INTERNAL; + break; + } + + if (rv == ESBD_INTERNAL) { + DR_DEV_INTERNAL_ERROR(&mp->sbm_cm); + } else + dr_dev_err(CE_WARN, &mp->sbm_cm, rv); + break; + } + + err = drmach_mem_add_span( + mp->sbm_cm.sbdev_id, mc->ml_address, mc->ml_size); + if (err) { + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + break; + } + } + + memlist_delete(ml); + dr_init_mem_unit_data(mp); + + /* back out if configure failed */ + if (mp->sbm_cm.sbdev_error != NULL) { + dr_lock_status(hp->h_bd); + err = drmach_unconfigure(cp->sbdev_id, 0); + if (err) + sbd_err_clear(&err); + dr_unlock_status(hp->h_bd); + } +} + +/*ARGSUSED*/ +void +dr_detach_mem(dr_handle_t *hp, dr_common_unit_t *cp) +{ +} + +/* + * This routine acts as a wrapper for kphysm_del_span_query in order to + * support potential memory holes in a board's physical address space. + * It calls kphysm_del_span_query for each node in a memlist and accumulates + * the results in *mp. + */ +static int +dr_del_mlist_query(struct memlist *mlist, memquery_t *mp) +{ + int rv = 0; + + if (mlist == NULL) + cmn_err(CE_WARN, "dr_del_mlist_query: mlist=NULL\n"); + + mp->phys_pages = 0; + mp->managed = 0; + mp->nonrelocatable = 0; + mp->first_nonrelocatable = 0; + mp->last_nonrelocatable = 0; + + return (rv); +} + +/* + * NOTE: This routine is only partially smart about multiple + * mem-units. Need to make mem-status structure smart + * about them also. + */ +int +dr_mem_status(dr_handle_t *hp, dr_devset_t devset, sbd_dev_stat_t *dsp) +{ + int m, mix; + memquery_t mq; + dr_board_t *bp; + dr_mem_unit_t *mp; + sbd_mem_stat_t *msp; + static fn_t f = "dr_mem_status"; + + bp = hp->h_bd; + devset &= DR_DEVS_PRESENT(bp); + + for (m = mix = 0; m < MAX_MEM_UNITS_PER_BOARD; m++) { + int rv; + sbd_error_t *err; + drmach_status_t pstat; + dr_mem_unit_t *p_mp; + + if (DEVSET_IN_SET(devset, SBD_COMP_MEM, m) == 0) + continue; + + mp = dr_get_mem_unit(bp, m); + + if (mp->sbm_cm.sbdev_state == DR_STATE_EMPTY) { + /* present, but not fully initialized */ + continue; + } + + if (mp->sbm_cm.sbdev_id == (drmachid_t)0) + continue; + + /* fetch platform status */ + err = drmach_status(mp->sbm_cm.sbdev_id, &pstat); + if (err) { + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + continue; + } + + msp = &dsp->d_mem; + bzero((caddr_t)msp, sizeof (*msp)); + + (void) strlcpy(msp->ms_cm.c_id.c_name, pstat.type, + sizeof (msp->ms_cm.c_id.c_name)); + msp->ms_cm.c_id.c_type = mp->sbm_cm.sbdev_type; + msp->ms_cm.c_id.c_unit = mp->sbm_cm.sbdev_unum; + msp->ms_cm.c_cond = mp->sbm_cm.sbdev_cond; + msp->ms_cm.c_busy = mp->sbm_cm.sbdev_busy | pstat.busy; + msp->ms_cm.c_time = mp->sbm_cm.sbdev_time; + msp->ms_cm.c_ostate = mp->sbm_cm.sbdev_ostate; + + msp->ms_totpages = mp->sbm_npages; + msp->ms_basepfn = mp->sbm_basepfn; + msp->ms_pageslost = mp->sbm_pageslost; + msp->ms_cage_enabled = kcage_on; + + if (mp->sbm_flags & DR_MFLAG_RESERVED) + p_mp = mp->sbm_peer; + else + p_mp = NULL; + + if (p_mp == NULL) { + msp->ms_peer_is_target = 0; + msp->ms_peer_ap_id[0] = '\0'; + } else if (p_mp->sbm_flags & DR_MFLAG_RESERVED) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + char *minor; + + /* + * b_dip doesn't have to be held for ddi_pathname() + * because the board struct (dr_board_t) will be + * destroyed before b_dip detaches. + */ + (void) ddi_pathname(bp->b_dip, path); + minor = strchr(p_mp->sbm_cm.sbdev_path, ':'); + + (void) snprintf(msp->ms_peer_ap_id, + sizeof (msp->ms_peer_ap_id), "%s%s", + path, (minor == NULL) ? "" : minor); + + kmem_free(path, MAXPATHLEN); + + if (p_mp->sbm_flags & DR_MFLAG_TARGET) + msp->ms_peer_is_target = 1; + } + + /* + * kphysm_del_span_query can report non-reloc pages = total + * pages for memory that is not yet configured + */ + if (mp->sbm_cm.sbdev_state != DR_STATE_UNCONFIGURED) { + struct memlist *ml; + + ml = dr_get_memlist(mp); + rv = ml ? dr_del_mlist_query(ml, &mq) : -1; + memlist_delete(ml); + + if (rv == KPHYSM_OK) { + msp->ms_managed_pages = mq.managed; + msp->ms_noreloc_pages = mq.nonrelocatable; + msp->ms_noreloc_first = + mq.first_nonrelocatable; + msp->ms_noreloc_last = + mq.last_nonrelocatable; + msp->ms_cm.c_sflags = 0; + if (mq.nonrelocatable && + drmach_copy_rename_need_suspend( + mp->sbm_cm.sbdev_id)) { + SBD_SET_SUSPEND(SBD_CMD_UNCONFIGURE, + msp->ms_cm.c_sflags); + } + } else { + PR_MEM("%s: kphysm_del_span_query() = %d\n", + f, rv); + } + } + + /* + * Check source unit state during copy-rename + */ + if ((mp->sbm_flags & DR_MFLAG_SOURCE) && + (mp->sbm_cm.sbdev_state == DR_STATE_UNREFERENCED || + mp->sbm_cm.sbdev_state == DR_STATE_RELEASE)) + msp->ms_cm.c_ostate = SBD_STAT_CONFIGURED; + + mix++; + dsp++; + } + + return (mix); +} + +/*ARGSUSED*/ +int +dr_pre_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + int err_flag = 0; + int d; + sbd_error_t *err; + static fn_t f = "dr_pre_attach_mem"; + + PR_MEM("%s...\n", f); + + for (d = 0; d < devnum; d++) { + dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d]; + dr_state_t state; + + cmn_err(CE_CONT, "OS configure %s", mp->sbm_cm.sbdev_path); + + state = mp->sbm_cm.sbdev_state; + switch (state) { + case DR_STATE_UNCONFIGURED: + PR_MEM("%s: recovering from UNCONFIG for %s\n", + f, mp->sbm_cm.sbdev_path); + + /* use memlist cached by dr_post_detach_mem_unit */ + ASSERT(mp->sbm_mlist != NULL); + PR_MEM("%s: re-configuring cached memlist for %s:\n", + f, mp->sbm_cm.sbdev_path); + PR_MEMLIST_DUMP(mp->sbm_mlist); + + /* kphysm del handle should be have been freed */ + ASSERT((mp->sbm_flags & DR_MFLAG_RELOWNER) == 0); + + /*FALLTHROUGH*/ + + case DR_STATE_CONNECTED: + PR_MEM("%s: reprogramming mem hardware on %s\n", + f, mp->sbm_cm.sbdev_bp->b_path); + + PR_MEM("%s: enabling %s\n", + f, mp->sbm_cm.sbdev_path); + + err = drmach_mem_enable(mp->sbm_cm.sbdev_id); + if (err) { + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + err_flag = 1; + } + break; + + default: + dr_dev_err(CE_WARN, &mp->sbm_cm, ESBD_STATE); + err_flag = 1; + break; + } + + /* exit for loop if error encountered */ + if (err_flag) + break; + } + + return (err_flag ? -1 : 0); +} + +/*ARGSUSED*/ +int +dr_post_attach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + int d; + static fn_t f = "dr_post_attach_mem"; + + PR_MEM("%s...\n", f); + + for (d = 0; d < devnum; d++) { + dr_mem_unit_t *mp = (dr_mem_unit_t *)devlist[d]; + struct memlist *mlist, *ml; + + mlist = dr_get_memlist(mp); + + /* + * Verify the memory really did successfully attach + * by checking for its existence in phys_install. + */ + memlist_read_lock(); + if (memlist_intersect(phys_install, mlist) == 0) { + memlist_read_unlock(); + + DR_DEV_INTERNAL_ERROR(&mp->sbm_cm); + + PR_MEM("%s: %s memlist not in phys_install", + f, mp->sbm_cm.sbdev_path); + + memlist_delete(mlist); + continue; + } + memlist_read_unlock(); + + for (ml = mlist; ml != NULL; ml = ml->ml_next) { + sbd_error_t *err; + + err = drmach_mem_add_span( + mp->sbm_cm.sbdev_id, + ml->ml_address, + ml->ml_size); + if (err) + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + } + + memlist_delete(mlist); + + /* + * Destroy cached memlist, if any. + * There will be a cached memlist in sbm_mlist if + * this board is being configured directly after + * an unconfigure. + * To support this transition, dr_post_detach_mem + * left a copy of the last known memlist in sbm_mlist. + * This memlist could differ from any derived from + * hardware if while this memunit was last configured + * the system detected and deleted bad pages from + * phys_install. The location of those bad pages + * will be reflected in the cached memlist. + */ + if (mp->sbm_mlist) { + memlist_delete(mp->sbm_mlist); + mp->sbm_mlist = NULL; + } + } + + return (0); +} + +/*ARGSUSED*/ +int +dr_pre_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + return (-1); +} + +/*ARGSUSED*/ +int +dr_post_detach_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + return (-1); +} + +/* + * Successful return from this function will have the memory + * handle in bp->b_dev[..mem-unit...].sbm_memhandle allocated + * and waiting. This routine's job is to select the memory that + * actually has to be released (detached) which may not necessarily + * be the same memory node that came in in devlist[], + * i.e. a copy-rename is needed. + */ +/*ARGSUSED*/ +int +dr_pre_release_mem(dr_handle_t *hp, dr_common_unit_t **devlist, int devnum) +{ + return (-1); +} + +/*ARGSUSED*/ +void +dr_release_mem_done(dr_common_unit_t *cp) +{ +} + +/*ARGSUSED*/ +int +dr_disconnect_mem(dr_mem_unit_t *mp) +{ + return (-1); +} + +/*ARGSUSED*/ +int +dr_cancel_mem(dr_mem_unit_t *s_mp) +{ + return (-1); +} + +void +dr_init_mem_unit(dr_mem_unit_t *mp) +{ + dr_state_t new_state; + + if (DR_DEV_IS_ATTACHED(&mp->sbm_cm)) { + new_state = DR_STATE_CONFIGURED; + mp->sbm_cm.sbdev_cond = SBD_COND_OK; + } else if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) { + new_state = DR_STATE_CONNECTED; + mp->sbm_cm.sbdev_cond = SBD_COND_OK; + } else if (mp->sbm_cm.sbdev_id != (drmachid_t)0) { + new_state = DR_STATE_OCCUPIED; + } else { + new_state = DR_STATE_EMPTY; + } + + if (DR_DEV_IS_PRESENT(&mp->sbm_cm)) + dr_init_mem_unit_data(mp); + + /* delay transition until fully initialized */ + dr_device_transition(&mp->sbm_cm, new_state); +} + +static void +dr_init_mem_unit_data(dr_mem_unit_t *mp) +{ + drmachid_t id = mp->sbm_cm.sbdev_id; + drmach_mem_info_t minfo; + sbd_error_t *err; + static fn_t f = "dr_init_mem_unit_data"; + + PR_MEM("%s...\n", f); + + /* a little sanity checking */ + ASSERT(mp->sbm_peer == NULL); + ASSERT(mp->sbm_flags == 0); + + if (err = drmach_mem_get_info(id, &minfo)) { + DRERR_SET_C(&mp->sbm_cm.sbdev_error, &err); + return; + } + mp->sbm_basepfn = _b64top(minfo.mi_basepa); + mp->sbm_npages = _b64top(minfo.mi_size); + mp->sbm_alignment_mask = minfo.mi_alignment_mask; + mp->sbm_slice_base = minfo.mi_slice_base; + mp->sbm_slice_top = minfo.mi_slice_top; + mp->sbm_slice_size = minfo.mi_slice_size; + + PR_MEM("%s: %s (basepfn = 0x%lx, npgs = %ld)\n", + f, mp->sbm_cm.sbdev_path, mp->sbm_basepfn, mp->sbm_npages); +} diff --git a/usr/src/uts/i86pc/io/dr/dr_quiesce.c b/usr/src/uts/i86pc/io/dr/dr_quiesce.c new file mode 100644 index 0000000000..f3467f6eb1 --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr_quiesce.c @@ -0,0 +1,1050 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * A CPR derivative specifically for starfire/starcat + * X86 doesn't make use of the quiesce interfaces, it's kept for simplicity. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/machparam.h> +#include <sys/machsystm.h> +#include <sys/ddi.h> +#define SUNDDI_IMPL +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/devctl.h> +#include <sys/time.h> +#include <sys/kmem.h> +#include <nfs/lm.h> +#include <sys/ddi_impldefs.h> +#include <sys/ndi_impldefs.h> +#include <sys/obpdefs.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/callb.h> +#include <sys/clock.h> +#include <sys/x_call.h> +#include <sys/cpuvar.h> +#include <sys/epm.h> +#include <sys/vfs.h> +#include <sys/promif.h> +#include <sys/conf.h> +#include <sys/cyclic.h> + +#include <sys/dr.h> +#include <sys/dr_util.h> + +extern void e_ddi_enter_driver_list(struct devnames *dnp, int *listcnt); +extern void e_ddi_exit_driver_list(struct devnames *dnp, int listcnt); +extern int is_pseudo_device(dev_info_t *dip); + +extern kmutex_t cpu_lock; +extern dr_unsafe_devs_t dr_unsafe_devs; + +static int dr_is_real_device(dev_info_t *dip); +static int dr_is_unsafe_major(major_t major); +static int dr_bypass_device(char *dname); +static int dr_check_dip(dev_info_t *dip, void *arg, uint_t ref); +static int dr_resolve_devname(dev_info_t *dip, char *buffer, + char *alias); +static sbd_error_t *drerr_int(int e_code, uint64_t *arr, int idx, + int majors); +static int dr_add_int(uint64_t *arr, int idx, int len, + uint64_t val); + +int dr_pt_test_suspend(dr_handle_t *hp); + +/* + * dr_quiesce.c interface + * NOTE: states used internally by dr_suspend and dr_resume + */ +typedef enum dr_suspend_state { + DR_SRSTATE_BEGIN = 0, + DR_SRSTATE_USER, + DR_SRSTATE_DRIVER, + DR_SRSTATE_FULL +} suspend_state_t; + +struct dr_sr_handle { + dr_handle_t *sr_dr_handlep; + dev_info_t *sr_failed_dip; + suspend_state_t sr_suspend_state; + uint_t sr_flags; + uint64_t sr_err_ints[DR_MAX_ERR_INT]; + int sr_err_idx; +}; + +#define SR_FLAG_WATCHDOG 0x1 + +/* + * XXX + * This hack will go away before RTI. Just for testing. + * List of drivers to bypass when performing a suspend. + */ +static char *dr_bypass_list[] = { + "" +}; + + +#define SKIP_SYNC /* bypass sync ops in dr_suspend */ + +/* + * dr_skip_user_threads is used to control if user threads should + * be suspended. If dr_skip_user_threads is true, the rest of the + * flags are not used; if it is false, dr_check_user_stop_result + * will be used to control whether or not we need to check suspend + * result, and dr_allow_blocked_threads will be used to control + * whether or not we allow suspend to continue if there are blocked + * threads. We allow all combinations of dr_check_user_stop_result + * and dr_allow_block_threads, even though it might not make much + * sense to not allow block threads when we don't even check stop + * result. + */ +static int dr_skip_user_threads = 0; /* default to FALSE */ +static int dr_check_user_stop_result = 1; /* default to TRUE */ +static int dr_allow_blocked_threads = 1; /* default to TRUE */ + +#define DR_CPU_LOOP_MSEC 1000 + +static void +dr_stop_intr(void) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + kpreempt_disable(); + cyclic_suspend(); +} + +static void +dr_enable_intr(void) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + cyclic_resume(); + kpreempt_enable(); +} + +dr_sr_handle_t * +dr_get_sr_handle(dr_handle_t *hp) +{ + dr_sr_handle_t *srh; + + srh = GETSTRUCT(dr_sr_handle_t, 1); + srh->sr_dr_handlep = hp; + + return (srh); +} + +void +dr_release_sr_handle(dr_sr_handle_t *srh) +{ + ASSERT(srh->sr_failed_dip == NULL); + FREESTRUCT(srh, dr_sr_handle_t, 1); +} + +static int +dr_is_real_device(dev_info_t *dip) +{ + struct regspec *regbuf = NULL; + int length = 0; + int rc; + + if (ddi_get_driver(dip) == NULL) + return (0); + + if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) + return (1); + if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) + return (0); + + /* + * now the general case + */ + rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", + (caddr_t)®buf, &length); + ASSERT(rc != DDI_PROP_NO_MEMORY); + if (rc != DDI_PROP_SUCCESS) { + return (0); + } else { + if ((length > 0) && (regbuf != NULL)) + kmem_free(regbuf, length); + return (1); + } +} + +static int +dr_is_unsafe_major(major_t major) +{ + char *dname, **cpp; + int i, ndevs; + + if ((dname = ddi_major_to_name(major)) == NULL) { + PR_QR("dr_is_unsafe_major: invalid major # %d\n", major); + return (0); + } + + ndevs = dr_unsafe_devs.ndevs; + for (i = 0, cpp = dr_unsafe_devs.devnames; i < ndevs; i++) { + if (strcmp(dname, *cpp++) == 0) + return (1); + } + return (0); +} + +static int +dr_bypass_device(char *dname) +{ + int i; + char **lname; + /* check the bypass list */ + for (i = 0, lname = &dr_bypass_list[i]; **lname != '\0'; lname++) { + if (strcmp(dname, dr_bypass_list[i++]) == 0) + return (1); + } + return (0); +} + +static int +dr_resolve_devname(dev_info_t *dip, char *buffer, char *alias) +{ + major_t devmajor; + char *aka, *name; + + *buffer = *alias = 0; + + if (dip == NULL) + return (-1); + + if ((name = ddi_get_name(dip)) == NULL) + name = "<null name>"; + + aka = name; + + if ((devmajor = ddi_name_to_major(aka)) != DDI_MAJOR_T_NONE) + aka = ddi_major_to_name(devmajor); + + (void) strcpy(buffer, name); + + if (strcmp(name, aka)) + (void) strcpy(alias, aka); + else + *alias = 0; + + return (0); +} + +struct dr_ref { + int *refcount; + int *refcount_non_gldv3; + uint64_t *arr; + int *idx; + int len; +}; + +/* ARGSUSED */ +static int +dr_check_dip(dev_info_t *dip, void *arg, uint_t ref) +{ + major_t major; + char *dname; + struct dr_ref *rp = (struct dr_ref *)arg; + + if (dip == NULL) + return (DDI_WALK_CONTINUE); + + if (!dr_is_real_device(dip)) + return (DDI_WALK_CONTINUE); + + dname = ddi_binding_name(dip); + + if (dr_bypass_device(dname)) + return (DDI_WALK_CONTINUE); + + if (dname && ((major = ddi_name_to_major(dname)) != (major_t)-1)) { + if (ref && rp->refcount) { + *rp->refcount += ref; + PR_QR("\n %s (major# %d) is referenced(%u)\n", dname, + major, ref); + } + if (ref && rp->refcount_non_gldv3) { + if (NETWORK_PHYSDRV(major) && !GLDV3_DRV(major)) + *rp->refcount_non_gldv3 += ref; + } + if (dr_is_unsafe_major(major) && i_ddi_devi_attached(dip)) { + PR_QR("\n %s (major# %d) not hotpluggable\n", dname, + major); + if (rp->arr != NULL && rp->idx != NULL) + *rp->idx = dr_add_int(rp->arr, *rp->idx, + rp->len, (uint64_t)major); + } + } + return (DDI_WALK_CONTINUE); +} + +static int +dr_check_unsafe_major(dev_info_t *dip, void *arg) +{ + return (dr_check_dip(dip, arg, 0)); +} + + +/*ARGSUSED*/ +void +dr_check_devices(dev_info_t *dip, int *refcount, dr_handle_t *handle, + uint64_t *arr, int *idx, int len, int *refcount_non_gldv3) +{ + struct dr_ref bref = {0}; + + if (dip == NULL) + return; + + bref.refcount = refcount; + bref.refcount_non_gldv3 = refcount_non_gldv3; + bref.arr = arr; + bref.idx = idx; + bref.len = len; + + ASSERT(e_ddi_branch_held(dip)); + (void) e_ddi_branch_referenced(dip, dr_check_dip, &bref); +} + +/* + * The "dip" argument's parent (if it exists) must be held busy. + */ +static int +dr_suspend_devices(dev_info_t *dip, dr_sr_handle_t *srh) +{ + dr_handle_t *handle; + major_t major; + char *dname; + int circ; + + /* + * If dip is the root node, it has no siblings and it is + * always held. If dip is not the root node, dr_suspend_devices() + * will be invoked with the parent held busy. + */ + for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { + char d_name[40], d_alias[40], *d_info; + + ndi_devi_enter(dip, &circ); + if (dr_suspend_devices(ddi_get_child(dip), srh)) { + ndi_devi_exit(dip, circ); + return (ENXIO); + } + ndi_devi_exit(dip, circ); + + if (!dr_is_real_device(dip)) + continue; + + major = (major_t)-1; + if ((dname = ddi_binding_name(dip)) != NULL) + major = ddi_name_to_major(dname); + + if (dr_bypass_device(dname)) { + PR_QR(" bypassed suspend of %s (major# %d)\n", dname, + major); + continue; + } + + if (drmach_verify_sr(dip, 1)) { + PR_QR(" bypassed suspend of %s (major# %d)\n", dname, + major); + continue; + } + + if ((d_info = ddi_get_name_addr(dip)) == NULL) + d_info = "<null>"; + + d_name[0] = 0; + if (dr_resolve_devname(dip, d_name, d_alias) == 0) { + if (d_alias[0] != 0) { + prom_printf("\tsuspending %s@%s (aka %s)\n", + d_name, d_info, d_alias); + } else { + prom_printf("\tsuspending %s@%s\n", d_name, + d_info); + } + } else { + prom_printf("\tsuspending %s@%s\n", dname, d_info); + } + + if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { + prom_printf("\tFAILED to suspend %s@%s\n", + d_name[0] ? d_name : dname, d_info); + + srh->sr_err_idx = dr_add_int(srh->sr_err_ints, + srh->sr_err_idx, DR_MAX_ERR_INT, (uint64_t)major); + + ndi_hold_devi(dip); + srh->sr_failed_dip = dip; + + handle = srh->sr_dr_handlep; + dr_op_err(CE_IGNORE, handle, ESBD_SUSPEND, "%s@%s", + d_name[0] ? d_name : dname, d_info); + + return (DDI_FAILURE); + } + } + + return (DDI_SUCCESS); +} + +static void +dr_resume_devices(dev_info_t *start, dr_sr_handle_t *srh) +{ + dr_handle_t *handle; + dev_info_t *dip, *next, *last = NULL; + major_t major; + char *bn; + int circ; + + major = (major_t)-1; + + /* attach in reverse device tree order */ + while (last != start) { + dip = start; + next = ddi_get_next_sibling(dip); + while (next != last && dip != srh->sr_failed_dip) { + dip = next; + next = ddi_get_next_sibling(dip); + } + if (dip == srh->sr_failed_dip) { + /* release hold acquired in dr_suspend_devices() */ + srh->sr_failed_dip = NULL; + ndi_rele_devi(dip); + } else if (dr_is_real_device(dip) && + srh->sr_failed_dip == NULL) { + + if ((bn = ddi_binding_name(dip)) != NULL) { + major = ddi_name_to_major(bn); + } else { + bn = "<null>"; + } + if (!dr_bypass_device(bn) && + !drmach_verify_sr(dip, 0)) { + char d_name[40], d_alias[40], *d_info; + + d_name[0] = 0; + d_info = ddi_get_name_addr(dip); + if (d_info == NULL) + d_info = "<null>"; + + if (!dr_resolve_devname(dip, d_name, d_alias)) { + if (d_alias[0] != 0) { + prom_printf("\tresuming " + "%s@%s (aka %s)\n", d_name, + d_info, d_alias); + } else { + prom_printf("\tresuming " + "%s@%s\n", d_name, d_info); + } + } else { + prom_printf("\tresuming %s@%s\n", bn, + d_info); + } + + if (devi_attach(dip, DDI_RESUME) != + DDI_SUCCESS) { + /* + * Print a console warning, + * set an e_code of ESBD_RESUME, + * and save the driver major + * number in the e_rsc. + */ + prom_printf("\tFAILED to resume %s@%s", + d_name[0] ? d_name : bn, d_info); + + srh->sr_err_idx = + dr_add_int(srh->sr_err_ints, + srh->sr_err_idx, DR_MAX_ERR_INT, + (uint64_t)major); + + handle = srh->sr_dr_handlep; + + dr_op_err(CE_IGNORE, handle, + ESBD_RESUME, "%s@%s", + d_name[0] ? d_name : bn, d_info); + } + } + } + + /* Hold parent busy while walking its children */ + ndi_devi_enter(dip, &circ); + dr_resume_devices(ddi_get_child(dip), srh); + ndi_devi_exit(dip, circ); + last = dip; + } +} + +/* + * True if thread is virtually stopped. Similar to CPR_VSTOPPED + * but from DR point of view. These user threads are waiting in + * the kernel. Once they complete in the kernel, they will process + * the stop signal and stop. + */ +#define DR_VSTOPPED(t) \ + ((t)->t_state == TS_SLEEP && \ + (t)->t_wchan != NULL && \ + (t)->t_astflag && \ + ((t)->t_proc_flag & TP_CHKPT)) + +/* ARGSUSED */ +static int +dr_stop_user_threads(dr_sr_handle_t *srh) +{ + int count; + int bailout; + dr_handle_t *handle = srh->sr_dr_handlep; + static fn_t f = "dr_stop_user_threads"; + kthread_id_t tp; + + extern void add_one_utstop(); + extern void utstop_timedwait(clock_t); + extern void utstop_init(void); + +#define DR_UTSTOP_RETRY 4 +#define DR_UTSTOP_WAIT hz + + if (dr_skip_user_threads) + return (DDI_SUCCESS); + + utstop_init(); + + /* we need to try a few times to get past fork, etc. */ + srh->sr_err_idx = 0; + for (count = 0; count < DR_UTSTOP_RETRY; count++) { + /* walk the entire threadlist */ + mutex_enter(&pidlock); + for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { + proc_t *p = ttoproc(tp); + + /* handle kernel threads separately */ + if (p->p_as == &kas || p->p_stat == SZOMB) + continue; + + mutex_enter(&p->p_lock); + thread_lock(tp); + + if (tp->t_state == TS_STOPPED) { + /* add another reason to stop this thread */ + tp->t_schedflag &= ~TS_RESUME; + } else { + tp->t_proc_flag |= TP_CHKPT; + + thread_unlock(tp); + mutex_exit(&p->p_lock); + add_one_utstop(); + mutex_enter(&p->p_lock); + thread_lock(tp); + + aston(tp); + + if (ISWAKEABLE(tp) || ISWAITING(tp)) { + setrun_locked(tp); + } + + } + + /* grab thread if needed */ + if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) + poke_cpu(tp->t_cpu->cpu_id); + + + thread_unlock(tp); + mutex_exit(&p->p_lock); + } + mutex_exit(&pidlock); + + + /* let everything catch up */ + utstop_timedwait(count * count * DR_UTSTOP_WAIT); + + + /* now, walk the threadlist again to see if we are done */ + mutex_enter(&pidlock); + for (tp = curthread->t_next, bailout = 0; + tp != curthread; tp = tp->t_next) { + proc_t *p = ttoproc(tp); + + /* handle kernel threads separately */ + if (p->p_as == &kas || p->p_stat == SZOMB) + continue; + + /* + * If this thread didn't stop, and we don't allow + * unstopped blocked threads, bail. + */ + thread_lock(tp); + if (!CPR_ISTOPPED(tp) && + !(dr_allow_blocked_threads && + DR_VSTOPPED(tp))) { + bailout = 1; + if (count == DR_UTSTOP_RETRY - 1) { + /* + * save the pid for later reporting + */ + srh->sr_err_idx = + dr_add_int(srh->sr_err_ints, + srh->sr_err_idx, DR_MAX_ERR_INT, + (uint64_t)p->p_pid); + + cmn_err(CE_WARN, "%s: " + "failed to stop thread: " + "process=%s, pid=%d", + f, p->p_user.u_psargs, p->p_pid); + + PR_QR("%s: failed to stop thread: " + "process=%s, pid=%d, t_id=0x%p, " + "t_state=0x%x, t_proc_flag=0x%x, " + "t_schedflag=0x%x\n", + f, p->p_user.u_psargs, p->p_pid, + (void *)tp, tp->t_state, + tp->t_proc_flag, tp->t_schedflag); + } + + } + thread_unlock(tp); + } + mutex_exit(&pidlock); + + /* were all the threads stopped? */ + if (!bailout) + break; + } + + /* were we unable to stop all threads after a few tries? */ + if (bailout) { + handle->h_err = drerr_int(ESBD_UTHREAD, srh->sr_err_ints, + srh->sr_err_idx, 0); + return (ESRCH); + } + + return (DDI_SUCCESS); +} + +static void +dr_start_user_threads(void) +{ + kthread_id_t tp; + + mutex_enter(&pidlock); + + /* walk all threads and release them */ + for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { + proc_t *p = ttoproc(tp); + + /* skip kernel threads */ + if (ttoproc(tp)->p_as == &kas) + continue; + + mutex_enter(&p->p_lock); + tp->t_proc_flag &= ~TP_CHKPT; + mutex_exit(&p->p_lock); + + thread_lock(tp); + if (CPR_ISTOPPED(tp)) { + /* back on the runq */ + tp->t_schedflag |= TS_RESUME; + setrun_locked(tp); + } + thread_unlock(tp); + } + + mutex_exit(&pidlock); +} + +static void +dr_signal_user(int sig) +{ + struct proc *p; + + mutex_enter(&pidlock); + + for (p = practive; p != NULL; p = p->p_next) { + /* only user threads */ + if (p->p_exec == NULL || p->p_stat == SZOMB || + p == proc_init || p == ttoproc(curthread)) + continue; + + mutex_enter(&p->p_lock); + sigtoproc(p, NULL, sig); + mutex_exit(&p->p_lock); + } + + mutex_exit(&pidlock); + + /* add a bit of delay */ + delay(hz); +} + +void +dr_resume(dr_sr_handle_t *srh) +{ + dr_handle_t *handle; + + handle = srh->sr_dr_handlep; + + switch (srh->sr_suspend_state) { + case DR_SRSTATE_FULL: + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Prevent false alarm in tod_validate() due to tod + * value change between suspend and resume + */ + mutex_enter(&tod_lock); + tod_status_set(TOD_DR_RESUME_DONE); + mutex_exit(&tod_lock); + + dr_enable_intr(); /* enable intr & clock */ + + start_cpus(); + mutex_exit(&cpu_lock); + + /* + * This should only be called if drmach_suspend_last() + * was called and state transitioned to DR_SRSTATE_FULL + * to prevent resume attempts on device instances that + * were not previously suspended. + */ + drmach_resume_first(); + + /* FALLTHROUGH */ + + case DR_SRSTATE_DRIVER: + /* + * resume drivers + */ + srh->sr_err_idx = 0; + + /* no parent dip to hold busy */ + dr_resume_devices(ddi_root_node(), srh); + + if (srh->sr_err_idx && srh->sr_dr_handlep) { + (srh->sr_dr_handlep)->h_err = drerr_int(ESBD_RESUME, + srh->sr_err_ints, srh->sr_err_idx, 1); + } + + /* + * resume the lock manager + */ + lm_cprresume(); + + /* FALLTHROUGH */ + + case DR_SRSTATE_USER: + /* + * finally, resume user threads + */ + if (!dr_skip_user_threads) { + prom_printf("DR: resuming user threads...\n"); + dr_start_user_threads(); + } + /* FALLTHROUGH */ + + case DR_SRSTATE_BEGIN: + default: + /* + * let those who care know that we've just resumed + */ + PR_QR("sending SIGTHAW...\n"); + dr_signal_user(SIGTHAW); + break; + } + + i_ndi_allow_device_tree_changes(handle->h_ndi); + + prom_printf("DR: resume COMPLETED\n"); +} + +int +dr_suspend(dr_sr_handle_t *srh) +{ + dr_handle_t *handle; + int force; + int dev_errs_idx; + uint64_t dev_errs[DR_MAX_ERR_INT]; + int rc = DDI_SUCCESS; + + handle = srh->sr_dr_handlep; + + force = dr_cmd_flags(handle) & SBD_FLAG_FORCE; + + i_ndi_block_device_tree_changes(&handle->h_ndi); + + prom_printf("\nDR: suspending user threads...\n"); + srh->sr_suspend_state = DR_SRSTATE_USER; + if (((rc = dr_stop_user_threads(srh)) != DDI_SUCCESS) && + dr_check_user_stop_result) { + dr_resume(srh); + return (rc); + } + + if (!force) { + struct dr_ref drc = {0}; + + prom_printf("\nDR: checking devices...\n"); + dev_errs_idx = 0; + + drc.arr = dev_errs; + drc.idx = &dev_errs_idx; + drc.len = DR_MAX_ERR_INT; + + /* + * Since the root node can never go away, it + * doesn't have to be held. + */ + ddi_walk_devs(ddi_root_node(), dr_check_unsafe_major, &drc); + if (dev_errs_idx) { + handle->h_err = drerr_int(ESBD_UNSAFE, dev_errs, + dev_errs_idx, 1); + dr_resume(srh); + return (DDI_FAILURE); + } + PR_QR("done\n"); + } else { + prom_printf("\nDR: dr_suspend invoked with force flag\n"); + } + +#ifndef SKIP_SYNC + /* + * This sync swap out all user pages + */ + vfs_sync(SYNC_ALL); +#endif + + /* + * special treatment for lock manager + */ + lm_cprsuspend(); + +#ifndef SKIP_SYNC + /* + * sync the file system in case we never make it back + */ + sync(); +#endif + + /* + * now suspend drivers + */ + prom_printf("DR: suspending drivers...\n"); + srh->sr_suspend_state = DR_SRSTATE_DRIVER; + srh->sr_err_idx = 0; + /* No parent to hold busy */ + if ((rc = dr_suspend_devices(ddi_root_node(), srh)) != DDI_SUCCESS) { + if (srh->sr_err_idx && srh->sr_dr_handlep) { + (srh->sr_dr_handlep)->h_err = drerr_int(ESBD_SUSPEND, + srh->sr_err_ints, srh->sr_err_idx, 1); + } + dr_resume(srh); + return (rc); + } + + drmach_suspend_last(); + + /* + * finally, grab all cpus + */ + srh->sr_suspend_state = DR_SRSTATE_FULL; + + mutex_enter(&cpu_lock); + pause_cpus(NULL); + dr_stop_intr(); + + return (rc); +} + +int +dr_pt_test_suspend(dr_handle_t *hp) +{ + dr_sr_handle_t *srh; + int err; + uint_t psmerr; + static fn_t f = "dr_pt_test_suspend"; + + PR_QR("%s...\n", f); + + srh = dr_get_sr_handle(hp); + if ((err = dr_suspend(srh)) == DDI_SUCCESS) { + dr_resume(srh); + if ((hp->h_err) && ((psmerr = hp->h_err->e_code) != 0)) { + PR_QR("%s: error on dr_resume()", f); + switch (psmerr) { + case ESBD_RESUME: + PR_QR("Couldn't resume devices: %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_KTHREAD: + PR_ALL("psmerr is ESBD_KTHREAD\n"); + break; + default: + PR_ALL("Resume error unknown = %d\n", psmerr); + break; + } + } + } else { + PR_ALL("%s: dr_suspend() failed, err = 0x%x\n", f, err); + psmerr = hp->h_err ? hp->h_err->e_code : ESBD_NOERROR; + switch (psmerr) { + case ESBD_UNSAFE: + PR_ALL("Unsafe devices (major #): %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_RTTHREAD: + PR_ALL("RT threads (PIDs): %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_UTHREAD: + PR_ALL("User threads (PIDs): %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_SUSPEND: + PR_ALL("Non-suspendable devices (major #): %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_RESUME: + PR_ALL("Could not resume devices (major #): %s\n", + DR_GET_E_RSC(hp->h_err)); + break; + + case ESBD_KTHREAD: + PR_ALL("psmerr is ESBD_KTHREAD\n"); + break; + + case ESBD_NOERROR: + PR_ALL("sbd_error_t error code not set\n"); + break; + + default: + PR_ALL("Unknown error psmerr = %d\n", psmerr); + break; + } + } + dr_release_sr_handle(srh); + + return (0); +} + +/* + * Add a new integer value to the end of an array. Don't allow duplicates to + * appear in the array, and don't allow the array to overflow. Return the new + * total number of entries in the array. + */ +static int +dr_add_int(uint64_t *arr, int idx, int len, uint64_t val) +{ + int i; + + if (arr == NULL) + return (0); + + if (idx >= len) + return (idx); + + for (i = 0; i < idx; i++) { + if (arr[i] == val) + return (idx); + } + + arr[idx++] = val; + + return (idx); +} + +/* + * Construct an sbd_error_t featuring a string representation of an array of + * integers as its e_rsc. + */ +static sbd_error_t * +drerr_int(int e_code, uint64_t *arr, int idx, int majors) +{ + int i, n, buf_len, buf_idx, buf_avail; + char *dname; + char *buf; + sbd_error_t *new_sbd_err; + static char s_ellipsis[] = "..."; + + if (arr == NULL || idx <= 0) + return (NULL); + + /* MAXPATHLEN is the size of the e_rsc field in sbd_error_t. */ + buf = (char *)kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + /* + * This is the total working area of the buffer. It must be computed + * as the size of 'buf', minus reserved space for the null terminator + * and the ellipsis string. + */ + buf_len = MAXPATHLEN - (strlen(s_ellipsis) + 1); + + /* Construct a string representation of the array values */ + for (buf_idx = 0, i = 0; i < idx; i++) { + buf_avail = buf_len - buf_idx; + if (majors) { + dname = ddi_major_to_name(arr[i]); + if (dname) { + n = snprintf(&buf[buf_idx], buf_avail, "%s, ", + dname); + } else { + n = snprintf(&buf[buf_idx], buf_avail, + "major %" PRIu64 ", ", arr[i]); + } + } else { + n = snprintf(&buf[buf_idx], buf_avail, "%" PRIu64 ", ", + arr[i]); + } + + /* An ellipsis gets appended when no more values fit */ + if (n >= buf_avail) { + (void) strcpy(&buf[buf_idx], s_ellipsis); + break; + } + + buf_idx += n; + } + + /* If all the contents fit, remove the trailing comma */ + if (n < buf_avail) { + buf[--buf_idx] = '\0'; + buf[--buf_idx] = '\0'; + } + + /* Return an sbd_error_t with the buffer and e_code */ + new_sbd_err = drerr_new(1, e_code, buf); + kmem_free(buf, MAXPATHLEN); + return (new_sbd_err); +} diff --git a/usr/src/uts/i86pc/io/dr/dr_util.c b/usr/src/uts/i86pc/io/dr/dr_util.c new file mode 100644 index 0000000000..d60dc7f7b8 --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/dr_util.c @@ -0,0 +1,457 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/param.h> /* for NULL */ +#include <sys/sbd_ioctl.h> +#include <sys/dr_util.h> +#include <sys/varargs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> + +/* sbd_etab[] and sbd_etab_len provided by sbdgenerr.pl */ +extern sbd_etab_t sbd_etab[]; +extern int sbd_etab_len; + +sbd_error_t * +sbd_err_new(int e_code, char *fmt, va_list args) +{ + sbd_error_t *new; + + new = GETSTRUCT(sbd_error_t, 1); + new->e_code = e_code; + + if (fmt) + (void) vsnprintf(new->e_rsc, sizeof (new->e_rsc), fmt, args); + + return (new); +} + +void +sbd_err_log(sbd_error_t *ep, int ce) +{ + char buf[32]; + char *fmt; + char *txt; + int i; + sbd_etab_t *tp; + + if (!ep) + return; + + if (ep->e_rsc[0] == '\0') + fmt = "%s"; + else + fmt = "%s: %s"; + + for (tp = sbd_etab, i = 0; i < sbd_etab_len; i++, tp++) + if (ep->e_code >= tp->t_base && ep->e_code <= tp->t_bnd) + break; + + if (i < sbd_etab_len) + txt = tp->t_text[ep->e_code - tp->t_base]; + else { + (void) snprintf(buf, sizeof (buf), "error %d", ep->e_code); + txt = buf; + } + + cmn_err(ce, fmt, txt, ep->e_rsc); +} + +void +sbd_err_clear(sbd_error_t **ep) +{ + FREESTRUCT(*ep, sbd_error_t, 1); + *ep = NULL; +} + +void +sbd_err_set_c(sbd_error_t **ep, int ce, int e_code, char *fmt, ...) +{ + sbd_error_t *tmp; + va_list args; + + va_start(args, fmt); + + tmp = sbd_err_new(e_code, fmt, args); + + sbd_err_log(tmp, ce); + + if (*ep == NULL) + *ep = tmp; + else + sbd_err_clear(&tmp); + + va_end(args); +} + +void +sbd_err_set(sbd_error_t **ep, int ce, int e_code, char *fmt, ...) +{ + sbd_error_t *tmp; + va_list args; + + va_start(args, fmt); + + tmp = sbd_err_new(e_code, fmt, args); + + sbd_err_log(tmp, ce); + + *ep = tmp; + + va_end(args); +} + +sbd_error_t * +drerr_new_v(int e_code, char *fmt, va_list args) +{ + return (sbd_err_new(e_code, fmt, args)); +} + +sbd_error_t * +drerr_new(int log, int e_code, char *fmt, ...) +{ + sbd_error_t *ep; + va_list args; + + va_start(args, fmt); + ep = sbd_err_new(e_code, fmt, args); + va_end(args); + + if (log) + sbd_err_log(ep, CE_WARN); + + return (ep); +} + +void +drerr_set_c(int log, sbd_error_t **ep, int e_code, char *fmt, ...) +{ + sbd_error_t *err; + va_list args; + + va_start(args, fmt); + err = sbd_err_new(e_code, fmt, args); + va_end(args); + + if (log) + sbd_err_log(err, CE_WARN); + + if (*ep == NULL) + *ep = err; + else + sbd_err_clear(&err); +} + + +/* + * Memlist support. + */ +void +dr_memlist_delete(struct memlist *mlist) +{ + register struct memlist *ml; + + for (ml = mlist; ml; ml = mlist) { + mlist = ml->ml_next; + FREESTRUCT(ml, struct memlist, 1); + } +} + +int +dr_memlist_intersect(struct memlist *al, struct memlist *bl) +{ + uint64_t astart, aend, bstart, bend; + + if ((al == NULL) || (bl == NULL)) + return (0); + + aend = al->ml_address + al->ml_size; + bstart = bl->ml_address; + bend = bl->ml_address + bl->ml_size; + + while (al && bl) { + while (al && (aend <= bstart)) + if ((al = al->ml_next) != NULL) + aend = al->ml_address + al->ml_size; + if (al == NULL) + return (0); + + if ((astart = al->ml_address) <= bstart) + return (1); + + while (bl && (bend <= astart)) + if ((bl = bl->ml_next) != NULL) + bend = bl->ml_address + bl->ml_size; + if (bl == NULL) + return (0); + + if ((bstart = bl->ml_address) <= astart) + return (1); + } + + return (0); +} + +void +dr_memlist_coalesce(struct memlist *mlist) +{ + uint64_t end, nend; + + if ((mlist == NULL) || (mlist->ml_next == NULL)) + return; + + while (mlist->ml_next) { + end = mlist->ml_address + mlist->ml_size; + if (mlist->ml_next->ml_address <= end) { + struct memlist *nl; + + nend = mlist->ml_next->ml_address + + mlist->ml_next->ml_size; + if (nend > end) + mlist->ml_size += (nend - end); + nl = mlist->ml_next; + mlist->ml_next = mlist->ml_next->ml_next; + if (nl) { + FREESTRUCT(nl, struct memlist, 1); + } + if (mlist->ml_next) + mlist->ml_next->ml_prev = mlist; + } else { + mlist = mlist->ml_next; + } + } +} + +#ifdef DEBUG +void +memlist_dump(struct memlist *mlist) +{ + register struct memlist *ml; + + if (mlist == NULL) + printf("memlist> EMPTY\n"); + else for (ml = mlist; ml; ml = ml->ml_next) + printf("memlist> 0x%" PRIx64 ", 0x%" PRIx64 "\n", + ml->ml_address, ml->ml_size); +} +#endif + +struct memlist * +dr_memlist_dup(struct memlist *mlist) +{ + struct memlist *hl = NULL, *tl, **mlp; + + if (mlist == NULL) + return (NULL); + + mlp = &hl; + tl = *mlp; + for (; mlist; mlist = mlist->ml_next) { + *mlp = GETSTRUCT(struct memlist, 1); + (*mlp)->ml_address = mlist->ml_address; + (*mlp)->ml_size = mlist->ml_size; + (*mlp)->ml_prev = tl; + tl = *mlp; + mlp = &((*mlp)->ml_next); + } + *mlp = NULL; + + return (hl); +} + +struct memlist * +dr_memlist_add_span(struct memlist *mlist, uint64_t base, uint64_t len) +{ + struct memlist *ml, *tl, *nl; + + if (len == 0ull) + return (NULL); + + if (mlist == NULL) { + mlist = GETSTRUCT(struct memlist, 1); + mlist->ml_address = base; + mlist->ml_size = len; + mlist->ml_next = mlist->ml_prev = NULL; + + return (mlist); + } + + for (tl = ml = mlist; ml; tl = ml, ml = ml->ml_next) { + if (base < ml->ml_address) { + if ((base + len) < ml->ml_address) { + nl = GETSTRUCT(struct memlist, 1); + nl->ml_address = base; + nl->ml_size = len; + nl->ml_next = ml; + if ((nl->ml_prev = ml->ml_prev) != NULL) + nl->ml_prev->ml_next = nl; + ml->ml_prev = nl; + if (mlist == ml) + mlist = nl; + } else { + ml->ml_size = MAX((base + len), + (ml->ml_address + ml->ml_size)) - base; + ml->ml_address = base; + } + break; + + } else if (base <= (ml->ml_address + ml->ml_size)) { + ml->ml_size = MAX((base + len), + (ml->ml_address + ml->ml_size)) - + MIN(ml->ml_address, base); + ml->ml_address = MIN(ml->ml_address, base); + break; + } + } + if (ml == NULL) { + nl = GETSTRUCT(struct memlist, 1); + nl->ml_address = base; + nl->ml_size = len; + nl->ml_next = NULL; + nl->ml_prev = tl; + tl->ml_next = nl; + } + + dr_memlist_coalesce(mlist); + + return (mlist); +} + +struct memlist * +dr_memlist_del_span(struct memlist *mlist, uint64_t base, uint64_t len) +{ + uint64_t end; + struct memlist *ml, *tl, *nlp; + + if (mlist == NULL) + return (NULL); + + end = base + len; + if ((end <= mlist->ml_address) || (base == end)) + return (mlist); + + for (tl = ml = mlist; ml; tl = ml, ml = nlp) { + uint64_t mend; + + nlp = ml->ml_next; + + if (end <= ml->ml_address) + break; + + mend = ml->ml_address + ml->ml_size; + if (base < mend) { + if (base <= ml->ml_address) { + ml->ml_address = end; + if (end >= mend) + ml->ml_size = 0ull; + else + ml->ml_size = mend - ml->ml_address; + } else { + ml->ml_size = base - ml->ml_address; + if (end < mend) { + struct memlist *nl; + /* + * splitting an memlist entry. + */ + nl = GETSTRUCT(struct memlist, 1); + nl->ml_address = end; + nl->ml_size = mend - nl->ml_address; + if ((nl->ml_next = nlp) != NULL) + nlp->ml_prev = nl; + nl->ml_prev = ml; + ml->ml_next = nl; + nlp = nl; + } + } + if (ml->ml_size == 0ull) { + if (ml == mlist) { + if ((mlist = nlp) != NULL) + nlp->ml_prev = NULL; + FREESTRUCT(ml, struct memlist, 1); + if (mlist == NULL) + break; + ml = nlp; + } else { + if ((tl->ml_next = nlp) != NULL) + nlp->ml_prev = tl; + FREESTRUCT(ml, struct memlist, 1); + ml = tl; + } + } + } + } + + return (mlist); +} + +/* + * add span without merging + */ +struct memlist * +dr_memlist_cat_span(struct memlist *mlist, uint64_t base, uint64_t len) +{ + struct memlist *ml, *tl, *nl; + + if (len == 0ull) + return (NULL); + + if (mlist == NULL) { + mlist = GETSTRUCT(struct memlist, 1); + mlist->ml_address = base; + mlist->ml_size = len; + mlist->ml_next = mlist->ml_prev = NULL; + + return (mlist); + } + + for (tl = ml = mlist; ml; tl = ml, ml = ml->ml_next) { + if (base < ml->ml_address) { + nl = GETSTRUCT(struct memlist, 1); + nl->ml_address = base; + nl->ml_size = len; + nl->ml_next = ml; + if ((nl->ml_prev = ml->ml_prev) != NULL) + nl->ml_prev->ml_next = nl; + ml->ml_prev = nl; + if (mlist == ml) + mlist = nl; + break; + } + } + + if (ml == NULL) { + nl = GETSTRUCT(struct memlist, 1); + nl->ml_address = base; + nl->ml_size = len; + nl->ml_next = NULL; + nl->ml_prev = tl; + tl->ml_next = nl; + } + + return (mlist); +} diff --git a/usr/src/uts/i86pc/io/dr/sbdgenerr.pl b/usr/src/uts/i86pc/io/dr/sbdgenerr.pl new file mode 100644 index 0000000000..9980c96459 --- /dev/null +++ b/usr/src/uts/i86pc/io/dr/sbdgenerr.pl @@ -0,0 +1,90 @@ +#!/usr/bin/perl +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2000 by Sun Microsystems, Inc. +# All rights reserved. +# + +sub trim { + my ($line) = @_; + $line =~ s#/\*|\*/##g; + $line =~ s#^\s+|\s+$##g; + return $line; +} + +my $filter = 0; +my %prefix; +while ($#ARGV >= 0) { + $prefix{$ARGV[0]} = 0; + shift @ARGV; + $filter = 1; +} + +my $base; +my $bnd; +my @text; +my @sets; +while (<STDIN>) { + my $n = m@^#define\s(E\w\w\w)\w+\s+(\d+)(.*)@; + next unless ($n > 0); + next unless ($filter == 0 || defined $prefix{$1}); + my $txt = trim($3); + if (length($txt) == 0) { + my $l = <STDIN>; + $txt = trim($l); + } + + $base = $2 if (!defined $base); + if (defined $bnd && $2 != $bnd + 1) { + push(@sets, { base => $base, bnd => $bnd }); + $base = $2; + } + $bnd = $2; + push(@text, $txt); +} + +push(@sets, { base => $base, bnd => $bnd }); + +printf "#include <sys/sbd_ioctl.h>\n"; + +my $i = 0; +my $s = 0; +do { + my $set = $sets[$s]; + + printf "static char *sbd_t%d[] = {\n", $set->{base}; + my $n = $set->{bnd} - $set->{base} + 1; + while ($n--) { + printf "\t\"%s\",\n", $text[$i++]; + } + printf "};\n"; +} while (++$s <= $#sets); + +printf "sbd_etab_t sbd_etab[] = {\n"; +$s = 0; +do { + my $set = $sets[$s]; + printf "\t{ %d, %d, sbd_t%d },\n", + $set->{base}, $set->{bnd}, $set->{base}; +} while (++$s <= $#sets); +printf "};\n"; +printf "int sbd_etab_len = %d;\n", $s; diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index 319193bcc0..a3afe84d07 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -22,6 +22,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * PSMI 1.1 extensions are supported only in 2.6 and later versions. @@ -29,8 +33,9 @@ * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. * PSMI 1.5 extensions are supported in Solaris Nevada. * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/processor.h> #include <sys/time.h> @@ -93,6 +98,7 @@ static int apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp, int type); static void apic_set_pwroff_method_from_mpcnfhdr(struct apic_mp_cnf_hdr *hdrp); +static void apic_free_apic_cpus(void); static void apic_try_deferred_reprogram(int ipl, int vect); static void delete_defer_repro_ent(int which_irq); static void apic_ioapic_wait_pending_clear(int ioapicindex, @@ -194,6 +200,11 @@ int apic_redist_cpu_skip = 0; int apic_num_imbalance = 0; int apic_num_rebind = 0; +/* + * Maximum number of APIC CPUs in the system, -1 indicates that dynamic + * allocation of CPU ids is disabled. + */ +int apic_max_nproc = -1; int apic_nproc = 0; size_t apic_cpus_size = 0; int apic_defconf = 0; @@ -597,6 +608,16 @@ apic_set_pwroff_method_from_mpcnfhdr(struct apic_mp_cnf_hdr *hdrp) } } +static void +apic_free_apic_cpus(void) +{ + if (apic_cpus != NULL) { + kmem_free(apic_cpus, apic_cpus_size); + apic_cpus = NULL; + apic_cpus_size = 0; + } +} + static int acpi_probe(char *modname) { @@ -639,7 +660,6 @@ acpi_probe(char *modname) id = apic_reg_ops->apic_read(APIC_LID_REG); local_ids[0] = (uchar_t)(id >> 24); apic_nproc = index = 1; - CPUSET_ONLY(apic_cpumask, 0); apic_io_max = 0; ap = (ACPI_SUBTABLE_HEADER *) (acpi_mapic_dtp + 1); @@ -652,16 +672,12 @@ acpi_probe(char *modname) mpa = (ACPI_MADT_LOCAL_APIC *) ap; if (mpa->LapicFlags & ACPI_MADT_ENABLED) { if (mpa->Id == local_ids[0]) { + ASSERT(index == 1); proc_ids[0] = mpa->ProcessorId; - (void) acpica_map_cpu(0, - mpa->ProcessorId); } else if (apic_nproc < NCPU && use_mp && apic_nproc < boot_ncpus) { local_ids[index] = mpa->Id; proc_ids[index] = mpa->ProcessorId; - CPUSET_ADD(apic_cpumask, index); - (void) acpica_map_cpu(index, - mpa->ProcessorId); index++; apic_nproc++; } else if (apic_nproc == NCPU && !warned) { @@ -770,9 +786,7 @@ acpi_probe(char *modname) if (apic_nproc < NCPU && use_mp && apic_nproc < boot_ncpus) { local_ids[index] = mpx2a->LocalApicId; - CPUSET_ADD(apic_cpumask, index); - (void) acpica_map_cpu(index, - mpx2a->Uid); + proc_ids[index] = mpa->ProcessorId; index++; apic_nproc++; } else if (apic_nproc == NCPU && !warned) { @@ -813,7 +827,14 @@ acpi_probe(char *modname) ap = (ACPI_SUBTABLE_HEADER *)(((char *)ap) + ap->Length); } - apic_cpus_size = apic_nproc * sizeof (*apic_cpus); + /* + * allocate enough space for possible hot-adding of CPUs. + * max_ncpus may be less than apic_nproc if it's set by user. + */ + if (plat_dr_support_cpu()) { + apic_max_nproc = max_ncpus; + } + apic_cpus_size = max(apic_nproc, max_ncpus) * sizeof (*apic_cpus); if ((apic_cpus = kmem_zalloc(apic_cpus_size, KM_NOSLEEP)) == NULL) goto cleanup; @@ -825,6 +846,47 @@ acpi_probe(char *modname) for (i = 0; i < apic_nproc; i++) { apic_cpus[i].aci_local_id = local_ids[i]; apic_cpus[i].aci_local_ver = (uchar_t)(ver & 0xFF); + apic_cpus[i].aci_processor_id = proc_ids[i]; + /* Only build mapping info for CPUs present at boot. */ + if (i < boot_ncpus) + (void) acpica_map_cpu(i, proc_ids[i]); + } + + /* + * To support CPU dynamic reconfiguration, the apic CPU info structure + * for each possible CPU will be pre-allocated at boot time. + * The state for each apic CPU info structure will be assigned according + * to the following rules: + * Rule 1: + * Slot index range: [0, min(apic_nproc, boot_ncpus)) + * State flags: 0 + * Note: cpu exists and will be configured/enabled at boot time + * Rule 2: + * Slot index range: [boot_ncpus, apic_nproc) + * State flags: APIC_CPU_FREE | APIC_CPU_DIRTY + * Note: cpu exists but won't be configured/enabled at boot time + * Rule 3: + * Slot index range: [apic_nproc, boot_ncpus) + * State flags: APIC_CPU_FREE + * Note: cpu doesn't exist at boot time + * Rule 4: + * Slot index range: [max(apic_nproc, boot_ncpus), max_ncpus) + * State flags: APIC_CPU_FREE + * Note: cpu doesn't exist at boot time + */ + CPUSET_ZERO(apic_cpumask); + for (i = 0; i < min(boot_ncpus, apic_nproc); i++) { + CPUSET_ADD(apic_cpumask, i); + apic_cpus[i].aci_status = 0; + } + for (i = boot_ncpus; i < apic_nproc; i++) { + apic_cpus[i].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY; + } + for (i = apic_nproc; i < boot_ncpus; i++) { + apic_cpus[i].aci_status = APIC_CPU_FREE; + } + for (i = max(boot_ncpus, apic_nproc); i < max_ncpus; i++) { + apic_cpus[i].aci_status = APIC_CPU_FREE; } for (i = 0; i < apic_io_max; i++) { @@ -926,10 +988,12 @@ acpi_probe(char *modname) /* if setting APIC mode failed above, we fall through to cleanup */ cleanup: + apic_free_apic_cpus(); if (apicadr != NULL) { mapout_apic((caddr_t)apicadr, APIC_LOCAL_MEMLEN); apicadr = NULL; } + apic_max_nproc = -1; apic_nproc = 0; for (i = 0; i < apic_io_max; i++) { mapout_ioapic((caddr_t)apicioadr[i], APIC_IO_MEMLEN); @@ -954,6 +1018,11 @@ apic_handle_defconf() { uint_t lid; + /* Failed to probe ACPI MADT tables, disable CPU DR. */ + apic_max_nproc = -1; + apic_free_apic_cpus(); + plat_dr_disable_cpu(); + /*LINTED: pointer cast may result in improper alignment */ apicioadr[0] = mapin_ioapic(APIC_IO_ADDR, APIC_IO_MEMLEN, PROT_READ | PROT_WRITE); @@ -998,8 +1067,7 @@ apic_handle_defconf() return (PSM_SUCCESS); apic_handle_defconf_fail: - if (apic_cpus) - kmem_free(apic_cpus, apic_cpus_size); + apic_free_apic_cpus(); if (apicadr) mapout_apic((caddr_t)apicadr, APIC_LOCAL_MEMLEN); if (apicioadr[0]) @@ -1187,7 +1255,51 @@ apic_parse_mpct(caddr_t mpct, int bypass_cpus_and_ioapics) boolean_t apic_cpu_in_range(int cpu) { - return ((cpu & ~IRQ_USER_BOUND) < apic_nproc); + cpu &= ~IRQ_USER_BOUND; + /* Check whether cpu id is in valid range. */ + if (cpu < 0 || cpu >= apic_nproc) { + return (B_FALSE); + } else if (apic_max_nproc != -1 && cpu >= apic_max_nproc) { + /* + * Check whether cpuid is in valid range if CPU DR is enabled. + */ + return (B_FALSE); + } else if (!CPU_IN_SET(apic_cpumask, cpu)) { + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Must be called with interrupts disabled and the apic_ioapic_lock held. + */ +processorid_t +apic_find_next_cpu_intr(void) +{ + int i, count; + processorid_t cpuid = 0; + + ASSERT(LOCK_HELD(&apic_ioapic_lock)); + + /* + * Find next CPU with INTR_ENABLE flag set. + * Assume that there is at least one CPU with interrupts enabled. + */ + for (count = 0; count < apic_nproc; count++) { + if (apic_next_bind_cpu >= apic_nproc) { + apic_next_bind_cpu = 0; + } + i = apic_next_bind_cpu++; + if (apic_cpu_in_range(i) && + (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { + cpuid = i; + break; + } + } + ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_INTR_ENABLE) != 0); + + return (cpuid); } uint16_t @@ -1770,7 +1882,7 @@ apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl) bind_cpu = irqptr->airq_temp_cpu; if (((uint32_t)bind_cpu != IRQ_UNBOUND) && ((uint32_t)bind_cpu != IRQ_UNINIT)) { - ASSERT((bind_cpu & ~IRQ_USER_BOUND) < apic_nproc); + ASSERT(apic_cpu_in_range(bind_cpu)); if (bind_cpu & IRQ_USER_BOUND) { /* If hardbound, temp_cpu == cpu */ bind_cpu &= ~IRQ_USER_BOUND; @@ -2392,6 +2504,7 @@ apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) major_t major; char *name, *drv_name, *prop_val, *cptr; char prop_name[32]; + ulong_t iflag; if (apic_intr_policy == INTR_LOWEST_PRIORITY) @@ -2475,26 +2588,28 @@ apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin) i++; bind_cpu = stoi(&cptr); kmem_free(prop_val, prop_len); - /* if specific cpu is bogus, then default to cpu 0 */ - if (bind_cpu >= apic_nproc) { + /* if specific CPU is bogus, then default to next cpu */ + if (!apic_cpu_in_range(bind_cpu)) { cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present", psm_name, prop_name, prop_val, bind_cpu); - bind_cpu = 0; + rc = DDI_PROP_NOT_FOUND; } else { /* indicate that we are bound at user request */ bind_cpu |= IRQ_USER_BOUND; } /* - * no need to check apic_cpus[].aci_status, if specific cpu is + * no need to check apic_cpus[].aci_status, if specific CPU is * not up, then post_cpu_start will handle it. */ - } else { - bind_cpu = apic_next_bind_cpu++; - if (bind_cpu >= apic_nproc) { - apic_next_bind_cpu = 1; - bind_cpu = 0; - } } + if (rc != DDI_PROP_SUCCESS) { + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + bind_cpu = apic_find_next_cpu_intr(); + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + } + if (drv_name != NULL) cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x " "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n", @@ -2925,7 +3040,7 @@ apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, /* Mask off high bit so it can be used as array index */ airq_temp_cpu &= ~IRQ_USER_BOUND; - ASSERT(airq_temp_cpu < apic_nproc); + ASSERT(apic_cpu_in_range(airq_temp_cpu)); } /* @@ -2999,7 +3114,7 @@ apic_rebind(apic_irq_t *irq_ptr, int bind_cpu, } else { cpu_infop->aci_temp_bound++; } - ASSERT((bind_cpu & ~IRQ_USER_BOUND) < apic_nproc); + ASSERT(apic_cpu_in_range(bind_cpu)); if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) { apic_cpus[airq_temp_cpu].aci_temp_bound--; @@ -3477,7 +3592,8 @@ apic_intr_redistribute() * we are consistent. */ for (i = 0; i < apic_nproc; i++) { - if (!(apic_redist_cpu_skip & (1 << i)) && + if (apic_cpu_in_range(i) && + !(apic_redist_cpu_skip & (1 << i)) && (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) { cpu_infop = &apic_cpus[i]; @@ -3639,7 +3755,9 @@ apic_intr_redistribute() apic_redist_cpu_skip = 0; } for (i = 0; i < apic_nproc; i++) { - apic_cpus[i].aci_busy = 0; + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } } } @@ -3650,7 +3768,9 @@ apic_cleanup_busy() apic_irq_t *irq_ptr; for (i = 0; i < apic_nproc; i++) { - apic_cpus[i].aci_busy = 0; + if (apic_cpu_in_range(i)) { + apic_cpus[i].aci_busy = 0; + } } for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) { diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic.c b/usr/src/uts/i86pc/io/pcplusmp/apic.c index 939ac73dab..13ceaef674 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * PSMI 1.1 extensions are supported only in 2.6 and later versions. @@ -30,8 +34,9 @@ * PSMI 1.3 and 1.4 extensions are supported in Solaris 10. * PSMI 1.5 extensions are supported in Solaris Nevada. * PSMI 1.6 extensions are supported in Solaris Nevada. + * PSMI 1.7 extensions are supported in Solaris Nevada. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/processor.h> #include <sys/time.h> @@ -88,7 +93,11 @@ static hrtime_t apic_gettime(); static hrtime_t apic_gethrtime(); static void apic_init(); static void apic_picinit(void); -static int apic_cpu_start(processorid_t, caddr_t); +static int apic_cpu_start(processorid_t cpuid, caddr_t ctx); +static int apic_cpu_stop(processorid_t cpuid, caddr_t ctx); +static int apic_cpu_add(psm_cpu_request_t *reqp); +static int apic_cpu_remove(psm_cpu_request_t *reqp); +static int apic_cpu_ops(psm_cpu_request_t *reqp); static int apic_post_cpu_start(void); static void apic_send_ipi(int cpun, int ipl); static void apic_set_idlecpu(processorid_t cpun); @@ -96,6 +105,7 @@ static void apic_unset_idlecpu(processorid_t cpun); static int apic_intr_enter(int ipl, int *vect); static void apic_setspl(int ipl); static void x2apic_setspl(int ipl); +static void apic_switch_ipi_callback(boolean_t enter); static int apic_addspl(int ipl, int vector, int min_ipl, int max_ipl); static int apic_delspl(int ipl, int vector, int min_ipl, int max_ipl); static void apic_shutdown(int cmd, int fcn); @@ -137,6 +147,9 @@ int apic_error_display_delay = 100; int apic_cpcovf_vect; int apic_enable_cpcovf_intr = 1; +/* maximum loop count when sending Start IPIs. */ +int apic_sipi_max_loop_count = 0x1000; + /* vector at which CMCI interrupts come in */ int apic_cmci_vect; extern int cmi_enable_cmci; @@ -145,6 +158,10 @@ extern void cmi_cmci_trap(void); static kmutex_t cmci_cpu_setup_lock; /* protects cmci_cpu_setup_registered */ static int cmci_cpu_setup_registered; +/* number of CPUs in power-on transition state */ +static int apic_poweron_cnt = 0; +static lock_t apic_mode_switch_lock; + /* * The following vector assignments influence the value of ipltopri and * vectortoipl. Note that vectors 0 - 0x1f are not used. We can program @@ -209,6 +226,7 @@ int apic_verbose = 0; /* minimum number of timer ticks to program to */ int apic_min_timer_ticks = 1; + /* * Local static data */ @@ -252,11 +270,12 @@ static struct psm_ops apic_ops = { apic_preshutdown, apic_intr_ops, /* Advanced DDI Interrupt framework */ apic_state, /* save, restore apic state for S3 */ + apic_cpu_ops, /* CPU control interface. */ }; static struct psm_info apic_psm_info = { - PSM_INFO_VER01_6, /* version */ + PSM_INFO_VER01_7, /* version */ PSM_OWN_EXCLUSIVE, /* ownership */ (struct psm_ops *)&apic_ops, /* operation */ APIC_PCPLUSMP_NAME, /* machine name */ @@ -786,6 +805,7 @@ apic_picinit(void) LOCK_INIT_CLEAR(&apic_gethrtime_lock); LOCK_INIT_CLEAR(&apic_ioapic_lock); LOCK_INIT_CLEAR(&apic_error_lock); + LOCK_INIT_CLEAR(&apic_mode_switch_lock); picsetup(); /* initialise the 8259 */ @@ -814,29 +834,27 @@ apic_picinit(void) ioapic_init_intr(IOAPIC_MASK); } - -/*ARGSUSED1*/ -static int -apic_cpu_start(processorid_t cpun, caddr_t arg) +static void +apic_cpu_send_SIPI(processorid_t cpun, boolean_t start) { int loop_count; uint32_t vector; - uint_t cpu_id; + uint_t apicid; ulong_t iflag; - cpu_id = apic_cpus[cpun].aci_local_id; - - apic_cmos_ssb_set = 1; + apicid = apic_cpus[cpun].aci_local_id; /* - * Interrupts on BSP cpu will be disabled during these startup + * Interrupts on current CPU will be disabled during the * steps in order to avoid unwanted side effects from * executing interrupt handlers on a problematic BIOS. */ - iflag = intr_clear(); - outb(CMOS_ADDR, SSB); - outb(CMOS_DATA, BIOS_SHUTDOWN); + + if (start) { + outb(CMOS_ADDR, SSB); + outb(CMOS_DATA, BIOS_SHUTDOWN); + } /* * According to X2APIC specification in section '2.3.5.1' of @@ -860,10 +878,10 @@ apic_cpu_start(processorid_t cpun, caddr_t arg) /* for integrated - make sure there is one INIT IPI in buffer */ /* for external - it will wake up the cpu */ - apic_reg_ops->apic_write_int_cmd(cpu_id, AV_ASSERT | AV_RESET); + apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET); /* If only 1 CPU is installed, PENDING bit will not go low */ - for (loop_count = 0x1000; loop_count; loop_count--) { + for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) { if (apic_mode == LOCAL_APIC && apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING) apic_ret(); @@ -871,8 +889,7 @@ apic_cpu_start(processorid_t cpun, caddr_t arg) break; } - apic_reg_ops->apic_write_int_cmd(cpu_id, AV_DEASSERT | AV_RESET); - + apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET); drv_usecwait(20000); /* 20 milli sec */ if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) { @@ -882,18 +899,111 @@ apic_cpu_start(processorid_t cpun, caddr_t arg) (APIC_VECTOR_MASK | APIC_IPL_MASK); /* to offset the INIT IPI queue up in the buffer */ - apic_reg_ops->apic_write_int_cmd(cpu_id, vector | AV_STARTUP); - + apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); drv_usecwait(200); /* 20 micro sec */ - apic_reg_ops->apic_write_int_cmd(cpu_id, vector | AV_STARTUP); - - drv_usecwait(200); /* 20 micro sec */ + /* + * send the second SIPI (Startup IPI) as recommended by Intel + * software development manual. + */ + apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP); + drv_usecwait(200); /* 20 micro sec */ } + intr_restore(iflag); +} + +/*ARGSUSED1*/ +static int +apic_cpu_start(processorid_t cpun, caddr_t arg) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!apic_cpu_in_range(cpun)) { + return (EINVAL); + } + + /* + * Switch to apic_common_send_ipi for safety during starting other CPUs. + */ + if (apic_mode == LOCAL_X2APIC) { + apic_switch_ipi_callback(B_TRUE); + } + + apic_cmos_ssb_set = 1; + apic_cpu_send_SIPI(cpun, B_TRUE); + return (0); } +/* + * Put CPU into halted state with interrupts disabled. + */ +/*ARGSUSED1*/ +static int +apic_cpu_stop(processorid_t cpun, caddr_t arg) +{ + int rc; + cpu_t *cp; + extern cpuset_t cpu_ready_set; + extern void cpu_idle_intercept_cpu(cpu_t *cp); + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!apic_cpu_in_range(cpun)) { + return (EINVAL); + } + if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) { + return (ENOTSUP); + } + + cp = cpu_get(cpun); + ASSERT(cp != NULL); + ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); + ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); + ASSERT((cp->cpu_flags & CPU_ENABLE) == 0); + + /* Clear CPU_READY flag to disable cross calls. */ + cp->cpu_flags &= ~CPU_READY; + CPUSET_ATOMIC_DEL(cpu_ready_set, cpun); + rc = xc_flush_cpu(cp); + if (rc != 0) { + CPUSET_ATOMIC_ADD(cpu_ready_set, cpun); + cp->cpu_flags |= CPU_READY; + return (rc); + } + + /* Intercept target CPU at a safe point before powering it off. */ + cpu_idle_intercept_cpu(cp); + + apic_cpu_send_SIPI(cpun, B_FALSE); + cp->cpu_flags &= ~CPU_RUNNING; + + return (0); +} + +static int +apic_cpu_ops(psm_cpu_request_t *reqp) +{ + if (reqp == NULL) { + return (EINVAL); + } + + switch (reqp->pcr_cmd) { + case PSM_CPU_ADD: + return (apic_cpu_add(reqp)); + + case PSM_CPU_REMOVE: + return (apic_cpu_remove(reqp)); + + case PSM_CPU_STOP: + return (apic_cpu_stop(reqp->req.cpu_stop.cpuid, + reqp->req.cpu_stop.ctx)); + + default: + return (ENOTSUP); + } +} #ifdef DEBUG int apic_break_on_cpu = 9; @@ -1377,7 +1487,6 @@ apic_post_cpu_start() { int cpun; static int cpus_started = 1; - struct psm_ops *pops = &apic_ops; /* We know this CPU + BSP started successfully. */ cpus_started++; @@ -1395,23 +1504,11 @@ apic_post_cpu_start() } /* - * We change psm_send_ipi and send_dirintf only if Solaris - * is booted in kmdb & the current CPU is the last CPU being - * brought up. We don't need to do anything if Solaris is running - * in MMIO mode (xAPIC). + * Switch back to x2apic IPI sending method for performance when target + * CPU has entered x2apic mode. */ - if ((boothowto & RB_DEBUG) && - (cpus_started == boot_ncpus || cpus_started == apic_nproc) && - apic_mode == LOCAL_X2APIC) { - /* - * We no longer need help from apic_common_send_ipi() - * since we will not start any more CPUs. - * - * We will need to revisit this if we start supporting - * hot-plugging of CPUs. - */ - pops->psm_send_ipi = x2apic_send_ipi; - send_dirintf = pops->psm_send_ipi; + if (apic_mode == LOCAL_X2APIC) { + apic_switch_ipi_callback(B_FALSE); } splx(ipltospl(LOCK_LEVEL)); @@ -1452,13 +1549,230 @@ apic_get_next_processorid(processorid_t cpu_id) return ((processorid_t)0); for (i = cpu_id + 1; i < NCPU; i++) { - if (CPU_IN_SET(apic_cpumask, i)) + if (apic_cpu_in_range(i)) return (i); } return ((processorid_t)-1); } +static int +apic_cpu_add(psm_cpu_request_t *reqp) +{ + int i, rv = 0; + ulong_t iflag; + boolean_t first = B_TRUE; + uchar_t localver; + uint32_t localid, procid; + processorid_t cpuid = (processorid_t)-1; + mach_cpu_add_arg_t *ap; + + ASSERT(reqp != NULL); + reqp->req.cpu_add.cpuid = (processorid_t)-1; + + /* Check whether CPU hotplug is supported. */ + if (!plat_dr_support_cpu() || apic_max_nproc == -1) { + return (ENOTSUP); + } + + ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp; + switch (ap->type) { + case MACH_CPU_ARG_LOCAL_APIC: + localid = ap->arg.apic.apic_id; + procid = ap->arg.apic.proc_id; + if (localid >= 255 || procid > 255) { + cmn_err(CE_WARN, + "!apic: apicid(%u) or procid(%u) is invalid.", + localid, procid); + return (EINVAL); + } + break; + + case MACH_CPU_ARG_LOCAL_X2APIC: + localid = ap->arg.apic.apic_id; + procid = ap->arg.apic.proc_id; + if (localid >= UINT32_MAX) { + cmn_err(CE_WARN, + "!apic: x2apicid(%u) is invalid.", localid); + return (EINVAL); + } else if (localid >= 255 && apic_mode == LOCAL_APIC) { + cmn_err(CE_WARN, "!apic: system is in APIC mode, " + "can't support x2APIC processor."); + return (ENOTSUP); + } + break; + + default: + cmn_err(CE_WARN, + "!apic: unknown argument type %d to apic_cpu_add().", + ap->type); + return (EINVAL); + } + + /* Use apic_ioapic_lock to sync with apic_find_next_cpu_intr. */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + /* Check whether local APIC id already exists. */ + for (i = 0; i < apic_nproc; i++) { + if (!CPU_IN_SET(apic_cpumask, i)) + continue; + if (apic_cpus[i].aci_local_id == localid) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: local apic id %u already exists.", + localid); + return (EEXIST); + } else if (apic_cpus[i].aci_processor_id == procid) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: processor id %u already exists.", + (int)procid); + return (EEXIST); + } + + /* + * There's no local APIC version number available in MADT table, + * so assume that all CPUs are homogeneous and use local APIC + * version number of the first existing CPU. + */ + if (first) { + first = B_FALSE; + localver = apic_cpus[i].aci_local_ver; + } + } + ASSERT(first == B_FALSE); + + /* + * Try to assign the same cpuid if APIC id exists in the dirty cache. + */ + for (i = 0; i < apic_max_nproc; i++) { + if (CPU_IN_SET(apic_cpumask, i)) { + ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0); + continue; + } + ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE); + if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) && + apic_cpus[i].aci_local_id == localid && + apic_cpus[i].aci_processor_id == procid) { + cpuid = i; + break; + } + } + + /* Avoid the dirty cache and allocate fresh slot if possible. */ + if (cpuid == (processorid_t)-1) { + for (i = 0; i < apic_max_nproc; i++) { + if ((apic_cpus[i].aci_status & APIC_CPU_FREE) && + (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) { + cpuid = i; + break; + } + } + } + + /* Try to find any free slot as last resort. */ + if (cpuid == (processorid_t)-1) { + for (i = 0; i < apic_max_nproc; i++) { + if (apic_cpus[i].aci_status & APIC_CPU_FREE) { + cpuid = i; + break; + } + } + } + + if (cpuid == (processorid_t)-1) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_NOTE, + "!apic: failed to allocate cpu id for processor %u.", + procid); + rv = EAGAIN; + } else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_NOTE, + "!apic: failed to build mapping for processor %u.", + procid); + rv = EBUSY; + } else { + ASSERT(cpuid >= 0 && cpuid < NCPU); + ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus); + bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0])); + apic_cpus[cpuid].aci_processor_id = procid; + apic_cpus[cpuid].aci_local_id = localid; + apic_cpus[cpuid].aci_local_ver = localver; + CPUSET_ATOMIC_ADD(apic_cpumask, cpuid); + if (cpuid >= apic_nproc) { + apic_nproc = cpuid + 1; + } + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + reqp->req.cpu_add.cpuid = cpuid; + } + + return (rv); +} + +static int +apic_cpu_remove(psm_cpu_request_t *reqp) +{ + int i; + ulong_t iflag; + processorid_t cpuid; + + /* Check whether CPU hotplug is supported. */ + if (!plat_dr_support_cpu() || apic_max_nproc == -1) { + return (ENOTSUP); + } + + cpuid = reqp->req.cpu_remove.cpuid; + + /* Use apic_ioapic_lock to sync with apic_find_next_cpu_intr. */ + iflag = intr_clear(); + lock_set(&apic_ioapic_lock); + + if (!apic_cpu_in_range(cpuid)) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + cmn_err(CE_WARN, + "!apic: cpuid %d doesn't exist in apic_cpus array.", + cpuid); + return (ENODEV); + } + ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0); + + if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) { + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + return (ENOENT); + } + + if (cpuid == apic_nproc - 1) { + /* + * We are removing the highest numbered cpuid so we need to + * find the next highest cpuid as the new value for apic_nproc. + */ + for (i = apic_nproc; i > 0; i--) { + if (CPU_IN_SET(apic_cpumask, i - 1)) { + apic_nproc = i; + break; + } + } + /* at least one CPU left */ + ASSERT(i > 0); + } + CPUSET_ATOMIC_DEL(apic_cpumask, cpuid); + /* mark slot as free and keep it in the dirty cache */ + apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY; + + lock_clear(&apic_ioapic_lock); + intr_restore(iflag); + + return (0); +} /* * type == -1 indicates it is an internal request. Do not change @@ -1898,8 +2212,7 @@ apic_disable_intr(processorid_t cpun) if ((irq_ptr = apic_irq_table[i]) != NULL) { ASSERT((irq_ptr->airq_temp_cpu == IRQ_UNBOUND) || (irq_ptr->airq_temp_cpu == IRQ_UNINIT) || - ((irq_ptr->airq_temp_cpu & ~IRQ_USER_BOUND) < - apic_nproc)); + (apic_cpu_in_range(irq_ptr->airq_temp_cpu))); if (irq_ptr->airq_temp_cpu == (cpun | IRQ_USER_BOUND)) { hardbound = 1; @@ -1908,12 +2221,7 @@ apic_disable_intr(processorid_t cpun) if (irq_ptr->airq_temp_cpu == cpun) { do { - bind_cpu = apic_next_bind_cpu++; - if (bind_cpu >= apic_nproc) { - apic_next_bind_cpu = 1; - bind_cpu = 0; - - } + bind_cpu = apic_find_next_cpu_intr(); } while (apic_rebind_all(irq_ptr, bind_cpu)); } } @@ -2141,6 +2449,8 @@ apic_redistribute_compute(void) } max_busy = 0; for (i = 0; i < apic_nproc; i++) { + if (!apic_cpu_in_range(i)) + continue; /* * Check if curipl is non zero & if ISR is in @@ -2481,24 +2791,6 @@ ioapic_write_eoi(int ioapic_ix, uint32_t value) ioapic[APIC_IO_EOI] = value; } -static processorid_t -apic_find_cpu(int flag) -{ - processorid_t acid = 0; - int i; - - /* Find the first CPU with the passed-in flag set */ - for (i = 0; i < apic_nproc; i++) { - if (apic_cpus[i].aci_status & flag) { - acid = i; - break; - } - } - - ASSERT((apic_cpus[acid].aci_status & flag) != 0); - return (acid); -} - /* * Call rebind to do the actual programming. * Must be called with interrupts disabled and apic_ioapic_lock held @@ -2508,8 +2800,8 @@ apic_find_cpu(int flag) * p is of the type 'apic_irq_t *'. * * apic_ioapic_lock must be held across this call, as it protects apic_rebind - * and it protects apic_find_cpu() from a race in which a CPU can be taken - * offline after a cpu is selected, but before apic_rebind is called to + * and it protects apic_find_next_cpu_intr() from a race in which a CPU can be + * taken offline after a cpu is selected, but before apic_rebind is called to * bind interrupts to it. */ int @@ -2534,8 +2826,7 @@ apic_setup_io_intr(void *p, int irq, boolean_t deferred) * CPU is not up or interrupts are disabled. Fall back to * the first available CPU */ - rv = apic_rebind(irqptr, apic_find_cpu(APIC_CPU_INTR_ENABLE), - drep); + rv = apic_rebind(irqptr, apic_find_next_cpu_intr(), drep); } return (rv); @@ -2555,6 +2846,45 @@ apic_get_apic_type() return (apic_psm_info.p_mach_idstring); } +/* + * Switch between safe and x2APIC IPI sending method. + * CPU may power on in xapic mode or x2apic mode. If CPU needs to send IPI to + * other CPUs before entering x2APIC mode, it still needs to xAPIC method. + * Before sending StartIPI to target CPU, psm_send_ipi will be changed to + * apic_common_send_ipi, which detects current local APIC mode and use right + * method to send IPI. If some CPUs fail to start up, apic_poweron_cnt + * won't return to zero, so apic_common_send_ipi will always be used. + * psm_send_ipi can't be simply changed back to x2apic_send_ipi if some CPUs + * failed to start up because those failed CPUs may recover itself later at + * unpredictable time. + */ +static void +apic_switch_ipi_callback(boolean_t enter) +{ + ulong_t iflag; + struct psm_ops *pops = &apic_ops; + + iflag = intr_clear(); + lock_set(&apic_mode_switch_lock); + if (enter) { + ASSERT(apic_poweron_cnt >= 0); + if (apic_poweron_cnt == 0) { + pops->psm_send_ipi = apic_common_send_ipi; + send_dirintf = pops->psm_send_ipi; + } + apic_poweron_cnt++; + } else { + ASSERT(apic_poweron_cnt > 0); + apic_poweron_cnt--; + if (apic_poweron_cnt == 0) { + pops->psm_send_ipi = x2apic_send_ipi; + send_dirintf = pops->psm_send_ipi; + } + } + lock_clear(&apic_mode_switch_lock); + intr_restore(iflag); +} + void x2apic_update_psm() { @@ -2562,28 +2892,9 @@ x2apic_update_psm() ASSERT(pops != NULL); - /* - * We don't need to do any magic if one of the following - * conditions is true : - * - Not being run under kernel debugger. - * - MP is not set. - * - Booted with one CPU only. - * - One CPU configured. - * - * We set apic_common_send_ipi() since kernel debuggers - * attempt to send IPIs to other slave CPUs during - * entry (exit) from (to) debugger. - */ - if (!(boothowto & RB_DEBUG) || use_mp == 0 || - apic_nproc == 1 || boot_ncpus == 1) { - pops->psm_send_ipi = x2apic_send_ipi; - } else { - pops->psm_send_ipi = apic_common_send_ipi; - } - pops->psm_intr_exit = x2apic_intr_exit; pops->psm_setspl = x2apic_setspl; - + pops->psm_send_ipi = x2apic_send_ipi; send_dirintf = pops->psm_send_ipi; apic_mode = LOCAL_X2APIC; diff --git a/usr/src/uts/i86pc/io/psm/uppc.c b/usr/src/uts/i86pc/io/psm/uppc.c index 0b6faee267..ad4d378f76 100644 --- a/usr/src/uts/i86pc/io/psm/uppc.c +++ b/usr/src/uts/i86pc/io/psm/uppc.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/mutex.h> #include <sys/types.h> @@ -173,12 +173,13 @@ static struct psm_ops uppc_ops = { (int (*)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t, int *))NULL, /* psm_intr_ops */ - uppc_state /* psm_state */ + uppc_state, /* psm_state */ + (int (*)(psm_cpu_request_t *))NULL /* psm_cpu_ops */ }; static struct psm_info uppc_info = { - PSM_INFO_VER01_6, /* version */ + PSM_INFO_VER01_7, /* version */ PSM_OWN_SYS_DEFAULT, /* ownership */ (struct psm_ops *)&uppc_ops, /* operation */ "uppc", /* machine name */ diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s index 721761f01d..25a943870e 100644 --- a/usr/src/uts/i86pc/ml/mpcore.s +++ b/usr/src/uts/i86pc/ml/mpcore.s @@ -22,9 +22,11 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/asm_linkage.h> #include <sys/asm_misc.h> #include <sys/regset.h> @@ -44,6 +46,7 @@ * - The GDT, IDT, ktss and page directory has been built for us * * Our actions: + * Start CPU: * - We start using our GDT by loading correct values in the * selector registers (cs=KCS_SEL, ds=es=ss=KDS_SEL, fs=KFS_SEL, * gs=KGS_SEL). @@ -51,20 +54,30 @@ * - We load the default LDT into the hardware LDT register. * - We load the default TSS into the hardware task register. * - call mp_startup(void) indirectly through the T_PC + * Stop CPU: + * - Put CPU into halted state with interrupts disabled * */ #if defined(__lint) void -real_mode_start(void) +real_mode_start_cpu(void) +{} + +void +real_mode_stop_cpu_stage1(void) +{} + +void +real_mode_stop_cpu_stage2(void) {} #else /* __lint */ #if defined(__amd64) - ENTRY_NP(real_mode_start) + ENTRY_NP(real_mode_start_cpu) #if !defined(__GNUC_AS__) @@ -229,8 +242,8 @@ long_mode_64: pushq $KCS_SEL pushq $kernel_cs_code lretq - .globl real_mode_end -real_mode_end: + .globl real_mode_start_cpu_end +real_mode_start_cpu_end: nop kernel_cs_code: @@ -484,8 +497,8 @@ long_mode_64: pushq $KCS_SEL pushq $kernel_cs_code lretq - .globl real_mode_end -real_mode_end: + .globl real_mode_start_cpu_end +real_mode_start_cpu_end: nop kernel_cs_code: @@ -575,11 +588,11 @@ kernel_cs_code: int $20 /* whoops, returned somehow! */ #endif /* !__GNUC_AS__ */ - SET_SIZE(real_mode_start) + SET_SIZE(real_mode_start_cpu) #elif defined(__i386) - ENTRY_NP(real_mode_start) + ENTRY_NP(real_mode_start_cpu) #if !defined(__GNUC_AS__) @@ -621,8 +634,8 @@ pestart: D16 pushl $KCS_SEL D16 pushl $kernel_cs_code D16 lret - .globl real_mode_end -real_mode_end: + .globl real_mode_start_cpu_end +real_mode_start_cpu_end: nop .globl kernel_cs_code @@ -714,8 +727,8 @@ pestart: D16 pushl $KCS_SEL D16 pushl $kernel_cs_code D16 lret - .globl real_mode_end -real_mode_end: + .globl real_mode_start_cpu_end +real_mode_start_cpu_end: nop .globl kernel_cs_code kernel_cs_code: @@ -767,7 +780,120 @@ kernel_cs_code: int $20 /* whoops, returned somehow! */ #endif - SET_SIZE(real_mode_start) + SET_SIZE(real_mode_start_cpu) #endif /* __amd64 */ + +#if defined(__amd64) + + ENTRY_NP(real_mode_stop_cpu_stage1) + +#if !defined(__GNUC_AS__) + + /* + * For vulcan as we need to do a .code32 and mentally invert the + * meaning of the addr16 and data16 prefixes to get 32-bit access when + * generating code to be executed in 16-bit mode (sigh...) + */ + .code32 + cli + movw %cs, %ax + movw %ax, %ds /* load cs into ds */ + movw %ax, %ss /* and into ss */ + + /* + * Jump to the stage 2 code in the rm_platter_va->rm_cpu_halt_code + */ + movw $CPUHALTCODEOFF, %ax + .byte 0xff, 0xe0 /* jmp *%ax */ + +#else /* __GNUC_AS__ */ + + /* + * NOTE: The GNU assembler automatically does the right thing to + * generate data size operand prefixes based on the code size + * generation mode (e.g. .code16, .code32, .code64) and as such + * prefixes need not be used on instructions EXCEPT in the case + * of address prefixes for code for which the reference is not + * automatically of the default operand size. + */ + .code16 + cli + movw %cs, %ax + movw %ax, %ds /* load cs into ds */ + movw %ax, %ss /* and into ss */ + + /* + * Jump to the stage 2 code in the rm_platter_va->rm_cpu_halt_code + */ + movw $CPUHALTCODEOFF, %ax + jmp *%ax + +#endif /* !__GNUC_AS__ */ + + .globl real_mode_stop_cpu_stage1_end +real_mode_stop_cpu_stage1_end: + nop + + SET_SIZE(real_mode_stop_cpu_stage1) + +#elif defined(__i386) + + ENTRY_NP(real_mode_stop_cpu_stage1) + +#if !defined(__GNUC_AS__) + + cli + D16 movw %cs, %eax + movw %eax, %ds /* load cs into ds */ + movw %eax, %ss /* and into ss */ + + /* + * Jump to the stage 2 code in the rm_platter_va->rm_cpu_halt_code + */ + movw $CPUHALTCODEOFF, %ax + .byte 0xff, 0xe0 /* jmp *%ax */ + +#else /* __GNUC_AS__ */ + + cli + mov %cs, %ax + mov %eax, %ds /* load cs into ds */ + mov %eax, %ss /* and into ss */ + + /* + * Jump to the stage 2 code in the rm_platter_va->rm_cpu_halt_code + */ + movw $CPUHALTCODEOFF, %ax + jmp *%ax + +#endif /* !__GNUC_AS__ */ + + .globl real_mode_stop_cpu_stage1_end +real_mode_stop_cpu_stage1_end: + nop + + SET_SIZE(real_mode_stop_cpu_stage1) + +#endif /* __amd64 */ + + ENTRY_NP(real_mode_stop_cpu_stage2) + + movw $0xdead, %ax + movw %ax, CPUHALTEDOFF + +real_mode_stop_cpu_loop: + /* + * Put CPU into halted state. + * Only INIT, SMI, NMI could break the loop. + */ + hlt + jmp real_mode_stop_cpu_loop + + .globl real_mode_stop_cpu_stage2_end +real_mode_stop_cpu_stage2_end: + nop + + SET_SIZE(real_mode_stop_cpu_stage2) + #endif /* __lint */ diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index f857617beb..029193b35e 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -295,6 +295,8 @@ rm_platter rm_cpu CPUNOFF rm_cr4 CR4OFF rm_x86feature X86FEATURE + rm_cpu_halt_code CPUHALTCODEOFF + rm_cpu_halted CPUHALTEDOFF ddi_acc_impl ahi_acc_attr ACC_ATTR diff --git a/usr/src/uts/i86pc/os/acpi_fw.h b/usr/src/uts/i86pc/os/acpi_fw.h index 032e231f3f..54a34ef591 100644 --- a/usr/src/uts/i86pc/os/acpi_fw.h +++ b/usr/src/uts/i86pc/os/acpi_fw.h @@ -22,6 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #ifndef _ACPI_FW_H #define _ACPI_FW_H @@ -161,6 +165,28 @@ struct slit { */ extern struct slit *slit_ptr; +struct msct_proximity_domain { + uint8_t revision; + uint8_t length; + uint32_t domain_min; + uint32_t domain_max; + uint32_t processor_max; + uint64_t memory_max; +}; + +struct msct { + struct table_header hdr; + uint32_t proximity_domain_offset; + uint32_t maximum_proximity_domains; + uint32_t maximum_power_domains; + uint64_t maximum_physical_address; +}; + +/* + * Pointer to Maximum System Capability Table (MSCT) + */ +extern struct msct *msct_ptr; + struct cfg_base_addr_alloc { uint64_t base_addr; uint16_t segment; diff --git a/usr/src/uts/i86pc/os/cmi.c b/usr/src/uts/i86pc/os/cmi.c index f9d722f031..4d27eb61d3 100644 --- a/usr/src/uts/i86pc/os/cmi.c +++ b/usr/src/uts/i86pc/os/cmi.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,8 @@ typedef struct cmi { } cmi_t; static cmi_t *cmi_list; +static const cmi_mc_ops_t *cmi_mc_global_ops; +static void *cmi_mc_global_data; static kmutex_t cmi_load_lock; /* @@ -107,6 +109,7 @@ static kmutex_t cmi_load_lock; * interface. */ extern cmi_hdl_t cmi_hdl_create(enum cmi_hdl_class, uint_t, uint_t, uint_t); +extern void cmi_hdl_destroy(cmi_hdl_t ophdl); extern void cmi_hdl_setcmi(cmi_hdl_t, void *, void *); extern void *cmi_hdl_getcmi(cmi_hdl_t); extern void cmi_hdl_setmc(cmi_hdl_t, const struct cmi_mc_ops *, void *); @@ -484,9 +487,8 @@ cmi_init(enum cmi_hdl_class class, uint_t chipid, uint_t coreid, } /* - * cmi_fini is not called at the moment. It is intended to be called - * on DR deconfigure of a cpu resource. It should not be called at - * simple offline of a cpu. + * cmi_fini is called on DR deconfigure of a cpu resource. + * It should not be called at simple offline of a cpu. */ void cmi_fini(cmi_hdl_t hdl) @@ -499,7 +501,7 @@ cmi_fini(cmi_hdl_t hdl) if (CMI_OP_PRESENT(cmi, cmi_fini)) CMI_OPS(cmi)->cmi_fini(hdl); - cmi_hdl_rele(hdl); /* release hold obtained in cmi_hdl_create */ + cmi_hdl_destroy(hdl); } /* @@ -806,6 +808,21 @@ cmi_mc_register(cmi_hdl_t hdl, const cmi_mc_ops_t *mcops, void *mcdata) cmi_hdl_setmc(hdl, mcops, mcdata); } +cmi_errno_t +cmi_mc_register_global(const cmi_mc_ops_t *mcops, void *mcdata) +{ + if (!cmi_no_mca_init) { + if (cmi_mc_global_ops != NULL || cmi_mc_global_data != NULL || + mcops == NULL || mcops->cmi_mc_patounum == NULL || + mcops->cmi_mc_unumtopa == NULL) { + return (CMIERR_UNKNOWN); + } + cmi_mc_global_data = mcdata; + cmi_mc_global_ops = mcops; + } + return (CMI_SUCCESS); +} + void cmi_mc_sw_memscrub_disable(void) { @@ -820,8 +837,17 @@ cmi_mc_patounum(uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, uint32_t synd, cmi_hdl_t hdl; cmi_errno_t rv; - if (cmi_no_mca_init || - (hdl = cmi_hdl_any()) == NULL) /* short-term hold */ + if (cmi_no_mca_init) + return (CMIERR_MC_ABSENT); + + if (cmi_mc_global_ops != NULL) { + if (cmi_mc_global_ops->cmi_mc_patounum == NULL) + return (CMIERR_MC_NOTSUP); + return (cmi_mc_global_ops->cmi_mc_patounum(cmi_mc_global_data, + pa, valid_hi, valid_lo, synd, syndtype, up)); + } + + if ((hdl = cmi_hdl_any()) == NULL) /* short-term hold */ return (CMIERR_MC_ABSENT); if ((mcops = cmi_hdl_getmcops(hdl)) == NULL || @@ -849,8 +875,17 @@ cmi_mc_unumtopa(mc_unum_t *up, nvlist_t *nvl, uint64_t *pap) if (up != NULL && nvl != NULL) return (CMIERR_API); /* convert from just one form */ - if (cmi_no_mca_init || - (hdl = cmi_hdl_any()) == NULL) /* short-term hold */ + if (cmi_no_mca_init) + return (CMIERR_MC_ABSENT); + + if (cmi_mc_global_ops != NULL) { + if (cmi_mc_global_ops->cmi_mc_unumtopa == NULL) + return (CMIERR_MC_NOTSUP); + return (cmi_mc_global_ops->cmi_mc_unumtopa(cmi_mc_global_data, + up, nvl, pap)); + } + + if ((hdl = cmi_hdl_any()) == NULL) /* short-term hold */ return (CMIERR_MC_ABSENT); if ((mcops = cmi_hdl_getmcops(hdl)) == NULL || @@ -865,7 +900,7 @@ cmi_mc_unumtopa(mc_unum_t *up, nvlist_t *nvl, uint64_t *pap) pap) == 0)) { return (CMIERR_MC_PARTIALUNUMTOPA); } else { - return (mcops && mcops->cmi_mc_unumtopa ? + return (mcops && mcops->cmi_mc_unumtopa == NULL ? CMIERR_MC_NOTSUP : CMIERR_MC_ABSENT); } } @@ -882,10 +917,15 @@ cmi_mc_logout(cmi_hdl_t hdl, boolean_t ismc, boolean_t sync) { const struct cmi_mc_ops *mcops; - if (cmi_no_mca_init || (mcops = cmi_hdl_getmcops(hdl)) == NULL) + if (cmi_no_mca_init) return; - if (mcops->cmi_mc_logout != NULL) + if (cmi_mc_global_ops != NULL) + mcops = cmi_mc_global_ops; + else + mcops = cmi_hdl_getmcops(hdl); + + if (mcops != NULL && mcops->cmi_mc_logout != NULL) mcops->cmi_mc_logout(hdl, ismc, sync); } diff --git a/usr/src/uts/i86pc/os/cmi_hw.c b/usr/src/uts/i86pc/os/cmi_hw.c index 1c2dbeaad6..8e3ae0f0af 100644 --- a/usr/src/uts/i86pc/os/cmi_hw.c +++ b/usr/src/uts/i86pc/os/cmi_hw.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * CPU Module Interface - hardware abstraction. @@ -94,6 +98,7 @@ typedef struct cmi_hdl_impl { #define HDLOPS(hdl) ((hdl)->cmih_ops) #define CMIH_F_INJACTV 0x1ULL +#define CMIH_F_DEAD 0x2ULL /* * Ops structure for handle operations. @@ -847,9 +852,18 @@ ntv_int(cmi_hdl_impl_t *hdl, int int_no) static int ntv_online(cmi_hdl_impl_t *hdl, int new_status, int *old_status) { + int rc; processorid_t cpuid = HDLPRIV(hdl)->cpu_id; - return (p_online_internal(cpuid, new_status, old_status)); + while (mutex_tryenter(&cpu_lock) == 0) { + if (hdl->cmih_flags & CMIH_F_DEAD) + return (EBUSY); + delay(1); + } + rc = p_online_internal_locked(cpuid, new_status, old_status); + mutex_exit(&cpu_lock); + + return (rc); } #else /* __xpv */ @@ -1387,15 +1401,32 @@ void cmi_hdl_rele(cmi_hdl_t ophdl) { cmi_hdl_impl_t *hdl = IMPLHDL(ophdl); - cmi_hdl_ent_t *ent; ASSERT(*hdl->cmih_refcntp > 0); + (void) atomic_dec_32_nv(hdl->cmih_refcntp); +} - if (atomic_dec_32_nv(hdl->cmih_refcntp) > 0) - return; +void +cmi_hdl_destroy(cmi_hdl_t ophdl) +{ + cmi_hdl_impl_t *hdl = IMPLHDL(ophdl); + cmi_hdl_ent_t *ent; + + /* Release the reference count held by cmi_hdl_create(). */ + ASSERT(*hdl->cmih_refcntp > 0); + (void) atomic_dec_32_nv(hdl->cmih_refcntp); + hdl->cmih_flags |= CMIH_F_DEAD; ent = cmi_hdl_ent_lookup(hdl->cmih_chipid, hdl->cmih_coreid, hdl->cmih_strandid); + /* + * Use busy polling instead of condition variable here because + * cmi_hdl_rele() may be called from #MC handler. + */ + while (cmi_hdl_canref(ent)) { + cmi_hdl_rele(ophdl); + delay(1); + } ent->cmae_hdlp = NULL; kmem_free(hdl, sizeof (*hdl)); @@ -1628,6 +1659,33 @@ cmi_ntv_hwstrandid(cpu_t *cp) return (cpuid_get_clogid(cp) % strands_per_core); } + +static void +cmi_ntv_hwdisable_mce_xc(void) +{ + ulong_t cr4; + + cr4 = getcr4(); + cr4 = cr4 & (~CR4_MCE); + setcr4(cr4); +} + +void +cmi_ntv_hwdisable_mce(cmi_hdl_t hdl) +{ + cpuset_t set; + cmi_hdl_impl_t *thdl = IMPLHDL(hdl); + cpu_t *cp = HDLPRIV(thdl); + + if (CPU->cpu_id == cp->cpu_id) { + cmi_ntv_hwdisable_mce_xc(); + } else { + CPUSET_ONLY(set, cp->cpu_id); + xc_call(NULL, NULL, NULL, CPUSET2BV(set), + (xc_func_t)cmi_ntv_hwdisable_mce_xc); + } +} + #endif /* __xpv */ void diff --git a/usr/src/uts/i86pc/os/cms.c b/usr/src/uts/i86pc/os/cms.c index 18d3b046f6..1fe03d3f30 100644 --- a/usr/src/uts/i86pc/os/cms.c +++ b/usr/src/uts/i86pc/os/cms.c @@ -23,6 +23,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/cpu_module_ms_impl.h> @@ -421,9 +425,19 @@ void cms_fini(cmi_hdl_t hdl) { cms_t *cms = HDL2CMS(hdl); + struct cms_ctl *cdp; if (CMS_OP_PRESENT(cms, cms_fini)) CMS_OPS(cms)->cms_fini(hdl); + + mutex_enter(&cms_load_lock); + cdp = (struct cms_ctl *)cmi_hdl_getspecific(hdl); + if (cdp != NULL) { + if (cdp->cs_cms != NULL) + cms_rele(cdp->cs_cms); + kmem_free(cdp, sizeof (*cdp)); + } + mutex_exit(&cms_load_lock); } boolean_t diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 2599d3530a..103955a097 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -337,7 +337,7 @@ i_cpr_pre_resume_cpus() CPUSET_ONLY(cpu_ready_set, boot_cpuid); CPUSET_ONLY(procset, boot_cpuid); - for (who = 0; who < ncpus; who++) { + for (who = 0; who < max_ncpus; who++) { wc_cpu_t *cpup = wc_other_cpus + who; wc_desctbr_t gdt; @@ -363,7 +363,10 @@ i_cpr_pre_resume_cpus() init_real_mode_platter(who, code_length, cpup->wc_cr4, gdt); - if ((err = mach_cpuid_start(who, rm_platter_va)) != 0) { + mutex_enter(&cpu_lock); + err = mach_cpuid_start(who, rm_platter_va); + mutex_exit(&cpu_lock); + if (err != 0) { cmn_err(CE_WARN, "cpu%d: failed to start during " "suspend/resume error %d", who, err); continue; @@ -481,10 +484,13 @@ prt_other_cpus() return; } - for (who = 0; who < ncpus; who++) { + for (who = 0; who < max_ncpus; who++) { wc_cpu_t *cpup = wc_other_cpus + who; + if (!CPU_IN_SET(mp_cpus, who)) + continue; + PMD(PMD_SX, ("prt_other_cpus() who = %d, gdt=%p:%x, " "idt=%p:%x, ldt=%lx, tr=%lx, kgsbase=" AFMT ", sp=%lx\n", who, @@ -1009,7 +1015,7 @@ i_cpr_alloc_cpus(void) */ if (wc_other_cpus == NULL) { - wc_other_cpus = kmem_zalloc(ncpus * sizeof (wc_cpu_t), + wc_other_cpus = kmem_zalloc(max_ncpus * sizeof (wc_cpu_t), KM_SLEEP); } @@ -1022,7 +1028,7 @@ i_cpr_free_cpus(void) wc_cpu_t *wc_cpu; if (wc_other_cpus != NULL) { - for (index = 0; index < ncpus; index++) { + for (index = 0; index < max_ncpus; index++) { wc_cpu = wc_other_cpus + index; if (wc_cpu->wc_saved_stack != NULL) { kmem_free(wc_cpu->wc_saved_stack, @@ -1030,7 +1036,8 @@ i_cpr_free_cpus(void) } } - kmem_free((void *) wc_other_cpus, ncpus * sizeof (wc_cpu_t)); + kmem_free((void *) wc_other_cpus, + max_ncpus * sizeof (wc_cpu_t)); wc_other_cpus = NULL; } } diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 8a14fccaa8..d47d680939 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -430,6 +430,9 @@ uint32_t cpuid_feature_ecx_exclude; uint32_t cpuid_feature_edx_include; uint32_t cpuid_feature_edx_exclude; +/* + * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs. + */ void cpuid_alloc_space(cpu_t *cpu) { @@ -439,6 +442,7 @@ cpuid_alloc_space(cpu_t *cpu) * their cpuid_info struct allocated here. */ ASSERT(cpu->cpu_id != 0); + ASSERT(cpu->cpu_m.mcpu_cpi == NULL); cpu->cpu_m.mcpu_cpi = kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP); } @@ -449,7 +453,8 @@ cpuid_free_space(cpu_t *cpu) struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; int i; - ASSERT(cpu->cpu_id != 0); + ASSERT(cpi != NULL); + ASSERT(cpi != &cpuid_info0); /* * Free up any function 4 related dynamic storage @@ -460,7 +465,8 @@ cpuid_free_space(cpu_t *cpu) kmem_free(cpi->cpi_std_4, cpi->cpi_std_4_size * sizeof (struct cpuid_regs *)); - kmem_free(cpu->cpu_m.mcpu_cpi, sizeof (*cpu->cpu_m.mcpu_cpi)); + kmem_free(cpi, sizeof (*cpi)); + cpu->cpu_m.mcpu_cpi = NULL; } #if !defined(__xpv) @@ -721,9 +727,9 @@ cpuid_pass1(cpu_t *cpu) determine_platform(); #endif /* - * Space statically allocated for cpu0, ensure pointer is set + * Space statically allocated for BSP, ensure pointer is set */ - if (cpu->cpu_id == 0) + if (cpu->cpu_id == 0 && cpu->cpu_m.mcpu_cpi == NULL) cpu->cpu_m.mcpu_cpi = &cpuid_info0; cpi = cpu->cpu_m.mcpu_cpi; ASSERT(cpi != NULL); @@ -3977,9 +3983,9 @@ cpuid_mwait_alloc(cpu_t *cpu) uint32_t *ret; size_t mwait_size; - ASSERT(cpuid_checkpass(cpu, 2)); + ASSERT(cpuid_checkpass(CPU, 2)); - mwait_size = cpu->cpu_m.mcpu_cpi->cpi_mwait.mon_max; + mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max; if (mwait_size == 0) return (NULL); @@ -4016,7 +4022,9 @@ cpuid_mwait_alloc(cpu_t *cpu) void cpuid_mwait_free(cpu_t *cpu) { - ASSERT(cpuid_checkpass(cpu, 2)); + if (cpu->cpu_m.mcpu_cpi == NULL) { + return; + } if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL && cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) { diff --git a/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c index a092046071..896b110e0e 100644 --- a/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c +++ b/usr/src/uts/i86pc/os/cpupm/cpu_acpi.c @@ -733,7 +733,8 @@ cpu_acpi_cache_cst(cpu_acpi_handle_t handle) CPU_ACPI_CSTATES_COUNT(handle) = (uint32_t)cnt; alloc_size = CPU_ACPI_CSTATES_SIZE(cnt); - CPU_ACPI_CSTATES(handle) = kmem_zalloc(alloc_size, KM_SLEEP); + if (CPU_ACPI_CSTATES(handle) == NULL) + CPU_ACPI_CSTATES(handle) = kmem_zalloc(alloc_size, KM_SLEEP); cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); p = cstate; diff --git a/usr/src/uts/i86pc/os/cpupm/cpu_idle.c b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c index 3dd75f00a5..483e8865c7 100644 --- a/usr/src/uts/i86pc/os/cpupm/cpu_idle.c +++ b/usr/src/uts/i86pc/os/cpupm/cpu_idle.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -47,6 +47,8 @@ #define CSTATE_USING_HPET 1 #define CSTATE_USING_LAT 2 +#define CPU_IDLE_STOP_TIMEOUT 1000 + extern void cpu_idle_adaptive(void); extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data, cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start); @@ -677,7 +679,7 @@ cpu_idle_init(cpu_t *cp) /* * Allocate, initialize and install cstate kstat */ - cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id, + cstate->cs_ksp = kstat_create("cstate", cp->cpu_id, name, "misc", KSTAT_TYPE_NAMED, sizeof (cpu_idle_kstat) / sizeof (kstat_named_t), @@ -692,8 +694,8 @@ cpu_idle_init(cpu_t *cp) cstate->cs_ksp->ks_data_size += MAXNAMELEN; cstate->cs_ksp->ks_private = cstate; kstat_install(cstate->cs_ksp); - cstate++; } + cstate++; } cpupm_alloc_domains(cp, CPUPM_C_STATES); @@ -771,6 +773,19 @@ cpu_idle_fini(cpu_t *cp) mutex_exit(&cpu_idle_callb_mutex); } +/* + * This function is introduced here to solve a race condition + * between the master and the slave to touch c-state data structure. + * After the slave calls this idle function to switch to the non + * deep idle function, the master can go on to reclaim the resource. + */ +static void +cpu_idle_stop_sync(void) +{ + /* switch to the non deep idle function */ + CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; +} + static void cpu_idle_stop(cpu_t *cp) { @@ -778,15 +793,28 @@ cpu_idle_stop(cpu_t *cp) (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; cpu_acpi_cstate_t *cstate; - uint_t cpu_max_cstates, i; + uint_t cpu_max_cstates, i = 0; - /* - * place the CPUs in a safe place so that we can disable - * deep c-state on them. - */ - pause_cpus(NULL); - cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; - start_cpus(); + mutex_enter(&cpu_idle_callb_mutex); + if (idle_cpu == cpu_idle_adaptive) { + /* + * invoke the slave to call synchronous idle function. + */ + cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync; + poke_cpu(cp->cpu_id); + + /* + * wait until the slave switchs to non deep idle function, + * so that the master is safe to go on to reclaim the resource. + */ + while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) { + drv_usecwait(10); + if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0) + cmn_err(CE_NOTE, "!cpu_idle_stop: the slave" + " idle stop timeout"); + } + } + mutex_exit(&cpu_idle_callb_mutex); cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); if (cstate) { @@ -937,7 +965,6 @@ cpuidle_cstate_instance(cpu_t *cp) cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state" " object Instance: %d", cpu_id); } - mutex_enter(&cpu_lock); mcpu = &(cp->cpu_m); mcpu->max_cstates = cpu_acpi_get_max_cstates(handle); if (mcpu->max_cstates > CPU_ACPI_C1) { @@ -950,7 +977,6 @@ cpuidle_cstate_instance(cpu_t *cp) cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE); } - mutex_exit(&cpu_lock); CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result); } while (result < 0); diff --git a/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c index 04113a9485..66b2acc7bf 100644 --- a/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_intel.c @@ -93,8 +93,8 @@ cpupm_intel_init(cpu_t *cp) if (x86_vendor != X86_VENDOR_Intel) return (B_FALSE); - family = cpuid_getfamily(CPU); - model = cpuid_getmodel(CPU); + family = cpuid_getfamily(cp); + model = cpuid_getmodel(cp); cpupm_intel_pdccap = CPUPM_INTEL_PDC_MP; diff --git a/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c index 78ad028890..72302fcf07 100644 --- a/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c +++ b/usr/src/uts/i86pc/os/cpupm/cpupm_mach.c @@ -240,7 +240,7 @@ cpupm_init(cpu_t *cp) if (mcpu->max_cstates > CPU_ACPI_C1) { (void) cstate_timer_callback( CST_EVENT_MULTIPLE_CSTATES); - CPU->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; + cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle; mcpu->mcpu_idle_type = CPU_ACPI_C1; disp_enq_thread = cstate_wakeup; } else { @@ -439,9 +439,13 @@ cpupm_alloc_domains(cpu_t *cp, int state) domain = CPU_ACPI_PSD(handle).sd_domain; type = CPU_ACPI_PSD(handle).sd_type; } else { - mutex_enter(&cpu_lock); - domain = cpuid_get_chipid(cp); - mutex_exit(&cpu_lock); + if (MUTEX_HELD(&cpu_lock)) { + domain = cpuid_get_chipid(cp); + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_chipid(cp); + mutex_exit(&cpu_lock); + } type = CPU_ACPI_HW_ALL; } dom_ptr = &cpupm_pstate_domains; @@ -452,9 +456,13 @@ cpupm_alloc_domains(cpu_t *cp, int state) domain = CPU_ACPI_TSD(handle).sd_domain; type = CPU_ACPI_TSD(handle).sd_type; } else { - mutex_enter(&cpu_lock); - domain = cpuid_get_chipid(cp); - mutex_exit(&cpu_lock); + if (MUTEX_HELD(&cpu_lock)) { + domain = cpuid_get_chipid(cp); + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_chipid(cp); + mutex_exit(&cpu_lock); + } type = CPU_ACPI_HW_ALL; } dom_ptr = &cpupm_tstate_domains; @@ -465,9 +473,13 @@ cpupm_alloc_domains(cpu_t *cp, int state) domain = CPU_ACPI_CSD(handle).sd_domain; type = CPU_ACPI_CSD(handle).sd_type; } else { - mutex_enter(&cpu_lock); - domain = cpuid_get_coreid(cp); - mutex_exit(&cpu_lock); + if (MUTEX_HELD(&cpu_lock)) { + domain = cpuid_get_coreid(cp); + } else { + mutex_enter(&cpu_lock); + domain = cpuid_get_coreid(cp); + mutex_exit(&cpu_lock); + } type = CPU_ACPI_HW_ALL; } dom_ptr = &cpupm_cstate_domains; diff --git a/usr/src/uts/i86pc/os/fakebop.c b/usr/src/uts/i86pc/os/fakebop.c index 1c23b7643c..d38bcb046f 100644 --- a/usr/src/uts/i86pc/os/fakebop.c +++ b/usr/src/uts/i86pc/os/fakebop.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * This file contains the functionality that mimics the boot operations @@ -40,6 +44,7 @@ #include <sys/varargs.h> #include <sys/param.h> #include <sys/machparam.h> +#include <sys/machsystm.h> #include <sys/archsystm.h> #include <sys/boot_console.h> #include <sys/cmn_err.h> @@ -151,12 +156,13 @@ char fastreboot_onpanic_cmdline[FASTBOOT_SAVED_CMDLINE_LEN]; static const char fastreboot_onpanic_args[] = " -B fastreboot_onpanic=0"; /* - * Pointers to where System Resource Affinity Table (SRAT) and - * System Locality Information Table (SLIT) are mapped into virtual memory + * Pointers to where System Resource Affinity Table (SRAT), System Locality + * Information Table (SLIT) and Maximum System Capability Table (MSCT) + * are mapped into virtual memory */ struct srat *srat_ptr = NULL; struct slit *slit_ptr = NULL; - +struct msct *msct_ptr = NULL; /* * Allocate aligned physical memory at boot time. This allocator allocates @@ -2077,6 +2083,7 @@ process_madt(struct madt *tp) { struct madt_processor *cpu, *end; uint32_t cpu_count = 0; + uint32_t cpu_possible_count = 0; uint8_t cpu_apicid_array[UINT8_MAX + 1]; if (tp != NULL) { @@ -2094,6 +2101,7 @@ process_madt(struct madt *tp) cpu->apic_id; cpu_count++; } + cpu_possible_count++; } cpu = (struct madt_processor *) @@ -2109,17 +2117,44 @@ process_madt(struct madt *tp) } /* + * Check whehter property plat-max-ncpus is already set. + */ + if (do_bsys_getproplen(NULL, PLAT_MAX_NCPUS_NAME) < 0) { + /* + * Set plat-max-ncpus to number of maximum possible CPUs given + * in MADT if it hasn't been set. + * There's no formal way to detect max possible CPUs supported + * by platform according to ACPI spec3.0b. So current CPU + * hotplug implementation expects that all possible CPUs will + * have an entry in MADT table and set plat-max-ncpus to number + * of entries in MADT. + * With introducing of ACPI4.0, Maximum System Capability Table + * (MSCT) provides maximum number of CPUs supported by platform. + * If MSCT is unavailable, fall back to old way. + */ + if (tp != NULL) + bsetpropsi(PLAT_MAX_NCPUS_NAME, cpu_possible_count); + } + + /* + * Set boot property boot-max-ncpus to number of CPUs existing at + * boot time. boot-max-ncpus is mainly used for optimization. + */ + if (tp != NULL) + bsetpropsi(BOOT_MAX_NCPUS_NAME, cpu_count); + + /* * User-set boot-ncpus overrides firmware count */ - if (do_bsys_getproplen(NULL, "boot-ncpus") >= 0) + if (do_bsys_getproplen(NULL, BOOT_NCPUS_NAME) >= 0) return; /* - * Set boot property for boot-ncpus to number of CPUs given in MADT - * if user hasn't set the property already + * Set boot property boot-ncpus to number of active CPUs given in MADT + * if it hasn't been set yet. */ if (tp != NULL) - bsetpropsi("boot-ncpus", cpu_count); + bsetpropsi(BOOT_NCPUS_NAME, cpu_count); } static void @@ -2146,6 +2181,7 @@ process_srat(struct srat *tp) } memory; #pragma pack() char prop_name[30]; + uint64_t maxmem = 0; if (tp == NULL) return; @@ -2181,6 +2217,10 @@ process_srat(struct srat *tp) mem_num); bsetprop(prop_name, strlen(prop_name), &memory, sizeof (memory)); + if ((item->i.m.flags & SRAT_HOT_PLUG) && + (memory.addr + memory.length > maxmem)) { + maxmem = memory.addr + memory.length; + } mem_num++; break; case SRAT_X2APIC: @@ -2199,6 +2239,14 @@ process_srat(struct srat *tp) item = (struct srat_item *) (item->len + (caddr_t)item); } + + /* + * The maximum physical address calculated from the SRAT table is more + * accurate than that calculated from the MSCT table. + */ + if (maxmem != 0) { + plat_dr_physmax = btop(maxmem); + } } static void @@ -2224,6 +2272,99 @@ process_slit(struct slit *tp) tp->number * tp->number); } +static struct msct * +process_msct(struct msct *tp) +{ + int last_seen = 0; + int proc_num = 0; + struct msct_proximity_domain *item, *end; + extern uint64_t plat_dr_options; + + ASSERT(tp != NULL); + + end = (void *)(tp->hdr.len + (uintptr_t)tp); + for (item = (void *)((uintptr_t)tp + tp->proximity_domain_offset); + item < end; + item = (void *)(item->length + (uintptr_t)item)) { + /* + * Sanity check according to section 5.2.19.1 of ACPI 4.0. + * Revision 1 + * Length 22 + */ + if (item->revision != 1 || item->length != 22) { + cmn_err(CE_CONT, + "?boot: unknown proximity domain structure in MSCT " + "with rev(%d), len(%d).\n", + (int)item->revision, (int)item->length); + return (NULL); + } else if (item->domain_min > item->domain_max) { + cmn_err(CE_CONT, + "?boot: invalid proximity domain structure in MSCT " + "with domain_min(%u), domain_max(%u).\n", + item->domain_min, item->domain_max); + return (NULL); + } else if (item->domain_min != last_seen) { + /* + * Items must be organized in ascending order of the + * proximity domain enumerations. + */ + cmn_err(CE_CONT, + "?boot: invalid proximity domain structure in MSCT," + " items are not orginized in ascending order.\n"); + return (NULL); + } + + /* + * If processor_max is 0 then there would be no CPUs in this + * domain. + */ + if (item->processor_max != 0) { + proc_num += (item->domain_max - item->domain_min + 1) * + item->processor_max; + } + + last_seen = item->domain_max - item->domain_min + 1; + /* + * Break out if all proximity domains have been processed. + * Some BIOSes may have unused items at the end of MSCT table. + */ + if (last_seen > tp->maximum_proximity_domains) { + break; + } + } + if (last_seen != tp->maximum_proximity_domains + 1) { + cmn_err(CE_CONT, + "?boot: invalid proximity domain structure in MSCT, " + "proximity domain count doesn't match.\n"); + return (NULL); + } + + /* + * Set plat-max-ncpus property if it hasn't been set yet. + */ + if (do_bsys_getproplen(NULL, PLAT_MAX_NCPUS_NAME) < 0) { + if (proc_num != 0) { + bsetpropsi(PLAT_MAX_NCPUS_NAME, proc_num); + } + } + + /* + * Use Maximum Physical Address from the MSCT table as upper limit for + * memory hot-adding by default. It may be overridden by value from + * the SRAT table or the "plat-dr-physmax" boot option. + */ + plat_dr_physmax = btop(tp->maximum_physical_address + 1); + + /* + * Existence of MSCT implies CPU/memory hotplug-capability for the + * platform. + */ + plat_dr_options |= PLAT_DR_FEATURE_CPU; + plat_dr_options |= PLAT_DR_FEATURE_MEMORY; + + return (tp); +} + #else /* __xpv */ static void enumerate_xen_cpus() @@ -2233,7 +2374,7 @@ enumerate_xen_cpus() /* * User-set boot-ncpus overrides enumeration */ - if (do_bsys_getproplen(NULL, "boot-ncpus") >= 0) + if (do_bsys_getproplen(NULL, BOOT_NCPUS_NAME) >= 0) return; /* @@ -2246,7 +2387,7 @@ enumerate_xen_cpus() if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) == 0) max_id = id; - bsetpropsi("boot-ncpus", max_id+1); + bsetpropsi(BOOT_NCPUS_NAME, max_id+1); } #endif /* __xpv */ @@ -2257,6 +2398,9 @@ build_firmware_properties(void) struct table_header *tp = NULL; #ifndef __xpv + if ((msct_ptr = (struct msct *)find_fw_table("MSCT")) != NULL) + msct_ptr = process_msct(msct_ptr); + if ((tp = find_fw_table("APIC")) != NULL) process_madt((struct madt *)tp); diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index 1525b7859d..02596478c0 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS @@ -88,14 +92,18 @@ * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and * different nodes indexed by node ID * - * - lgrp_plat_node_cnt Number of NUMA nodes in system + * - lgrp_plat_node_cnt Number of NUMA nodes in system for + * non-DR-capable systems, + * maximum possible number of NUMA nodes + * in system for DR capable systems. * * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping * table indexed by node ID (only used * for SRAT) * - * - lgrp_plat_node_memory[] Table with physical address range for - * each node indexed by node ID + * - lgrp_plat_memnode_info[] Table with physical address range for + * each memory node indexed by memory node + * ID * * The code is implemented to make the following always be true: * @@ -113,26 +121,41 @@ * equivalent node ID since we want to keep the node IDs numbered from 0 to * <number of nodes - 1> to minimize cost of searching and potentially space. * - * The code below really tries to do the above. However, the virtual memory - * system expects the memnodes which describe the physical address range for - * each NUMA node to be arranged in ascending order by physical address. (:-( - * Otherwise, the kernel will panic in different semi-random places in the VM - * system. + * With the introduction of support of memory DR operations on x86 platforms, + * things get a little complicated. The addresses of hot-added memory may not + * be continuous with other memory connected to the same lgrp node. In other + * words, memory addresses may get interleaved among lgrp nodes after memory + * DR operations. To work around this limitation, we have extended the + * relationship between lgrp node and memory node from 1:1 map to 1:N map, + * that means there may be multiple memory nodes associated with a lgrp node + * after memory DR operations. * - * Consequently, this module has to try to sort the nodes in ascending order by - * each node's starting physical address to try to meet this "constraint" in - * the VM system (see lgrp_plat_node_sort()). Also, the lowest numbered - * proximity domain ID in the system is deteremined and used to make the lowest - * numbered proximity domain map to node 0 in hopes that the proximity domains - * are sorted in ascending order by physical address already even if their IDs - * don't start at 0 (see NODE_DOMAIN_HASH() and lgrp_plat_srat_domains()). - * Finally, it is important to note that these workarounds may not be - * sufficient if/when memory hotplugging is supported and the VM system may - * ultimately need to be fixed to handle this.... + * To minimize the code changes to support memory DR operations, the + * following policies have been adopted. + * 1) On non-DR-capable systems, the relationship among lgroup platform handle, + * node ID and memnode ID is still kept as: + * lgroup platform handle == node ID == memnode ID + * 2) For memory present at boot time on DR capable platforms, the relationship + * is still kept as is. + * lgroup platform handle == node ID == memnode ID + * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have + * been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt) + * are reserved for memory present at boot time, and memnode IDs + * [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate + * memnode ID for hot-added memory. + * 4) All boot code having the assumption "node ID == memnode ID" can live as + * is, that's because node ID is always equal to memnode ID at boot time. + * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and + * lgrp_plat_mem_size() related logics have been enhanced to deal with + * the 1:N map relationship. + * 6) The latency probing related logics, which have the assumption + * "node ID == memnode ID" and may be called at run time, is disabled if + * memory DR operation is enabled. */ #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ +#include <sys/atomic.h> #include <sys/bootconf.h> #include <sys/cmn_err.h> #include <sys/controlregs.h> @@ -143,6 +166,7 @@ #include <sys/memlist.h> #include <sys/memnode.h> #include <sys/mman.h> +#include <sys/note.h> #include <sys/pci_cfgspace.h> #include <sys/pci_impl.h> #include <sys/param.h> @@ -158,7 +182,8 @@ #include <vm/seg_kmem.h> #include <vm/vm_dep.h> -#include "acpi_fw.h" /* for SRAT and SLIT */ +#include <sys/acpidev.h> +#include "acpi_fw.h" /* for SRAT, SLIT and MSCT */ #define MAX_NODES 8 @@ -187,7 +212,6 @@ ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \ ((domain) - lgrp_plat_prox_domain_min) % node_cnt) - /* * CPU to node ID mapping structure (only used with SRAT) */ @@ -239,14 +263,16 @@ typedef struct node_domain_map { } node_domain_map_t; /* - * Node ID and starting and ending page for physical memory in node + * Node ID and starting and ending page for physical memory in memory node */ -typedef struct node_phys_addr_map { +typedef struct memnode_phys_addr_map { pfn_t start; pfn_t end; int exists; uint32_t prox_domain; -} node_phys_addr_map_t; + uint32_t device_id; + uint_t lgrphand; +} memnode_phys_addr_map_t; /* * Number of CPUs for which we got APIC IDs @@ -278,7 +304,7 @@ static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; /* * Physical address range for memory in each node */ -static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; +static memnode_phys_addr_map_t lgrp_plat_memnode_info[MAX_MEM_NODES]; /* * Statistics gotten from probing @@ -306,6 +332,17 @@ static int lgrp_plat_srat_error = 0; static int lgrp_plat_slit_error = 0; /* + * Whether lgrp topology has been flattened to 2 levels. + */ +static int lgrp_plat_topo_flatten = 0; + + +/* + * Maximum memory node ID in use. + */ +static uint_t lgrp_plat_max_mem_node; + +/* * Allocate lgroup array statically */ static lgrp_t lgrp_space[NLGRP]; @@ -318,7 +355,7 @@ static int nlgrps_alloc; int lgrp_plat_domain_min_enable = 1; /* - * Number of nodes in system + * Maximum possible number of nodes in system */ uint_t lgrp_plat_node_cnt = 1; @@ -343,11 +380,12 @@ int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; /* - * Enable use of ACPI System Resource Affinity Table (SRAT) and System - * Locality Information Table (SLIT) + * Enable use of ACPI System Resource Affinity Table (SRAT), System + * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT) */ int lgrp_plat_srat_enable = 1; int lgrp_plat_slit_enable = 1; +int lgrp_plat_msct_enable = 1; /* * mnode_xwa: set to non-zero value to initiate workaround if large pages are @@ -370,12 +408,10 @@ struct lgrp_stats lgrp_stats[NLGRP]; */ void plat_build_mem_nodes(struct memlist *list); -int plat_lgrphand_to_mem_node(lgrp_handle_t hand); +int plat_mnode_xcheck(pfn_t pfncnt); lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); -int plat_mnode_xcheck(pfn_t pfncnt); - int plat_pfn_to_mem_node(pfn_t pfn); /* @@ -420,11 +456,11 @@ static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, static void lgrp_plat_get_numa_config(void); -static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, +static void lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); -static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, +static int lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats); static void lgrp_plat_main_init(void); @@ -434,13 +470,13 @@ static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, uint32_t domain); -static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, - int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start, - uint64_t end, uint32_t domain); +static int lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, + int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt, + uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id); static void lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, cpu_node_map_t *cpu_node, int cpu_count, - node_phys_addr_map_t *node_memory); + memnode_phys_addr_map_t *memnode_info); static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, @@ -448,24 +484,32 @@ static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); -static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, - node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); +static int lgrp_plat_process_slit(struct slit *tp, + node_domain_map_t *node_domain, uint_t node_cnt, + memnode_phys_addr_map_t *memnode_info, + lgrp_plat_latency_stats_t *lat_stats); + +static int lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info, + uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, + lgrp_plat_latency_stats_t *lat_stats); -static int lgrp_plat_process_srat(struct srat *tp, +static int lgrp_plat_process_srat(struct srat *tp, struct msct *mp, uint32_t *prox_domain_min, node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count, - node_phys_addr_map_t *node_memory); + memnode_phys_addr_map_t *memnode_info); static void lgrp_plat_release_bootstrap(void); static int lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min); -static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, - lgrp_plat_latency_stats_t *lat_stats); +static int lgrp_plat_msct_domains(struct msct *tp, + uint32_t *prox_domain_min); + +static void lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats); static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, - node_phys_addr_map_t *node_memory); + memnode_phys_addr_map_t *memnode_info); static hrtime_t opt_probe_vendor(int dest_node, int nreads); @@ -529,10 +573,10 @@ plat_build_mem_nodes(struct memlist *list) * This shouldn't happen and rest of code can't deal * with this if it does. */ - if (node < 0 || node >= lgrp_plat_node_cnt || - !lgrp_plat_node_memory[node].exists || - cur_start < lgrp_plat_node_memory[node].start || - cur_start > lgrp_plat_node_memory[node].end) { + if (node < 0 || node >= lgrp_plat_max_mem_node || + !lgrp_plat_memnode_info[node].exists || + cur_start < lgrp_plat_memnode_info[node].start || + cur_start > lgrp_plat_memnode_info[node].end) { cmn_err(CE_PANIC, "Don't know which memnode " "to add installed memory address 0x%lx\n", cur_start); @@ -543,9 +587,9 @@ plat_build_mem_nodes(struct memlist *list) */ cur_end = end; endcnt = 0; - if (lgrp_plat_node_memory[node].exists && - cur_end > lgrp_plat_node_memory[node].end) { - cur_end = lgrp_plat_node_memory[node].end; + if (lgrp_plat_memnode_info[node].exists && + cur_end > lgrp_plat_memnode_info[node].end) { + cur_end = lgrp_plat_memnode_info[node].end; if (mnode_xwa > 1) { /* * sacrifice the last page in each @@ -572,16 +616,6 @@ plat_build_mem_nodes(struct memlist *list) } -int -plat_lgrphand_to_mem_node(lgrp_handle_t hand) -{ - if (max_mem_nodes == 1) - return (0); - - return ((int)hand); -} - - /* * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if @@ -593,9 +627,9 @@ plat_mnode_xcheck(pfn_t pfncnt) int node, prevnode = -1, basenode; pfn_t ea, sa; - for (node = 0; node < lgrp_plat_node_cnt; node++) { + for (node = 0; node < lgrp_plat_max_mem_node; node++) { - if (lgrp_plat_node_memory[node].exists == 0) + if (lgrp_plat_memnode_info[node].exists == 0) continue; if (prevnode == -1) { @@ -605,23 +639,23 @@ plat_mnode_xcheck(pfn_t pfncnt) } /* assume x86 node pfn ranges are in increasing order */ - ASSERT(lgrp_plat_node_memory[node].start > - lgrp_plat_node_memory[prevnode].end); + ASSERT(lgrp_plat_memnode_info[node].start > + lgrp_plat_memnode_info[prevnode].end); /* * continue if the starting address of node is not contiguous * with the previous node. */ - if (lgrp_plat_node_memory[node].start != - (lgrp_plat_node_memory[prevnode].end + 1)) { + if (lgrp_plat_memnode_info[node].start != + (lgrp_plat_memnode_info[prevnode].end + 1)) { basenode = node; prevnode = node; continue; } /* check if the starting address of node is pfncnt aligned */ - if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { + if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) { /* * at this point, node starts at an unaligned boundary @@ -630,14 +664,14 @@ plat_mnode_xcheck(pfn_t pfncnt) * range of length pfncnt that crosses this boundary. */ - sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, + sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end, pfncnt); - ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), + ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start), pfncnt); ASSERT((ea - sa) == pfncnt); - if (sa >= lgrp_plat_node_memory[basenode].start && - ea <= (lgrp_plat_node_memory[node].end + 1)) { + if (sa >= lgrp_plat_memnode_info[basenode].start && + ea <= (lgrp_plat_memnode_info[node].end + 1)) { /* * large page found to cross mnode boundary. * Return Failure if workaround not enabled. @@ -659,9 +693,10 @@ plat_mem_node_to_lgrphand(int mnode) if (max_mem_nodes == 1) return (LGRP_DEFAULT_HANDLE); - return ((lgrp_handle_t)mnode); -} + ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node); + return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand)); +} int plat_pfn_to_mem_node(pfn_t pfn) @@ -671,22 +706,23 @@ plat_pfn_to_mem_node(pfn_t pfn) if (max_mem_nodes == 1) return (0); - for (node = 0; node < lgrp_plat_node_cnt; node++) { + for (node = 0; node < lgrp_plat_max_mem_node; node++) { /* * Skip nodes with no memory */ - if (!lgrp_plat_node_memory[node].exists) + if (!lgrp_plat_memnode_info[node].exists) continue; - if (pfn >= lgrp_plat_node_memory[node].start && - pfn <= lgrp_plat_node_memory[node].end) + membar_consumer(); + if (pfn >= lgrp_plat_memnode_info[node].start && + pfn <= lgrp_plat_memnode_info[node].end) return (node); } /* * Didn't find memnode where this PFN lives which should never happen */ - ASSERT(node < lgrp_plat_node_cnt); + ASSERT(node < lgrp_plat_max_mem_node); return (-1); } @@ -698,7 +734,6 @@ plat_pfn_to_mem_node(pfn_t pfn) /* * Allocate additional space for an lgroup. */ -/* ARGSUSED */ lgrp_t * lgrp_plat_alloc(lgrp_id_t lgrpid) { @@ -713,23 +748,198 @@ lgrp_plat_alloc(lgrp_id_t lgrpid) /* * Platform handling for (re)configuration changes + * + * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug: + * 1) Use cpu_lock to synchronize between lgrp_plat_config() and + * lgrp_plat_cpu_to_hand(). + * 2) Disable latency probing logic by making sure that the flag + * LGRP_PLAT_PROBE_ENABLE is cleared. + * + * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug: + * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal. + * 2) Only expansion to existing entries, no shrinking. + * 3) On writing side, DR framework ensures that lgrp_plat_config() is called + * in single-threaded context. And membar_producer() is used to ensure that + * all changes are visible to other CPUs before setting the "exists" flag. + * 4) On reading side, membar_consumer() after checking the "exists" flag + * ensures that right values are retrieved. + * + * Mechanism to protect lgrp_plat_node_domain[] at hotplug: + * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal. + * 2) On writing side, it's single-threaded and membar_producer() is used to + * ensure all changes are visible to other CPUs before setting the "exists" + * flag. + * 3) On reading side, membar_consumer() after checking the "exists" flag + * ensures that right values are retrieved. */ -/* ARGSUSED */ void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) { +#ifdef __xpv + _NOTE(ARGUNUSED(flag, arg)); +#else + int rc, node; + cpu_t *cp; + void *hdl = NULL; + uchar_t *sliptr = NULL; + uint32_t domain, apicid, slicnt = 0; + update_membounds_t *mp; + + extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *, + uint32_t *, uint32_t *, uchar_t **); + extern void acpidev_dr_free_cpu_numa_info(void *); + + /* + * This interface is used to support CPU/memory DR operations. + * Don't bother here if it's still during boot or only one lgrp node + * is supported. + */ + if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1) + return; + + switch (flag) { + case LGRP_CONFIG_CPU_ADD: + cp = (cpu_t *)arg; + ASSERT(cp != NULL); + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* Check whether CPU already exists. */ + ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists); + if (lgrp_plat_cpu_node[cp->cpu_id].exists) { + cmn_err(CE_WARN, + "!lgrp: CPU(%d) already exists in cpu_node map.", + cp->cpu_id); + break; + } + + /* Query CPU lgrp information. */ + rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, + &slicnt, &sliptr); + ASSERT(rc == 0); + if (rc != 0) { + cmn_err(CE_WARN, + "!lgrp: failed to query lgrp info for CPU(%d).", + cp->cpu_id); + break; + } + + /* Update node to proximity domain mapping */ + node = lgrp_plat_domain_to_node(lgrp_plat_node_domain, + lgrp_plat_node_cnt, domain); + if (node == -1) { + node = lgrp_plat_node_domain_update( + lgrp_plat_node_domain, lgrp_plat_node_cnt, domain); + ASSERT(node != -1); + if (node == -1) { + acpidev_dr_free_cpu_numa_info(hdl); + cmn_err(CE_WARN, "!lgrp: failed to update " + "node_domain map for domain(%u).", domain); + break; + } + } + + /* Update latency information among lgrps. */ + if (slicnt != 0 && sliptr != NULL) { + if (lgrp_plat_process_sli(domain, sliptr, slicnt, + lgrp_plat_node_domain, lgrp_plat_node_cnt, + &lgrp_plat_lat_stats) != 0) { + cmn_err(CE_WARN, "!lgrp: failed to update " + "latency information for domain (%u).", + domain); + } + } + + /* Update CPU to node mapping. */ + lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain; + lgrp_plat_cpu_node[cp->cpu_id].node = node; + lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid; + lgrp_plat_cpu_node[cp->cpu_id].exists = 1; + lgrp_plat_apic_ncpus++; + + acpidev_dr_free_cpu_numa_info(hdl); + break; + + case LGRP_CONFIG_CPU_DEL: + cp = (cpu_t *)arg; + ASSERT(cp != NULL); + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* Check whether CPU exists. */ + ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists); + if (!lgrp_plat_cpu_node[cp->cpu_id].exists) { + cmn_err(CE_WARN, + "!lgrp: CPU(%d) doesn't exist in cpu_node map.", + cp->cpu_id); + break; + } + + /* Query CPU lgrp information. */ + rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, + NULL, NULL); + ASSERT(rc == 0); + if (rc != 0) { + cmn_err(CE_WARN, + "!lgrp: failed to query lgrp info for CPU(%d).", + cp->cpu_id); + break; + } + + /* Update map. */ + ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid); + ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain); + lgrp_plat_cpu_node[cp->cpu_id].exists = 0; + lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX; + lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX; + lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX; + lgrp_plat_apic_ncpus--; + + acpidev_dr_free_cpu_numa_info(hdl); + break; + + case LGRP_CONFIG_MEM_ADD: + mp = (update_membounds_t *)arg; + ASSERT(mp != NULL); + + /* Update latency information among lgrps. */ + if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) { + if (lgrp_plat_process_sli(mp->u_domain, + mp->u_sli_ptr, mp->u_sli_cnt, + lgrp_plat_node_domain, lgrp_plat_node_cnt, + &lgrp_plat_lat_stats) != 0) { + cmn_err(CE_WARN, "!lgrp: failed to update " + "latency information for domain (%u).", + domain); + } + } + + if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain, + lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes, + mp->u_base, mp->u_base + mp->u_length, + mp->u_domain, mp->u_device_id) < 0) { + cmn_err(CE_WARN, + "!lgrp: failed to update latency information for " + "memory (0x%" PRIx64 " - 0x%" PRIx64 ").", + mp->u_base, mp->u_base + mp->u_length); + } + break; + + default: + break; + } +#endif /* __xpv */ } /* * Return the platform handle for the lgroup containing the given CPU */ -/* ARGSUSED */ lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id) { lgrp_handle_t hand; + ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); + if (lgrp_plat_node_cnt == 1) return (LGRP_DEFAULT_HANDLE); @@ -763,6 +973,7 @@ lgrp_plat_init(lgrp_init_stages_t stage) */ lgrp_plat_node_cnt = max_mem_nodes = 1; #else /* __xpv */ + /* * Get boot property for lgroup topology height limit */ @@ -782,14 +993,32 @@ lgrp_plat_init(lgrp_init_stages_t stage) lgrp_plat_slit_enable = (int)value; /* + * Get boot property for enabling/disabling MSCT + */ + if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0) + lgrp_plat_msct_enable = (int)value; + + /* * Initialize as a UMA machine */ if (lgrp_topo_ht_limit() == 1) { lgrp_plat_node_cnt = max_mem_nodes = 1; + lgrp_plat_max_mem_node = 1; return; } lgrp_plat_get_numa_config(); + + /* + * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes + * to support memory DR operations if memory DR is enabled. + */ + lgrp_plat_max_mem_node = lgrp_plat_node_cnt; + if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) { + max_mem_nodes = MAX_MEM_NODES_PER_LGROUP * + lgrp_plat_node_cnt; + ASSERT(max_mem_nodes <= MAX_MEM_NODES); + } #endif /* __xpv */ break; @@ -817,7 +1046,6 @@ lgrp_plat_init(lgrp_init_stages_t stage) * specific, so platform gets to decide its value. It would be nice if the * number was at least proportional to make comparisons more meaningful though. */ -/* ARGSUSED */ int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) { @@ -844,13 +1072,29 @@ lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) /* * Probe from current CPU if its lgroup latencies haven't been set yet - * and we are trying to get latency from current CPU to some node + * and we are trying to get latency from current CPU to some node. + * Avoid probing if CPU/memory DR is enabled. */ - node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, - lgrp_plat_cpu_node_nentries); - ASSERT(node >= 0 && node < lgrp_plat_node_cnt); - if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) - lgrp_plat_probe(); + if (lgrp_plat_lat_stats.latencies[src][src] == 0) { + /* + * Latency information should be updated by lgrp_plat_config() + * for DR operations. Something is wrong if reaches here. + * For safety, flatten lgrp topology to two levels. + */ + if (plat_dr_support_cpu() || plat_dr_support_memory()) { + ASSERT(lgrp_plat_lat_stats.latencies[src][src]); + cmn_err(CE_WARN, + "lgrp: failed to get latency information, " + "fall back to two-level topology."); + lgrp_plat_2level_setup(&lgrp_plat_lat_stats); + } else { + node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, + lgrp_plat_cpu_node_nentries); + ASSERT(node >= 0 && node < lgrp_plat_node_cnt); + if (node == src) + lgrp_plat_probe(); + } + } return (lgrp_plat_lat_stats.latencies[src][dest]); } @@ -859,21 +1103,45 @@ lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) /* * Return the maximum number of lgrps supported by the platform. * Before lgrp topology is known it returns an estimate based on the number of - * nodes. Once topology is known it returns the actual maximim number of lgrps - * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and - * dynamic addition of new nodes, this number may not grow during system - * lifetime (yet). + * nodes. Once topology is known it returns: + * 1) the actual maximim number of lgrps created if CPU/memory DR operations + * are not suppported. + * 2) the maximum possible number of lgrps if CPU/memory DR operations are + * supported. */ int lgrp_plat_max_lgrps(void) { - return (lgrp_topo_initialized ? - lgrp_alloc_max + 1 : - lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); + if (!lgrp_topo_initialized || plat_dr_support_cpu() || + plat_dr_support_memory()) { + return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); + } else { + return (lgrp_alloc_max + 1); + } } /* + * Count number of memory pages (_t) based on mnode id (_n) and query type (_t). + */ +#define _LGRP_PLAT_MEM_SIZE(_n, _q, _t) \ + if (mem_node_config[_n].exists) { \ + switch (_q) { \ + case LGRP_MEM_SIZE_FREE: \ + _t += MNODE_PGCNT(_n); \ + break; \ + case LGRP_MEM_SIZE_AVAIL: \ + _t += mem_node_memlist_pages(_n, phys_avail); \ + break; \ + case LGRP_MEM_SIZE_INSTALL: \ + _t += mem_node_memlist_pages(_n, phys_install); \ + break; \ + default: \ + break; \ + } \ + } + +/* * Return the number of free pages in an lgroup. * * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize @@ -896,25 +1164,19 @@ lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) return (lgrp_plat_mem_size_default(plathand, query)); if (plathand != LGRP_NULL_HANDLE) { - mnode = plat_lgrphand_to_mem_node(plathand); - if (mnode >= 0 && mem_node_config[mnode].exists) { - switch (query) { - case LGRP_MEM_SIZE_FREE: - npgs = MNODE_PGCNT(mnode); - break; - case LGRP_MEM_SIZE_AVAIL: - npgs = mem_node_memlist_pages(mnode, - phys_avail); - break; - case LGRP_MEM_SIZE_INSTALL: - npgs = mem_node_memlist_pages(mnode, - phys_install); - break; - default: - break; - } + /* Count memory node present at boot. */ + mnode = (int)plathand; + ASSERT(mnode < lgrp_plat_node_cnt); + _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); + + /* Count possible hot-added memory nodes. */ + for (mnode = lgrp_plat_node_cnt; + mnode < lgrp_plat_max_mem_node; mnode++) { + if (lgrp_plat_memnode_info[mnode].lgrphand == plathand) + _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); } } + return (npgs); } @@ -923,7 +1185,6 @@ lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) * Return the platform handle of the lgroup that contains the physical memory * corresponding to the given page frame number */ -/* ARGSUSED */ lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn) { @@ -979,6 +1240,10 @@ lgrp_plat_probe(void) max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) return; + /* SRAT and SLIT should be enabled if DR operations are enabled. */ + if (plat_dr_support_cpu() || plat_dr_support_memory()) + return; + /* * Determine ID of node containing current CPU */ @@ -1051,14 +1316,13 @@ lgrp_plat_probe(void) * - Fallback to just optimizing for local and remote if * latencies didn't look right */ - lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, + lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); lgrp_plat_probe_stats.probe_error_code = - lgrp_plat_latency_verify(lgrp_plat_node_memory, + lgrp_plat_latency_verify(lgrp_plat_memnode_info, &lgrp_plat_lat_stats); if (lgrp_plat_probe_stats.probe_error_code) - lgrp_plat_2level_setup(lgrp_plat_node_memory, - &lgrp_plat_lat_stats); + lgrp_plat_2level_setup(&lgrp_plat_lat_stats); } @@ -1078,8 +1342,11 @@ lgrp_plat_root_hand(void) /* - * Update CPU to node mapping for given CPU and proximity domain (and returns - * negative numbers for errors and positive ones for success) + * Update CPU to node mapping for given CPU and proximity domain. + * Return values: + * - zero for success + * - positive numbers for warnings + * - negative numbers for errors */ static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, @@ -1119,6 +1386,13 @@ lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, return (1); /* + * It's invalid to have more than one entry with the same + * local APIC ID in SRAT table. + */ + if (cpu_node[i].node != UINT_MAX) + return (-2); + + /* * Fill in node and proximity domain IDs */ cpu_node[i].prox_domain = domain; @@ -1128,9 +1402,11 @@ lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, } /* - * Return error when entry for APIC ID wasn't found in table + * It's possible that an apicid doesn't exist in the cpu_node map due + * to user limits number of CPUs powered on at boot by specifying the + * boot_ncpus kernel option. */ - return (-2); + return (2); } @@ -1189,9 +1465,11 @@ lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, */ node = start = NODE_DOMAIN_HASH(domain, node_cnt); do { - if (node_domain[node].prox_domain == domain && - node_domain[node].exists) - return (node); + if (node_domain[node].exists) { + membar_consumer(); + if (node_domain[node].prox_domain == domain) + return (node); + } node = (node + 1) % node_cnt; } while (node != start); return (-1); @@ -1219,19 +1497,26 @@ lgrp_plat_get_numa_config(void) if (lgrp_plat_apic_ncpus > 0) { int retval; + /* Reserve enough resources if CPU DR is enabled. */ + if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus) + lgrp_plat_cpu_node_nentries = max_ncpus; + else + lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; + /* * Temporarily allocate boot memory to use for CPU to node * mapping since kernel memory allocator isn't alive yet */ lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, - NULL, lgrp_plat_apic_ncpus * sizeof (cpu_node_map_t), + NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t), sizeof (int)); ASSERT(lgrp_plat_cpu_node != NULL); if (lgrp_plat_cpu_node) { - lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t)); + } else { + lgrp_plat_cpu_node_nentries = 0; } /* @@ -1240,10 +1525,10 @@ lgrp_plat_get_numa_config(void) */ (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); - retval = lgrp_plat_process_srat(srat_ptr, + retval = lgrp_plat_process_srat(srat_ptr, msct_ptr, &lgrp_plat_prox_domain_min, lgrp_plat_node_domain, lgrp_plat_cpu_node, - lgrp_plat_apic_ncpus, lgrp_plat_node_memory); + lgrp_plat_apic_ncpus, lgrp_plat_memnode_info); if (retval <= 0) { lgrp_plat_srat_error = retval; lgrp_plat_node_cnt = 1; @@ -1260,15 +1545,12 @@ lgrp_plat_get_numa_config(void) if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && is_opteron()) opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, - lgrp_plat_node_memory); + lgrp_plat_memnode_info); /* * Don't bother to setup system for multiple lgroups and only use one * memory node when memory is interleaved between any nodes or there is * only one NUMA node - * - * NOTE: May need to change this for Dynamic Reconfiguration (DR) - * when and if it happens for x86/x64 */ if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { lgrp_plat_node_cnt = max_mem_nodes = 1; @@ -1287,7 +1569,7 @@ lgrp_plat_get_numa_config(void) /* * There should be one memnode (physical page free list(s)) for - * each node + * each node if memory DR is disabled. */ max_mem_nodes = lgrp_plat_node_cnt; @@ -1303,8 +1585,35 @@ lgrp_plat_get_numa_config(void) * exists */ lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, - lgrp_plat_node_cnt, lgrp_plat_node_memory, + lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info, &lgrp_plat_lat_stats); + + /* + * Disable support of CPU/memory DR operations if multiple locality + * domains exist in system and either of following is true. + * 1) Failed to process SLIT table. + * 2) Latency probing is enabled by user. + */ + if (lgrp_plat_node_cnt > 1 && + (plat_dr_support_cpu() || plat_dr_support_memory())) { + if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 || + !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 || + lgrp_plat_apic_ncpus <= 0) { + cmn_err(CE_CONT, + "?lgrp: failed to process ACPI SRAT/SLIT table, " + "disable support of CPU/memory DR operations."); + plat_dr_disable_cpu(); + plat_dr_disable_memory(); + } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) { + cmn_err(CE_CONT, + "?lgrp: latency probing enabled by user, " + "disable support of CPU/memory DR operations."); + plat_dr_disable_cpu(); + plat_dr_disable_memory(); + } + } + + /* Done if succeeded to process SLIT table. */ if (lgrp_plat_slit_error == 0) return; @@ -1366,7 +1675,7 @@ int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; * latencies be same */ static void -lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, +lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) { int i; @@ -1387,7 +1696,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, if (max_mem_nodes == 1) return; - ASSERT(node_memory != NULL && lat_stats != NULL && + ASSERT(memnode_info != NULL && lat_stats != NULL && probe_stats != NULL); /* @@ -1395,11 +1704,11 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, * (ie. latency(node0, node1) == latency(node1, node0)) */ for (i = 0; i < lgrp_plat_node_cnt; i++) { - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) + if (!memnode_info[j].exists) continue; t1 = lat_stats->latencies[i][j]; @@ -1452,7 +1761,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, */ for (i = 0; i < lgrp_plat_node_cnt; i++) { for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) + if (!memnode_info[j].exists) continue; /* * Pick one pair of nodes (i, j) @@ -1469,7 +1778,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, for (k = 0; k < lgrp_plat_node_cnt; k++) { for (l = 0; l < lgrp_plat_node_cnt; l++) { - if (!node_memory[l].exists) + if (!memnode_info[l].exists) continue; /* * Pick another pair of nodes (k, l) @@ -1545,7 +1854,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, min = -1; max = 0; for (i = 0; i < lgrp_plat_node_cnt; i++) { - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; t = lat_stats->latencies[i][i]; if (t == 0) @@ -1559,7 +1868,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, for (i = 0; i < lgrp_plat_node_cnt; i++) { int local; - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; local = lat_stats->latencies[i][i]; @@ -1590,7 +1899,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, lat_stats->latency_max = 0; for (i = 0; i < lgrp_plat_node_cnt; i++) { for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) + if (!memnode_info[j].exists) continue; t = lat_stats->latencies[i][j]; if (t > lat_stats->latency_max) @@ -1616,7 +1925,7 @@ lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, * -3 Local >= remote */ static int -lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, +lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats) { int i; @@ -1624,7 +1933,7 @@ lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, u_longlong_t t1; u_longlong_t t2; - ASSERT(node_memory != NULL && lat_stats != NULL); + ASSERT(memnode_info != NULL && lat_stats != NULL); /* * Nothing to do when this is an UMA machine, lgroup topology is @@ -1639,10 +1948,10 @@ lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, * (ie. latency(node0, node1) == latency(node1, node0)) */ for (i = 0; i < lgrp_plat_node_cnt; i++) { - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) + if (!memnode_info[j].exists) continue; t1 = lat_stats->latencies[i][j]; t2 = lat_stats->latencies[j][i]; @@ -1659,7 +1968,7 @@ lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, */ t1 = lat_stats->latencies[0][0]; for (i = 1; i < lgrp_plat_node_cnt; i++) { - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; t2 = lat_stats->latencies[i][i]; @@ -1681,7 +1990,7 @@ lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, if (t1) { for (i = 0; i < lgrp_plat_node_cnt; i++) { for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) + if (!memnode_info[j].exists) continue; t2 = lat_stats->latencies[i][j]; if (i == j || t2 == 0) @@ -1729,8 +2038,7 @@ lgrp_plat_main_init(void) */ if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && lgrp_plat_lat_stats.latency_max == 0) - lgrp_plat_2level_setup(lgrp_plat_node_memory, - &lgrp_plat_lat_stats); + lgrp_plat_2level_setup(&lgrp_plat_lat_stats); return; } @@ -1769,7 +2077,7 @@ lgrp_plat_main_init(void) * Skip this node and leave its probe page NULL * if it doesn't have any memory */ - mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); + mnode = i; if (!mem_node_config[mnode].exists) { lgrp_plat_probe_mem_config.probe_va[i] = NULL; continue; @@ -1815,10 +2123,11 @@ lgrp_plat_main_init(void) * This is a copy of the MAX_MEM_NODES == 1 version of the routine * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup */ -/* ARGSUSED */ static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) { + _NOTE(ARGUNUSED(lgrphand)); + struct memlist *mlist; pgcnt_t npgs = 0; extern struct memlist *phys_avail; @@ -1866,8 +2175,9 @@ lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, * domain and return node ID which is index into mapping table. */ if (!node_domain[node].exists) { - node_domain[node].exists = 1; node_domain[node].prox_domain = domain; + membar_producer(); + node_domain[node].exists = 1; return (node); } @@ -1887,18 +2197,17 @@ lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, return (-1); } - /* * Update node memory information for given proximity domain with specified * starting and ending physical address range (and return positive numbers for * success and negative ones for errors) */ static int -lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, - node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, - uint32_t domain) +lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt, + memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start, + uint64_t end, uint32_t domain, uint32_t device_id) { - int node; + int node, mnode; /* * Get node number for proximity domain @@ -1912,13 +2221,53 @@ lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, } /* + * This function is called during boot if device_id is + * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for + * memory DR operations. + */ + if (device_id != ACPI_MEMNODE_DEVID_BOOT) { + ASSERT(lgrp_plat_max_mem_node <= memnode_cnt); + + for (mnode = lgrp_plat_node_cnt; + mnode < lgrp_plat_max_mem_node; mnode++) { + if (memnode_info[mnode].exists && + memnode_info[mnode].prox_domain == domain && + memnode_info[mnode].device_id == device_id) { + if (btop(start) < memnode_info[mnode].start) + memnode_info[mnode].start = btop(start); + if (btop(end) > memnode_info[mnode].end) + memnode_info[mnode].end = btop(end); + return (1); + } + } + + if (lgrp_plat_max_mem_node >= memnode_cnt) { + return (-3); + } else { + lgrp_plat_max_mem_node++; + memnode_info[mnode].start = btop(start); + memnode_info[mnode].end = btop(end); + memnode_info[mnode].prox_domain = domain; + memnode_info[mnode].device_id = device_id; + memnode_info[mnode].lgrphand = node; + membar_producer(); + memnode_info[mnode].exists = 1; + return (0); + } + } + + /* * Create entry in table for node if it doesn't exist */ - if (!node_memory[node].exists) { - node_memory[node].exists = 1; - node_memory[node].start = btop(start); - node_memory[node].end = btop(end); - node_memory[node].prox_domain = domain; + ASSERT(node < memnode_cnt); + if (!memnode_info[node].exists) { + memnode_info[node].start = btop(start); + memnode_info[node].end = btop(end); + memnode_info[node].prox_domain = domain; + memnode_info[node].device_id = device_id; + memnode_info[node].lgrphand = node; + membar_producer(); + memnode_info[node].exists = 1; return (0); } @@ -1928,11 +2277,11 @@ lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, * There may be more than one SRAT memory entry for a domain, so we may * need to update existing start or end address for the node. */ - if (node_memory[node].prox_domain == domain) { - if (btop(start) < node_memory[node].start) - node_memory[node].start = btop(start); - if (btop(end) > node_memory[node].end) - node_memory[node].end = btop(end); + if (memnode_info[node].prox_domain == domain) { + if (btop(start) < memnode_info[node].start) + memnode_info[node].start = btop(start); + if (btop(end) > memnode_info[node].end) + memnode_info[node].end = btop(end); return (1); } return (-2); @@ -1940,16 +2289,14 @@ lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, /* - * Have to sort node by starting physical address because VM system (physical - * page free list management) assumes and expects memnodes to be sorted in - * ascending order by physical address. If not, the kernel will panic in - * potentially a number of different places. (:-( - * NOTE: This workaround will not be sufficient if/when hotplugging memory is - * supported on x86/x64. + * Have to sort nodes by starting physical address because plat_mnode_xcheck() + * assumes and expects memnodes to be sorted in ascending order by physical + * address. */ static void lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, - cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory) + cpu_node_map_t *cpu_node, int cpu_count, + memnode_phys_addr_map_t *memnode_info) { boolean_t found; int i; @@ -1959,7 +2306,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, boolean_t swapped; if (!lgrp_plat_node_sort_enable || node_cnt <= 1 || - node_domain == NULL || node_memory == NULL) + node_domain == NULL || memnode_info == NULL) return; /* @@ -1970,7 +2317,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, /* * Skip entries that don't exist */ - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; /* @@ -1978,7 +2325,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, */ found = B_FALSE; for (j = i + 1; j < node_cnt; j++) { - if (node_memory[j].exists) { + if (memnode_info[j].exists) { found = B_TRUE; break; } @@ -1994,7 +2341,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, * Not sorted if starting address of current entry is bigger * than starting address of next existing entry */ - if (node_memory[i].start > node_memory[j].start) { + if (memnode_info[i].start > memnode_info[j].start) { sorted = B_FALSE; break; } @@ -2017,7 +2364,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, /* * Skip entries that don't exist */ - if (!node_memory[i].exists) + if (!memnode_info[i].exists) continue; /* @@ -2025,7 +2372,7 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, */ found = B_FALSE; for (j = i + 1; j <= n; j++) { - if (node_memory[j].exists) { + if (memnode_info[j].exists) { found = B_TRUE; break; } @@ -2037,8 +2384,8 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, if (found == B_FALSE) break; - if (node_memory[i].start > node_memory[j].start) { - node_phys_addr_map_t save_addr; + if (memnode_info[i].start > memnode_info[j].start) { + memnode_phys_addr_map_t save_addr; node_domain_map_t save_node; /* @@ -2054,12 +2401,12 @@ lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, /* * Swap node to physical memory assignments */ - bcopy(&node_memory[i], &save_addr, - sizeof (node_phys_addr_map_t)); - bcopy(&node_memory[j], &node_memory[i], - sizeof (node_phys_addr_map_t)); - bcopy(&save_addr, &node_memory[j], - sizeof (node_phys_addr_map_t)); + bcopy(&memnode_info[i], &save_addr, + sizeof (memnode_phys_addr_map_t)); + bcopy(&memnode_info[j], &memnode_info[i], + sizeof (memnode_phys_addr_map_t)); + bcopy(&save_addr, &memnode_info[j], + sizeof (memnode_phys_addr_map_t)); swapped = B_TRUE; } } @@ -2226,11 +2573,12 @@ lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) return (-1); /* - * Calculate number of entries in array and return when there's just - * one CPU since that's not very interesting for NUMA + * Calculate number of entries in array and return when the system is + * not very interesting for NUMA. It's not interesting for NUMA if + * system has only one CPU and doesn't support CPU hotplug. */ n = boot_prop_len / sizeof (uint8_t); - if (n == 1) + if (n == 1 && !plat_dr_support_cpu()) return (-2); /* @@ -2243,21 +2591,31 @@ lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) * Just return number of CPU APIC IDs if CPU to node mapping table is * NULL */ - if (cpu_node == NULL) - return (n); + if (cpu_node == NULL) { + if (plat_dr_support_cpu() && n >= boot_ncpus) { + return (boot_ncpus); + } else { + return (n); + } + } /* * Fill in CPU to node ID mapping table with APIC ID for each CPU */ for (i = 0; i < n; i++) { + /* Only add boot CPUs into the map if CPU DR is enabled. */ + if (plat_dr_support_cpu() && i >= boot_ncpus) + break; cpu_node[i].exists = 1; cpu_node[i].apicid = cpu_apicid_array[i]; + cpu_node[i].prox_domain = UINT32_MAX; + cpu_node[i].node = UINT_MAX; } /* * Return number of CPUs based on number of APIC IDs */ - return (n); + return (i); } @@ -2266,11 +2624,14 @@ lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) * NUMA node is from each other */ static int -lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, - node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) +lgrp_plat_process_slit(struct slit *tp, + node_domain_map_t *node_domain, uint_t node_cnt, + memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats) { int i; int j; + int src; + int dst; int localities; hrtime_t max; hrtime_t min; @@ -2284,8 +2645,6 @@ lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, return (2); localities = tp->number; - if (localities != node_cnt) - return (3); min = lat_stats->latency_min; max = lat_stats->latency_max; @@ -2295,11 +2654,21 @@ lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, */ slit_entries = tp->entry; for (i = 0; i < localities; i++) { + src = lgrp_plat_domain_to_node(node_domain, + node_cnt, i); + if (src == -1) + continue; + for (j = 0; j < localities; j++) { uint8_t latency; + dst = lgrp_plat_domain_to_node(node_domain, + node_cnt, j); + if (dst == -1) + continue; + latency = slit_entries[(i * localities) + j]; - lat_stats->latencies[i][j] = latency; + lat_stats->latencies[src][dst] = latency; if (latency < min || min == -1) min = latency; if (latency > max) @@ -2310,7 +2679,7 @@ lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, /* * Verify that latencies/distances given in SLIT look reasonable */ - retval = lgrp_plat_latency_verify(node_memory, lat_stats); + retval = lgrp_plat_latency_verify(memnode_info, lat_stats); if (retval) { /* @@ -2334,20 +2703,110 @@ lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, /* + * Update lgrp latencies according to information returned by ACPI _SLI method. + */ +static int +lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, + uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, + lgrp_plat_latency_stats_t *lat_stats) +{ + int i; + int src, dst; + uint8_t latency; + hrtime_t max, min; + + if (lat_stats == NULL || sli_info == NULL || + sli_cnt == 0 || domain_id >= sli_cnt) + return (-1); + + src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id); + if (src == -1) { + src = lgrp_plat_node_domain_update(node_domain, node_cnt, + domain_id); + if (src == -1) + return (-1); + } + + /* + * Don't update latency info if topology has been flattened to 2 levels. + */ + if (lgrp_plat_topo_flatten != 0) { + return (0); + } + + /* + * Latency information for proximity domain is ready. + * TODO: support adjusting latency information at runtime. + */ + if (lat_stats->latencies[src][src] != 0) { + return (0); + } + + /* Validate latency information. */ + for (i = 0; i < sli_cnt; i++) { + if (i == domain_id) { + if (sli_info[i] != ACPI_SLIT_SELF_LATENCY || + sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) { + return (-1); + } + } else { + if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY || + sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY || + sli_info[i] != sli_info[sli_cnt + i]) { + return (-1); + } + } + } + + min = lat_stats->latency_min; + max = lat_stats->latency_max; + for (i = 0; i < sli_cnt; i++) { + dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i); + if (dst == -1) + continue; + + ASSERT(sli_info[i] == sli_info[sli_cnt + i]); + + /* Update row in latencies matrix. */ + latency = sli_info[i]; + lat_stats->latencies[src][dst] = latency; + if (latency < min || min == -1) + min = latency; + if (latency > max) + max = latency; + + /* Update column in latencies matrix. */ + latency = sli_info[sli_cnt + i]; + lat_stats->latencies[dst][src] = latency; + if (latency < min || min == -1) + min = latency; + if (latency > max) + max = latency; + } + lat_stats->latency_min = min; + lat_stats->latency_max = max; + + return (0); +} + + +/* * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs * and memory are local to each other in the same NUMA node and return number * of nodes */ static int -lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, - node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count, - node_phys_addr_map_t *node_memory) +lgrp_plat_process_srat(struct srat *tp, struct msct *mp, + uint32_t *prox_domain_min, node_domain_map_t *node_domain, + cpu_node_map_t *cpu_node, int cpu_count, + memnode_phys_addr_map_t *memnode_info) { struct srat_item *srat_end; int i; struct srat_item *item; int node_cnt; int proc_entry_count; + int rc; /* * Nothing to do when no SRAT or disabled @@ -2356,11 +2815,21 @@ lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, return (-1); /* - * Determine number of nodes by counting number of proximity domains in - * SRAT and return if number of nodes is 1 or less since don't need to - * read SRAT then + * Try to get domain information from MSCT table. + * ACPI4.0: OSPM will use information provided by the MSCT only + * when the System Resource Affinity Table (SRAT) exists. + */ + node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min); + if (node_cnt <= 0) { + /* + * Determine number of nodes by counting number of proximity + * domains in SRAT. + */ + node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); + } + /* + * Return if number of nodes is 1 or less since don't need to read SRAT. */ - node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); if (node_cnt == 1) return (1); else if (node_cnt <= 0) @@ -2397,16 +2866,17 @@ lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, } apic_id = item->i.p.apic_id; - if (lgrp_plat_cpu_node_update(node_domain, node_cnt, - cpu_node, cpu_count, apic_id, domain) < 0) + rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, + cpu_node, cpu_count, apic_id, domain); + if (rc < 0) return (-3); - - proc_entry_count++; + else if (rc == 0) + proc_entry_count++; break; case SRAT_MEMORY: /* memory entry */ if (!(item->i.m.flags & SRAT_ENABLED) || - node_memory == NULL) + memnode_info == NULL) break; /* @@ -2418,10 +2888,52 @@ lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, length = item->i.m.len; end = start + length - 1; - if (lgrp_plat_node_memory_update(node_domain, node_cnt, - node_memory, start, end, domain) < 0) + /* + * According to ACPI 4.0, both ENABLE and HOTPLUG flags + * may be set for memory address range entries in SRAT + * table which are reserved for memory hot plug. + * We intersect memory address ranges in SRAT table + * with memory ranges in physinstalled to filter out + * memory address ranges reserved for hot plug. + */ + if (item->i.m.flags & SRAT_HOT_PLUG) { + uint64_t rstart = UINT64_MAX; + uint64_t rend = 0; + struct memlist *ml; + extern struct bootops *bootops; + + memlist_read_lock(); + for (ml = bootops->boot_mem->physinstalled; + ml; ml = ml->ml_next) { + uint64_t tstart = ml->ml_address; + uint64_t tend; + + tend = ml->ml_address + ml->ml_size; + if (tstart > end || tend < start) + continue; + if (start > tstart) + tstart = start; + if (rstart > tstart) + rstart = tstart; + if (end < tend) + tend = end; + if (rend < tend) + rend = tend; + } + memlist_read_unlock(); + start = rstart; + end = rend; + /* Skip this entry if no memory installed. */ + if (start > end) + break; + } + + if (lgrp_plat_memnode_info_update(node_domain, + node_cnt, memnode_info, node_cnt, + start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0) return (-4); break; + case SRAT_X2APIC: /* x2apic CPU entry */ if (!(item->i.xp.flags & SRAT_ENABLED) || cpu_node == NULL) @@ -2434,11 +2946,12 @@ lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, domain = item->i.xp.domain; apic_id = item->i.xp.x2apic_id; - if (lgrp_plat_cpu_node_update(node_domain, node_cnt, - cpu_node, cpu_count, apic_id, domain) < 0) + rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, + cpu_node, cpu_count, apic_id, domain); + if (rc < 0) return (-3); - - proc_entry_count++; + else if (rc == 0) + proc_entry_count++; break; default: @@ -2460,7 +2973,7 @@ lgrp_plat_process_srat(struct srat *tp, uint32_t *prox_domain_min, * physical address */ lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count, - node_memory); + memnode_info); return (node_cnt); } @@ -2665,27 +3178,70 @@ lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min) /* + * Parse domain information in ACPI Maximum System Capability Table (MSCT). + * MSCT table has been verified in function process_msct() in fakebop.c. + */ +static int +lgrp_plat_msct_domains(struct msct *tp, uint32_t *prox_domain_min) +{ + int last_seen = 0; + uint32_t proxmin = UINT32_MAX; + struct msct_proximity_domain *item, *end; + + if (tp == NULL || lgrp_plat_msct_enable == 0) + return (-1); + + if (tp->maximum_proximity_domains >= MAX_NODES) { + cmn_err(CE_CONT, + "?lgrp: too many proximity domains (%d), max %d supported, " + "disable support of CPU/memory DR operations.", + tp->maximum_proximity_domains + 1, MAX_NODES); + plat_dr_disable_cpu(); + plat_dr_disable_memory(); + return (-1); + } + + if (prox_domain_min != NULL) { + end = (void *)(tp->hdr.len + (uintptr_t)tp); + for (item = (void *)((uintptr_t)tp + + tp->proximity_domain_offset); item < end; + item = (void *)(item->length + (uintptr_t)item)) { + if (item->domain_min < proxmin) { + proxmin = item->domain_min; + } + + last_seen = item->domain_max - item->domain_min + 1; + /* + * Break out if all proximity domains have been + * processed. Some BIOSes may have unused items + * at the end of MSCT table. + */ + if (last_seen > tp->maximum_proximity_domains) { + break; + } + } + *prox_domain_min = proxmin; + } + + return (tp->maximum_proximity_domains + 1); +} + + +/* * Set lgroup latencies for 2 level lgroup topology */ static void -lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, - lgrp_plat_latency_stats_t *lat_stats) +lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats) { - int i; + int i, j; - ASSERT(node_memory != NULL && lat_stats != NULL); + ASSERT(lat_stats != NULL); if (lgrp_plat_node_cnt >= 4) cmn_err(CE_NOTE, "MPO only optimizing for local and remote\n"); for (i = 0; i < lgrp_plat_node_cnt; i++) { - int j; - - if (!node_memory[i].exists) - continue; for (j = 0; j < lgrp_plat_node_cnt; j++) { - if (!node_memory[j].exists) - continue; if (i == j) lat_stats->latencies[i][j] = 2; else @@ -2694,7 +3250,9 @@ lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, } lat_stats->latency_min = 2; lat_stats->latency_max = 3; + /* TODO: check it. */ lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); + lgrp_plat_topo_flatten = 1; } @@ -2884,7 +3442,7 @@ is_opteron(void) */ static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, - node_phys_addr_map_t *node_memory) + memnode_phys_addr_map_t *memnode_info) { uint_t bus; uint_t dev; @@ -2999,10 +3557,10 @@ opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { /* * Mark node memory as non-existent and set start and - * end addresses to be same in node_memory[] + * end addresses to be same in memnode_info[] */ - node_memory[node].exists = 0; - node_memory[node].start = node_memory[node].end = + memnode_info[node].exists = 0; + memnode_info[node].start = memnode_info[node].end = (pfn_t)-1; continue; } @@ -3011,11 +3569,11 @@ opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, * Mark node memory as existing and remember physical address * range of each node for use later */ - node_memory[node].exists = 1; + memnode_info[node].exists = 1; - node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); + memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); - node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | + memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | OPT_DRAMADDR_LO_MASK_OFF); } diff --git a/usr/src/uts/i86pc/os/machdep.c b/usr/src/uts/i86pc/os/machdep.c index c9f97fc046..e9d08a9fa7 100644 --- a/usr/src/uts/i86pc/os/machdep.c +++ b/usr/src/uts/i86pc/os/machdep.c @@ -23,6 +23,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/t_lock.h> @@ -147,6 +151,16 @@ int force_shutdown_method = AD_UNKNOWN; char panicbuf[PANICBUFSIZE]; /* + * Flags to control Dynamic Reconfiguration features. + */ +uint64_t plat_dr_options; + +/* + * Maximum physical address for memory DR operations. + */ +uint64_t plat_dr_physmax; + +/* * maxphys - used during physio * klustsize - used for klustering by swapfs and specfs */ @@ -1390,3 +1404,33 @@ lbolt_softint_post(void) { (*setsoftint)(CBE_LOCK_PIL, lbolt_softint_hdl.ih_pending); } + +boolean_t +plat_dr_check_capability(uint64_t features) +{ + return ((plat_dr_options & features) == features); +} + +boolean_t +plat_dr_support_cpu(void) +{ + return (plat_dr_options & PLAT_DR_FEATURE_CPU); +} + +boolean_t +plat_dr_support_memory(void) +{ + return (plat_dr_options & PLAT_DR_FEATURE_MEMORY); +} + +void +plat_dr_enable_capability(uint64_t features) +{ + atomic_or_64(&plat_dr_options, features); +} + +void +plat_dr_disable_capability(uint64_t features) +{ + atomic_and_64(&plat_dr_options, ~features); +} diff --git a/usr/src/uts/i86pc/os/mem_config_arch.c b/usr/src/uts/i86pc/os/mem_config_arch.c new file mode 100644 index 0000000000..191e8cb641 --- /dev/null +++ b/usr/src/uts/i86pc/os/mem_config_arch.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1997-1998 by Sun Microsystems, Inc. + * All rights reserved. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/page.h> +#include <sys/mem_config.h> + +/*ARGSUSED*/ +int +arch_kphysm_del_span_ok(pfn_t base, pgcnt_t npgs) +{ + ASSERT(npgs != 0); + return (0); +} + +/*ARGSUSED*/ +int +arch_kphysm_relocate(pfn_t base, pgcnt_t npgs) +{ + ASSERT(npgs != 0); + return (ENOTSUP); +} + +int +arch_kphysm_del_supported(void) +{ + return (0); +} diff --git a/usr/src/uts/i86pc/os/memnode.c b/usr/src/uts/i86pc/os/memnode.c index 908b2e8cd2..35185bcd84 100644 --- a/usr/src/uts/i86pc/os/memnode.c +++ b/usr/src/uts/i86pc/os/memnode.c @@ -84,7 +84,7 @@ mem_node_add_slice(pfn_t start, pfn_t end) } mnode = PFN_2_MEM_NODE(start); - ASSERT(mnode < max_mem_nodes); + ASSERT(mnode >= 0 && mnode < max_mem_nodes); if (cas32((uint32_t *)&mem_node_config[mnode].exists, 0, 1)) { /* @@ -130,7 +130,7 @@ mem_node_del_slice(pfn_t start, pfn_t end) } mnode = PFN_2_MEM_NODE(start); - ASSERT(mnode < max_mem_nodes); + ASSERT(mnode >= 0 && mnode < max_mem_nodes); ASSERT(mem_node_config[mnode].exists == 1); delta_pgcnt = end - start; diff --git a/usr/src/uts/i86pc/os/microcode.c b/usr/src/uts/i86pc/os/microcode.c index 6f32a86c71..90d9585640 100644 --- a/usr/src/uts/i86pc/os/microcode.c +++ b/usr/src/uts/i86pc/os/microcode.c @@ -141,6 +141,7 @@ void ucode_alloc_space(cpu_t *cp) { ASSERT(cp->cpu_id != 0); + ASSERT(cp->cpu_m.mcpu_ucode_info == NULL); cp->cpu_m.mcpu_ucode_info = kmem_zalloc(sizeof (*cp->cpu_m.mcpu_ucode_info), KM_SLEEP); } @@ -148,9 +149,11 @@ ucode_alloc_space(cpu_t *cp) void ucode_free_space(cpu_t *cp) { - ASSERT(cp->cpu_id != 0); + ASSERT(cp->cpu_m.mcpu_ucode_info != NULL); + ASSERT(cp->cpu_m.mcpu_ucode_info != &cpu_ucode_info0); kmem_free(cp->cpu_m.mcpu_ucode_info, sizeof (*cp->cpu_m.mcpu_ucode_info)); + cp->cpu_m.mcpu_ucode_info = NULL; } /* @@ -1126,7 +1129,10 @@ ucode_check(cpu_t *cp) uint32_t new_rev = 0; ASSERT(cp); - if (cp->cpu_id == 0) + /* + * Space statically allocated for BSP, ensure pointer is set + */ + if (cp->cpu_id == 0 && cp->cpu_m.mcpu_ucode_info == NULL) cp->cpu_m.mcpu_ucode_info = &cpu_ucode_info0; uinfop = cp->cpu_m.mcpu_ucode_info; diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 3f8752638f..bf57dd3c50 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -22,6 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/sysmacros.h> @@ -107,6 +111,7 @@ mlsetup(struct regs *rp) extern char t0stack[]; extern int post_fastreboot; extern int console; + extern uint64_t plat_dr_options; ASSERT_STACK_ALIGNED(); @@ -354,15 +359,70 @@ mlsetup(struct regs *rp) prom_init("kernel", (void *)NULL); - if (bootprop_getval("boot-ncpus", &prop_value) != 0) + /* User-set option overrides firmware value. */ + if (bootprop_getval(PLAT_DR_OPTIONS_NAME, &prop_value) == 0) { + plat_dr_options = (uint64_t)prop_value; + } +#if defined(__xpv) + /* No support of DR operations on xpv */ + plat_dr_options = 0; +#else /* __xpv */ + /* Flag PLAT_DR_FEATURE_ENABLED should only be set by DR driver. */ + plat_dr_options &= ~PLAT_DR_FEATURE_ENABLED; +#ifndef __amd64 + /* Only enable CPU/memory DR on 64 bits kernel. */ + plat_dr_options &= ~PLAT_DR_FEATURE_MEMORY; + plat_dr_options &= ~PLAT_DR_FEATURE_CPU; +#endif /* __amd64 */ +#endif /* __xpv */ + + /* + * Get value of "plat_dr_physmax" boot option. + * It overrides values calculated from MSCT or SRAT table. + */ + if (bootprop_getval(PLAT_DR_PHYSMAX_NAME, &prop_value) == 0) { + plat_dr_physmax = ((uint64_t)prop_value) >> PAGESHIFT; + } + + /* Get value of boot_ncpus. */ + if (bootprop_getval(BOOT_NCPUS_NAME, &prop_value) != 0) { boot_ncpus = NCPU; - else { + } else { boot_ncpus = (int)prop_value; if (boot_ncpus <= 0 || boot_ncpus > NCPU) boot_ncpus = NCPU; } - max_ncpus = boot_max_ncpus = boot_ncpus; + /* + * Set max_ncpus and boot_max_ncpus to boot_ncpus if platform doesn't + * support CPU DR operations. + */ + if (plat_dr_support_cpu() == 0) { + max_ncpus = boot_max_ncpus = boot_ncpus; + } else { + if (bootprop_getval(PLAT_MAX_NCPUS_NAME, &prop_value) != 0) { + max_ncpus = NCPU; + } else { + max_ncpus = (int)prop_value; + if (max_ncpus <= 0 || max_ncpus > NCPU) { + max_ncpus = NCPU; + } + if (boot_ncpus > max_ncpus) { + boot_ncpus = max_ncpus; + } + } + + if (bootprop_getval(BOOT_MAX_NCPUS_NAME, &prop_value) != 0) { + boot_max_ncpus = boot_ncpus; + } else { + boot_max_ncpus = (int)prop_value; + if (boot_max_ncpus <= 0 || boot_max_ncpus > NCPU) { + boot_max_ncpus = boot_ncpus; + } else if (boot_max_ncpus > max_ncpus) { + boot_max_ncpus = max_ncpus; + } + } + } /* * Initialize the lgrp framework diff --git a/usr/src/uts/i86pc/os/mp_implfuncs.c b/usr/src/uts/i86pc/os/mp_implfuncs.c index b5742b2c69..3bded8f124 100644 --- a/usr/src/uts/i86pc/os/mp_implfuncs.c +++ b/usr/src/uts/i86pc/os/mp_implfuncs.c @@ -22,12 +22,12 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +#define PSMI_1_7 #include <sys/vmem.h> #include <vm/hat.h> #include <sys/modctl.h> #include <vm/seg_kmem.h> -#define PSMI_1_6 #include <sys/psm.h> #include <sys/psm_modctl.h> #include <sys/smp_impldefs.h> diff --git a/usr/src/uts/i86pc/os/mp_machdep.c b/usr/src/uts/i86pc/os/mp_machdep.c index 80e371850b..dc4023007c 100644 --- a/usr/src/uts/i86pc/os/mp_machdep.c +++ b/usr/src/uts/i86pc/os/mp_machdep.c @@ -24,11 +24,11 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/smp_impldefs.h> #include <sys/psm.h> #include <sys/psm_modctl.h> @@ -157,6 +157,7 @@ void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr; void (*hrtime_tick)(void) = return_instr; int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **) = mach_cpu_create_devinfo; +int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **) = NULL; /* * True if the generic TSC code is our source of hrtime, rather than whatever @@ -864,9 +865,17 @@ mach_get_platform(int owner) total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) / sizeof (void (*)(void)); else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4) - /* no psm_preshutdown function */ + /* no psm_intr_ops function */ total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) / sizeof (void (*)(void)); + else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_5) + /* no psm_state function */ + total_ops = OFFSETOF(struct psm_ops, psm_state) / + sizeof (void (*)(void)); + else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_6) + /* no psm_cpu_ops function */ + total_ops = OFFSETOF(struct psm_ops, psm_cpu_ops) / + sizeof (void (*)(void)); else total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void)); @@ -1042,7 +1051,11 @@ mach_smpinit(void) cpu_id = -1; cpu_id = (*pops->psm_get_next_processorid)(cpu_id); - for (cnt = 0; cpu_id != -1; cnt++) { + /* + * Only add boot_ncpus CPUs to mp_cpus. Other CPUs will be handled + * by CPU DR driver at runtime. + */ + for (cnt = 0; cpu_id != -1 && cnt < boot_ncpus; cnt++) { CPUSET_ADD(cpumask, cpu_id); cpu_id = (*pops->psm_get_next_processorid)(cpu_id); } @@ -1092,7 +1105,7 @@ mach_smpinit(void) psm_enable_intr = pops->psm_enable_intr; /* check for multiple CPUs */ - if (cnt < 2) + if (cnt < 2 && plat_dr_support_cpu() == B_FALSE) return; /* check for MP platforms */ @@ -1480,6 +1493,63 @@ mach_cpuid_start(processorid_t id, void *ctx) return ((*pops->psm_cpu_start)(id, ctx)); } +int +mach_cpu_stop(cpu_t *cp, void *ctx) +{ + struct psm_ops *pops = mach_set[0]; + psm_cpu_request_t request; + + if (pops->psm_cpu_ops == NULL) { + return (ENOTSUP); + } + + ASSERT(cp->cpu_id != -1); + request.pcr_cmd = PSM_CPU_STOP; + request.req.cpu_stop.cpuid = cp->cpu_id; + request.req.cpu_stop.ctx = ctx; + + return ((*pops->psm_cpu_ops)(&request)); +} + +int +mach_cpu_add(mach_cpu_add_arg_t *argp, processorid_t *cpuidp) +{ + int rc; + struct psm_ops *pops = mach_set[0]; + psm_cpu_request_t request; + + if (pops->psm_cpu_ops == NULL) { + return (ENOTSUP); + } + + request.pcr_cmd = PSM_CPU_ADD; + request.req.cpu_add.argp = argp; + request.req.cpu_add.cpuid = -1; + rc = (*pops->psm_cpu_ops)(&request); + if (rc == 0) { + ASSERT(request.req.cpu_add.cpuid != -1); + *cpuidp = request.req.cpu_add.cpuid; + } + + return (rc); +} + +int +mach_cpu_remove(processorid_t cpuid) +{ + struct psm_ops *pops = mach_set[0]; + psm_cpu_request_t request; + + if (pops->psm_cpu_ops == NULL) { + return (ENOTSUP); + } + + request.pcr_cmd = PSM_CPU_REMOVE; + request.req.cpu_remove.cpuid = cpuid; + + return ((*pops->psm_cpu_ops)(&request)); +} + /* * Default handler to create device node for CPU. * One reference count will be held on created device node. @@ -1569,6 +1639,24 @@ mach_cpu_create_device_node(struct cpu *cp, dev_info_t **dipp) return (rv); } +/* + * The dipp contains one of following values on return: + * - NULL if no device node found + * - pointer to device node if found + */ +int +mach_cpu_get_device_node(struct cpu *cp, dev_info_t **dipp) +{ + *dipp = NULL; + if (psm_cpu_get_devinfo != NULL) { + if (psm_cpu_get_devinfo(cp, dipp) == PSM_SUCCESS) { + return (PSM_SUCCESS); + } + } + + return (PSM_FAILURE); +} + /*ARGSUSED*/ static int mach_translate_irq(dev_info_t *dip, int irqno) diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c index 6fc571b445..bc21812187 100644 --- a/usr/src/uts/i86pc/os/mp_pc.c +++ b/usr/src/uts/i86pc/os/mp_pc.c @@ -22,6 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * Welcome to the world of the "real mode platter". @@ -31,6 +35,7 @@ #include <sys/types.h> #include <sys/systm.h> #include <sys/cpuvar.h> +#include <sys/cpu_module.h> #include <sys/kmem.h> #include <sys/archsystm.h> #include <sys/machsystm.h> @@ -41,10 +46,21 @@ #include <sys/mach_mmu.h> #include <sys/promif.h> #include <sys/cpu.h> +#include <sys/cpu_event.h> +#include <sys/sunndi.h> +#include <sys/fs/dv_node.h> #include <vm/hat_i86.h> +#include <vm/as.h> + +extern cpuset_t cpu_ready_set; -extern void real_mode_start(void); -extern void real_mode_end(void); +extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); +extern void real_mode_start_cpu(void); +extern void real_mode_start_cpu_end(void); +extern void real_mode_stop_cpu_stage1(void); +extern void real_mode_stop_cpu_stage1_end(void); +extern void real_mode_stop_cpu_stage2(void); +extern void real_mode_stop_cpu_stage2_end(void); extern void *(*cpu_pause_func)(void *); void rmp_gdt_init(rm_platter_t *); @@ -61,22 +77,36 @@ int mach_cpucontext_init(void) { ushort_t *vec; + ulong_t addr; + struct rm_platter *rm = (struct rm_platter *)rm_platter_va; if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, sizeof (vec), PROT_READ | PROT_WRITE))) return (-1); + /* * setup secondary cpu bios boot up vector + * Write page offset to 0x467 and page frame number to 0x469. */ - *vec = (ushort_t)((caddr_t) - ((struct rm_platter *)rm_platter_va)->rm_code - rm_platter_va - + ((ulong_t)rm_platter_va & 0xf)); - vec[1] = (ushort_t)(rm_platter_pa >> 4); + addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; + vec[0] = (ushort_t)(addr & PAGEOFFSET); + vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); warm_reset_vector = vec; - bcopy((caddr_t)real_mode_start, - (caddr_t)((rm_platter_t *)rm_platter_va)->rm_code, - (size_t)real_mode_end - (size_t)real_mode_start); + /* Map real mode platter into kas so kernel can access it. */ + hat_devload(kas.a_hat, + (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, + btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, + HAT_LOAD_NOCONSIST); + + /* Copy CPU startup code to rm_platter if it's still during boot. */ + if (!plat_dr_enabled()) { + ASSERT((size_t)real_mode_start_cpu_end - + (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); + bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, + (size_t)real_mode_start_cpu_end - + (size_t)real_mode_start_cpu); + } return (0); } @@ -95,12 +125,53 @@ mach_cpucontext_fini(void) extern void *long_mode_64(void); #endif /* __amd64 */ -void * -mach_cpucontext_alloc(struct cpu *cp) +/*ARGSUSED*/ +void +rmp_gdt_init(rm_platter_t *rm) +{ + +#if defined(__amd64) + /* Use the kas address space for the CPU startup thread. */ + if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) + panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" + "located above 4G in physical memory (@ 0x%lx)", + MAKECR3(kas.a_hat->hat_htable->ht_pfn)); + + /* + * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY + * by code in real_mode_start_cpu(): + * + * GDT[0]: NULL selector + * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 + * + * Clear the IDT as interrupts will be off and a limit of 0 will cause + * the CPU to triple fault and reset on an NMI, seemingly as reasonable + * a course of action as any other, though it may cause the entire + * platform to reset in some cases... + */ + rm->rm_temp_gdt[0] = 0; + rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; + + rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); + rm->rm_temp_gdt_base = rm_platter_pa + + (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); + rm->rm_temp_idt_lim = 0; + rm->rm_temp_idt_base = 0; + + /* + * Since the CPU needs to jump to protected mode using an identity + * mapped address, we need to calculate it here. + */ + rm->rm_longmode64_addr = rm_platter_pa + + ((uint32_t)long_mode_64 - (uint32_t)real_mode_start_cpu); +#endif /* __amd64 */ +} + +static void * +mach_cpucontext_alloc_tables(struct cpu *cp) { - rm_platter_t *rm = (rm_platter_t *)rm_platter_va; - struct cpu_tables *ct; struct tss *ntss; + struct cpu_tables *ct; /* * Allocate space for stack, tss, gdt and idt. We round the size @@ -109,7 +180,8 @@ mach_cpucontext_alloc(struct cpu *cp) */ ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); if ((uintptr_t)ct & PAGEOFFSET) - panic("mp_startup_init: cpu%d misaligned tables", cp->cpu_id); + panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", + cp->cpu_id); ntss = cp->cpu_tss = &ct->ct_tss; @@ -149,93 +221,174 @@ mach_cpucontext_alloc(struct cpu *cp) set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); + return (ct); +} + +void * +mach_cpucontext_xalloc(struct cpu *cp, int optype) +{ + size_t len; + struct cpu_tables *ct; + rm_platter_t *rm = (rm_platter_t *)rm_platter_va; + static int cpu_halt_code_ready; + + if (optype == MACH_CPUCONTEXT_OP_STOP) { + ASSERT(plat_dr_enabled()); + + /* + * The WARM_RESET_VECTOR has a limitation that the physical + * address written to it must be page-aligned. To work around + * this limitation, the CPU stop code has been splitted into + * two stages. + * The stage 2 code, which implements the real logic to halt + * CPUs, is copied to the rm_cpu_halt_code field in the real + * mode platter. The stage 1 code, which simply jumps to the + * stage 2 code in the rm_cpu_halt_code field, is copied to + * rm_code field in the real mode platter and it may be + * overwritten after the CPU has been stopped. + */ + if (!cpu_halt_code_ready) { + /* + * The rm_cpu_halt_code field in the real mode platter + * is used by the CPU stop code only. So only copy the + * CPU stop stage 2 code into the rm_cpu_halt_code + * field on the first call. + */ + len = (size_t)real_mode_stop_cpu_stage2_end - + (size_t)real_mode_stop_cpu_stage2; + ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); + bcopy((caddr_t)real_mode_stop_cpu_stage2, + (caddr_t)rm->rm_cpu_halt_code, len); + cpu_halt_code_ready = 1; + } + + /* + * The rm_code field in the real mode platter is shared by + * the CPU start, CPU stop, CPR and fast reboot code. So copy + * the CPU stop stage 1 code into the rm_code field every time. + */ + len = (size_t)real_mode_stop_cpu_stage1_end - + (size_t)real_mode_stop_cpu_stage1; + ASSERT(len <= RM_PLATTER_CODE_SIZE); + bcopy((caddr_t)real_mode_stop_cpu_stage1, + (caddr_t)rm->rm_code, len); + rm->rm_cpu_halted = 0; + + return (cp->cpu_m.mcpu_mach_ctx_ptr); + } else if (optype != MACH_CPUCONTEXT_OP_START) { + return (NULL); + } + + /* + * Only need to allocate tables when starting CPU. + * Tables allocated when starting CPU will be reused when stopping CPU. + */ + ct = mach_cpucontext_alloc_tables(cp); + if (ct == NULL) { + return (NULL); + } + + /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ + if (plat_dr_enabled()) { + bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, + (size_t)real_mode_start_cpu_end - + (size_t)real_mode_start_cpu); + } + /* * Now copy all that we've set up onto the real mode platter * for the real mode code to digest as part of starting the cpu. */ - rm->rm_idt_base = cp->cpu_idt; rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; rm->rm_gdt_base = cp->cpu_gdt; rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; - rm->rm_pdbr = getcr3(); + /* + * CPU needs to access kernel address space after powering on. + * When hot-adding CPU at runtime, directly use top level page table + * of kas other than the return value of getcr3(). getcr3() returns + * current process's top level page table, which may be different from + * the one of kas. + */ + rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); rm->rm_cpu = cp->cpu_id; rm->rm_x86feature = x86_feature; + + /* + * For hot-adding CPU at runtime, Machine Check and Performance Counter + * should be disabled. They will be enabled on demand after CPU powers + * on successfully + */ rm->rm_cr4 = getcr4(); + rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); rmp_gdt_init(rm); return (ct); } -/*ARGSUSED*/ void -rmp_gdt_init(rm_platter_t *rm) +mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) { + struct cpu_tables *ct = arg; -#if defined(__amd64) - - if (getcr3() > 0xffffffffUL) - panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" - "located above 4G in physical memory (@ 0x%lx)", getcr3()); - - /* - * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY - * by code in real_mode_start(): - * - * GDT[0]: NULL selector - * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 - * - * Clear the IDT as interrupts will be off and a limit of 0 will cause - * the CPU to triple fault and reset on an NMI, seemingly as reasonable - * a course of action as any other, though it may cause the entire - * platform to reset in some cases... - */ - rm->rm_temp_gdt[0] = 0; - rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; - - rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); - rm->rm_temp_gdt_base = rm_platter_pa + - (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); - rm->rm_temp_idt_lim = 0; - rm->rm_temp_idt_base = 0; + ASSERT(&ct->ct_tss == cp->cpu_tss); + if (optype == MACH_CPUCONTEXT_OP_START) { + switch (err) { + case 0: + /* + * Save pointer for reuse when stopping CPU. + */ + cp->cpu_m.mcpu_mach_ctx_ptr = arg; + break; + case ETIMEDOUT: + /* + * The processor was poked, but failed to start before + * we gave up waiting for it. In case it starts later, + * don't free anything. + */ + cp->cpu_m.mcpu_mach_ctx_ptr = arg; + break; + default: + /* + * Some other, passive, error occurred. + */ + kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); + cp->cpu_tss = NULL; + break; + } + } else if (optype == MACH_CPUCONTEXT_OP_STOP) { + switch (err) { + case 0: + /* + * Free resources allocated when starting CPU. + */ + kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); + cp->cpu_tss = NULL; + cp->cpu_m.mcpu_mach_ctx_ptr = NULL; + break; + default: + /* + * Don't touch table pointer in case of failure. + */ + break; + } + } else { + ASSERT(0); + } +} - /* - * Since the CPU needs to jump to protected mode using an identity - * mapped address, we need to calculate it here. - */ - rm->rm_longmode64_addr = rm_platter_pa + - ((uint32_t)long_mode_64 - (uint32_t)real_mode_start); -#endif /* __amd64 */ +void * +mach_cpucontext_alloc(struct cpu *cp) +{ + return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); } -/*ARGSUSED*/ void mach_cpucontext_free(struct cpu *cp, void *arg, int err) { - struct cpu_tables *ct = arg; - - ASSERT(&ct->ct_tss == cp->cpu_tss); - - switch (err) { - case 0: - break; - case ETIMEDOUT: - /* - * The processor was poked, but failed to start before - * we gave up waiting for it. In case it starts later, - * don't free anything. - */ - break; - default: - /* - * Some other, passive, error occurred. - */ - kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); - cp->cpu_tss = NULL; - break; - } + mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); } /* @@ -276,25 +429,221 @@ mach_cpu_pause(volatile char *safe) } /* - * Power on CPU. + * Power on the target CPU. */ -/*ARGSUSED*/ int mp_cpu_poweron(struct cpu *cp) { - ASSERT(MUTEX_HELD(&cpu_lock)); - return (ENOTSUP); /* not supported */ + int error; + cpuset_t tempset; + processorid_t cpuid; + + ASSERT(cp != NULL); + cpuid = cp->cpu_id; + if (use_mp == 0 || plat_dr_support_cpu() == 0) { + return (ENOTSUP); + } else if (cpuid < 0 || cpuid >= max_ncpus) { + return (EINVAL); + } + + /* + * The currrent x86 implementaiton of mp_cpu_configure() and + * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only + * be called once after calling mp_cpu_configure() for a specific CPU. + * It's because mp_cpu_poweron() will destroy data structure created + * by mp_cpu_configure(). So reject the request if the CPU has already + * been powered on once after calling mp_cpu_configure(). + * This limitaiton only affects the p_online syscall and the DR driver + * won't be affected because the DR driver always invoke public CPU + * management interfaces in the predefined order: + * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() + */ + if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { + return (ENOTSUP); + } + + /* + * Check if there's at least a Mbyte of kmem available + * before attempting to start the cpu. + */ + if (kmem_avail() < 1024 * 1024) { + /* + * Kick off a reap in case that helps us with + * later attempts .. + */ + kmem_reap(); + return (ENOMEM); + } + + affinity_set(CPU->cpu_id); + + /* + * Start the target CPU. No need to call mach_cpucontext_fini() + * if mach_cpucontext_init() fails. + */ + if ((error = mach_cpucontext_init()) == 0) { + error = mp_start_cpu_common(cp, B_FALSE); + mach_cpucontext_fini(); + } + if (error != 0) { + affinity_clear(); + return (error); + } + + /* Wait for the target cpu to reach READY state. */ + tempset = cpu_ready_set; + while (!CPU_IN_SET(tempset, cpuid)) { + delay(1); + tempset = *((volatile cpuset_t *)&cpu_ready_set); + } + + /* Mark the target CPU as available for mp operation. */ + CPUSET_ATOMIC_ADD(mp_cpus, cpuid); + + /* Free the space allocated to hold the microcode file */ + ucode_cleanup(); + + affinity_clear(); + + return (0); +} + +#define MP_CPU_DETACH_MAX_TRIES 5 +#define MP_CPU_DETACH_DELAY 100 + +static int +mp_cpu_detach_driver(dev_info_t *dip) +{ + int i; + int rv = EBUSY; + dev_info_t *pdip; + + pdip = ddi_get_parent(dip); + ASSERT(pdip != NULL); + /* + * Check if caller holds pdip busy - can cause deadlocks in + * e_ddi_branch_unconfigure(), which calls devfs_clean(). + */ + if (DEVI_BUSY_OWNED(pdip)) { + return (EDEADLOCK); + } + + for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { + if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { + rv = 0; + break; + } + DELAY(MP_CPU_DETACH_DELAY); + } + + return (rv); } /* - * Power off CPU. + * Power off the target CPU. + * Note: cpu_lock will be released and then reacquired. */ -/*ARGSUSED*/ int mp_cpu_poweroff(struct cpu *cp) { - ASSERT(MUTEX_HELD(&cpu_lock)); - return (ENOTSUP); /* not supported */ + int rv = 0; + void *ctx; + dev_info_t *dip = NULL; + rm_platter_t *rm = (rm_platter_t *)rm_platter_va; + extern void cpupm_start(cpu_t *); + extern void cpupm_stop(cpu_t *); + + ASSERT(cp != NULL); + ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); + ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); + + if (use_mp == 0 || plat_dr_support_cpu() == 0) { + return (ENOTSUP); + } + /* + * There is no support for powering off cpu0 yet. + * There are many pieces of code which have a hard dependency on cpu0. + */ + if (cp->cpu_id == 0) { + return (ENOTSUP); + }; + + if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { + return (ENXIO); + } + ASSERT(dip != NULL); + if (mp_cpu_detach_driver(dip) != 0) { + rv = EBUSY; + goto out_online; + } + + /* Allocate CPU context for stopping */ + if (mach_cpucontext_init() != 0) { + rv = ENXIO; + goto out_online; + } + ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); + if (ctx == NULL) { + rv = ENXIO; + goto out_context_fini; + } + + cpupm_stop(cp); + cpu_event_fini_cpu(cp); + + if (cp->cpu_m.mcpu_cmi_hdl != NULL) { + cmi_fini(cp->cpu_m.mcpu_cmi_hdl); + cp->cpu_m.mcpu_cmi_hdl = NULL; + } + + rv = mach_cpu_stop(cp, ctx); + if (rv != 0) { + goto out_enable_cmi; + } + + /* Wait until the target CPU has been halted. */ + while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { + delay(1); + } + rm->rm_cpu_halted = 0xffff; + + /* CPU_READY has been cleared by mach_cpu_stop. */ + ASSERT((cp->cpu_flags & CPU_READY) == 0); + ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); + cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; + CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); + + mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); + mach_cpucontext_fini(); + + return (0); + +out_enable_cmi: + { + cmi_hdl_t hdl; + + if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), + cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { + if (x86_feature & X86_MCA) + cmi_mca_init(hdl); + cp->cpu_m.mcpu_cmi_hdl = hdl; + } + } + cpu_event_init_cpu(cp); + cpupm_start(cp); + mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); + +out_context_fini: + mach_cpucontext_fini(); + +out_online: + (void) e_ddi_branch_configure(dip, NULL, 0); + + if (rv != EAGAIN && rv != ETIME) { + rv = ENXIO; + } + + return (rv); } /* diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 0ba377f67c..146f879a66 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -23,10 +23,15 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/thread.h> #include <sys/cpuvar.h> +#include <sys/cpu.h> #include <sys/t_lock.h> #include <sys/param.h> #include <sys/proc.h> @@ -34,6 +39,7 @@ #include <sys/class.h> #include <sys/cmn_err.h> #include <sys/debug.h> +#include <sys/note.h> #include <sys/asm_linkage.h> #include <sys/x_call.h> #include <sys/systm.h> @@ -60,6 +66,7 @@ #include <sys/reboot.h> #include <sys/kdi_machimpl.h> #include <vm/hat_i86.h> +#include <vm/vm_dep.h> #include <sys/memnode.h> #include <sys/pci_cfgspace.h> #include <sys/mach_mmu.h> @@ -71,8 +78,11 @@ struct cpu cpus[1]; /* CPU data */ struct cpu *cpu[NCPU] = {&cpus[0]}; /* pointers to all CPUs */ +struct cpu *cpu_free_list; /* list for released CPUs */ cpu_core_t cpu_core[NCPU]; /* cpu_core structures */ +#define cpu_next_free cpu_prev + /* * Useful for disabling MP bring-up on a MP capable system. */ @@ -94,7 +104,8 @@ int flushes_require_xcalls; cpuset_t cpu_ready_set; /* initialized in startup() */ -static void mp_startup(void); +static void mp_startup_boot(void); +static void mp_startup_hotplug(void); static void cpu_sep_enable(void); static void cpu_sep_disable(void); @@ -108,7 +119,6 @@ void init_cpu_info(struct cpu *cp) { processor_info_t *pi = &cp->cpu_type_info; - char buf[CPU_IDSTRLEN]; /* * Get clock-frequency property for the CPU. @@ -131,14 +141,18 @@ init_cpu_info(struct cpu *cp) if (fpu_exists) (void) strcpy(pi->pi_fputypes, "i387 compatible"); - (void) cpuid_getidstr(cp, buf, sizeof (buf)); + cp->cpu_idstr = kmem_zalloc(CPU_IDSTRLEN, KM_SLEEP); + cp->cpu_brandstr = kmem_zalloc(CPU_IDSTRLEN, KM_SLEEP); - cp->cpu_idstr = kmem_alloc(strlen(buf) + 1, KM_SLEEP); - (void) strcpy(cp->cpu_idstr, buf); - - (void) cpuid_getbrandstr(cp, buf, sizeof (buf)); - cp->cpu_brandstr = kmem_alloc(strlen(buf) + 1, KM_SLEEP); - (void) strcpy(cp->cpu_brandstr, buf); + /* + * If called for the BSP, cp is equal to current CPU. + * For non-BSPs, cpuid info of cp is not ready yet, so use cpuid info + * of current CPU as default values for cpu_idstr and cpu_brandstr. + * They will be corrected in mp_startup_common() after cpuid_pass1() + * has been invoked on target CPU. + */ + (void) cpuid_getidstr(CPU, cp->cpu_idstr, CPU_IDSTRLEN); + (void) cpuid_getbrandstr(CPU, cp->cpu_brandstr, CPU_IDSTRLEN); } /* @@ -228,9 +242,11 @@ init_cpu_syscall(struct cpu *cp) * * Allocate and initialize the cpu structure, TRAPTRACE buffer, and the * startup and idle threads for the specified CPU. + * Parameter boot is true for boot time operations and is false for CPU + * DR operations. */ -struct cpu * -mp_startup_init(int cpun) +static struct cpu * +mp_cpu_configure_common(int cpun, boolean_t boot) { struct cpu *cp; kthread_id_t tp; @@ -247,27 +263,25 @@ mp_startup_init(int cpun) trap_trace_ctl_t *ttc = &trap_trace_ctl[cpun]; #endif + ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(cpun < NCPU && cpu[cpun] == NULL); - cp = kmem_zalloc(sizeof (*cp), KM_SLEEP); -#if !defined(__xpv) - if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { - cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU); - cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; - } else -#endif - cp->cpu_m.mcpu_idle_cpu = cpu_idle; + if (cpu_free_list == NULL) { + cp = kmem_zalloc(sizeof (*cp), KM_SLEEP); + } else { + cp = cpu_free_list; + cpu_free_list = cp->cpu_next_free; + } cp->cpu_m.mcpu_istamp = cpun << 16; - procp = curthread->t_procp; + /* Create per CPU specific threads in the process p0. */ + procp = &p0; - mutex_enter(&cpu_lock); /* * Initialize the dispatcher first. */ disp_cpu_init(cp); - mutex_exit(&cpu_lock); cpu_vm_data_init(cp); @@ -294,14 +308,21 @@ mp_startup_init(int cpun) tp->t_disp_queue = cp->cpu_disp; /* - * Setup thread to start in mp_startup. + * Setup thread to start in mp_startup_common. */ sp = tp->t_stk; - tp->t_pc = (uintptr_t)mp_startup; tp->t_sp = (uintptr_t)(sp - MINFRAME); #if defined(__amd64) tp->t_sp -= STACK_ENTRY_ALIGN; /* fake a call */ #endif + /* + * Setup thread start entry point for boot or hotplug. + */ + if (boot) { + tp->t_pc = (uintptr_t)mp_startup_boot; + } else { + tp->t_pc = (uintptr_t)mp_startup_hotplug; + } cp->cpu_id = cpun; cp->cpu_self = cp; @@ -312,12 +333,13 @@ mp_startup_init(int cpun) /* * cpu_base_spl must be set explicitly here to prevent any blocking - * operations in mp_startup from causing the spl of the cpu to drop - * to 0 (allowing device interrupts before we're ready) in resume(). + * operations in mp_startup_common from causing the spl of the cpu + * to drop to 0 (allowing device interrupts before we're ready) in + * resume(). * cpu_base_spl MUST remain at LOCK_LEVEL until the cpu is CPU_READY. * As an extra bit of security on DEBUG kernels, this is enforced with - * an assertion in mp_startup() -- before cpu_base_spl is set to its - * proper value. + * an assertion in mp_startup_common() -- before cpu_base_spl is set + * to its proper value. */ cp->cpu_base_spl = ipltospl(LOCK_LEVEL); @@ -392,6 +414,15 @@ mp_startup_init(int cpun) * alloc space for cpuid info */ cpuid_alloc_space(cp); +#if !defined(__xpv) + if ((x86_feature & X86_MWAIT) && idle_cpu_prefer_mwait) { + cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(cp); + cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; + } else +#endif + cp->cpu_m.mcpu_idle_cpu = cpu_idle; + + init_cpu_info(cp); /* * alloc space for ucode_info @@ -408,31 +439,34 @@ mp_startup_init(int cpun) ttc->ttc_next = ttc->ttc_first; ttc->ttc_limit = ttc->ttc_first + trap_trace_bufsize; #endif + /* * Record that we have another CPU. */ - mutex_enter(&cpu_lock); /* * Initialize the interrupt threads for this CPU */ cpu_intr_alloc(cp, NINTR_THREADS); + + cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; + cpu_set_state(cp); + /* * Add CPU to list of available CPUs. It'll be on the active list - * after mp_startup(). + * after mp_startup_common(). */ cpu_add_unit(cp); - mutex_exit(&cpu_lock); return (cp); } /* - * Undo what was done in mp_startup_init + * Undo what was done in mp_cpu_configure_common */ static void -mp_startup_fini(struct cpu *cp, int error) +mp_cpu_unconfigure_common(struct cpu *cp, int error) { - mutex_enter(&cpu_lock); + ASSERT(MUTEX_HELD(&cpu_lock)); /* * Remove the CPU from the list of available CPUs. @@ -451,7 +485,6 @@ mp_startup_fini(struct cpu *cp, int error) * leave the fundamental data structures intact. */ cp->cpu_flags = 0; - mutex_exit(&cpu_lock); return; } @@ -468,8 +501,7 @@ mp_startup_fini(struct cpu *cp, int error) */ segkp_release(segkp, cp->cpu_intr_stack - (INTR_STACK_SIZE - SA(MINFRAME))); - - mutex_exit(&cpu_lock); + cp->cpu_intr_stack = NULL; #ifdef TRAPTRACE /* @@ -485,10 +517,26 @@ mp_startup_fini(struct cpu *cp, int error) hat_cpu_offline(cp); - cpuid_free_space(cp); - ucode_free_space(cp); + /* Free CPU ID string and brand string. */ + if (cp->cpu_idstr) { + kmem_free(cp->cpu_idstr, CPU_IDSTRLEN); + cp->cpu_idstr = NULL; + } + if (cp->cpu_brandstr) { + kmem_free(cp->cpu_brandstr, CPU_IDSTRLEN); + cp->cpu_brandstr = NULL; + } + +#if !defined(__xpv) + if (cp->cpu_m.mcpu_mwait != NULL) { + cpuid_mwait_free(cp); + cp->cpu_m.mcpu_mwait = NULL; + } +#endif + cpuid_free_space(cp); + if (cp->cpu_idt != CPU->cpu_idt) kmem_free(cp->cpu_idt, PAGESIZE); cp->cpu_idt = NULL; @@ -496,6 +544,12 @@ mp_startup_fini(struct cpu *cp, int error) kmem_free(cp->cpu_gdt, PAGESIZE); cp->cpu_gdt = NULL; + if (cp->cpu_supp_freqs != NULL) { + size_t len = strlen(cp->cpu_supp_freqs) + 1; + kmem_free(cp->cpu_supp_freqs, len); + cp->cpu_supp_freqs = NULL; + } + teardown_vaddr_for_ppcopy(cp); kcpc_hw_fini(cp); @@ -505,15 +559,13 @@ mp_startup_fini(struct cpu *cp, int error) cpu_vm_data_destroy(cp); - mutex_enter(&cpu_lock); + xc_fini_cpu(cp); disp_cpu_fini(cp); - mutex_exit(&cpu_lock); -#if !defined(__xpv) - if (cp->cpu_m.mcpu_mwait != NULL) - cpuid_mwait_free(cp); -#endif - kmem_free(cp, sizeof (*cp)); + ASSERT(cp != CPU0); + bzero(cp, sizeof (*cp)); + cp->cpu_next_free = cpu_free_list; + cpu_free_list = cp; } /* @@ -529,8 +581,8 @@ mp_startup_fini(struct cpu *cp, int error) * system. * * workaround_errata is invoked early in mlsetup() for CPU 0, and in - * mp_startup() for all slave CPUs. Slaves process workaround_errata prior - * to acknowledging their readiness to the master, so this routine will + * mp_startup_common() for all slave CPUs. Slaves process workaround_errata + * prior to acknowledging their readiness to the master, so this routine will * never be executed by multiple CPUs in parallel, thus making updates to * global data safe. * @@ -1214,77 +1266,82 @@ workaround_errata_end() #endif } -static cpuset_t procset; - /* - * Start a single cpu, assuming that the kernel context is available - * to successfully start another cpu. - * - * (For example, real mode code is mapped into the right place - * in memory and is ready to be run.) + * The procset_slave and procset_master are used to synchronize + * between the control CPU and the target CPU when starting CPUs. */ +static cpuset_t procset_slave, procset_master; + +static void +mp_startup_wait(cpuset_t *sp, processorid_t cpuid) +{ + cpuset_t tempset; + + for (tempset = *sp; !CPU_IN_SET(tempset, cpuid); + tempset = *(volatile cpuset_t *)sp) { + SMT_PAUSE(); + } + CPUSET_ATOMIC_DEL(*(cpuset_t *)sp, cpuid); +} + +static void +mp_startup_signal(cpuset_t *sp, processorid_t cpuid) +{ + cpuset_t tempset; + + CPUSET_ATOMIC_ADD(*(cpuset_t *)sp, cpuid); + for (tempset = *sp; CPU_IN_SET(tempset, cpuid); + tempset = *(volatile cpuset_t *)sp) { + SMT_PAUSE(); + } +} + int -start_cpu(processorid_t who) +mp_start_cpu_common(cpu_t *cp, boolean_t boot) { + _NOTE(ARGUNUSED(boot)); + void *ctx; - cpu_t *cp; int delays; int error = 0; + cpuset_t tempset; + processorid_t cpuid; +#ifndef __xpv + extern void cpupm_init(cpu_t *); +#endif - ASSERT(who != 0); - - /* - * Check if there's at least a Mbyte of kmem available - * before attempting to start the cpu. - */ - if (kmem_avail() < 1024 * 1024) { - /* - * Kick off a reap in case that helps us with - * later attempts .. - */ - kmem_reap(); - return (ENOMEM); + ASSERT(cp != NULL); + cpuid = cp->cpu_id; + ctx = mach_cpucontext_alloc(cp); + if (ctx == NULL) { + cmn_err(CE_WARN, + "cpu%d: failed to allocate context", cp->cpu_id); + return (EAGAIN); } - - cp = mp_startup_init(who); - if ((ctx = mach_cpucontext_alloc(cp)) == NULL || - (error = mach_cpu_start(cp, ctx)) != 0) { - - /* - * Something went wrong before we even started it - */ - if (ctx) - cmn_err(CE_WARN, - "cpu%d: failed to start error %d", - cp->cpu_id, error); - else - cmn_err(CE_WARN, - "cpu%d: failed to allocate context", cp->cpu_id); - - if (ctx) - mach_cpucontext_free(cp, ctx, error); - else - error = EAGAIN; /* hmm. */ - mp_startup_fini(cp, error); + error = mach_cpu_start(cp, ctx); + if (error != 0) { + cmn_err(CE_WARN, + "cpu%d: failed to start, error %d", cp->cpu_id, error); + mach_cpucontext_free(cp, ctx, error); return (error); } - for (delays = 0; !CPU_IN_SET(procset, who); delays++) { + for (delays = 0, tempset = procset_slave; !CPU_IN_SET(tempset, cpuid); + delays++) { if (delays == 500) { /* * After five seconds, things are probably looking * a bit bleak - explain the hang. */ cmn_err(CE_NOTE, "cpu%d: started, " - "but not running in the kernel yet", who); + "but not running in the kernel yet", cpuid); } else if (delays > 2000) { /* * We waited at least 20 seconds, bail .. */ error = ETIMEDOUT; - cmn_err(CE_WARN, "cpu%d: timed out", who); + cmn_err(CE_WARN, "cpu%d: timed out", cpuid); mach_cpucontext_free(cp, ctx, error); - mp_startup_fini(cp, error); return (error); } @@ -1292,37 +1349,105 @@ start_cpu(processorid_t who) * wait at least 10ms, then check again.. */ delay(USEC_TO_TICK_ROUNDUP(10000)); + tempset = *((volatile cpuset_t *)&procset_slave); } + CPUSET_ATOMIC_DEL(procset_slave, cpuid); mach_cpucontext_free(cp, ctx, 0); #ifndef __xpv if (tsc_gethrtime_enable) - tsc_sync_master(who); + tsc_sync_master(cpuid); #endif if (dtrace_cpu_init != NULL) { + (*dtrace_cpu_init)(cpuid); + } + + /* + * During CPU DR operations, the cpu_lock is held by current + * (the control) thread. We can't release the cpu_lock here + * because that will break the CPU DR logic. + * On the other hand, CPUPM and processor group initialization + * routines need to access the cpu_lock. So we invoke those + * routines here on behalf of mp_startup_common(). + * + * CPUPM and processor group initialization routines depend + * on the cpuid probing results. Wait for mp_startup_common() + * to signal that cpuid probing is done. + */ + mp_startup_wait(&procset_slave, cpuid); +#ifndef __xpv + cpupm_init(cp); +#endif + (void) pg_cpu_init(cp, B_FALSE); + cpu_set_state(cp); + mp_startup_signal(&procset_master, cpuid); + + return (0); +} + +/* + * Start a single cpu, assuming that the kernel context is available + * to successfully start another cpu. + * + * (For example, real mode code is mapped into the right place + * in memory and is ready to be run.) + */ +int +start_cpu(processorid_t who) +{ + cpu_t *cp; + int error = 0; + cpuset_t tempset; + + ASSERT(who != 0); + + /* + * Check if there's at least a Mbyte of kmem available + * before attempting to start the cpu. + */ + if (kmem_avail() < 1024 * 1024) { /* - * DTrace CPU initialization expects cpu_lock to be held. + * Kick off a reap in case that helps us with + * later attempts .. */ - mutex_enter(&cpu_lock); - (*dtrace_cpu_init)(who); - mutex_exit(&cpu_lock); + kmem_reap(); + return (ENOMEM); + } + + /* + * First configure cpu. + */ + cp = mp_cpu_configure_common(who, B_TRUE); + ASSERT(cp != NULL); + + /* + * Then start cpu. + */ + error = mp_start_cpu_common(cp, B_TRUE); + if (error != 0) { + mp_cpu_unconfigure_common(cp, error); + return (error); } - while (!CPU_IN_SET(cpu_ready_set, who)) - delay(1); + mutex_exit(&cpu_lock); + tempset = cpu_ready_set; + while (!CPU_IN_SET(tempset, who)) { + drv_usecwait(1); + tempset = *((volatile cpuset_t *)&cpu_ready_set); + } + mutex_enter(&cpu_lock); return (0); } - -/*ARGSUSED*/ void start_other_cpus(int cprboot) { + _NOTE(ARGUNUSED(cprboot)); + uint_t who; - uint_t skipped = 0; uint_t bootcpuid = 0; /* @@ -1347,9 +1472,12 @@ start_other_cpus(int cprboot) CPUSET_ADD(cpu_ready_set, bootcpuid); /* - * if only 1 cpu or not using MP, skip the rest of this + * skip the rest of this if + * . only 1 cpu dectected and system isn't hotplug-capable + * . not using MP */ - if (CPUSET_ISNULL(mp_cpus) || use_mp == 0) { + if ((CPUSET_ISNULL(mp_cpus) && plat_dr_support_cpu() == 0) || + use_mp == 0) { if (use_mp == 0) cmn_err(CE_CONT, "?***** Not in MP mode\n"); goto done; @@ -1375,18 +1503,13 @@ start_other_cpus(int cprboot) affinity_set(CPU_CURRENT); for (who = 0; who < NCPU; who++) { - if (!CPU_IN_SET(mp_cpus, who)) continue; ASSERT(who != bootcpuid); - if (ncpus >= max_ncpus) { - skipped = who; - continue; - } - if (start_cpu(who) != 0) - CPUSET_DEL(mp_cpus, who); mutex_enter(&cpu_lock); + if (start_cpu(who) != 0) + CPUSET_DEL(mp_cpus, who); cpu_state_change_notify(who, CPU_SETUP); mutex_exit(&cpu_lock); } @@ -1396,59 +1519,92 @@ start_other_cpus(int cprboot) affinity_clear(); - if (skipped) { + mach_cpucontext_fini(); + +done: + if (get_hwenv() == HW_NATIVE) + workaround_errata_end(); + cmi_post_mpstartup(); + + if (use_mp && ncpus != boot_max_ncpus) { cmn_err(CE_NOTE, "System detected %d cpus, but " "only %d cpu(s) were enabled during boot.", - skipped + 1, ncpus); + boot_max_ncpus, ncpus); cmn_err(CE_NOTE, "Use \"boot-ncpus\" parameter to enable more CPU(s). " "See eeprom(1M)."); } - -done: - if (get_hwenv() == HW_NATIVE) - workaround_errata_end(); - mach_cpucontext_fini(); - - cmi_post_mpstartup(); } -/* - * Dummy functions - no i86pc platforms support dynamic cpu allocation. - */ -/*ARGSUSED*/ int mp_cpu_configure(int cpuid) { - return (ENOTSUP); /* not supported */ + cpu_t *cp; + + if (use_mp == 0 || plat_dr_support_cpu() == 0) { + return (ENOTSUP); + } + + cp = cpu_get(cpuid); + if (cp != NULL) { + return (EALREADY); + } + + /* + * Check if there's at least a Mbyte of kmem available + * before attempting to start the cpu. + */ + if (kmem_avail() < 1024 * 1024) { + /* + * Kick off a reap in case that helps us with + * later attempts .. + */ + kmem_reap(); + return (ENOMEM); + } + + cp = mp_cpu_configure_common(cpuid, B_FALSE); + ASSERT(cp != NULL && cpu_get(cpuid) == cp); + + return (cp != NULL ? 0 : EAGAIN); } -/*ARGSUSED*/ int mp_cpu_unconfigure(int cpuid) { - return (ENOTSUP); /* not supported */ + cpu_t *cp; + + if (use_mp == 0 || plat_dr_support_cpu() == 0) { + return (ENOTSUP); + } else if (cpuid < 0 || cpuid >= max_ncpus) { + return (EINVAL); + } + + cp = cpu_get(cpuid); + if (cp == NULL) { + return (ENODEV); + } + mp_cpu_unconfigure_common(cp, 0); + + return (0); } /* * Startup function for 'other' CPUs (besides boot cpu). * Called from real_mode_start. * - * WARNING: until CPU_READY is set, mp_startup and routines called by - * mp_startup should not call routines (e.g. kmem_free) that could call + * WARNING: until CPU_READY is set, mp_startup_common and routines called by + * mp_startup_common should not call routines (e.g. kmem_free) that could call * hat_unload which requires CPU_READY to be set. */ -void -mp_startup(void) +static void +mp_startup_common(boolean_t boot) { - struct cpu *cp = CPU; + cpu_t *cp = CPU; uint_t new_x86_feature; - extern void cpu_event_init_cpu(cpu_t *); -#ifndef __xpv - extern void cpupm_init(cpu_t *); -#endif const char *fmt = "?cpu%d: %b\n"; + extern void cpu_event_init_cpu(cpu_t *); /* * We need to get TSC on this proc synced (i.e., any delta @@ -1457,8 +1613,8 @@ mp_startup(void) * interrupts, cmn_err, etc. */ - /* Let cpu0 continue into tsc_sync_master() */ - CPUSET_ATOMIC_ADD(procset, cp->cpu_id); + /* Let the control CPU continue into tsc_sync_master() */ + mp_startup_signal(&procset_slave, cp->cpu_id); #ifndef __xpv if (tsc_gethrtime_enable) @@ -1537,39 +1693,42 @@ mp_startup(void) if (workaround_errata(cp) != 0) panic("critical workaround(s) missing for cpu%d", cp->cpu_id); + /* + * We can touch cpu_flags here without acquiring the cpu_lock here + * because the cpu_lock is held by the control CPU which is running + * mp_start_cpu_common(). + * Need to clear CPU_QUIESCED flag before calling any function which + * may cause thread context switching, such as kmem_alloc() etc. + * The idle thread checks for CPU_QUIESCED flag and loops for ever if + * it's set. So the startup thread may have no chance to switch back + * again if it's switched away with CPU_QUIESCED set. + */ + cp->cpu_flags &= ~(CPU_POWEROFF | CPU_QUIESCED); + cpuid_pass2(cp); cpuid_pass3(cp); (void) cpuid_pass4(cp); - init_cpu_info(cp); - - mutex_enter(&cpu_lock); - - cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS; - - cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_idstr); - cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_brandstr); - - if (dtrace_cpu_init != NULL) { - (*dtrace_cpu_init)(cp->cpu_id); - } - /* - * Fill out cpu_ucode_info. Update microcode if necessary. + * Correct cpu_idstr and cpu_brandstr on target CPU after + * cpuid_pass1() is done. */ - ucode_check(cp); + (void) cpuid_getidstr(cp, cp->cpu_idstr, CPU_IDSTRLEN); + (void) cpuid_getbrandstr(cp, cp->cpu_brandstr, CPU_IDSTRLEN); - mutex_exit(&cpu_lock); + cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS; post_startup_cpu_fixups(); + cpu_event_init_cpu(cp); + /* * Enable preemption here so that contention for any locks acquired - * later in mp_startup may be preempted if the thread owning those - * locks is continuously executing on other CPUs (for example, this - * CPU must be preemptible to allow other CPUs to pause it during their - * startup phases). It's safe to enable preemption here because the - * CPU state is pretty-much fully constructed. + * later in mp_startup_common may be preempted if the thread owning + * those locks is continuously executing on other CPUs (for example, + * this CPU must be preemptible to allow other CPUs to pause it during + * their startup phases). It's safe to enable preemption here because + * the CPU state is pretty-much fully constructed. */ curthread->t_preempt = 0; @@ -1577,29 +1736,31 @@ mp_startup(void) ASSERT(cp->cpu_base_spl == ipltospl(LOCK_LEVEL)); set_base_spl(); /* Restore the spl to its proper value */ - cpu_event_init_cpu(cp); -#ifndef __xpv - cpupm_init(cp); -#endif - + pghw_physid_create(cp); /* - * Processor group initialization for this CPU is dependent on the - * cpuid probing, which must be done in the context of the current - * CPU, as well as the CPU's device node initialization (for ACPI). + * Delegate initialization tasks, which need to access the cpu_lock, + * to mp_start_cpu_common() because we can't acquire the cpu_lock here + * during CPU DR operations. */ - mutex_enter(&cpu_lock); - pghw_physid_create(cp); - (void) pg_cpu_init(cp, B_FALSE); + mp_startup_signal(&procset_slave, cp->cpu_id); + mp_startup_wait(&procset_master, cp->cpu_id); pg_cmt_cpu_startup(cp); - mutex_exit(&cpu_lock); + + if (boot) { + mutex_enter(&cpu_lock); + cp->cpu_flags &= ~CPU_OFFLINE; + cpu_enable_intr(cp); + cpu_add_active(cp); + mutex_exit(&cpu_lock); + } /* Enable interrupts */ (void) spl0(); - mutex_enter(&cpu_lock); - cpu_enable_intr(cp); - cpu_add_active(cp); - mutex_exit(&cpu_lock); + /* + * Fill out cpu_ucode_info. Update microcode if necessary. + */ + ucode_check(cp); #ifndef __xpv { @@ -1616,6 +1777,7 @@ mp_startup(void) cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL) { if (x86_feature & X86_MCA) cmi_mca_init(hdl); + cp->cpu_m.mcpu_cmi_hdl = hdl; } } #endif /* __xpv */ @@ -1630,16 +1792,13 @@ mp_startup(void) */ CPUSET_ATOMIC_ADD(cpu_ready_set, cp->cpu_id); - /* - * Because mp_startup() gets fired off after init() starts, we - * can't use the '?' trick to do 'boot -v' printing - so we - * always direct the 'cpu .. online' messages to the log. - */ - cmn_err(CE_CONT, "!cpu%d initialization complete - online\n", - cp->cpu_id); - (void) mach_cpu_create_device_node(cp, NULL); + cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_idstr); + cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_brandstr); + cmn_err(CE_CONT, "?cpu%d initialization complete - online\n", + cp->cpu_id); + /* * Now we are done with the startup thread, so free it up. */ @@ -1648,6 +1807,23 @@ mp_startup(void) /*NOTREACHED*/ } +/* + * Startup function for 'other' CPUs at boot time (besides boot cpu). + */ +static void +mp_startup_boot(void) +{ + mp_startup_common(B_TRUE); +} + +/* + * Startup function for hotplug CPUs at runtime. + */ +void +mp_startup_hotplug(void) +{ + mp_startup_common(B_FALSE); +} /* * Start CPU on user request. @@ -1663,7 +1839,6 @@ mp_cpu_start(struct cpu *cp) /* * Stop CPU on user request. */ -/* ARGSUSED */ int mp_cpu_stop(struct cpu *cp) { @@ -1713,31 +1888,42 @@ cpu_enable_intr(struct cpu *cp) psm_enable_intr(cp->cpu_id); } - -/*ARGSUSED*/ void mp_cpu_faulted_enter(struct cpu *cp) { -#ifndef __xpv - cmi_hdl_t hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), - cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); +#ifdef __xpv + _NOTE(ARGUNUSED(cp)); +#else + cmi_hdl_t hdl = cp->cpu_m.mcpu_cmi_hdl; if (hdl != NULL) { + cmi_hdl_hold(hdl); + } else { + hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), + cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); + } + if (hdl != NULL) { cmi_faulted_enter(hdl); cmi_hdl_rele(hdl); } #endif } -/*ARGSUSED*/ void mp_cpu_faulted_exit(struct cpu *cp) { -#ifndef __xpv - cmi_hdl_t hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), - cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); +#ifdef __xpv + _NOTE(ARGUNUSED(cp)); +#else + cmi_hdl_t hdl = cp->cpu_m.mcpu_cmi_hdl; if (hdl != NULL) { + cmi_hdl_hold(hdl); + } else { + hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), + cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); + } + if (hdl != NULL) { cmi_faulted_exit(hdl); cmi_hdl_rele(hdl); } diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index b4b235cf5c..c0e9c2404b 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -22,6 +22,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/t_lock.h> @@ -63,6 +67,7 @@ #include <sys/dumphdr.h> #include <sys/bootconf.h> +#include <sys/memlist_plat.h> #include <sys/varargs.h> #include <sys/promif.h> #include <sys/modctl.h> @@ -116,7 +121,7 @@ #include <sys/systeminfo.h> #include <sys/multiboot.h> -#ifdef __xpv +#ifdef __xpv #include <sys/hypervisor.h> #include <sys/xen_mmu.h> @@ -130,6 +135,10 @@ extern void xen_late_startup(void); struct xen_evt_data cpu0_evt_data; +#else /* __xpv */ +#include <sys/memlist_impl.h> + +extern void mem_config_init(void); #endif /* __xpv */ extern void progressbar_init(void); @@ -315,6 +324,17 @@ pgcnt_t segkpsize = 0; pgcnt_t segziosize = 0; /* size of zio segment in pages */ /* + * A static DR page_t VA map is reserved that can map the page structures + * for a domain's entire RA space. The pages that back this space are + * dynamically allocated and need not be physically contiguous. The DR + * map size is derived from KPM size. + * This mechanism isn't used by x86 yet, so just stubs here. + */ +int ppvm_enable = 0; /* Static virtual map for page structs */ +page_t *ppvm_base = NULL; /* Base of page struct map */ +pgcnt_t ppvm_size = 0; /* Size of page struct map */ + +/* * VA range available to the debugger */ const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE; @@ -965,20 +985,38 @@ startup_memlist(void) if (prom_debug) print_memlist("boot physinstalled", bootops->boot_mem->physinstalled); - installed_top_size(bootops->boot_mem->physinstalled, &physmax, + installed_top_size_ex(bootops->boot_mem->physinstalled, &physmax, &physinstalled, &memblocks); PRM_DEBUG(physmax); PRM_DEBUG(physinstalled); PRM_DEBUG(memblocks); /* + * Compute maximum physical address for memory DR operations. + * Memory DR operations are unsupported on xpv or 32bit OSes. + */ +#ifdef __amd64 + if (plat_dr_support_memory()) { + if (plat_dr_physmax == 0) { + uint_t pabits = UINT_MAX; + + cpuid_get_addrsize(CPU, &pabits, NULL); + plat_dr_physmax = btop(1ULL << pabits); + } + if (plat_dr_physmax > PHYSMEM_MAX64) + plat_dr_physmax = PHYSMEM_MAX64; + } else +#endif + plat_dr_physmax = 0; + + /* * Examine the bios reserved memory to find out: * - the number of discontiguous segments of memory. */ if (prom_debug) print_memlist("boot reserved mem", bootops->boot_mem->rsvdmem); - installed_top_size(bootops->boot_mem->rsvdmem, &rsvd_high_pfn, + installed_top_size_ex(bootops->boot_mem->rsvdmem, &rsvd_high_pfn, &rsvd_pgcnt, &rsvdmemblocks); PRM_DEBUG(rsvd_high_pfn); PRM_DEBUG(rsvd_pgcnt); @@ -1137,9 +1175,14 @@ startup_memlist(void) * memory is at addresses above 1 TB. When adjusted, segkpm_base must * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte). */ - if (physmax + 1 > mmu_btop(TERABYTE)) { + if (physmax + 1 > mmu_btop(TERABYTE) || + plat_dr_physmax > mmu_btop(TERABYTE)) { uint64_t kpm_resv_amount = mmu_ptob(physmax + 1); + if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) { + kpm_resv_amount = mmu_ptob(plat_dr_physmax); + } + segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount), KERNEL_REDZONE_SIZE)); /* down from top VA */ @@ -1186,6 +1229,16 @@ startup_memlist(void) panic("physavail was too big!"); if (prom_debug) print_memlist("phys_avail", phys_avail); +#ifndef __xpv + /* + * Free unused memlist items, which may be used by memory DR driver + * at runtime. + */ + if ((caddr_t)current < (caddr_t)memlist + memlist_sz) { + memlist_free_block((caddr_t)current, + (caddr_t)memlist + memlist_sz - (caddr_t)current); + } +#endif /* * Build bios reserved memspace @@ -1196,6 +1249,16 @@ startup_memlist(void) panic("bios_rsvd was too big!"); if (prom_debug) print_memlist("bios_rsvd", bios_rsvd); +#ifndef __xpv + /* + * Free unused memlist items, which may be used by memory DR driver + * at runtime. + */ + if ((caddr_t)current < (caddr_t)bios_rsvd + rsvdmemlist_sz) { + memlist_free_block((caddr_t)current, + (caddr_t)bios_rsvd + rsvdmemlist_sz - (caddr_t)current); + } +#endif /* * setup page coloring @@ -1391,7 +1454,11 @@ startup_kmem(void) } #endif -#ifdef __xpv +#ifndef __xpv + if (plat_dr_support_memory()) { + mem_config_init(); + } +#else /* __xpv */ /* * Some of the xen start information has to be relocated up * into the kernel's permanent address space. @@ -1406,7 +1473,7 @@ startup_kmem(void) */ CPU->cpu_m.mcpu_vcpu_info = &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id]; -#endif +#endif /* __xpv */ PRM_POINT("startup_kmem() done"); } @@ -1591,8 +1658,10 @@ startup_modules(void) if ((get_hwenv() != HW_XEN_HVM) && (hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU), cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL && - (x86_feature & X86_MCA)) + (x86_feature & X86_MCA)) { cmi_mca_init(hdl); + CPU->cpu_m.mcpu_cmi_hdl = hdl; + } #endif /* __xpv */ /* @@ -1701,7 +1770,11 @@ layout_kernel_va(void) #if defined(__amd64) kpm_vbase = (caddr_t)segkpm_base; - kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1)); + if (physmax + 1 < plat_dr_physmax) { + kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax)); + } else { + kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1)); + } if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base) panic("not enough room for kpm!"); PRM_DEBUG(kpm_size); @@ -2342,10 +2415,6 @@ release_bootstrap(void) rm_platter_va = i86devmap(pfn, 1, PROT_READ | PROT_WRITE | PROT_EXEC); rm_platter_pa = ptob(pfn); - hat_devload(kas.a_hat, - (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, - pfn, PROT_READ | PROT_WRITE | PROT_EXEC, - HAT_LOAD_NOCONSIST); break; } if (pfn == btop(1*1024*1024) && use_mp) diff --git a/usr/src/uts/i86pc/os/x_call.c b/usr/src/uts/i86pc/os/x_call.c index 33b75c1a5a..17d5123508 100644 --- a/usr/src/uts/i86pc/os/x_call.c +++ b/usr/src/uts/i86pc/os/x_call.c @@ -22,6 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #include <sys/types.h> #include <sys/param.h> @@ -186,11 +190,11 @@ xc_extract(xc_msg_t **queue) return (old_head); } - /* * Initialize the machcpu fields used for cross calls */ static uint_t xc_initialized = 0; + void xc_init_cpu(struct cpu *cpup) { @@ -198,34 +202,100 @@ xc_init_cpu(struct cpu *cpup) int c; /* - * add a new msg to each existing CPU's free list, as well as one for - * my list for each of them. ncpus has an inconsistent value when this - * function is called, so use cpup->cpu_id. + * Allocate message buffers for the new CPU. */ - for (c = 0; c < cpup->cpu_id; ++c) { - if (cpu[c] == NULL) - continue; - msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); - msg->xc_command = XC_MSG_FREE; - msg->xc_master = c; - xc_insert(&cpu[c]->cpu_m.xc_free, msg); + for (c = 0; c < max_ncpus; ++c) { + if (plat_dr_support_cpu()) { + /* + * Allocate a message buffer for every CPU possible + * in system, including our own, and add them to our xc + * message queue. + */ + msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); + msg->xc_command = XC_MSG_FREE; + msg->xc_master = cpup->cpu_id; + xc_insert(&cpup->cpu_m.xc_free, msg); + } else if (cpu[c] != NULL && cpu[c] != cpup) { + /* + * Add a new message buffer to each existing CPU's free + * list, as well as one for my list for each of them. + * Note: cpu0 is statically inserted into cpu[] array, + * so need to check cpu[c] isn't cpup itself to avoid + * allocating extra message buffers for cpu0. + */ + msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); + msg->xc_command = XC_MSG_FREE; + msg->xc_master = c; + xc_insert(&cpu[c]->cpu_m.xc_free, msg); + + msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); + msg->xc_command = XC_MSG_FREE; + msg->xc_master = cpup->cpu_id; + xc_insert(&cpup->cpu_m.xc_free, msg); + } + } + if (!plat_dr_support_cpu()) { + /* + * Add one for self messages if CPU hotplug is disabled. + */ msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); msg->xc_command = XC_MSG_FREE; msg->xc_master = cpup->cpu_id; xc_insert(&cpup->cpu_m.xc_free, msg); } + if (!xc_initialized) + xc_initialized = 1; +} + +void +xc_fini_cpu(struct cpu *cpup) +{ + xc_msg_t *msg; + + ASSERT((cpup->cpu_flags & CPU_READY) == 0); + ASSERT(cpup->cpu_m.xc_msgbox == NULL); + ASSERT(cpup->cpu_m.xc_work_cnt == 0); + + while ((msg = xc_extract(&cpup->cpu_m.xc_free)) != NULL) { + kmem_free(msg, sizeof (*msg)); + } +} + +#define XC_FLUSH_MAX_WAITS 1000 + +/* Flush inflight message buffers. */ +int +xc_flush_cpu(struct cpu *cpup) +{ + int i; + + ASSERT((cpup->cpu_flags & CPU_READY) == 0); + /* - * Add one for self messages + * Pause all working CPUs, which ensures that there's no CPU in + * function xc_common(). + * This is used to work around a race condition window in xc_common() + * between checking CPU_READY flag and increasing working item count. */ - msg = kmem_zalloc(sizeof (*msg), KM_SLEEP); - msg->xc_command = XC_MSG_FREE; - msg->xc_master = cpup->cpu_id; - xc_insert(&cpup->cpu_m.xc_free, msg); + pause_cpus(cpup); + start_cpus(); - if (!xc_initialized) - xc_initialized = 1; + for (i = 0; i < XC_FLUSH_MAX_WAITS; i++) { + if (cpup->cpu_m.xc_work_cnt == 0) { + break; + } + DELAY(1); + } + for (; i < XC_FLUSH_MAX_WAITS; i++) { + if (!BT_TEST(xc_priority_set, cpup->cpu_id)) { + break; + } + DELAY(1); + } + + return (i >= XC_FLUSH_MAX_WAITS ? ETIME : 0); } /* @@ -416,7 +486,7 @@ xc_common( * Post messages to all CPUs involved that are CPU_READY */ CPU->cpu_m.xc_wait_cnt = 0; - for (c = 0; c < ncpus; ++c) { + for (c = 0; c < max_ncpus; ++c) { if (!BT_TEST(set, c)) continue; cpup = cpu[c]; @@ -487,7 +557,7 @@ xc_priority_common( /* * Wait briefly for any previous xc_priority to have finished. */ - for (c = 0; c < ncpus; ++c) { + for (c = 0; c < max_ncpus; ++c) { cpup = cpu[c]; if (cpup == NULL || !(cpup->cpu_flags & CPU_READY)) continue; @@ -527,7 +597,7 @@ xc_priority_common( * Post messages to all CPUs involved that are CPU_READY * We'll always IPI, plus bang on the xc_msgbox for i86_mwait() */ - for (c = 0; c < ncpus; ++c) { + for (c = 0; c < max_ncpus; ++c) { if (!BT_TEST(set, c)) continue; cpup = cpu[c]; diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile index 11ebb58187..7fa9f8f6ef 100644 --- a/usr/src/uts/i86pc/sys/Makefile +++ b/usr/src/uts/i86pc/sys/Makefile @@ -60,6 +60,7 @@ HDRS= \ psm_types.h \ rm_platter.h \ smp_impldefs.h \ + sbd_ioctl.h \ vm_machparam.h \ x_call.h \ xc_levels.h \ diff --git a/usr/src/uts/i86pc/sys/acpidev.h b/usr/src/uts/i86pc/sys/acpidev.h index e0fbaa4999..6d11277aaf 100644 --- a/usr/src/uts/i86pc/sys/acpidev.h +++ b/usr/src/uts/i86pc/sys/acpidev.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -28,8 +28,10 @@ #include <sys/types.h> #include <sys/obpdefs.h> #include <sys/sunddi.h> +#ifdef _KERNEL #include <sys/acpi/acpi.h> #include <sys/acpica.h> +#endif #ifdef __cplusplus extern "C" { @@ -55,23 +57,19 @@ extern "C" { #define ACPIDEV_HID_CPU "ACPI0007" #define ACPIDEV_HID_PCI_HOSTBRIDGE "PNP0A03" #define ACPIDEV_HID_PCIE_HOSTBRIDGE "PNP0A08" +#define ACPIDEV_HID_PCIEX_HOSTBRIDGE "PNP0A08" #define ACPIDEV_HID_MEMORY "PNP0C80" -/* Common ACPI object names. */ -#define ACPIDEV_OBJECT_NAME_SB ACPI_NS_SYSTEM_BUS -#define ACPIDEV_OBJECT_NAME_PR "_PR_" - -/* Common ACPI method names. */ -#define ACPIDEV_METHOD_NAME_MAT "_MAT" - /* Device names for ACPI objects. */ #define ACPIDEV_NODE_NAME_ROOT "fw" +#define ACPIDEV_NODE_NAME_ACPIDR "acpidr" #define ACPIDEV_NODE_NAME_CONTAINER "container" #define ACPIDEV_NODE_NAME_MODULE_SBD "sb" #define ACPIDEV_NODE_NAME_MODULE_CPU "socket" #define ACPIDEV_NODE_NAME_CPU "cpu" #define ACPIDEV_NODE_NAME_PROCESSOR "cpus" #define ACPIDEV_NODE_NAME_MEMORY "mem" +#define ACPIDEV_NODE_NAME_PCI "pci" /* Device types for ACPI objects. */ #define ACPIDEV_TYPE_ROOTNEX "acpirootnex" @@ -81,15 +79,31 @@ extern "C" { #define ACPIDEV_TYPE_CONTAINER "acpicontainer" #define ACPIDEV_TYPE_CPU "acpicpu" #define ACPIDEV_TYPE_MEMORY "acpimemory" +#define ACPIDEV_TYPE_PCI "pci" +#define ACPIDEV_TYPE_PCIEX "pciex" /* Device property names for ACPI objects. */ #define ACPIDEV_PROP_NAME_UNIT_ADDR "unit-address" #define ACPIDEV_PROP_NAME_ACPI_UID "acpi-uid" #define ACPIDEV_PROP_NAME_PROCESSOR_ID "acpi-processor-id" #define ACPIDEV_PROP_NAME_LOCALAPIC_ID "apic-id" +#define ACPIDEV_PROP_NAME_PROXIMITY_ID "proximity-id" #define ACPIDEV_PROP_NAME_UID_FORMAT "acpidev-uid-format" +/* Miscellaneous strings. */ +#define ACPIDEV_CMD_OST_PREFIX "acpi-update-status" +#define ACPIDEV_CMD_OST_INPROGRESS "acpi-update-status=inprogress" +#define ACPIDEV_CMD_OST_SUCCESS "acpi-update-status=success" +#define ACPIDEV_CMD_OST_FAILURE "acpi-update-status=failure" +#define ACPIDEV_CMD_OST_NOOP "acpi-update-status=noop" + +#define ACPIDEV_EVENT_TYPE_ATTR_NAME "acpi-event-type" +#define ACPIDEV_EVENT_TYPE_BUS_CHECK "bus_check" +#define ACPIDEV_EVENT_TYPE_DEVICE_CHECK "device_check" +#define ACPIDEV_EVENT_TYPE_DEVICE_CHECK_LIGHT "device_check_light" +#define ACPIDEV_EVENT_TYPE_EJECT_REQUEST "eject_request" + /* ACPI device class Id. */ typedef enum acpidev_class_id { ACPIDEV_CLASS_ID_INVALID = 0, @@ -99,6 +113,8 @@ typedef enum acpidev_class_id { ACPIDEV_CLASS_ID_CONTAINER = 4, ACPIDEV_CLASS_ID_CPU = 5, ACPIDEV_CLASS_ID_MEMORY = 6, + ACPIDEV_CLASS_ID_PCI = 7, + ACPIDEV_CLASS_ID_PCIEX = 8, ACPIDEV_CLASS_ID_MAX } acpidev_class_id_t; @@ -106,9 +122,88 @@ typedef enum acpidev_class_id { #define ACPIDEV_OUSER_NO_CPU 0x1 #define ACPIDEV_OUSER_NO_MEM 0x2 #define ACPIDEV_OUSER_NO_CONTAINER 0x4 +#define ACPIDEV_OUSER_NO_PCI 0x8 +#define ACPIDEV_OUSER_NO_CACHE 0x10000 #ifdef _KERNEL +/* Common ACPI object names. */ +#define ACPIDEV_OBJECT_NAME_SB ACPI_NS_SYSTEM_BUS +#define ACPIDEV_OBJECT_NAME_PR "_PR_" + +/* Common ACPI method names. */ +#define ACPIDEV_METHOD_NAME_MAT "_MAT" +#define ACPIDEV_METHOD_NAME_EJ0 "_EJ0" +#define ACPIDEV_METHOD_NAME_EDL "_EDL" +#define ACPIDEV_METHOD_NAME_EJD "_EJD" +#define ACPIDEV_METHOD_NAME_OST "_OST" +#define ACPIDEV_METHOD_NAME_PXM "_PXM" +#define ACPIDEV_METHOD_NAME_SLI "_SLI" + +/* Source event code for _OST. */ +#define ACPI_OST_EVENT_EJECTING 0x103 +#define ACPI_OST_EVENT_INSERTING 0x200 + +/* Status code for _OST. */ +#define ACPI_OST_STA_SUCCESS 0x0 + +/* Non-specific failure. */ +#define ACPI_OST_STA_FAILURE 0x1 + +/* Unrecognized Notify Code. */ +#define ACPI_OST_STA_NOT_SUPPORT 0x2 + +/* Device ejection not supported by OSPM. */ +#define ACPI_OST_STA_EJECT_NOT_SUPPORT 0x80 + +/* Device in use by application. */ +#define ACPI_OST_STA_EJECT_IN_USE 0x81 + +/* Device Busy. */ +#define ACPI_OST_STA_EJECT_BUSY 0x82 + +/* Ejection dependency is busy or not supported for ejection by OSPM. */ +#define ACPI_OST_STA_EJECT_DEPENDENCY 0x83 + +/* Ejection is in progress (pending). */ +#define ACPI_OST_STA_EJECT_IN_PROGRESS 0x84 + +/* Device insertion in progress (pending). */ +#define ACPI_OST_STA_INSERT_IN_PROGRESS 0x80 + +/* Device driver load failure. */ +#define ACPI_OST_STA_INSERT_DRIVER 0x81 + +/* Device insertion not supported by OSPM. */ +#define ACPI_OST_STA_INSERT_NOT_SUPPORT 0x82 + +/* + * Insertion failure + * Resources Unavailable as described by the following bit encodings: + * Bit[3] Bus Numbers + * Bit[2] Interrupts + * Bit[1] I/O + * Bit[0] Memory + */ +#define ACPI_OST_STA_INSERT_NO_RESOURCE 0x90 +#define ACPI_OST_STA_INSERT_NO_BUS 0x8 +#define ACPI_OST_STA_INSERT_NO_INTR 0x4 +#define ACPI_OST_STA_INSERT_NO_IO 0x2 +#define ACPI_OST_STA_INSERT_NO_MEM 0x1 + +/* + * According to the ACPI specification, self latency (entry[n][n]) in the + * SLIT table should be 10. + */ +#define ACPI_SLIT_SELF_LATENCY 10 + +/* + * The DR driver assigns a unique device id for each hot-added memory device. + * ACPI_MEMNODE_DEVID_BOOT is assigned to memory devices present at boot, + * which is distinguished from device ids assigned by the DR driver. + */ +#define ACPI_MEMNODE_DEVID_BOOT UINT32_MAX + /* Forward declaration */ typedef struct acpidev_data_impl *acpidev_data_handle_t; typedef struct acpidev_walk_info acpidev_walk_info_t; @@ -197,7 +292,7 @@ typedef ACPI_STATUS (* acpidev_probe_t)(acpidev_walk_info_t *); typedef acpidev_filter_result_t (* acpidev_filter_t)(acpidev_walk_info_t *, char *, int); typedef ACPI_STATUS (* acpidev_init_t)(acpidev_walk_info_t *); -typedef ACPI_STATUS (* acpidev_fini_t)(ACPI_HANDLE, dev_info_t *, +typedef void (* acpidev_fini_t)(ACPI_HANDLE, acpidev_data_handle_t, acpidev_class_t *); /* Device class driver interface. */ @@ -237,6 +332,7 @@ extern acpidev_class_t acpidev_class_device; extern acpidev_class_t acpidev_class_container; extern acpidev_class_t acpidev_class_cpu; extern acpidev_class_t acpidev_class_memory; +extern acpidev_class_t acpidev_class_pci; /* * Class driver lists. @@ -309,6 +405,7 @@ extern ACPI_STATUS acpidev_process_object(acpidev_walk_info_t *infop, #define ACPIDEV_PROCESS_FLAG_OFFLINE 0x400 /* Put device into offline. */ #define ACPIDEV_PROCESS_FLAG_NOTAG 0x800 /* Skip tag dip with object. */ #define ACPIDEV_PROCESS_FLAG_SYNCSTATUS 0x1000 /* Sync object status. */ +#define ACPIDEV_PROCESS_FLAG_HOLDBRANCH 0x10000 /* Hold device branch. */ /* * Filter ACPI objects according to filter rules, generate devname if needed. @@ -346,10 +443,16 @@ extern void acpidev_data_destroy_handle(ACPI_HANDLE hdl); extern ACPI_HANDLE acpidev_data_get_object(acpidev_data_handle_t hdl); extern dev_info_t *acpidev_data_get_devinfo(acpidev_data_handle_t hdl); extern int acpidev_data_get_status(acpidev_data_handle_t hdl); +extern boolean_t acpidev_data_dr_capable(acpidev_data_handle_t hdl); +extern boolean_t acpidev_data_dr_ready(acpidev_data_handle_t hdl); +extern boolean_t acpidev_data_dr_failed(acpidev_data_handle_t hdl); extern void acpidev_data_set_flag(acpidev_data_handle_t hdl, uint32_t flag); extern void acpidev_data_clear_flag(acpidev_data_handle_t hdl, uint32_t flag); extern uint32_t acpidev_data_get_flag(acpidev_data_handle_t hdl, uint32_t flag); +/* ACPI system event handler has been registered. */ +#define ACPIDEV_DATA_HANDLER_READY 0x1 + /* * Try to generate meaningful device unit address from uid. * Return buf on success and NULL on failure. @@ -430,6 +533,26 @@ typedef ACPI_STATUS (* acpidev_apic_walker_t)(ACPI_SUBTABLE_HEADER *, void *); extern ACPI_STATUS acpidev_walk_apic(ACPI_BUFFER *bufp, ACPI_HANDLE hdl, char *method, acpidev_apic_walker_t func, void *context); +/* + * Evaluate _OST method under object, which is used to support hotplug event. + * hdl: object handle + * code: _OST source event code + * stauts: _OST result status code + * bufp and len: optional third parameter for _OST. + */ +extern ACPI_STATUS acpidev_eval_ost(ACPI_HANDLE hdl, uint32_t code, + uint32_t status, char *bufp, size_t len); + +/* + * Evaluate _EJ0 method under object. + */ +extern ACPI_STATUS acpidev_eval_ej0(ACPI_HANDLE hdl); + +/* + * Evaluate _PXM method under object. + */ +extern ACPI_STATUS acpidev_eval_pxm(ACPI_HANDLE hdl, uint32_t *idp); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/sys/acpidev_dr.h b/usr/src/uts/i86pc/sys/acpidev_dr.h new file mode 100644 index 0000000000..6219f98bf1 --- /dev/null +++ b/usr/src/uts/i86pc/sys/acpidev_dr.h @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ +/* + * Interfaces to support System Board Dynamic Reconfiguration. + */ + +#ifndef _SYS_ACPIDEV_DR_H +#define _SYS_ACPIDEV_DR_H +#include <sys/types.h> +#include <sys/obpdefs.h> +#include <sys/cpuvar.h> +#include <sys/memlist.h> +#include <sys/sunddi.h> +#include <sys/acpi/acpi.h> +#include <sys/acpica.h> +#include <sys/acpidev.h> +#include <sys/acpidev_rsc.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* Maximum number of DR capable system boards supported. */ +#define ACPIDEV_DR_MAX_BOARDS 0x40 +#define ACPIDEV_DR_SEGS_PER_MEM_DEV 0x10 +#define ACPIDEV_DR_MEMLISTS_PER_SEG 0x10 +#define ACPIDEV_DR_MAX_MEMLIST_ENTRIES 0x10000 + +#define ACPIDEV_DR_PROP_PORTID "portid" +#define ACPIDEV_DR_PROP_BOARDNUM OBP_BOARDNUM +#define ACPIDEV_DR_PROP_DEVNAME OBP_NAME + +/* + * Format strings for DR capable system boards. + * They will be used as attachment point names. + */ +#define ACPIDEV_DR_CPU_BD_FMT "CPU%u" +#define ACPIDEV_DR_MEMORY_BD_FMT "MEM%u" +#define ACPIDEV_DR_IO_BD_FMT "IO%u" +#define ACPIDEV_DR_SYSTEM_BD_FMT "SB%u" + +typedef enum { + ACPIDEV_INVALID_BOARD = 0, + ACPIDEV_CPU_BOARD, + ACPIDEV_MEMORY_BOARD, + ACPIDEV_IO_BOARD, + ACPIDEV_SYSTEM_BOARD +} acpidev_board_type_t; + +/* Check whether the system is DR capable. */ +extern int acpidev_dr_capable(void); + +extern uint32_t acpidev_dr_max_boards(void); +extern uint32_t acpidev_dr_max_mem_units_per_board(void); +extern uint32_t acpidev_dr_max_io_units_per_board(void); +extern uint32_t acpidev_dr_max_cmp_units_per_board(void); +extern uint32_t acpidev_dr_max_cpu_units_per_cmp(void); +extern uint32_t acpidev_dr_max_segments_per_mem_device(void); +extern uint32_t acpidev_dr_max_memlists_per_segment(void); +extern ACPI_STATUS acpidev_dr_get_mem_alignment(ACPI_HANDLE hdl, uint64_t *ap); + +/* Initialize support of DR operations. */ +extern void acpidev_dr_init(void); + +/* Scan for DR capable boards and setup environment for DR operations. */ +extern void acpidev_dr_check(acpidev_walk_info_t *infop); + +/* + * Initialize DR interfaces to enable DR operations. + */ +extern ACPI_STATUS acpidev_dr_initialize(dev_info_t *pdip); + +/* Get ACPI handle of the DR capable board. */ +extern ACPI_STATUS acpidev_dr_get_board_handle(uint_t board, + ACPI_HANDLE *hdlp); + +/* Get board type of the DR capable board. */ +extern acpidev_board_type_t acpidev_dr_get_board_type(ACPI_HANDLE hdl); + +/* Get board number of the DR capable board. */ +extern ACPI_STATUS acpidev_dr_get_board_number(ACPI_HANDLE hdl, + uint32_t *bnump); + +/* Get board name of the DR capable board. */ +extern ACPI_STATUS acpidev_dr_get_board_name(ACPI_HANDLE hdl, + char *buf, size_t len); + +/* Get attachment point of the DR capable board. */ +extern ACPI_STATUS acpidev_dr_get_attachment_point(ACPI_HANDLE hdl, + char *buf, size_t len); + +/* + * Figure out device type of the object/device. + * It only supports device types which may be involved in DR operations. + */ +extern acpidev_class_id_t acpidev_dr_device_get_class(ACPI_HANDLE hdl); + +/* Get memory device index/id. */ +extern ACPI_STATUS acpidev_dr_device_get_memory_index(ACPI_HANDLE hdl, + uint32_t *idxp); + +/* Check whether the device is a DR capable board or not. */ +extern int acpidev_dr_device_is_board(ACPI_HANDLE hdl); + +/* Check whether the device is present or not. */ +extern int acpidev_dr_device_is_present(ACPI_HANDLE hdl); + +/* Check whether the device is powered-on or not. */ +extern int acpidev_dr_device_is_powered(ACPI_HANDLE hdl); + +/* Check whether the device is DR capable. */ +extern int acpidev_dr_device_hotplug_capable(ACPI_HANDLE hdl); + +/* Check whether the device has an eject device list. */ +extern int acpidev_dr_device_has_edl(ACPI_HANDLE hdl); + +/* + * Simulate OBP property interfaces to support drmach driver, + * so we can keep drmach in consistency with SPARC version. + * Return size of data copied to buf if it's big enough, + * otherwise return size of buffer needed. + */ +extern int acpidev_dr_device_getprop(ACPI_HANDLE hdl, char *name, + caddr_t buf, size_t len); + +/* + * Get "reg" or "assigned-address" property of the device. + * Return "assigned-address" property if assigned is non-zero, + * otherwise return "reg" property. + * Caller needs to release returned resources by calling + * acpidev_dr_device_free_regspec(). + */ +extern ACPI_STATUS acpidev_dr_device_get_regspec(ACPI_HANDLE hdl, + boolean_t assigned, acpidev_regspec_t **regpp, uint_t *cntp); + +/* Free resources returned by acpidev_dr_device_get_regspec(). */ +extern void acpidev_dr_device_free_regspec(acpidev_regspec_t *regp, + uint_t count); + +/* Walk devices in eject device list (ACPI _EDL method). */ +extern ACPI_STATUS acpidev_dr_device_walk_edl(ACPI_HANDLE hdl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval); + +/* Walk devices in eject dependency list (ACPI _EJD method). */ +extern ACPI_STATUS acpidev_dr_device_walk_ejd(ACPI_HANDLE hdl, + ACPI_WALK_CALLBACK cb, void *arg, void **retval); + +/* + * Walk child and dependent devices which may be involved in DR operations. + * PCI host bridges embedded in physical processors may be presented in eject + * device list instead of as children of processors. + */ +extern ACPI_STATUS acpidev_dr_device_walk_device(ACPI_HANDLE hdl, + uint_t max_lvl, ACPI_WALK_CALLBACK cb, void *arg, void **retval); + +/* Check whether the device is in working state without any error. */ +extern ACPI_STATUS acpidev_dr_device_check_status(ACPI_HANDLE hdl); + +/* Power on the device. */ +extern ACPI_STATUS acpidev_dr_device_poweron(ACPI_HANDLE hdl); + +/* Power off the device. */ +extern ACPI_STATUS acpidev_dr_device_poweroff(ACPI_HANDLE hdl); + +/* + * Create device nodes for hot-added devices under hdl. + * Return: + * AE_OK: on success + * AE_SUPPORT: if it's not capable of DR operation. + * AE_ERROR: for other errors + */ +extern ACPI_STATUS acpidev_dr_device_insert(ACPI_HANDLE hdl); + +/* + * Destroy device nodes to be removed under hdl. + * AE_OK: on success + * AE_SUPPORT: if it's not capable of DR operation. + * AE_ERROR: for other errors + */ +extern ACPI_STATUS acpidev_dr_device_remove(ACPI_HANDLE hdl); + +/* Block dynamic reconfiguration operations. */ +extern void acpidev_dr_lock_all(void); + +/* Unblock dynamic reconfiguration operations. */ +extern void acpidev_dr_unlock_all(void); + +extern ACPI_STATUS acpidev_dr_allocate_cpuid(ACPI_HANDLE hdl, + processorid_t *idp); +extern ACPI_STATUS acpidev_dr_free_cpuid(ACPI_HANDLE hdl); + +/* + * Query NUMA relative information for the CPU device. + * It returns APIC id, Proximity id and latency information of the CPU device. + * Latency information is retrieved from the ACPI _SLI method or the ACPI SLIT + * table. + */ +extern int acpidev_dr_get_cpu_numa_info(cpu_t *cp, void **hdlpp, + uint32_t *apicidp, uint32_t *pxmidp, uint32_t *slicntp, uchar_t **slipp); + +/* + * Release resources allocated by acpidev_dr_get_cpu_numa_info(). + */ +extern void acpidev_dr_free_cpu_numa_info(void *hdlp); + +/* + * Query NUMA relative information for a memory device. + * It returns proximity id and latency information of the memory device. + * Latency information is obtained from the ACPI _SLI method or the ACPI + * SLIT table. + */ +extern ACPI_STATUS acpidev_dr_get_mem_numa_info(ACPI_HANDLE hdl, + struct memlist *ml, void **hdlpp, uint32_t *pxmidp, + uint32_t *slicntp, uchar_t **slipp); + +/* + * Release resources allocated by acpidev_dr_get_mem_numa_info(). + */ +extern void acpidev_dr_free_mem_numa_info(void *hdlp); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACPIDEV_DR_H */ diff --git a/usr/src/uts/i86pc/sys/acpidev_impl.h b/usr/src/uts/i86pc/sys/acpidev_impl.h index 666ab5578d..c01ea9f5cb 100644 --- a/usr/src/uts/i86pc/sys/acpidev_impl.h +++ b/usr/src/uts/i86pc/sys/acpidev_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -27,10 +27,13 @@ #define _SYS_ACPIDEV_IMPL_H #include <sys/types.h> #include <sys/cmn_err.h> +#include <sys/bitmap.h> +#include <sys/synch.h> #include <sys/sunddi.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> #include <sys/acpidev.h> +#include <sys/acpidev_dr.h> #ifdef __cplusplus extern "C" { @@ -54,12 +57,52 @@ struct acpidev_data_impl { dev_info_t *aod_dip; acpidev_class_t *aod_class; acpidev_class_list_t **aod_class_list; + acpidev_board_type_t aod_bdtype; /* Type of board. */ + uint32_t aod_bdnum; /* Board # for DR. */ + uint32_t aod_portid; /* Port id for DR. */ + uint32_t aod_bdidx; /* Index # of AP */ + volatile uint32_t aod_chidx; /* Index # of child */ + uint32_t aod_memidx; /* Index # of memory */ + acpidev_class_id_t aod_class_id; /* Dev type for DR. */ }; #define ACPIDEV_ODF_STATUS_VALID 0x1 #define ACPIDEV_ODF_DEVINFO_CREATED 0x2 #define ACPIDEV_ODF_DEVINFO_TAGGED 0x4 -#define ACPIDEV_ODF_DEVINFO_OFFLINE 0x8 +#define ACPIDEV_ODF_HOTPLUG_CAPABLE 0x100 +#define ACPIDEV_ODF_HOTPLUG_READY 0x200 +#define ACPIDEV_ODF_HOTPLUG_FAILED 0x400 + +#define ACPIDEV_DR_IS_BOARD(hdl) \ + ((hdl)->aod_iflag & ACPIDEV_ODF_HOTPLUG_CAPABLE) + +#define ACPIDEV_DR_SET_BOARD(hdl) \ + (hdl)->aod_iflag |= ACPIDEV_ODF_HOTPLUG_CAPABLE + +#define ACPIDEV_DR_IS_READY(hdl) \ + ((hdl)->aod_iflag & ACPIDEV_ODF_HOTPLUG_READY) + +#define ACPIDEV_DR_SET_READY(hdl) \ + (hdl)->aod_iflag |= ACPIDEV_ODF_HOTPLUG_READY + +#define ACPIDEV_DR_IS_FAILED(hdl) \ + ((hdl)->aod_iflag & ACPIDEV_ODF_HOTPLUG_FAILED) + +#define ACPIDEV_DR_SET_FAILED(hdl) \ + (hdl)->aod_iflag |= ACPIDEV_ODF_HOTPLUG_FAILED + +#define ACPIDEV_DR_IS_WORKING(hdl) \ + (((hdl)->aod_iflag & (ACPIDEV_ODF_HOTPLUG_READY | \ + ACPIDEV_ODF_HOTPLUG_FAILED)) == ACPIDEV_ODF_HOTPLUG_READY) + +#define ACPIDEV_DR_IS_PROCESSED(hdl) \ + ((hdl)->aod_iflag & (ACPIDEV_ODF_HOTPLUG_READY | \ + ACPIDEV_ODF_HOTPLUG_FAILED | ACPIDEV_ODF_HOTPLUG_CAPABLE)) + +#define ACPIDEV_DR_BOARD_READY(hdl) \ + (((hdl)->aod_iflag & \ + (ACPIDEV_ODF_HOTPLUG_READY | ACPIDEV_ODF_HOTPLUG_CAPABLE)) == \ + (ACPIDEV_ODF_HOTPLUG_READY | ACPIDEV_ODF_HOTPLUG_CAPABLE)) /* * List of registered device class drivers. @@ -83,6 +126,23 @@ typedef struct acpidev_pseudo_uid_head { acpidev_pseudo_uid_t *apuh_first; } acpidev_pseudo_uid_head_t; +typedef struct acpidev_dr_capacity { + uint_t cpu_vendor; + uint_t cpu_family; + uint_t cpu_model_min; + uint_t cpu_model_max; + uint_t cpu_step_min; + uint_t cpu_step_max; + boolean_t hotplug_supported; + uint64_t memory_alignment; +} acpidev_dr_capacity_t; + +extern int acpidev_dr_enable; +extern krwlock_t acpidev_class_lock; +extern ulong_t acpidev_object_type_mask[BT_BITOUL(ACPI_TYPE_NS_NODE_MAX + 1)]; +extern ACPI_TABLE_SRAT *acpidev_srat_tbl_ptr; +extern ACPI_TABLE_SLIT *acpidev_slit_tbl_ptr; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/sys/acpinex.h b/usr/src/uts/i86pc/sys/acpinex.h index be21c8e0e2..4aa6f19636 100644 --- a/usr/src/uts/i86pc/sys/acpinex.h +++ b/usr/src/uts/i86pc/sys/acpinex.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -62,6 +62,10 @@ typedef struct { char ans_path[MAXPATHLEN]; } acpinex_softstate_t; +extern void acpinex_event_init(void); +extern void acpinex_event_fini(void); +extern int acpinex_event_scan(acpinex_softstate_t *, boolean_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index d0aea5d877..48500c4396 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -502,7 +502,7 @@ typedef struct apic_irq { * adjust APIC_PADSZ as we add/modify any member of apic_cpus_info. We also * don't want the compiler to optimize apic_cpus_info. */ -#define APIC_PADSZ 19 +#define APIC_PADSZ 15 #pragma pack(1) typedef struct apic_cpus_info { @@ -517,6 +517,7 @@ typedef struct apic_cpus_info { uchar_t aci_current[MAXIPL]; /* Current IRQ at each IPL */ uint32_t aci_bound; /* # of user requested binds ? */ uint32_t aci_temp_bound; /* # of non user IRQ binds */ + uint32_t aci_processor_id; /* Only used in ACPI mode. */ uchar_t aci_idle; /* The CPU is idle */ /* * Fill to make sure each struct is in separate 64-byte cache line. @@ -527,6 +528,8 @@ typedef struct apic_cpus_info { #define APIC_CPU_ONLINE 1 #define APIC_CPU_INTR_ENABLE 2 +#define APIC_CPU_FREE 4 /* APIC CPU slot is free */ +#define APIC_CPU_DIRTY 8 /* Slot was once used */ /* * APIC ops to support various flavors of APIC like APIC and x2APIC. @@ -784,6 +787,7 @@ extern int apic_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp, psm_intr_op_t intr_op, int *result); extern int apic_state(psm_state_request_t *); extern boolean_t apic_cpu_in_range(int cpu); +extern processorid_t apic_find_next_cpu_intr(void); extern int apic_check_msi_support(); extern apic_irq_t *apic_find_irq(dev_info_t *dip, struct intrspec *ispec, int type); @@ -849,6 +853,7 @@ extern int apic_diff_for_redistribution; extern int apic_poweroff_method; extern int apic_enable_acpi; extern int apic_nproc; +extern int apic_max_nproc; extern int apic_next_bind_cpu; extern int apic_redistribute_sample_interval; extern int apic_multi_msi_enable; diff --git a/usr/src/uts/i86pc/sys/dr.h b/usr/src/uts/i86pc/sys/dr.h new file mode 100644 index 0000000000..492eaeaae9 --- /dev/null +++ b/usr/src/uts/i86pc/sys/dr.h @@ -0,0 +1,500 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_DR_H +#define _SYS_DR_H +#include <sys/types.h> +#include <sys/note.h> +#include <sys/processor.h> +#include <sys/obpdefs.h> +#include <sys/memlist.h> +#include <sys/mem_config.h> +#include <sys/param.h> /* for MAXPATHLEN */ +#include <sys/varargs.h> +#include <sys/sbd_ioctl.h> +#include <sys/dr_util.h> +#include <sys/drmach.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * helper macros for constructing and reporting internal error messages. + * NOTE: each module which uses one or more this these macros is expected + * to supply a char *dr_ie_fmt string containing the SCCS filename + * expansion macro (percent M percent) and a sprintf %d to render the + * line number argument. + */ +#define DR_INTERNAL_ERROR(hp) \ + drerr_new(1, ESBD_INTERNAL, dr_ie_fmt, __LINE__) + +#define DR_OP_INTERNAL_ERROR(hp) \ + drerr_set_c(CE_WARN, &(hp)->h_err, \ + ESBD_INTERNAL, dr_ie_fmt, __LINE__) + +#define DR_DEV_INTERNAL_ERROR(cp) \ + drerr_set_c(CE_WARN, &(cp)->sbdev_error, \ + ESBD_INTERNAL, dr_ie_fmt, __LINE__) + +/* + * Macros for keeping an error code and an associated list of integers. + */ +#define DR_MAX_ERR_INT (32) +#define DR_GET_E_CODE(sep) ((sep)->e_code) +#define DR_SET_E_CODE(sep, en) ((sep)->e_code = (en)) +#define DR_GET_E_RSC(sep) ((sep)->e_rsc) + +/* Number of device node types. */ +#define DR_MAXNUM_NT 3 + +/* used to map sbd_comp_type_t to array index */ +#define DEVSET_NIX(t) \ + (((t) == SBD_COMP_CPU) ? 0 : \ + ((t) == SBD_COMP_MEM) ? 1 : \ + ((t) == SBD_COMP_IO) ? 2 : \ + ((t) == SBD_COMP_CMP) ? 0 : DR_MAXNUM_NT) + +/* + * Format of dr_devset_t bit masks: + * + * 64 56 48 40 32 24 16 8 0 + * |....|IIII|IIII|IIII|IIII|MMMM|MMMM|CCCC|CCCC|CCCC|CCCC|CCCC|CCCC|CCCC|CCCC| + * + * 1 = indicates respective component present/attached. + * I = I/O, M = Memory, C = CPU. + */ +#define DEVSET_CPU_OFFSET 0 +#define DEVSET_CPU_NUMBER 32 +#define DEVSET_MEM_OFFSET (DEVSET_CPU_OFFSET + DEVSET_CPU_NUMBER) +#define DEVSET_MEM_NUMBER 8 +#define DEVSET_IO_OFFSET (DEVSET_MEM_OFFSET + DEVSET_MEM_NUMBER) +#define DEVSET_IO_NUMBER 16 +#define DEVSET_MAX_BITS (DEVSET_IO_OFFSET + DEVSET_IO_NUMBER) + +#define DEVSET_BIX(t) \ + (((t) == SBD_COMP_CPU) ? DEVSET_CPU_OFFSET : \ + ((t) == SBD_COMP_MEM) ? DEVSET_MEM_OFFSET : \ + ((t) == SBD_COMP_IO) ? DEVSET_IO_OFFSET : \ + ((t) == SBD_COMP_CMP) ? DEVSET_CPU_OFFSET : 0) + +#define DEVSET_NT2DEVPOS(t, u) (((t) == SBD_COMP_CMP) ?\ + (DEVSET_BIX(t) + (u) * MAX_CORES_PER_CMP) : DEVSET_BIX(t) + (u)) + +#if (DEVSET_MAX_BITS <= 64) +typedef uint64_t dr_devset_t; + +#define DEVSET_ONEUNIT ((dr_devset_t)1) +#define DEVSET_ANYUNIT ((dr_devset_t)(-1)) +#define DEVSET_CPU_NMASK ((dr_devset_t)((1ULL << DEVSET_CPU_NUMBER) - 1)) +#define DEVSET_MEM_NMASK ((dr_devset_t)((1ULL << DEVSET_MEM_NUMBER) - 1)) +#define DEVSET_IO_NMASK ((dr_devset_t)((1ULL << DEVSET_IO_NUMBER) - 1)) +#define DEVSET_CMP_NMASK ((dr_devset_t)((1ULL << MAX_CORES_PER_CMP) - 1)) + +#define DEVSET_NMASK(t) \ + (((t) == SBD_COMP_CPU) ? DEVSET_CPU_NMASK : \ + ((t) == SBD_COMP_MEM) ? DEVSET_MEM_NMASK : \ + ((t) == SBD_COMP_IO) ? DEVSET_IO_NMASK : \ + ((t) == SBD_COMP_CMP) ? DEVSET_CPU_NMASK : 0) + +#define DEVSET_MASK \ + ((DEVSET_CPU_NMASK << DEVSET_CPU_OFFSET) | \ + (DEVSET_MEM_NMASK << DEVSET_MEM_OFFSET) | \ + (DEVSET_IO_NMASK << DEVSET_IO_OFFSET)) + +#define DEVSET(t, u) \ + (((u) == DEVSET_ANYUNIT) ? \ + ((DEVSET_NMASK(t) << DEVSET_NT2DEVPOS((t), 0)) & \ + DEVSET_MASK) : \ + ((t) == SBD_COMP_CMP) ? \ + (DEVSET_CMP_NMASK << DEVSET_NT2DEVPOS((t), (u))) : \ + (DEVSET_ONEUNIT << DEVSET_NT2DEVPOS((t), (u)))) + +#define DEVSET_IS_NULL(ds) ((ds) == 0) +#define DEVSET_IN_SET(ds, t, u) (((ds) & DEVSET((t), (u))) != 0) +#define DEVSET_ADD(ds, t, u) ((ds) |= DEVSET((t), (u))) +#define DEVSET_DEL(ds, t, u) ((ds) &= ~DEVSET((t), (u))) +#define DEVSET_AND(ds1, ds2) ((ds1) & (ds2)) +#define DEVSET_OR(ds1, ds2) ((ds1) | (ds2)) +#define DEVSET_NAND(ds1, ds2) ((ds1) & ~(ds2)) +#define DEVSET_GET_UNITSET(ds, t) \ + (((ds) & DEVSET((t), DEVSET_ANYUNIT)) >> DEVSET_NT2DEVPOS((t), 0)) +#define DEVSET_FMT_STR "0x%" PRIx64 "" +#define DEVSET_FMT_ARG(ds) (ds) +#else /* DEVSET_MAX_BITS <= 64 */ +#error please implement devset with bitmap to support more 64 devices +#endif /* DEVSET_MAX_BITS <= 64 */ + +/* + * Ops for dr_board_t.b_dev_* + */ +#define DR_DEV_IS(ds, cp) DEVSET_IN_SET( \ + (cp)->sbdev_bp->b_dev_##ds, \ + (cp)->sbdev_type, \ + (cp)->sbdev_unum) + +#define DR_DEV_ADD(ds, cp) DEVSET_ADD( \ + (cp)->sbdev_bp->b_dev_##ds, \ + (cp)->sbdev_type, \ + (cp)->sbdev_unum) + +#define DR_DEV_DEL(ds, cp) DEVSET_DEL( \ + (cp)->sbdev_bp->b_dev_##ds, \ + (cp)->sbdev_type, \ + (cp)->sbdev_unum) + +/* + * Ops for dr_board_t.b_dev_present + */ +#define DR_DEV_IS_PRESENT(cp) DR_DEV_IS(present, cp) +#define DR_DEV_SET_PRESENT(cp) DR_DEV_ADD(present, cp) +#define DR_DEV_CLR_PRESENT(cp) DR_DEV_DEL(present, cp) + +/* + * Ops for dr_board_t.b_dev_attached + */ +#define DR_DEV_IS_ATTACHED(cp) DR_DEV_IS(attached, cp) +#define DR_DEV_SET_ATTACHED(cp) DR_DEV_ADD(attached, cp) +#define DR_DEV_CLR_ATTACHED(cp) DR_DEV_DEL(attached, cp) + +/* + * Ops for dr_board_t.b_dev_released + */ +#define DR_DEV_IS_RELEASED(cp) DR_DEV_IS(released, cp) +#define DR_DEV_SET_RELEASED(cp) DR_DEV_ADD(released, cp) +#define DR_DEV_CLR_RELEASED(cp) DR_DEV_DEL(released, cp) + +/* + * Ops for dr_board_t.b_dev_unreferenced + */ +#define DR_DEV_IS_UNREFERENCED(cp) DR_DEV_IS(unreferenced, cp) +#define DR_DEV_SET_UNREFERENCED(cp) DR_DEV_ADD(unreferenced, cp) +#define DR_DEV_CLR_UNREFERENCED(cp) DR_DEV_DEL(unreferenced, cp) + +#define DR_DEVS_PRESENT(bp) \ + ((bp)->b_dev_present) +#define DR_DEVS_ATTACHED(bp) \ + ((bp)->b_dev_attached) +#define DR_DEVS_RELEASED(bp) \ + ((bp)->b_dev_released) +#define DR_DEVS_UNREFERENCED(bp) \ + ((bp)->b_dev_unreferenced) +#define DR_DEVS_UNATTACHED(bp) \ + ((bp)->b_dev_present & ~(bp)->b_dev_attached) +#define DR_DEVS_CONFIGURE(bp, devs) \ + ((bp)->b_dev_attached = (devs)) +#define DR_DEVS_DISCONNECT(bp, devs) \ + ((bp)->b_dev_present &= ~(devs)) +#define DR_DEVS_CANCEL(bp, devs) \ + ((bp)->b_dev_released &= ~(devs), \ + (bp)->b_dev_unreferenced &= ~(devs)) + +/* + * CMP Specific Helpers + */ +#define DR_CMP_CORE_UNUM(cmp, core) ((cmp) * MAX_CORES_PER_CMP + (core)) + +/* + * For CPU and CMP devices, DR_UNUM2SBD_UNUM is used to extract the physical + * CPU/CMP id from the device id. + */ +#define DR_UNUM2SBD_UNUM(n, d) \ + ((d) == SBD_COMP_CPU ? ((n) / MAX_CORES_PER_CMP) : \ + (d) == SBD_COMP_CMP ? ((n) / MAX_CORES_PER_CMP) : (n)) + +/* + * Some stuff to assist in debug. + */ +#ifdef DEBUG +#define DRDBG_STATE 0x00000001 +#define DRDBG_QR 0x00000002 +#define DRDBG_CPU 0x00000004 +#define DRDBG_MEM 0x00000008 +#define DRDBG_IO 0x00000010 + +#define PR_ALL if (dr_debug) printf +#define PR_STATE if (dr_debug & DRDBG_STATE) printf +#define PR_QR if (dr_debug & DRDBG_QR) prom_printf +#define PR_CPU if (dr_debug & DRDBG_CPU) printf +#define PR_MEM if (dr_debug & DRDBG_MEM) printf +#define PR_IO if (dr_debug & DRDBG_IO) printf +#define PR_MEMLIST_DUMP if (dr_debug & DRDBG_MEM) MEMLIST_DUMP + +extern uint_t dr_debug; +#else /* DEBUG */ +#define PR_ALL _NOTE(CONSTANTCONDITION) if (0) printf +#define PR_STATE PR_ALL +#define PR_QR PR_ALL +#define PR_CPU PR_ALL +#define PR_MEM PR_ALL +#define PR_IO PR_ALL +#define PR_MEMLIST_DUMP _NOTE(CONSTANTCONDITION) if (0) MEMLIST_DUMP + +#endif /* DEBUG */ + +/* + * dr_board_t b_sflags. + */ +#define DR_BSLOCK 0x01 /* for blocking status (protected by b_slock) */ + +typedef const char *fn_t; + +/* + * Unsafe devices based on dr.conf prop "unsupported-io-drivers" + */ +typedef struct { + char **devnames; + uint_t ndevs; +} dr_unsafe_devs_t; + +/* + * Device states. + * PARTIAL state is really only relevant for board state. + */ +typedef enum { + DR_STATE_EMPTY = 0, + DR_STATE_OCCUPIED, + DR_STATE_CONNECTED, + DR_STATE_UNCONFIGURED, + DR_STATE_PARTIAL, /* part connected, part configured */ + DR_STATE_CONFIGURED, + DR_STATE_RELEASE, + DR_STATE_UNREFERENCED, + DR_STATE_FATAL, + DR_STATE_MAX +} dr_state_t; + +typedef struct dr_handle { + struct dr_board *h_bd; + sbd_error_t *h_err; + int h_op_intr; /* nz if op interrupted */ + dev_t h_dev; /* dev_t of opened device */ + int h_cmd; /* PIM ioctl argument */ + int h_mode; /* device open mode */ + sbd_cmd_t h_sbdcmd; /* copied-in ioctl cmd struct */ + sbd_ioctl_arg_t *h_iap; /* ptr to caller-space cmd struct */ + dr_devset_t h_devset; /* based on h_dev */ + uint_t h_ndi; + drmach_opts_t h_opts; /* command-line platform options */ +} dr_handle_t; + +typedef struct dr_common_unit { + dr_state_t sbdev_state; + sbd_state_t sbdev_ostate; + sbd_cond_t sbdev_cond; + time_t sbdev_time; + int sbdev_busy; + struct dr_board *sbdev_bp; + int sbdev_unum; + sbd_comp_type_t sbdev_type; + drmachid_t sbdev_id; + char sbdev_path[MAXNAMELEN]; + sbd_error_t *sbdev_error; +} dr_common_unit_t; + +typedef struct dr_mem_unit { + dr_common_unit_t sbm_cm; /* mem-unit state */ + uint_t sbm_flags; + pfn_t sbm_basepfn; + pgcnt_t sbm_npages; + pgcnt_t sbm_pageslost; + struct memlist *sbm_dyn_segs; /* kphysm_add_dynamic segs */ + /* + * The following fields are used during + * the memory detach process only. sbm_mlist + * will be used to store the board memlist + * following a detach. The memlist will be + * used to re-attach the board when configuring + * the unit directly after an unconfigure. + */ + struct dr_mem_unit *sbm_peer; + struct memlist *sbm_mlist; + struct memlist *sbm_del_mlist; + memhandle_t sbm_memhandle; + uint64_t sbm_alignment_mask; + uint64_t sbm_slice_base; + uint64_t sbm_slice_top; + uint64_t sbm_slice_size; +} dr_mem_unit_t; + +/* + * Currently only maintain state information for individual + * components. + */ +typedef struct dr_cpu_unit { + dr_common_unit_t sbc_cm; /* cpu-unit state */ + processorid_t sbc_cpu_id; + cpu_flag_t sbc_cpu_flags; /* snapshot of CPU flags */ + ushort_t sbc_pad1; /* padded for compatibility */ + int sbc_speed; + int sbc_ecache; + int sbc_cpu_impl; +} dr_cpu_unit_t; + +typedef struct dr_io_unit { + dr_common_unit_t sbi_cm; /* io-unit state */ +} dr_io_unit_t; + +typedef union { + dr_common_unit_t du_common; + dr_mem_unit_t du_mem; + dr_cpu_unit_t du_cpu; + dr_io_unit_t du_io; +} dr_dev_unit_t; + +typedef struct dr_board { + kmutex_t b_lock; /* lock for this board struct */ + kmutex_t b_slock; /* lock for status on the board */ + kcondvar_t b_scv; /* condvar for status on the board */ + int b_sflags; /* for serializing status */ + sbd_state_t b_rstate; /* board's cfgadm receptacle state */ + sbd_state_t b_ostate; /* board's cfgadm occupant state */ + sbd_cond_t b_cond; /* cfgadm condition */ + int b_busy; + int b_assigned; + time_t b_time; /* time of last board operation */ + char b_type[MAXNAMELEN]; + drmachid_t b_id; + int b_num; /* board number */ + int b_ndev; /* # of devices on board */ + dev_info_t *b_dip; /* dip for make-nodes */ + dr_state_t b_state; /* board DR state */ + dr_devset_t b_dev_present; /* present mask */ + dr_devset_t b_dev_attached; /* attached mask */ + dr_devset_t b_dev_released; /* released mask */ + dr_devset_t b_dev_unreferenced; /* unreferenced mask */ + char b_path[MAXNAMELEN]; + dr_dev_unit_t *b_dev[DR_MAXNUM_NT]; +} dr_board_t; + +/* + * dr_quiesce.c interfaces + */ +struct dr_sr_handle; +typedef struct dr_sr_handle dr_sr_handle_t; + +extern dr_sr_handle_t *dr_get_sr_handle(dr_handle_t *handle); +extern void dr_release_sr_handle(dr_sr_handle_t *srh); +extern int dr_suspend(dr_sr_handle_t *srh); +extern void dr_resume(dr_sr_handle_t *srh); +extern void dr_check_devices(dev_info_t *dip, int *refcount, + dr_handle_t *handle, uint64_t *arr, int *idx, + int len, int *refcount_non_gldv3); +extern int dr_pt_test_suspend(dr_handle_t *hp); + +/* + * dr_cpu.c interface + */ +extern void dr_init_cpu_unit(dr_cpu_unit_t *cp); +extern int dr_pre_attach_cpu(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_attach_cpu(dr_handle_t *hp, dr_common_unit_t *cp); +extern int dr_post_attach_cpu(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_pre_release_cpu(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_pre_detach_cpu(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_detach_cpu(dr_handle_t *hp, dr_common_unit_t *cp); +extern int dr_post_detach_cpu(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_cpu_status(dr_handle_t *hp, dr_devset_t devset, + sbd_dev_stat_t *dsp); +extern int dr_cancel_cpu(dr_cpu_unit_t *cp); +extern int dr_disconnect_cpu(dr_cpu_unit_t *cp); + + +/* + * dr_mem.c interface + */ +extern void dr_init_mem_unit(dr_mem_unit_t *mp); +extern int dr_pre_attach_mem(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_attach_mem(dr_handle_t *hp, dr_common_unit_t *cp); +extern int dr_post_attach_mem(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_pre_release_mem(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_release_mem(dr_common_unit_t *cp); +extern void dr_release_mem_done(dr_common_unit_t *cp); +extern int dr_pre_detach_mem(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_detach_mem(dr_handle_t *, dr_common_unit_t *); +extern int dr_post_detach_mem(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_mem_status(dr_handle_t *hp, dr_devset_t devset, + sbd_dev_stat_t *dsp); +extern int dr_cancel_mem(dr_mem_unit_t *mp); +extern int dr_disconnect_mem(dr_mem_unit_t *mp); + +/* + * dr_io.c interface + */ +extern void dr_init_io_unit(dr_io_unit_t *io); +extern int dr_pre_attach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_attach_io(dr_handle_t *hp, dr_common_unit_t *cp); +extern int dr_post_attach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_pre_release_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_pre_detach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern void dr_detach_io(dr_handle_t *hp, dr_common_unit_t *cp); +extern int dr_post_detach_io(dr_handle_t *hp, + dr_common_unit_t **devlist, int devnum); +extern int dr_io_status(dr_handle_t *hp, dr_devset_t devset, + sbd_dev_stat_t *dsp); +extern int dr_disconnect_io(dr_io_unit_t *ip); + + +/* + * dr.c interface + */ +extern void dr_op_err(int ce, dr_handle_t *hp, int code, char *fmt, ...); +extern void dr_dev_err(int ce, dr_common_unit_t *cp, int code); + +extern dr_cpu_unit_t *dr_get_cpu_unit(dr_board_t *bp, int unit_num); +extern dr_mem_unit_t *dr_get_mem_unit(dr_board_t *bp, int unit_num); +extern dr_io_unit_t *dr_get_io_unit(dr_board_t *bp, int unit_num); + +extern dr_board_t *dr_lookup_board(int board_num); +extern int dr_release_dev_done(dr_common_unit_t *cp); +extern char *dr_nt_to_dev_type(int type); +extern void dr_device_transition(dr_common_unit_t *cp, + dr_state_t new_state); +extern void dr_lock_status(dr_board_t *bp); +extern void dr_unlock_status(dr_board_t *bp); +extern int dr_cmd_flags(dr_handle_t *hp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DR_H */ diff --git a/usr/src/uts/i86pc/sys/dr_util.h b/usr/src/uts/i86pc/sys/dr_util.h new file mode 100644 index 0000000000..46a66c759b --- /dev/null +++ b/usr/src/uts/i86pc/sys/dr_util.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DR_UTIL_H_ +#define _SYS_DR_UTIL_H_ +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/memlist.h> +#include <sys/varargs.h> +#include <sys/sbd_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define GETSTRUCT(t, n) \ + ((t *)kmem_zalloc((size_t)(n) * sizeof (t), KM_SLEEP)) +#define FREESTRUCT(p, t, n) \ + (kmem_free((caddr_t)(p), sizeof (t) * (size_t)(n))) + +#define DRERR_SET_C(epps, eppn) \ + if (*(epps) == NULL) \ + *(epps) = *(eppn); \ + else \ + sbd_err_clear(eppn) + +#ifdef DEBUG +#define MEMLIST_DUMP(ml) memlist_dump(ml) +#else +#define MEMLIST_DUMP(ml) +#endif + +extern sbd_error_t *sbd_err_new(int e_code, char *fmt, va_list args); +extern void sbd_err_log(sbd_error_t *ep, int ce); +extern void sbd_err_clear(sbd_error_t **ep); +extern void sbd_err_set_c(sbd_error_t **ep, int ce, + int e_code, char *fmt, ...); +extern void sbd_err_set(sbd_error_t **ep, int ce, + int e_code, char *fmt, ...); + +extern sbd_error_t *drerr_new(int log, int e_code, char *fmt, ...); +extern sbd_error_t *drerr_new_v(int e_code, char *fmt, va_list args); +extern void drerr_set_c(int log, sbd_error_t **ep, + int e_code, char *fmt, ...); + +extern void dr_memlist_delete(struct memlist *mlist); +extern void memlist_dump(struct memlist *mlist); +extern int dr_memlist_intersect(struct memlist *al, + struct memlist *bl); +extern void dr_memlist_coalesce(struct memlist *mlist); +extern struct memlist *dr_memlist_dup(struct memlist *mlist); +extern struct memlist *dr_memlist_add_span(struct memlist *mlist, + uint64_t base, uint64_t len); +extern struct memlist *dr_memlist_del_span(struct memlist *mlist, + uint64_t base, uint64_t len); +extern struct memlist *dr_memlist_cat_span(struct memlist *mlist, + uint64_t base, uint64_t len); + +/* + * These are all utilities internal for DR. There are + * similar functions in common/os which have similar names. + * We rename them to make sure there is no name space + * conflict. + */ +#define memlist_delete dr_memlist_delete +#define memlist_intersect dr_memlist_intersect +#define memlist_coalesce dr_memlist_coalesce +#define memlist_dup dr_memlist_dup +#define memlist_add_span dr_memlist_add_span +#define memlist_del_span dr_memlist_del_span +#define memlist_cat_span dr_memlist_cat_span + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DR_UTIL_H_ */ diff --git a/usr/src/uts/i86pc/sys/drmach.h b/usr/src/uts/i86pc/sys/drmach.h new file mode 100644 index 0000000000..2d2a2cff7b --- /dev/null +++ b/usr/src/uts/i86pc/sys/drmach.h @@ -0,0 +1,177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_DRMACH_H_ +#define _SYS_DRMACH_H_ +#include <sys/types.h> +#include <sys/memlist.h> +#include <sys/ddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/sysevent.h> +#include <sys/x86_archext.h> +#include <sys/sbd_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_BOARDS drmach_max_boards() +#define MAX_MEM_UNITS_PER_BOARD drmach_max_mem_units_per_board() +#define MAX_IO_UNITS_PER_BOARD drmach_max_io_units_per_board() +#define MAX_CMP_UNITS_PER_BOARD drmach_max_cmp_units_per_board() + +/* DR uses MAX_CORES_PER_CMP as number of logical CPUs within a CMP. */ +#define MAX_CORES_PER_CMP drmach_max_core_per_cmp() + +/* Maximum possible logical CPUs per board. */ +#define MAX_CPU_UNITS_PER_BOARD (MAX_CMP_UNITS_PER_BOARD * MAX_CORES_PER_CMP) + +/* Check whether CPU is CMP. True if chip has more than one core/thread. */ +#define CPU_IMPL_IS_CMP(impl) (MAX_CORES_PER_CMP > 1) + +/* CPU implementation ID for Intel Nehalem CPU. */ +#define X86_CPU_IMPL_NEHALEM_EX 0x062E0000 +#define X86_CPU_IMPL_UNKNOWN 0x00000000 + +/* returned with drmach_board_find_devices callback */ +#define DRMACH_DEVTYPE_CPU "cpu" +#define DRMACH_DEVTYPE_MEM "memory" +#define DRMACH_DEVTYPE_PCI "pci" + +/* + * x86 platform specific routines currently only defined + * in drmach_acpi.c and referenced by DR. + */ + +typedef void *drmachid_t; + +typedef struct { + boolean_t assigned; + boolean_t powered; + boolean_t configured; + boolean_t busy; + boolean_t empty; + sbd_cond_t cond; + char type[SBD_TYPE_LEN]; + char info[SBD_MAX_INFO]; +} drmach_status_t; + +typedef struct { + int size; + char *copts; +} drmach_opts_t; + +typedef struct { + uint64_t mi_basepa; + uint64_t mi_size; + uint64_t mi_slice_base; + uint64_t mi_slice_top; + uint64_t mi_slice_size; + uint64_t mi_alignment_mask; +} drmach_mem_info_t; + +extern uint_t drmach_max_boards(void); +extern uint_t drmach_max_io_units_per_board(void); +extern uint_t drmach_max_cmp_units_per_board(void); +extern uint_t drmach_max_mem_units_per_board(void); +extern uint_t drmach_max_core_per_cmp(void); + +extern sbd_error_t *drmach_get_dip(drmachid_t id, dev_info_t **dip); +extern sbd_error_t *drmach_release(drmachid_t id); +extern sbd_error_t *drmach_pre_op(int cmd, drmachid_t id, + drmach_opts_t *opts, void *devsetp); +extern sbd_error_t *drmach_post_op(int cmd, drmachid_t id, + drmach_opts_t *opts, int rv); +extern sbd_error_t *drmach_configure(drmachid_t id, int flags); +extern sbd_error_t *drmach_unconfigure(drmachid_t id, int flags); +extern sbd_error_t *drmach_status(drmachid_t id, drmach_status_t *stat); +extern sbd_error_t *drmach_passthru(drmachid_t id, + drmach_opts_t *opts); + +extern sbd_error_t *drmach_board_find_devices(drmachid_t id, void *a, + sbd_error_t *(*found)(void *a, const char *, int, drmachid_t)); +extern int drmach_board_lookup(int bnum, drmachid_t *id); +extern sbd_error_t *drmach_board_name(int bnum, char *buf, int buflen); +extern sbd_error_t *drmach_board_assign(int bnum, drmachid_t *id); +extern sbd_error_t *drmach_board_unassign(drmachid_t id); +extern sbd_error_t *drmach_board_poweroff(drmachid_t id); +extern sbd_error_t *drmach_board_poweron(drmachid_t id); +extern sbd_error_t *drmach_board_test(drmachid_t id, drmach_opts_t *opts, + int force); +extern sbd_error_t *drmach_board_connect(drmachid_t id, + drmach_opts_t *opts); +extern sbd_error_t *drmach_board_disconnect(drmachid_t id, + drmach_opts_t *opts); +extern sbd_error_t *drmach_board_deprobe(drmachid_t id); +extern int drmach_board_is_floating(drmachid_t); + +extern sbd_error_t *drmach_cpu_get_id(drmachid_t id, processorid_t *cpuid); +extern sbd_error_t *drmach_cpu_get_impl(drmachid_t id, int *ip); +extern sbd_error_t *drmach_cpu_disconnect(drmachid_t id); + +extern sbd_error_t *drmach_io_is_attached(drmachid_t id, int *yes); +extern sbd_error_t *drmach_io_post_attach(drmachid_t id); +extern sbd_error_t *drmach_io_pre_release(drmachid_t id); +extern sbd_error_t *drmach_io_unrelease(drmachid_t id); +extern sbd_error_t *drmach_io_post_release(drmachid_t id); + +extern sbd_error_t *drmach_mem_get_slice_info(drmachid_t id, + uint64_t *basepa, uint64_t *endpa, + uint64_t *sizep); +extern sbd_error_t *drmach_mem_get_memlist(drmachid_t id, + struct memlist **ml); +extern sbd_error_t *drmach_mem_get_info(drmachid_t, drmach_mem_info_t *); +extern sbd_error_t *drmach_mem_enable(drmachid_t id); +extern sbd_error_t *drmach_mem_disable(drmachid_t id); +extern sbd_error_t *drmach_mem_add_span(drmachid_t id, + uint64_t basepa, uint64_t size); +extern sbd_error_t *drmach_mem_del_span(drmachid_t id, + uint64_t basepa, uint64_t size); +extern sbd_error_t *drmach_copy_rename_init( + drmachid_t dst_id, drmachid_t src_id, + struct memlist *src_copy_ml, + drmachid_t *pgm_id); +extern sbd_error_t *drmach_copy_rename_fini(drmachid_t id); +extern void drmach_copy_rename(drmachid_t id); +extern int drmach_copy_rename_need_suspend(drmachid_t id); + +extern int drmach_log_sysevent(int board, char *hint, int flag, + int verbose); + +extern int drmach_verify_sr(dev_info_t *dip, int sflag); +extern void drmach_suspend_last(); +extern void drmach_resume_first(); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DRMACH_H_ */ diff --git a/usr/src/uts/i86pc/sys/fastboot_msg.h b/usr/src/uts/i86pc/sys/fastboot_msg.h index a35fcf2f67..4c7ee7f8b5 100644 --- a/usr/src/uts/i86pc/sys/fastboot_msg.h +++ b/usr/src/uts/i86pc/sys/fastboot_msg.h @@ -45,6 +45,7 @@ fastboot_nosup_msg(FBNS_DEFAULT, "") fastboot_nosup_msg(FBNS_SUSPEND, " after suspend/resume") fastboot_nosup_msg(FBNS_FMAHWERR, " due to FMA recovery from hardware error") +fastboot_nosup_msg(FBNS_HOTPLUG, " after DR operations") /* * Should ALWAYS be the last one. diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index b652331715..67b9a07439 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -63,6 +63,7 @@ typedef void *cpu_pri_lev_t; struct cpuid_info; struct cpu_ucode_info; +struct cmi_hdl; /* * A note about the hypervisor affinity bits: a one bit in the affinity mask @@ -135,7 +136,9 @@ struct machcpu { struct cpu_ucode_info *mcpu_ucode_info; - void *mcpu_pm_mach_state; + void *mcpu_pm_mach_state; + struct cmi_hdl *mcpu_cmi_hdl; + void *mcpu_mach_ctx_ptr; /* * A stamp that is unique per processor and changes diff --git a/usr/src/uts/i86pc/sys/machsystm.h b/usr/src/uts/i86pc/sys/machsystm.h index b98a48326e..8c79243a29 100644 --- a/usr/src/uts/i86pc/sys/machsystm.h +++ b/usr/src/uts/i86pc/sys/machsystm.h @@ -23,6 +23,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #ifndef _SYS_MACHSYSTM_H #define _SYS_MACHSYSTM_H @@ -51,11 +55,30 @@ extern "C" { #ifdef _KERNEL +typedef enum mach_cpu_add_arg_type { + MACH_CPU_ARG_LOCAL_APIC, + MACH_CPU_ARG_LOCAL_X2APIC, +} mach_cpu_add_arg_type_t; + +typedef struct mach_cpu_add_arg { + mach_cpu_add_arg_type_t type; + union { + struct { + uint32_t apic_id; + uint32_t proc_id; + } apic; + } arg; +} mach_cpu_add_arg_t; + extern void mach_cpu_idle(void); extern void mach_cpu_halt(char *); extern int mach_cpu_start(cpu_t *, void *); extern int mach_cpuid_start(processorid_t, void *); +extern int mach_cpu_stop(cpu_t *, void *); +extern int mach_cpu_add(mach_cpu_add_arg_t *, processorid_t *); +extern int mach_cpu_remove(processorid_t); extern int mach_cpu_create_device_node(cpu_t *, dev_info_t **); +extern int mach_cpu_get_device_node(cpu_t *, dev_info_t **); extern int Cpudelay; extern void setcpudelay(void); @@ -120,10 +143,16 @@ extern int use_mp; extern struct cpu cpus[]; /* pointer to other cpus */ extern struct cpu *cpu[]; /* pointer to all cpus */ +/* Operation types for extended mach_cpucontext interfaces */ +#define MACH_CPUCONTEXT_OP_START 0 +#define MACH_CPUCONTEXT_OP_STOP 1 + extern int mach_cpucontext_init(void); extern void mach_cpucontext_fini(void); extern void *mach_cpucontext_alloc(struct cpu *); extern void mach_cpucontext_free(struct cpu *, void *, int); +extern void *mach_cpucontext_xalloc(struct cpu *, int); +extern void mach_cpucontext_xfree(struct cpu *, void *, int, int); extern void rmp_gdt_init(rm_platter_t *); extern uintptr_t hole_start, hole_end; @@ -145,6 +174,52 @@ extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); extern int force_shutdown_method; +/* Dynamic Reconfiguration capability interface. */ +#define PLAT_DR_OPTIONS_NAME "plat-dr-options" +#define PLAT_DR_PHYSMAX_NAME "plat-dr-physmax" +#define PLAT_MAX_NCPUS_NAME "plat-max-ncpus" +#define BOOT_MAX_NCPUS_NAME "boot-max-ncpus" +#define BOOT_NCPUS_NAME "boot-ncpus" + +#define PLAT_DR_FEATURE_CPU 0x1 +#define PLAT_DR_FEATURE_MEMORY 0x2 +#define PLAT_DR_FEATURE_ENABLED 0x1000000 + +#define plat_dr_enabled() \ + plat_dr_check_capability(PLAT_DR_FEATURE_ENABLED) + +#define plat_dr_enable() \ + plat_dr_enable_capability(PLAT_DR_FEATURE_ENABLED) + +#define plat_dr_disable_cpu() \ + plat_dr_disable_capability(PLAT_DR_FEATURE_CPU) +#define plat_dr_disable_memory() \ + plat_dr_disable_capability(PLAT_DR_FEATURE_MEMORY) + +extern boolean_t plat_dr_support_cpu(void); +extern boolean_t plat_dr_support_memory(void); +extern boolean_t plat_dr_check_capability(uint64_t features); +extern void plat_dr_enable_capability(uint64_t features); +extern void plat_dr_disable_capability(uint64_t features); + +#pragma weak plat_dr_support_cpu +#pragma weak plat_dr_support_memory + +/* + * Used to communicate DR updates to platform lgroup framework + */ +typedef struct { + uint64_t u_base; + uint64_t u_length; + uint32_t u_domain; + uint32_t u_device_id; + uint32_t u_sli_cnt; + uchar_t *u_sli_ptr; +} update_membounds_t; + +/* Maximum physical page number (PFN) for memory DR operations. */ +extern uint64_t plat_dr_physmax; + #ifdef __xpv #include <sys/xen_mmu.h> extern page_t *page_get_high_mfn(mfn_t); diff --git a/usr/src/uts/i86pc/sys/memnode.h b/usr/src/uts/i86pc/sys/memnode.h index dde3931627..2bcb623617 100644 --- a/usr/src/uts/i86pc/sys/memnode.h +++ b/usr/src/uts/i86pc/sys/memnode.h @@ -41,8 +41,9 @@ extern "C" { * memory slice ID and the high-order bits are the SSM nodeid. */ +#define MAX_MEM_NODES_PER_LGROUP 3 #ifndef MAX_MEM_NODES -#define MAX_MEM_NODES (8) +#define MAX_MEM_NODES (8 * MAX_MEM_NODES_PER_LGROUP) #endif /* MAX_MEM_NODES */ #define PFN_2_MEM_NODE(pfn) \ @@ -57,14 +58,12 @@ extern "C" { */ extern int plat_pfn_to_mem_node(pfn_t); -extern int plat_lgrphand_to_mem_node(lgrp_handle_t); extern void plat_assign_lgrphand_to_mem_node(lgrp_handle_t, int); extern lgrp_handle_t plat_mem_node_to_lgrphand(int); extern void plat_slice_add(pfn_t, pfn_t); extern void plat_slice_del(pfn_t, pfn_t); #pragma weak plat_pfn_to_mem_node -#pragma weak plat_lgrphand_to_mem_node #pragma weak plat_mem_node_to_lgrphand #pragma weak plat_slice_add #pragma weak plat_slice_del diff --git a/usr/src/uts/i86pc/sys/psm_types.h b/usr/src/uts/i86pc/sys/psm_types.h index 17e9db17b1..24ad7d700f 100644 --- a/usr/src/uts/i86pc/sys/psm_types.h +++ b/usr/src/uts/i86pc/sys/psm_types.h @@ -23,12 +23,14 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #ifndef _SYS_PSM_TYPES_H #define _SYS_PSM_TYPES_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Platform Specific Module Types */ @@ -88,6 +90,31 @@ typedef struct psm_state_req { } req; } psm_state_request_t; +typedef enum psm_cpu_op_e { + PSM_CPU_ADD = 1, + PSM_CPU_REMOVE, + PSM_CPU_STOP +} psm_cpu_op_t; + +typedef struct psm_cpu_request { + psm_cpu_op_t pcr_cmd; + union { + struct { + processorid_t cpuid; + void *argp; + } cpu_add; + + struct { + processorid_t cpuid; + } cpu_remove; + + struct { + processorid_t cpuid; + void *ctx; + } cpu_stop; + } req; +} psm_cpu_request_t; + struct psm_ops { int (*psm_probe)(void); @@ -106,7 +133,7 @@ struct psm_ops { void (*psm_unset_idlecpu)(processorid_t cpun); #if defined(PSMI_1_3) || defined(PSMI_1_4) || defined(PSMI_1_5) || \ - defined(PSMI_1_6) + defined(PSMI_1_6) || defined(PSMI_1_7) int (*psm_clkinit)(int hertz); #else void (*psm_clkinit)(int hertz); @@ -117,14 +144,14 @@ struct psm_ops { hrtime_t (*psm_gethrtime)(void); processorid_t (*psm_get_next_processorid)(processorid_t cpu_id); -#if defined(PSMI_1_5) || defined(PSMI_1_6) +#if defined(PSMI_1_5) || defined(PSMI_1_6) || defined(PSMI_1_7) int (*psm_cpu_start)(processorid_t cpun, caddr_t ctxt); #else void (*psm_cpu_start)(processorid_t cpun, caddr_t rm_code); #endif int (*psm_post_cpu_start)(void); #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4) || \ - defined(PSMI_1_5) || defined(PSMI_1_6) + defined(PSMI_1_5) || defined(PSMI_1_6) || defined(PSMI_1_7) void (*psm_shutdown)(int cmd, int fcn); #else void (*psm_shutdown)(void); @@ -140,26 +167,30 @@ struct psm_ops { #endif void (*psm_notify_error)(int level, char *errmsg); #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4) || \ - defined(PSMI_1_5) || defined(PSMI_1_6) + defined(PSMI_1_5) || defined(PSMI_1_6) || defined(PSMI_1_7) void (*psm_notify_func)(int msg); #endif #if defined(PSMI_1_3) || defined(PSMI_1_4) || defined(PSMI_1_5) || \ - defined(PSMI_1_6) + defined(PSMI_1_6) || defined(PSMI_1_7) void (*psm_timer_reprogram)(hrtime_t time); void (*psm_timer_enable)(void); void (*psm_timer_disable)(void); void (*psm_post_cyclic_setup)(void *arg); #endif -#if defined(PSMI_1_4) || defined(PSMI_1_5) || defined(PSMI_1_6) +#if defined(PSMI_1_4) || defined(PSMI_1_5) || defined(PSMI_1_6) || \ + defined(PSMI_1_7) void (*psm_preshutdown)(int cmd, int fcn); #endif -#if defined(PSMI_1_5) || defined(PSMI_1_6) +#if defined(PSMI_1_5) || defined(PSMI_1_6) || defined(PSMI_1_7) int (*psm_intr_ops)(dev_info_t *dip, ddi_intr_handle_impl_t *handle, psm_intr_op_t op, int *result); #endif -#if defined(PSMI_1_6) +#if defined(PSMI_1_6) || defined(PSMI_1_7) int (*psm_state)(psm_state_request_t *request); #endif +#if defined(PSMI_1_7) + int (*psm_cpu_ops)(psm_cpu_request_t *reqp); +#endif }; @@ -183,7 +214,8 @@ struct psm_info { #define PSM_INFO_VER01_3 0x8604 #define PSM_INFO_VER01_4 0x8605 #define PSM_INFO_VER01_5 0x8606 -#define PSM_INFO_VER01_6 0x8706 +#define PSM_INFO_VER01_6 0x8607 +#define PSM_INFO_VER01_7 0x8608 #define PSM_INFO_VER01_X (PSM_INFO_VER01_1 & 0xFFF0) /* ver 1.X */ /* diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h index b21c54dd63..48e141126b 100644 --- a/usr/src/uts/i86pc/sys/rm_platter.h +++ b/usr/src/uts/i86pc/sys/rm_platter.h @@ -22,12 +22,14 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ #ifndef _SYS_RM_PLATTER_H #define _SYS_RM_PLATTER_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/tss.h> #include <sys/segments.h> @@ -36,8 +38,12 @@ extern "C" { #endif +#define RM_PLATTER_CODE_SIZE 0x400 +#define RM_PLATTER_CPU_HALT_CODE_SIZE 0x100 + typedef struct rm_platter { - char rm_code[1024]; + char rm_code[RM_PLATTER_CODE_SIZE]; + char rm_cpu_halt_code[RM_PLATTER_CPU_HALT_CODE_SIZE]; #if defined(__amd64) /* * The compiler will want to 64-bit align the 64-bit rm_gdt_base @@ -59,7 +65,7 @@ typedef struct rm_platter { */ uint32_t rm_idt_pad; #endif /* __amd64 */ - ushort_t rm_filler2; /* till I am sure that pragma works */ + ushort_t rm_cpu_halted; /* non-zero if CPU has been halted */ ushort_t rm_idt_lim; /* stuff for lidt */ gate_desc_t *rm_idt_base; uint_t rm_pdbr; /* cr3 value */ diff --git a/usr/src/uts/i86pc/sys/sbd_ioctl.h b/usr/src/uts/i86pc/sys/sbd_ioctl.h new file mode 100644 index 0000000000..1eab543aa3 --- /dev/null +++ b/usr/src/uts/i86pc/sys/sbd_ioctl.h @@ -0,0 +1,655 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SBD_IOCTL_H +#define _SBD_IOCTL_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/obpdefs.h> +#include <sys/processor.h> +#include <sys/param.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM +typedef enum { + SBD_COMP_NONE, + SBD_COMP_CPU, + SBD_COMP_MEM, + SBD_COMP_IO, + SBD_COMP_CMP, + SBD_COMP_UNKNOWN +} sbd_comp_type_t; + +typedef enum { + SBD_STAT_NONE = 0, + SBD_STAT_EMPTY, + SBD_STAT_DISCONNECTED, + SBD_STAT_CONNECTED, + SBD_STAT_UNCONFIGURED, + SBD_STAT_CONFIGURED +} sbd_state_t; + +typedef enum { + SBD_COND_UNKNOWN = 0, + SBD_COND_OK, + SBD_COND_FAILING, + SBD_COND_FAILED, + SBD_COND_UNUSABLE +} sbd_cond_t; + +typedef int sbd_busy_t; + +#define SBD_MAX_UNSAFE 16 +#define SBD_TYPE_LEN 12 +#define SBD_NULL_UNIT -1 + +typedef struct { + sbd_comp_type_t c_type; + int c_unit; + char c_name[OBP_MAXPROPNAME]; +} sbd_comp_id_t; + +typedef struct { + sbd_comp_id_t c_id; + sbd_state_t c_ostate; + sbd_cond_t c_cond; + sbd_busy_t c_busy; + uint_t c_sflags; + time_t c_time; +} sbd_cm_stat_t; + +#define ci_type c_id.c_type +#define ci_unit c_id.c_unit +#define ci_name c_id.c_name + +typedef struct { + sbd_cm_stat_t cs_cm; + int cs_isbootproc; + processorid_t cs_cpuid; + int cs_speed; + int cs_ecache; +} sbd_cpu_stat_t; + +#define cs_type cs_cm.ci_type +#define cs_unit cs_cm.ci_unit +#define cs_name cs_cm.ci_name +#define cs_ostate cs_cm.c_ostate +#define cs_cond cs_cm.c_cond +#define cs_busy cs_cm.c_busy +#define cs_suspend cs_cm.c_sflags +#define cs_time cs_cm.c_time + +typedef struct { + sbd_cm_stat_t ms_cm; + int ms_interleave; + pfn_t ms_basepfn; + pgcnt_t ms_totpages; + pgcnt_t ms_detpages; + pgcnt_t ms_pageslost; + pgcnt_t ms_managed_pages; + pgcnt_t ms_noreloc_pages; + pgcnt_t ms_noreloc_first; + pgcnt_t ms_noreloc_last; + int ms_cage_enabled; + int ms_peer_is_target; /* else peer is source */ + char ms_peer_ap_id[MAXPATHLEN]; /* board's AP name */ +} sbd_mem_stat_t; + +#define ms_type ms_cm.ci_type +#define ms_unit ms_cm.ci_unit +#define ms_name ms_cm.ci_name +#define ms_ostate ms_cm.c_ostate +#define ms_cond ms_cm.c_cond +#define ms_busy ms_cm.c_busy +#define ms_suspend ms_cm.c_sflags +#define ms_time ms_cm.c_time + +typedef struct { + sbd_cm_stat_t is_cm; + int is_referenced; + int is_unsafe_count; + int is_unsafe_list[SBD_MAX_UNSAFE]; + char is_pathname[MAXPATHLEN]; +} sbd_io_stat_t; + +#define is_type is_cm.ci_type +#define is_unit is_cm.ci_unit +#define is_name is_cm.ci_name +#define is_ostate is_cm.c_ostate +#define is_cond is_cm.c_cond +#define is_busy is_cm.c_busy +#define is_suspend is_cm.c_sflags +#define is_time is_cm.c_time + +/* This constant must be the max of the max cores on all platforms */ +#define SBD_MAX_CORES_PER_CMP 64 + +typedef struct { + sbd_cm_stat_t ps_cm; + processorid_t ps_cpuid[SBD_MAX_CORES_PER_CMP]; + int ps_ncores; + int ps_speed; + int ps_ecache; +} sbd_cmp_stat_t; + +#define ps_type ps_cm.ci_type +#define ps_unit ps_cm.ci_unit +#define ps_name ps_cm.ci_name +#define ps_ostate ps_cm.c_ostate +#define ps_cond ps_cm.c_cond +#define ps_busy ps_cm.c_busy +#define ps_suspend ps_cm.c_sflags +#define ps_time ps_cm.c_time + +typedef union { + sbd_cm_stat_t d_cm; + sbd_cpu_stat_t d_cpu; + sbd_mem_stat_t d_mem; + sbd_io_stat_t d_io; + sbd_cmp_stat_t d_cmp; +} sbd_dev_stat_t; + +#define ds_type d_cm.ci_type +#define ds_unit d_cm.ci_unit +#define ds_name d_cm.ci_name +#define ds_ostate d_cm.c_ostate +#define ds_cond d_cm.c_cond +#define ds_busy d_cm.c_busy +#define ds_suspend d_cm.c_sflags +#define ds_time d_cm.c_time + +#define SBD_MAX_INFO 256 + +typedef struct { + int s_board; + char s_type[SBD_TYPE_LEN]; + char s_info[SBD_MAX_INFO]; + sbd_state_t s_rstate; + sbd_state_t s_ostate; + sbd_cond_t s_cond; + sbd_busy_t s_busy; + time_t s_time; + uint_t s_power:1; + uint_t s_assigned:1; + uint_t s_platopts; + int s_nstat; + sbd_dev_stat_t s_stat[1]; +} sbd_stat_t; + +typedef struct { + sbd_comp_id_t c_id; + uint_t c_flags; + int c_len; + caddr_t c_opts; +} sbd_cm_cmd_t; + +typedef struct { + sbd_cm_cmd_t g_cm; + int g_ncm; +} sbd_getncm_cmd_t; + +typedef struct { + sbd_cm_cmd_t s_cm; + int s_nbytes; + caddr_t s_statp; +} sbd_stat_cmd_t; + +typedef union { + sbd_cm_cmd_t cmd_cm; + sbd_getncm_cmd_t cmd_getncm; + sbd_stat_cmd_t cmd_stat; +} sbd_cmd_t; + +typedef struct { + int e_code; + char e_rsc[MAXPATHLEN]; +} sbd_error_t; + +typedef struct { + sbd_cmd_t i_cmd; + sbd_error_t i_err; +} sbd_ioctl_arg_t; + +typedef struct { + int t_base; + int t_bnd; + char **t_text; +} sbd_etab_t; + +#define i_flags i_cmd.cmd_cm.c_flags +#define i_len i_cmd.cmd_cm.c_len +#define i_opts i_cmd.cmd_cm.c_opts +#define ic_type i_cmd.cmd_cm.ci_type +#define ic_name i_cmd.cmd_cm.ci_name +#define ic_unit i_cmd.cmd_cm.ci_unit +#define ie_code i_err.e_code +#define ie_rsc i_err.e_rsc + +#define _SBD_IOC (('D' << 16) | ('R' << 8)) + +#define SBD_CMD_ASSIGN (_SBD_IOC | 0x01) +#define SBD_CMD_UNASSIGN (_SBD_IOC | 0x02) +#define SBD_CMD_POWERON (_SBD_IOC | 0x03) +#define SBD_CMD_POWEROFF (_SBD_IOC | 0x04) +#define SBD_CMD_TEST (_SBD_IOC | 0x05) +#define SBD_CMD_CONNECT (_SBD_IOC | 0x06) +#define SBD_CMD_CONFIGURE (_SBD_IOC | 0x07) +#define SBD_CMD_UNCONFIGURE (_SBD_IOC | 0x08) +#define SBD_CMD_DISCONNECT (_SBD_IOC | 0x09) +#define SBD_CMD_STATUS (_SBD_IOC | 0x0a) +#define SBD_CMD_GETNCM (_SBD_IOC | 0x0b) +#define SBD_CMD_PASSTHRU (_SBD_IOC | 0x0c) + +#define SBD_CHECK_SUSPEND(cmd, c_sflags) \ + (((c_sflags) >> (((cmd) & 0xf) - 1)) & 0x01) + +#define SBD_SET_SUSPEND(cmd, c_sflags) \ + ((c_sflags) |= (0x01 << (((cmd) & 0xf) - 1))) + +#define SBD_CHECK_PLATOPTS(cmd, c_platopts) \ + (((c_platopts) >> (((cmd) & 0xf) - 1)) & 0x01) + +#define SBD_SET_PLATOPTS(cmd, c_platopts) \ + ((c_platopts) &= ~(0x01 << (((cmd) & 0xf) - 1))) + +#define SBD_FLAG_FORCE 0x1 +#define SBD_FLAG_ALLCMP 0x2 +#define SBD_FLAG_QUIESCE_OKAY 0x4 + +#if defined(_SYSCALL32) + +typedef struct { + int32_t c_type; + int32_t c_unit; + char c_name[OBP_MAXPROPNAME]; +} sbd_comp_id32_t; + +typedef struct { + sbd_comp_id32_t c_id; + int32_t c_ostate; + int32_t c_cond; + int32_t c_busy; + uint32_t c_sflags; + time32_t c_time; +} sbd_cm_stat32_t; + +typedef struct { + sbd_cm_stat32_t cs_cm; + int32_t cs_isbootproc; + int32_t cs_cpuid; + int32_t cs_speed; + int32_t cs_ecache; +} sbd_cpu_stat32_t; + +typedef struct { + sbd_cm_stat32_t ms_cm; + int32_t ms_interleave; + uint32_t ms_basepfn; + uint32_t ms_totpages; + uint32_t ms_detpages; + int32_t ms_pageslost; + uint32_t ms_managed_pages; + uint32_t ms_noreloc_pages; + uint32_t ms_noreloc_first; + uint32_t ms_noreloc_last; + int32_t ms_cage_enabled; + int32_t ms_peer_is_target; + char ms_peer_ap_id[MAXPATHLEN]; +} sbd_mem_stat32_t; + +typedef struct { + sbd_cm_stat32_t is_cm; + int32_t is_referenced; + int32_t is_unsafe_count; + int32_t is_unsafe_list[SBD_MAX_UNSAFE]; + char is_pathname[MAXPATHLEN]; +} sbd_io_stat32_t; + +typedef struct { + sbd_cm_stat32_t ps_cm; + int32_t ps_cpuid[SBD_MAX_CORES_PER_CMP]; + int32_t ps_ncores; + int32_t ps_speed; + int32_t ps_ecache; +} sbd_cmp_stat32_t; + +typedef union { + sbd_cm_stat32_t d_cm; + sbd_cpu_stat32_t d_cpu; + sbd_mem_stat32_t d_mem; + sbd_io_stat32_t d_io; + sbd_cmp_stat32_t d_cmp; +} sbd_dev_stat32_t; + +typedef struct { + int32_t s_board; + char s_type[SBD_TYPE_LEN]; + char s_info[SBD_MAX_INFO]; + int32_t s_rstate; + int32_t s_ostate; + int32_t s_cond; + int32_t s_busy; + time32_t s_time; + uint32_t s_power:1; + uint32_t s_assigned:1; + uint32_t s_platopts; + int32_t s_nstat; + sbd_dev_stat32_t s_stat[1]; +} sbd_stat32_t; + +typedef struct { + int32_t e_code; + char e_rsc[MAXPATHLEN]; +} sbd_error32_t; + +typedef struct { + sbd_comp_id32_t c_id; + uint32_t c_flags; + int32_t c_len; + caddr32_t c_opts; +} sbd_cm_cmd32_t; + +typedef struct { + sbd_cm_cmd32_t g_cm; + int32_t g_ncm; +} sbd_getncm_cmd32_t; + +typedef struct { + sbd_cm_cmd32_t s_cm; + int32_t s_nbytes; + caddr32_t s_statp; +} sbd_stat_cmd32_t; + +typedef union { + sbd_cm_cmd32_t cmd_cm; + sbd_getncm_cmd32_t cmd_getncm; + sbd_stat_cmd32_t cmd_stat; +} sbd_cmd32_t; + +typedef struct { + sbd_cmd32_t i_cmd; + sbd_error32_t i_err; +} sbd_ioctl_arg32_t; + +typedef struct { + int32_t t_base; + int32_t t_bnd; + char **t_text; +} sbd_etab32_t; + +#endif /* _SYSCALL32 */ +#endif /* _ASM */ + +/* Common error codes */ + +#define ESBD_NOERROR 0 /* no error */ +#define ESBD_INTERNAL 1 /* Internal error */ +#define ESBD_NOMEM 2 /* Insufficient memory */ +#define ESBD_PROTO 3 /* Protocol error */ +#define ESBD_BUSY 4 /* Device busy */ +#define ESBD_NODEV 5 /* No such device */ +#define ESBD_ALREADY 6 /* Operation already in progress */ +#define ESBD_IO 7 /* I/O error */ +#define ESBD_FAULT 8 /* Bad address */ +#define ESBD_EMPTY_BD 9 /* No device(s) on board */ +#define ESBD_INVAL 10 /* Invalid argument */ +#define ESBD_STATE 11 /* Invalid state transition */ +#define ESBD_FATAL_STATE 12 /* Device in fatal state */ +#define ESBD_OUTSTANDING 13 /* Outstanding error */ +#define ESBD_SUSPEND 14 /* Device failed to suspend */ +#define ESBD_RESUME 15 /* Device failed to resume */ +#define ESBD_UTHREAD 16 /* Cannot stop user thread */ +#define ESBD_RTTHREAD 17 /* Cannot quiesce realtime thread */ +#define ESBD_KTHREAD 18 /* Cannot stop kernel thread */ +#define ESBD_OFFLINE 19 /* Failed to off-line */ +#define ESBD_ONLINE 20 /* Failed to on-line */ +#define ESBD_CPUSTART 21 /* Failed to start CPU */ +#define ESBD_CPUSTOP 22 /* Failed to stop CPU */ +#define ESBD_INVAL_COMP 23 /* Invalid component type */ +#define ESBD_KCAGE_OFF 24 /* Kernel cage is disabled */ +#define ESBD_NO_TARGET 25 /* No available memory target */ +#define ESBD_HW_PROGRAM 26 /* Hardware programming error */ +#define ESBD_MEM_NOTVIABLE 27 /* VM viability test failed */ +#define ESBD_MEM_REFUSED 28 /* Memory operation refused */ +#define ESBD_MEM_NONRELOC 29 /* Non-relocatable pages in span */ +#define ESBD_MEM_CANCELLED 30 /* Memory operation cancelled */ +#define ESBD_MEMFAIL 31 /* Memory operation failed */ +#define ESBD_MEMONLINE 32 /* Can't unconfig cpu if mem online */ +#define ESBD_QUIESCE_REQD 33 + /* Operator confirmation for quiesce is required */ +#define ESBD_MEMINTLV 34 + /* Memory is interleaved across boards */ +#define ESBD_CPUONLINE 35 + /* Can't config memory if not all cpus are online */ +#define ESBD_UNSAFE 36 /* Unsafe driver present */ +#define ESBD_INVAL_OPT 37 /* option invalid */ + +/* Starcat error codes */ + +#define ESTC_NONE 1000 /* No error */ +#define ESTC_GETPROP 1001 /* Cannot read property value */ +#define ESTC_BNUM 1002 /* Invalid board number */ +#define ESTC_CONFIGBUSY 1003 + /* Cannot proceed; Board is configured or busy */ +#define ESTC_PROBE 1004 /* Solaris failed to probe */ +#define ESTC_DEPROBE 1005 /* Solaris failed to deprobe */ +#define ESTC_MOVESIGB 1006 /* Firmware move-cpu0 failed */ +#define ESTC_SUPPORT 1007 /* Operation not supported */ +#define ESTC_DRVFAIL 1008 /* Device driver failure */ +#define ESTC_UNKPTCMD 1012 /* Unrecognized platform command */ +#define ESTC_NOTID 1013 + /* drmach parameter is not a valid ID */ +#define ESTC_INAPPROP 1014 + /* drmach parameter is inappropriate for operation */ +#define ESTC_INTERNAL 1015 /* Unexpected internal condition */ +#define ESTC_MBXRQST 1016 + /* Mailbox framework failure: outgoing */ +#define ESTC_MBXRPLY 1017 + /* Mailbox framework failure: incoming */ +#define ESTC_NOACL 1018 /* Board is not in domain ACL */ +#define ESTC_NOT_ASSIGNED 1019 /* Board is not assigned to domain */ +#define ESTC_NOT_ACTIVE 1020 /* Board is not active */ +#define ESTC_EMPTY_SLOT 1021 /* Slot is empty */ +#define ESTC_POWER_OFF 1022 /* Board is powered off */ +#define ESTC_TEST_IN_PROGRESS 1023 /* Board is already being tested */ +#define ESTC_TESTING_BUSY 1024 + /* Wait: All SC test resources are in use */ +#define ESTC_TEST_REQUIRED 1025 /* Board requires test prior to use */ +#define ESTC_TEST_ABORTED 1026 /* Board test has been aborted */ +#define ESTC_MBOX_UNKNOWN 1027 + /* Unknown error type received from SC */ +#define ESTC_TEST_STATUS_UNKNOWN 1028 + /* Test completed with unknown status */ +#define ESTC_TEST_RESULT_UNKNOWN 1029 + /* Unknown test result returned by SC */ +#define ESTC_TEST_FAILED 1030 + /* SMS hpost reported error, see POST log for details */ +#define ESTC_UNAVAILABLE 1031 /* Slot is unavailable to the domain */ +#define ESTC_NZ_LPA 1032 /* Nonzero LPA not yet supported */ +#define ESTC_IOSWITCH 1033 + /* Cannot unconfigure I/O board: tunnel switch failed */ +#define ESTC_IOCAGE_NO_CPU_AVAIL 1034 + /* No CPU available for I/O cage test. */ +#define ESTC_SMS_ERR_RECOVERABLE 1035 + /* SMS reported recoverable error: check SMS status and Retry */ +#define ESTC_SMS_ERR_UNRECOVERABLE 1036 + /* SMS reported unrecoverable error: Board is Unusable */ +#define ESTC_NWSWITCH 1037 + /* Cannot unconfigure I/O board: network switch failed */ + +/* Starfire error codes */ + +#define ESTF_NONE 2000 /* No error */ +#define ESTF_GETPROP 2001 /* Cannot read property value */ +#define ESTF_GETPROPLEN 2002 /* Cannot determine property length */ +#define ESTF_BNUM 2003 /* Invalid board number */ +#define ESTF_CONFIGBUSY 2004 + /* Cannot proceed; Board is configured or busy */ +#define ESTF_NOCPUID 2005 /* No CPU specified for connect */ +#define ESTF_PROBE 2006 /* Firmware probe failed */ +#define ESTF_DEPROBE 2007 /* Firmware deprobe failed */ +#define ESTF_MOVESIGB 2008 /* Firmware move-cpu0 failed */ +#define ESTF_JUGGLE 2009 /* Cannot move SIGB assignment */ +#define ESTF_HASSIGB 2010 + /* Cannot disconnect CPU; SIGB is currently assigned */ +#define ESTF_SUPPORT 2011 /* Operation not supported */ +#define ESTF_DRVFAIL 2012 /* Device driver failure */ +#define ESTF_SETCPUVAL 2013 + /* Must specify a CPU on the given board */ +#define ESTF_NODEV 2014 /* No such device */ +#define ESTF_INTERBOARD 2015 + /* Memory configured with inter-board interleaving */ +#define ESTF_UNKPTCMD 2016 /* Unrecognized platform command */ +#define ESTF_NOTID 2017 /* drmach parameter is not a valid ID */ +#define ESTF_INAPPROP 2018 + /* drmach parameter is inappropriate for operation */ +#define ESTF_INTERNAL 2019 /* Unexpected internal condition */ + +/* Daktari error codes */ + +#define EDAK_NONE 3000 /* no error */ +#define EDAK_INTERNAL 3001 /* Internal error */ +#define EDAK_NOFRUINFO 3002 /* Didn't receive fru info */ +#define EDAK_NONDR_BOARD 3003 + /* DR is not supported on this board type */ +#define EDAK_POWERON 3004 /* Power on request failed */ +#define EDAK_POWEROK 3005 /* Failed to power on */ +#define EDAK_INTERRUPTED 3006 /* Operation interrupted */ +#define EDAK_BOARDINIT 3007 /* Board initialization failed */ +#define EDAK_CPUINIT 3008 /* CPU intialization failed */ +#define EDAK_MEMFAIL 3009 /* Memory operation failed */ + +/* Serengeti error codes */ + +#define ESGT_NONE 4000 /* no error */ +#define ESGT_INTERNAL 4001 /* Internal error */ +#define ESGT_INVAL 4002 /* Invalid argument */ +#define ESGT_MEMFAIL 4003 /* Memory operation failed */ +#define ESGT_PROBE 4004 /* Board probe failed */ +#define ESGT_DEPROBE 4005 /* Board deprobe failed */ +#define ESGT_JUGGLE_BOOTPROC 4006 /* Failed to juggle bootproc */ +#define ESGT_NOT_CPUTYPE 4007 /* Not a cpu device */ +#define ESGT_NO_DEV_TYPE 4008 /* Cannot find device type */ +#define ESGT_BAD_PORTID 4009 /* Bad port id */ +#define ESGT_RESUME 4010 /* Failed to resume device */ +#define ESGT_SUSPEND 4011 /* Failed to suspend device */ +#define ESGT_KTHREAD 4012 /* failed to stop kernel thd */ +#define ESGT_UNSAFE 4013 /* unsafe */ +#define ESGT_RTTHREAD 4014 /* real time threads */ +#define ESGT_UTHREAD 4015 /* failed to stop user thd */ +#define ESGT_PROM_ATTACH 4016 /* prom failed attach board */ +#define ESGT_PROM_DETACH 4017 /* prom failed detach board */ +#define ESGT_SC_ERR 4018 /* sc return a failure */ +#define ESGT_GET_BOARD_STAT 4019 /* Failed to obtain board information */ +#define ESGT_WAKEUPCPU 4020 /* Failed to wake up cpu */ +#define ESGT_STOPCPU 4021 /* Failed to stop cpu */ +/* Serengeti SC return codes */ +#define ESGT_HW_FAIL 4022 /* Hardware Failure */ +#define ESGT_BD_ACCESS 4023 /* Board access denied */ +#define ESGT_STALE_CMP 4024 /* Stale components */ +#define ESGT_STALE_OBJ 4025 /* Stale objects */ +#define ESGT_NO_SEPROM_SPACE 4026 /* No SEPROM space */ +#define ESGT_NOT_SUPP 4027 /* Operation not supported */ +#define ESGT_NO_MEM 4028 /* No Memory */ + +/* OPL error codes */ + +#define EOPL_GETPROP 5001 /* Cannot read property value */ +#define EOPL_BNUM 5002 /* Invalid board number */ +#define EOPL_CONFIGBUSY 5003 + /* Cannot proceed; Board is configured or busy */ +#define EOPL_PROBE 5004 /* Firmware probe failed */ +#define EOPL_DEPROBE 5005 /* Firmware deprobe failed */ +#define EOPL_SUPPORT 5006 /* Operation not supported */ +#define EOPL_DRVFAIL 5007 /* Device driver failure */ +#define EOPL_UNKPTCMD 5008 /* Unrecognized platform command */ +#define EOPL_NOTID 5009 /* drmach parameter is not a valid ID */ +#define EOPL_INAPPROP 5010 + /* drmach parameter is inappropriate for operation */ +#define EOPL_INTERNAL 5011 /* Unexpected internal condition */ +#define EOPL_FINDDEVICE 5012 /* Firmware cannot find node. */ +#define EOPL_MC_SETUP 5013 /* Cannot setup memory node */ +#define EOPL_CPU_STATE 5014 /* Invalid CPU/core state */ +#define EOPL_MC_OPL 5015 /* Cannot find mc-opl interface */ +#define EOPL_SCF_FMEM 5016 /* Cannot find scf_fmem interface */ +#define EOPL_FMEM_SETUP 5017 /* Error setting up FMEM buffer */ +#define EOPL_SCF_FMEM_START 5018 /* scf_fmem_start error */ +#define EOPL_FMEM_ERROR 5019 /* FMEM error */ +#define EOPL_SCF_FMEM_CANCEL 5020 /* scf_fmem_cancel error */ +#define EOPL_FMEM_XC_TIMEOUT 5021 /* xcall timeout */ +#define EOPL_FMEM_COPY_TIMEOUT 5022 /* DR parellel copy timeout */ +#define EOPL_FMEM_SCF_BUSY 5023 /* SCF busy */ +#define EOPL_FMEM_RETRY_OUT 5024 /* SCF IO Retry Error */ +#define EOPL_FMEM_TIMEOUT 5025 /* FMEM command timeout */ +#define EOPL_FMEM_HW_ERROR 5026 /* Hardware error */ +#define EOPL_FMEM_TERMINATE 5027 /* FMEM operation Terminated */ +#define EOPL_FMEM_COPY_ERROR 5028 /* Memory copy error */ +#define EOPL_FMEM_SCF_ERR 5029 /* SCF error */ +#define EOPL_MIXED_CPU 5030 + /* Cannot add SPARC64-VI to domain booted with all SPARC64-VII CPUs */ +#define EOPL_FMEM_SCF_OFFLINE 5031 /* SCF OFFLINE */ + +/* X86 error codes */ + +#define EX86_GETPROP 10001 /* Cannot read property value */ +#define EX86_BNUM 10002 /* Invalid board number */ +#define EX86_NOTID 10003 /* drmach parameter is not a valid ID */ +#define EX86_INAPPROP 10004 + /* drmach parameter is inappropriate for operation */ +#define EX86_PROBE 10005 /* Firmware probe failed */ +#define EX86_DEPROBE 10006 /* Firmware deprobe failed */ +#define EX86_SUPPORT 10007 /* Operation not supported */ +#define EX86_INTERNAL 10008 /* Unexpected internal condition */ +#define EX86_CONFIGBUSY 10009 + /* Cannot proceed, board is configured or busy */ +#define EX86_POWERBUSY 10010 /* Cannot proceed, board is powered */ +#define EX86_CONNECTBUSY 10011 /* Cannot proceed, board is connected */ +#define EX86_INVALID_ARG 10012 /* Invalid argument */ +#define EX86_DRVFAIL 10013 /* Device driver failure */ +#define EX86_UNKPTCMD 10014 /* Unrecognized platform command */ +#define EX86_ALLOC_CPUID 10015 /* Failed to allocate processor id */ +#define EX86_FREE_CPUID 10016 /* Failed to release processor id */ +#define EX86_POWERON 10017 /* Failed to power on board */ +#define EX86_POWEROFF 10018 /* Failed to power off board */ +#define EX86_MC_SETUP 10019 /* Cannot setup memory node */ +#define EX86_ACPIWALK 10020 /* Cannot walk ACPI namespace */ +#define EX86_WALK_DEPENDENCY 10021 + /* Failed to check dependency for board */ +#define EX86_IN_FAILURE 10022 /* Board is in failure state */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SBD_IOCTL_H */ diff --git a/usr/src/uts/i86pc/sys/x_call.h b/usr/src/uts/i86pc/sys/x_call.h index ca778a4c21..65b5813b95 100644 --- a/usr/src/uts/i86pc/sys/x_call.h +++ b/usr/src/uts/i86pc/sys/x_call.h @@ -68,6 +68,8 @@ typedef struct xc_msg { #if defined(_KERNEL) extern void xc_init_cpu(struct cpu *); +extern void xc_fini_cpu(struct cpu *); +extern int xc_flush_cpu(struct cpu *); extern uint_t xc_serv(caddr_t, caddr_t); #define CPUSET2BV(set) ((ulong_t *)(void *)&(set)) diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index ade54caab2..6ff1216d76 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -22,6 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* @@ -188,6 +192,10 @@ int pt_kern; extern void atomic_orb(uchar_t *addr, uchar_t val); extern void atomic_andb(uchar_t *addr, uchar_t val); +#ifndef __xpv +extern pfn_t memseg_get_start(struct memseg *); +#endif + #define PP_GETRM(pp, rmmask) (pp->p_nrm & rmmask) #define PP_ISMOD(pp) PP_GETRM(pp, P_MOD) #define PP_ISREF(pp) PP_GETRM(pp, P_REF) @@ -4050,8 +4058,8 @@ hat_cpu_offline(struct cpu *cpup) ASSERT(cpup != CPU); CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id); - x86pte_cpu_fini(cpup); hat_vlp_teardown(cpup); + x86pte_cpu_fini(cpup); } /* @@ -4363,7 +4371,96 @@ void hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) {} -#ifdef __xpv +#ifndef __xpv +void +hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, + offset_t kpm_pages_off) +{ + _NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off)); + pfn_t base, end; + + /* + * kphysm_add_memory_dynamic() does not set nkpmpgs + * when page_t memory is externally allocated. That + * code must properly calculate nkpmpgs in all cases + * if nkpmpgs needs to be used at some point. + */ + + /* + * The meta (page_t) pages for dynamically added memory are allocated + * either from the incoming memory itself or from existing memory. + * In the former case the base of the incoming pages will be different + * than the base of the dynamic segment so call memseg_get_start() to + * get the actual base of the incoming memory for each case. + */ + + base = memseg_get_start(msp); + end = msp->pages_end; + + hat_devload(kas.a_hat, kpm_vbase + mmu_ptob(base), + mmu_ptob(end - base), base, PROT_READ | PROT_WRITE, + HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); +} + +void +hat_kpm_addmem_mseg_insert(struct memseg *msp) +{ + _NOTE(ARGUNUSED(msp)); +} + +void +hat_kpm_addmem_memsegs_update(struct memseg *msp) +{ + _NOTE(ARGUNUSED(msp)); +} + +/* + * Return end of metadata for an already setup memseg. + * X86 platforms don't need per-page meta data to support kpm. + */ +caddr_t +hat_kpm_mseg_reuse(struct memseg *msp) +{ + return ((caddr_t)msp->epages); +} + +void +hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) +{ + _NOTE(ARGUNUSED(msp, mspp)); + ASSERT(0); +} + +void +hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, + struct memseg *lo, struct memseg *mid, struct memseg *hi) +{ + _NOTE(ARGUNUSED(msp, mspp, lo, mid, hi)); + ASSERT(0); +} + +/* + * Walk the memsegs chain, applying func to each memseg span. + */ +void +hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg) +{ + pfn_t pbase, pend; + void *base; + size_t size; + struct memseg *msp; + + for (msp = memsegs; msp; msp = msp->next) { + pbase = msp->pages_base; + pend = msp->pages_end; + base = ptob(pbase) + kpm_vbase; + size = ptob(pend - pbase); + func(arg, base, size); + } +} + +#else /* __xpv */ + /* * There are specific Hypervisor calls to establish and remove mappings * to grant table references and the privcmd driver. We have to ensure @@ -4406,5 +4503,5 @@ hat_release_mapping(hat_t *hat, caddr_t addr) htable_release(ht); htable_release(ht); XPV_ALLOW_MIGRATE(); - } -#endif +} +#endif /* __xpv */ diff --git a/usr/src/uts/i86pc/vm/vm_dep.h b/usr/src/uts/i86pc/vm/vm_dep.h index ae2b6c12c1..753aa9d146 100644 --- a/usr/src/uts/i86pc/vm/vm_dep.h +++ b/usr/src/uts/i86pc/vm/vm_dep.h @@ -22,6 +22,10 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* * UNIX machine dependent virtual memory support. @@ -71,7 +75,14 @@ extern void plcnt_inc_dec(page_t *, int, int, long, int); */ #define PLCNT_XFER_NORELOC(pp) -#define PLCNT_MODIFY_MAX(pfn, cnt) mtype_modify_max(pfn, (pgcnt_t)cnt) +/* + * macro to modify the page list max counts when memory is added to + * the page lists during startup (add_physmem) or during a DR operation + * when memory is added (kphysm_add_memory_dynamic) or deleted + * (kphysm_del_cleanup). + */ +#define PLCNT_MODIFY_MAX(pfn, cnt) mtype_modify_max(pfn, cnt) + extern int memrange_num(pfn_t); extern int pfn_2_mtype(pfn_t); extern int mtype_func(int, int, uint_t); @@ -82,7 +93,7 @@ extern int mnode_range_cnt(int); /* * candidate counters in vm_pagelist.c are indexed by color and range */ -#define NUM_MEM_RANGES 4 /* memory range types */ +#define NUM_MEM_RANGES 4 /* memory range types */ #define MAX_MNODE_MRANGES NUM_MEM_RANGES #define MNODE_RANGE_CNT(mnode) mnode_range_cnt(mnode) #define MNODE_MAX_MRANGE(mnode) memrange_num(mem_node_config[mnode].physbase) @@ -161,6 +172,25 @@ extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); rw_exit(&page_ctrs_rwlock[(mnode)]); \ } +/* + * macro to call page_ctrs_adjust() when memory is added + * during a DR operation. + */ +#define PAGE_CTRS_ADJUST(pfn, cnt, rv) { \ + spgcnt_t _cnt = (spgcnt_t)(cnt); \ + int _mn; \ + pgcnt_t _np; \ + pfn_t _pfn = (pfn); \ + pfn_t _endpfn = _pfn + _cnt; \ + while (_pfn < _endpfn) { \ + _mn = PFN_2_MEM_NODE(_pfn); \ + _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - _pfn; \ + _pfn += _np; \ + if ((rv = page_ctrs_adjust(_mn)) != 0) \ + break; \ + } \ +} + #define PAGE_GET_COLOR_SHIFT(szc, nszc) \ (hw_page_array[(nszc)].hp_shift - hw_page_array[(szc)].hp_shift) @@ -524,6 +554,7 @@ extern page_t *page_freelist_split(uchar_t, uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); extern page_t *page_freelist_coalesce(int, uchar_t, uint_t, uint_t, int, pfn_t); +extern void page_freelist_coalesce_all(int); extern uint_t page_get_pagecolors(uint_t); extern void pfnzero(pfn_t, uint_t, uint_t); diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index 63c8129ddf..79c0ee073e 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -22,6 +22,10 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -127,7 +131,6 @@ int largepagesupport = 0; extern uint_t page_create_new; extern uint_t page_create_exists; extern uint_t page_create_putbacks; -extern uint_t page_create_putbacks; /* * Allow users to disable the kernel's use of SSE. */ @@ -142,6 +145,8 @@ typedef struct { pfn_t mnr_pfnhi; int mnr_mnode; int mnr_memrange; /* index into memranges[] */ + int mnr_next; /* next lower PA mnoderange */ + int mnr_exists; /* maintain page list stats */ pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ @@ -175,6 +180,11 @@ typedef struct { */ #define PFN_4GIG 0x100000 #define PFN_16MEG 0x1000 +/* Indices into the memory range (arch_memranges) array. */ +#define MRI_4G 0 +#define MRI_2G 1 +#define MRI_16M 2 +#define MRI_0 3 static pfn_t arch_memranges[NUM_MEM_RANGES] = { PFN_4GIG, /* pfn range for 4G and above */ 0x80000, /* pfn range for 2G-4G */ @@ -191,6 +201,8 @@ int nranges = NUM_MEM_RANGES; mnoderange_t *mnoderanges; int mnoderangecnt; int mtype4g; +int mtype16m; +int mtypetop; /* index of highest pfn'ed mnoderange */ /* * 4g memory management variables for systems with more than 4g of memory: @@ -220,7 +232,6 @@ int mtype4g; * for requests that don't specifically require it. */ -#define LOTSFREE4G (maxmem4g >> lotsfree4gshift) #define DESFREE4G (maxmem4g >> desfree4gshift) #define RESTRICT4G_ALLOC \ @@ -230,7 +241,6 @@ static pgcnt_t maxmem4g; static pgcnt_t freemem4g; static int physmax4g; static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ -static int lotsfree4gshift = 3; /* * 16m memory management: @@ -246,7 +256,7 @@ static int lotsfree4gshift = 3; * are not restricted. */ -#define FREEMEM16M MTYPE_FREEMEM(0) +#define FREEMEM16M MTYPE_FREEMEM(mtype16m) #define DESFREE16M desfree16m #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ @@ -337,6 +347,9 @@ hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; kmutex_t *fpc_mutex[NPC_MUTEX]; kmutex_t *cpc_mutex[NPC_MUTEX]; +/* Lock to protect mnoderanges array for memory DR operations. */ +static kmutex_t mnoderange_lock; + /* * Only let one thread at a time try to coalesce large pages, to * prevent them from working against each other. @@ -951,7 +964,8 @@ pfn_2_mtype(pfn_t pfn) #else int n; - for (n = mnoderangecnt - 1; n >= 0; n--) { + /* Always start from highest pfn and work our way down */ + for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { if (pfn >= mnoderanges[n].mnr_pfnlo) { break; } @@ -1063,7 +1077,7 @@ retry: * verify that pages being returned from allocator have correct DMA attribute */ #ifndef DEBUG -#define check_dma(a, b, c) (0) +#define check_dma(a, b, c) (void)(0) #else static void check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) @@ -1253,7 +1267,12 @@ mnode_range_cnt(int mnode) void mnode_range_setup(mnoderange_t *mnoderanges) { + mnoderange_t *mp = mnoderanges; int mnode, mri; + int mindex = 0; /* current index into mnoderanges array */ + int i, j; + pfn_t hipfn; + int last, hi; for (mnode = 0; mnode < max_mem_nodes; mnode++) { if (mem_node_config[mnode].exists == 0) @@ -1272,26 +1291,168 @@ mnode_range_setup(mnoderange_t *mnoderanges) mem_node_config[mnode].physmax); mnoderanges->mnr_mnode = mnode; mnoderanges->mnr_memrange = mri; + mnoderanges->mnr_exists = 1; mnoderanges++; + mindex++; if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) mri--; else break; } } + + /* + * For now do a simple sort of the mnoderanges array to fill in + * the mnr_next fields. Since mindex is expected to be relatively + * small, using a simple O(N^2) algorithm. + */ + for (i = 0; i < mindex; i++) { + if (mp[i].mnr_pfnlo == 0) /* find lowest */ + break; + } + ASSERT(i < mindex); + last = i; + mtype16m = last; + mp[last].mnr_next = -1; + for (i = 0; i < mindex - 1; i++) { + hipfn = (pfn_t)(-1); + hi = -1; + /* find next highest mnode range */ + for (j = 0; j < mindex; j++) { + if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo && + mp[j].mnr_pfnlo < hipfn) { + hipfn = mp[j].mnr_pfnlo; + hi = j; + } + } + mp[hi].mnr_next = last; + last = hi; + } + mtypetop = last; } +#ifndef __xpv +/* + * Update mnoderanges for memory hot-add DR operations. + */ +static void +mnode_range_add(int mnode) +{ + int *prev; + int n, mri; + pfn_t start, end; + extern void membar_sync(void); + + ASSERT(0 <= mnode && mnode < max_mem_nodes); + ASSERT(mem_node_config[mnode].exists); + start = mem_node_config[mnode].physbase; + end = mem_node_config[mnode].physmax; + ASSERT(start <= end); + mutex_enter(&mnoderange_lock); + +#ifdef DEBUG + /* Check whether it interleaves with other memory nodes. */ + for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { + ASSERT(mnoderanges[n].mnr_exists); + if (mnoderanges[n].mnr_mnode == mnode) + continue; + ASSERT(start > mnoderanges[n].mnr_pfnhi || + end < mnoderanges[n].mnr_pfnlo); + } +#endif /* DEBUG */ + + mri = nranges - 1; + while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) + mri--; + while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { + /* Check whether mtype already exists. */ + for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { + if (mnoderanges[n].mnr_mnode == mnode && + mnoderanges[n].mnr_memrange == mri) { + mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), + start); + mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), + end); + break; + } + } + + /* Add a new entry if it doesn't exist yet. */ + if (n == -1) { + /* Try to find an unused entry in mnoderanges array. */ + for (n = 0; n < mnoderangecnt; n++) { + if (mnoderanges[n].mnr_exists == 0) + break; + } + ASSERT(n < mnoderangecnt); + mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start); + mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end); + mnoderanges[n].mnr_mnode = mnode; + mnoderanges[n].mnr_memrange = mri; + mnoderanges[n].mnr_exists = 1; + /* Page 0 should always be present. */ + for (prev = &mtypetop; + mnoderanges[*prev].mnr_pfnlo > start; + prev = &mnoderanges[*prev].mnr_next) { + ASSERT(mnoderanges[*prev].mnr_next >= 0); + ASSERT(mnoderanges[*prev].mnr_pfnlo > end); + } + mnoderanges[n].mnr_next = *prev; + membar_sync(); + *prev = n; + } + + if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) + mri--; + else + break; + } + + mutex_exit(&mnoderange_lock); +} + +/* + * Update mnoderanges for memory hot-removal DR operations. + */ +static void +mnode_range_del(int mnode) +{ + _NOTE(ARGUNUSED(mnode)); + ASSERT(0 <= mnode && mnode < max_mem_nodes); + /* TODO: support deletion operation. */ + ASSERT(0); +} + +void +plat_slice_add(pfn_t start, pfn_t end) +{ + mem_node_add_slice(start, end); + if (plat_dr_enabled()) { + mnode_range_add(PFN_2_MEM_NODE(start)); + } +} + +void +plat_slice_del(pfn_t start, pfn_t end) +{ + ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end)); + ASSERT(plat_dr_enabled()); + mnode_range_del(PFN_2_MEM_NODE(start)); + mem_node_del_slice(start, end); +} +#endif /* __xpv */ + /*ARGSUSED*/ int mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) { - int mtype = mnoderangecnt - 1; + int mtype = mtypetop; #if !defined(__xpv) #if defined(__i386) /* * set the mtype range - * - kmem requests needs to be below 4g if restricted_kmemalloc is set. + * - kmem requests need to be below 4g if restricted_kmemalloc is set. * - for non kmem requests, set range to above 4g if memory below 4g * runs low. */ @@ -1334,8 +1495,8 @@ mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) int mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) { - int mtype = mnoderangecnt - 1; -#if !defined(__ixpv) + int mtype = mtypetop; +#if !defined(__xpv) if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { *flags |= PGI_MT_RANGE16M; } else { @@ -1349,7 +1510,7 @@ mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) /* * Determine if the mnode range specified in mtype contains memory belonging * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains - * the range of indices from high pfn to 0, 16m or 4g. + * the range from high pfn to 0, 16m or 4g. * * Return first mnode range type index found otherwise return -1 if none found. */ @@ -1357,18 +1518,20 @@ int mtype_func(int mnode, int mtype, uint_t flags) { if (flags & PGI_MT_RANGE) { - int mtlim = 0; + int mnr_lim = MRI_0; - if (flags & PGI_MT_NEXT) - mtype--; + if (flags & PGI_MT_NEXT) { + mtype = mnoderanges[mtype].mnr_next; + } if (flags & PGI_MT_RANGE4G) - mtlim = mtype4g + 1; /* exclude 0-4g range */ + mnr_lim = MRI_4G; /* exclude 0-4g range */ else if (flags & PGI_MT_RANGE16M) - mtlim = 1; /* exclude 0-16m range */ - while (mtype >= mtlim) { + mnr_lim = MRI_16M; /* exclude 0-16m range */ + while (mtype != -1 && + mnoderanges[mtype].mnr_memrange <= mnr_lim) { if (mnoderanges[mtype].mnr_mnode == mnode) return (mtype); - mtype--; + mtype = mnoderanges[mtype].mnr_next; } } else if (mnoderanges[mtype].mnr_mnode == mnode) { return (mtype); @@ -1378,34 +1541,40 @@ mtype_func(int mnode, int mtype, uint_t flags) /* * Update the page list max counts with the pfn range specified by the - * input parameters. Called from add_physmem() when physical memory with - * page_t's are initially added to the page lists. + * input parameters. */ void mtype_modify_max(pfn_t startpfn, long cnt) { - int mtype = 0; - pfn_t endpfn = startpfn + cnt, pfn; - pgcnt_t inc; - - ASSERT(cnt > 0); + int mtype; + pgcnt_t inc; + spgcnt_t scnt = (spgcnt_t)(cnt); + pgcnt_t acnt = ABS(scnt); + pfn_t endpfn = startpfn + acnt; + pfn_t pfn, lo; if (!physmax4g) return; - for (pfn = startpfn; pfn < endpfn; ) { - if (pfn <= mnoderanges[mtype].mnr_pfnhi) { - if (endpfn < mnoderanges[mtype].mnr_pfnhi) { - inc = endpfn - pfn; + mtype = mtypetop; + for (pfn = endpfn; pfn > startpfn; ) { + ASSERT(mtype != -1); + lo = mnoderanges[mtype].mnr_pfnlo; + if (pfn > lo) { + if (startpfn >= lo) { + inc = pfn - startpfn; } else { - inc = mnoderanges[mtype].mnr_pfnhi - pfn + 1; + inc = pfn - lo; + } + if (mnoderanges[mtype].mnr_memrange != MRI_4G) { + if (scnt > 0) + maxmem4g += inc; + else + maxmem4g -= inc; } - if (mtype <= mtype4g) - maxmem4g += inc; - pfn += inc; + pfn -= inc; } - mtype++; - ASSERT(mtype < mnoderangecnt || pfn >= endpfn); + mtype = mnoderanges[mtype].mnr_next; } } @@ -1418,6 +1587,7 @@ mtype_2_mrange(int mtype) void mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) { + _NOTE(ARGUNUSED(mnode)); ASSERT(mnoderanges[mtype].mnr_mnode == mnode); *pfnlo = mnoderanges[mtype].mnr_pfnlo; *pfnhi = mnoderanges[mtype].mnr_pfnhi; @@ -1462,6 +1632,7 @@ plcnt_init(caddr_t addr) void plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) { + _NOTE(ARGUNUSED(pp)); #ifdef DEBUG int bin = PP_2_BIN(pp); @@ -1470,7 +1641,7 @@ plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) cnt); #endif ASSERT(mtype == PP_2_MTYPE(pp)); - if (physmax4g && mtype <= mtype4g) + if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G) atomic_add_long(&freemem4g, cnt); if (flags & PG_CACHE_LIST) atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); @@ -1485,7 +1656,7 @@ plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) int mnode_pgcnt(int mnode) { - int mtype = mnoderangecnt - 1; + int mtype = mtypetop; int flags = PGI_MT_RANGE0; pgcnt_t pgcnt = 0; @@ -1505,6 +1676,7 @@ mnode_pgcnt(int mnode) size_t page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) { + _NOTE(ARGUNUSED(l2_linesz)); size_t colorsz = 0; int i; int colors; @@ -1519,14 +1691,19 @@ page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) /* * Reduce the memory ranges lists if we don't have large amounts * of memory. This avoids searching known empty free lists. + * To support memory DR operations, we need to keep memory ranges + * for possible memory hot-add operations. */ - i = memrange_num(physmax); + if (plat_dr_physmax > physmax) + i = memrange_num(plat_dr_physmax); + else + i = memrange_num(physmax); #if defined(__i386) - if (i > 0) + if (i > MRI_4G) restricted_kmemalloc = 0; #endif /* physmax greater than 4g */ - if (i == 0) + if (i == MRI_4G) physmax4g = 1; #endif /* !__xpv */ memranges += i; @@ -1619,6 +1796,16 @@ page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) /* size for mnoderanges */ for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) mnoderangecnt += mnode_range_cnt(i); + if (plat_dr_support_memory()) { + /* + * Reserve enough space for memory DR operations. + * Two extra mnoderanges for possbile fragmentations, + * one for the 2G boundary and the other for the 4G boundary. + * We don't expect a memory board crossing the 16M boundary + * for memory hot-add operations on x86 platforms. + */ + mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt; + } colorsz = mnoderangecnt * sizeof (mnoderange_t); /* size for fpc_mutex and cpc_mutex */ @@ -3122,8 +3309,8 @@ page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, * ordering. */ if (dma_attr == NULL) { - n = 0; - m = mnoderangecnt - 1; + n = mtype16m; + m = mtypetop; fullrange = 1; VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); } else { @@ -3136,6 +3323,10 @@ page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, if (dma_attr->dma_attr_align > MMU_PAGESIZE) return (NULL); + /* Sanity check the dma_attr */ + if (pfnlo > pfnhi) + return (NULL); + n = pfn_2_mtype(pfnlo); m = pfn_2_mtype(pfnhi); @@ -3144,13 +3335,10 @@ page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, } VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); - if (n > m) - return (NULL); - szc = 0; - /* cylcing thru mtype handled by RANGE0 if n == 0 */ - if (n == 0) { + /* cylcing thru mtype handled by RANGE0 if n == mtype16m */ + if (n == mtype16m) { flags |= PGI_MT_RANGE0; n = m; } @@ -3164,7 +3352,8 @@ page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, /* * allocate pages from high pfn to low. */ - for (mtype = m; mtype >= n; mtype--) { + mtype = m; + do { if (fullrange != 0) { pp = page_get_mnode_freelist(mnode, bin, mtype, szc, flags); @@ -3181,7 +3370,8 @@ page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, check_dma(dma_attr, pp, 1); return (pp); } - } + } while (mtype != n && + (mtype = mnoderanges[mtype].mnr_next) != -1); if (!local_failed_stat) { lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); local_failed_stat = 1; diff --git a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c index f0a257d1fd..8bc46f8e3e 100644 --- a/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c +++ b/usr/src/uts/i86xpv/cpu/generic_cpu/gcpu_poll_xpv.c @@ -161,6 +161,12 @@ gcpu_mca_poll_init(cmi_hdl_t hdl) } } +/* deconfigure gcpu_mca_poll_init() */ +void +gcpu_mca_poll_fini(cmi_hdl_t hdl) +{ +} + void gcpu_mca_poll_start(cmi_hdl_t hdl) { diff --git a/usr/src/uts/i86xpv/io/psm/xpv_psm.c b/usr/src/uts/i86xpv/io/psm/xpv_psm.c index 07db030ed5..c59d1015a0 100644 --- a/usr/src/uts/i86xpv/io/psm/xpv_psm.c +++ b/usr/src/uts/i86xpv/io/psm/xpv_psm.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/mutex.h> #include <sys/types.h> @@ -1672,7 +1672,8 @@ static struct psm_ops xen_psm_ops = { (void (*)(void *arg))NULL, /* psm_post_cyclic_setup */ (void (*)(int, int))NULL, /* psm_preshutdown */ xen_intr_ops, /* Advanced DDI Interrupt framework */ - (int (*)(psm_state_request_t *))NULL /* psm_state */ + (int (*)(psm_state_request_t *))NULL, /* psm_state */ + (int (*)(psm_cpu_request_t *))NULL /* psm_cpu_ops */ }; static struct psm_info xen_psm_info = { diff --git a/usr/src/uts/i86xpv/io/psm/xpv_uppc.c b/usr/src/uts/i86xpv/io/psm/xpv_uppc.c index 9427ead955..06049ed9e0 100644 --- a/usr/src/uts/i86xpv/io/psm/xpv_uppc.c +++ b/usr/src/uts/i86xpv/io/psm/xpv_uppc.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ -#define PSMI_1_6 +#define PSMI_1_7 #include <sys/mutex.h> #include <sys/types.h> @@ -891,7 +891,8 @@ static struct psm_ops xen_uppc_ops = { (int (*)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t, int *))NULL, /* psm_intr_ops */ - (int (*)(psm_state_request_t *))NULL /* psm_state */ + (int (*)(psm_state_request_t *))NULL, /* psm_state */ + (int (*)(psm_cpu_request_t *))NULL /* psm_cpu_ops */ }; static struct psm_info xen_uppc_info = { diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index 87e1323772..807cb8f790 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -1359,6 +1359,17 @@ fcnname/**/_info: \ END_MODULE(acpica); #endif +/* + * Stubs for acpidev + */ +#ifndef ACPIDEV_MODULE + MODULE(acpidev,misc); + NO_UNLOAD_STUB(acpidev, acpidev_dr_get_cpu_numa_info, nomod_minus_one) ; + NO_UNLOAD_STUB(acpidev, acpidev_dr_free_cpu_numa_info, + nomod_minus_one) ; + END_MODULE(acpidev); +#endif + #ifndef IPNET_MODULE MODULE(ipnet,drv); STUB(ipnet, ipnet_if_getdev, nomod_zero); diff --git a/usr/src/uts/intel/ia32/os/sundep.c b/usr/src/uts/intel/ia32/os/sundep.c index 9aca9f4ea1..13f04658aa 100644 --- a/usr/src/uts/intel/ia32/os/sundep.c +++ b/usr/src/uts/intel/ia32/os/sundep.c @@ -115,9 +115,10 @@ check_boot_version(int boots_version) * 1) the pfn of the highest installed physical page, * 2) the number of pages installed * 3) the number of distinct contiguous regions these pages fall into. + * 4) the number of contiguous memory ranges */ void -installed_top_size( +installed_top_size_ex( struct memlist *list, /* pointer to start of installed list */ pfn_t *high_pfn, /* return ptr for top value */ pgcnt_t *pgcnt, /* return ptr for sum of installed pages */ @@ -141,6 +142,21 @@ installed_top_size( *ranges = cnt; } +void +installed_top_size( + struct memlist *list, /* pointer to start of installed list */ + pfn_t *high_pfn, /* return ptr for top value */ + pgcnt_t *pgcnt) /* return ptr for sum of installed pages */ +{ + int ranges; + + installed_top_size_ex(list, high_pfn, pgcnt, &ranges); +} + +void +phys_install_has_changed(void) +{} + /* * Copy in a memory list from boot to kernel, with a filter function * to remove pages. The filter function can increase the address and/or diff --git a/usr/src/uts/intel/io/acpica/acpica.c b/usr/src/uts/intel/io/acpica/acpica.c index 088b1f0eb4..30a8d79be1 100644 --- a/usr/src/uts/intel/io/acpica/acpica.c +++ b/usr/src/uts/intel/io/acpica/acpica.c @@ -36,6 +36,7 @@ #include <sys/modctl.h> #include <sys/open.h> #include <sys/stat.h> +#include <sys/spl.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/esunddi.h> @@ -121,8 +122,11 @@ _init(void) int error = EBUSY; int status; extern int (*acpi_fp_setwake)(); + extern kmutex_t cpu_map_lock; mutex_init(&acpica_module_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&cpu_map_lock, NULL, MUTEX_SPIN, + (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL)); if ((error = mod_install(&modlinkage)) != 0) { mutex_destroy(&acpica_module_lock); diff --git a/usr/src/uts/intel/io/acpica/osl.c b/usr/src/uts/intel/io/acpica/osl.c index 7593bde61a..6cb08ee27e 100644 --- a/usr/src/uts/intel/io/acpica/osl.c +++ b/usr/src/uts/intel/io/acpica/osl.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ /* @@ -98,7 +98,7 @@ static int scanning_d2a_map = 0; static int d2a_done = 0; /* features supported by ACPICA and ACPI device configuration. */ -uint64_t acpica_core_features = 0; +uint64_t acpica_core_features = ACPI_FEATURE_OSI_MODULE; static uint64_t acpica_devcfg_features = 0; /* set by acpi_poweroff() in PSMs and appm_ioctl() in acpippm for S3 */ @@ -112,7 +112,7 @@ struct cpu_map_item { ACPI_HANDLE obj; }; -static kmutex_t cpu_map_lock; +kmutex_t cpu_map_lock; static struct cpu_map_item **cpu_map = NULL; static int cpu_map_count_max = 0; static int cpu_map_count = 0; @@ -1552,7 +1552,7 @@ acpica_get_handle_cpu(int cpu_id, ACPI_HANDLE *rh) } } if (i < cpu_map_count && (cpu_map[i]->obj != NULL)) { - *rh = cpu_map[cpu_id]->obj; + *rh = cpu_map[i]->obj; mutex_exit(&cpu_map_lock); return (AE_OK); } @@ -1567,7 +1567,7 @@ acpica_get_handle_cpu(int cpu_id, ACPI_HANDLE *rh) } } if (i < cpu_map_count && (cpu_map[i]->obj != NULL)) { - *rh = cpu_map[cpu_id]->obj; + *rh = cpu_map[i]->obj; mutex_exit(&cpu_map_lock); return (AE_OK); } @@ -2209,6 +2209,82 @@ acpica_get_cpu_object_by_apicid(UINT32 apicid, ACPI_HANDLE *hdlp) return (rc); } +ACPI_STATUS +acpica_get_cpu_id_by_object(ACPI_HANDLE hdl, processorid_t *cpuidp) +{ + int i; + ACPI_STATUS rc = AE_NOT_EXIST; + + ASSERT(cpuidp != NULL); + if (hdl == NULL || cpuidp == NULL) { + return (rc); + } + + *cpuidp = -1; + mutex_enter(&cpu_map_lock); + for (i = 0; i < cpu_map_count; i++) { + if (cpu_map[i]->obj == hdl && cpu_map[i]->cpu_id != -1) { + *cpuidp = cpu_map[i]->cpu_id; + rc = AE_OK; + break; + } + } + mutex_exit(&cpu_map_lock); + + return (rc); +} + +ACPI_STATUS +acpica_get_apicid_by_object(ACPI_HANDLE hdl, UINT32 *rp) +{ + int i; + ACPI_STATUS rc = AE_NOT_EXIST; + + ASSERT(rp != NULL); + if (hdl == NULL || rp == NULL) { + return (rc); + } + + *rp = UINT32_MAX; + mutex_enter(&cpu_map_lock); + for (i = 0; i < cpu_map_count; i++) { + if (cpu_map[i]->obj == hdl && + cpu_map[i]->apic_id != UINT32_MAX) { + *rp = cpu_map[i]->apic_id; + rc = AE_OK; + break; + } + } + mutex_exit(&cpu_map_lock); + + return (rc); +} + +ACPI_STATUS +acpica_get_procid_by_object(ACPI_HANDLE hdl, UINT32 *rp) +{ + int i; + ACPI_STATUS rc = AE_NOT_EXIST; + + ASSERT(rp != NULL); + if (hdl == NULL || rp == NULL) { + return (rc); + } + + *rp = UINT32_MAX; + mutex_enter(&cpu_map_lock); + for (i = 0; i < cpu_map_count; i++) { + if (cpu_map[i]->obj == hdl) { + *rp = cpu_map[i]->proc_id; + rc = AE_OK; + break; + } + } + mutex_exit(&cpu_map_lock); + + return (rc); +} + void acpica_set_core_feature(uint64_t features) { diff --git a/usr/src/uts/intel/sys/acpica.h b/usr/src/uts/intel/sys/acpica.h index 5f74655418..ff55dcb302 100644 --- a/usr/src/uts/intel/sys/acpica.h +++ b/usr/src/uts/intel/sys/acpica.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2009, Intel Corporation. + * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. */ @@ -133,6 +133,7 @@ typedef struct iflag { #define ACPI_DEVCFG_CPU 0x1 #define ACPI_DEVCFG_MEMORY 0x2 #define ACPI_DEVCFG_CONTAINER 0x4 +#define ACPI_DEVCFG_PCI 0x8 /* * master_ops.c @@ -187,6 +188,9 @@ extern ACPI_STATUS acpica_unmap_cpu(processorid_t); extern ACPI_STATUS acpica_get_cpu_object_by_cpuid(processorid_t, ACPI_HANDLE *); extern ACPI_STATUS acpica_get_cpu_object_by_procid(UINT32, ACPI_HANDLE *); extern ACPI_STATUS acpica_get_cpu_object_by_apicid(UINT32, ACPI_HANDLE *); +extern ACPI_STATUS acpica_get_cpu_id_by_object(ACPI_HANDLE, processorid_t *); +extern ACPI_STATUS acpica_get_apicid_by_object(ACPI_HANDLE, UINT32 *); +extern ACPI_STATUS acpica_get_procid_by_object(ACPI_HANDLE, UINT32 *); extern uint64_t acpica_get_core_feature(uint64_t); extern void acpica_set_core_feature(uint64_t); diff --git a/usr/src/uts/intel/sys/bootconf.h b/usr/src/uts/intel/sys/bootconf.h index f626fbbb79..4f6c48c395 100644 --- a/usr/src/uts/intel/sys/bootconf.h +++ b/usr/src/uts/intel/sys/bootconf.h @@ -50,6 +50,7 @@ extern "C" { #define BP_CPU_APICID_ARRAY "cpu_apicid_array" #define BP_LGRP_SLIT_ENABLE "lgrp_slit_enable" #define BP_LGRP_SRAT_ENABLE "lgrp_srat_enable" +#define BP_LGRP_MSCT_ENABLE "lgrp_msct_enable" #define BP_LGRP_TOPO_LEVELS "lgrp_topo_levels" /* @@ -213,18 +214,6 @@ extern int octet_to_hexascii(const void *, uint_t, char *, uint_t *); extern int dhcpinit(void); -/* - * XXX The memlist stuff belongs in a header of its own - */ -extern int check_boot_version(int); -extern void size_physavail(struct memlist *, pgcnt_t *, int *, pfn_t); -extern int copy_physavail(struct memlist *, struct memlist **, - uint_t, uint_t); -extern void installed_top_size(struct memlist *, pfn_t *, pgcnt_t *, int *); -extern int check_memexp(struct memlist *, uint_t); -extern void copy_memlist_filter(struct memlist *, struct memlist **, - void (*filter)(uint64_t *, uint64_t *)); - extern struct bootops *bootops; extern int netboot; extern int swaploaded; diff --git a/usr/src/uts/intel/sys/cpu_module.h b/usr/src/uts/intel/sys/cpu_module.h index 9b64b8527a..8d801b07f5 100644 --- a/usr/src/uts/intel/sys/cpu_module.h +++ b/usr/src/uts/intel/sys/cpu_module.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -143,7 +143,6 @@ extern cmi_errno_t cmi_hdl_rdmsr(cmi_hdl_t, uint_t, uint64_t *); extern cmi_errno_t cmi_hdl_wrmsr(cmi_hdl_t, uint_t, uint64_t); extern void cmi_hdl_enable_mce(cmi_hdl_t); - extern uint_t cmi_hdl_vendor(cmi_hdl_t); extern const char *cmi_hdl_vendorstr(cmi_hdl_t); extern uint_t cmi_hdl_family(cmi_hdl_t); @@ -172,6 +171,7 @@ extern uint_t cmi_ntv_hwchipid(cpu_t *); extern uint_t cmi_ntv_hwprocnodeid(cpu_t *); extern uint_t cmi_ntv_hwcoreid(cpu_t *); extern uint_t cmi_ntv_hwstrandid(cpu_t *); +extern void cmi_ntv_hwdisable_mce(cmi_hdl_t); #endif /* __xpv */ typedef struct cmi_mca_regs { @@ -210,6 +210,7 @@ extern void cmi_mca_trap(struct regs *); extern boolean_t cmi_panic_on_ue(void); extern void cmi_mc_register(cmi_hdl_t, const struct cmi_mc_ops *, void *); +extern cmi_errno_t cmi_mc_register_global(const struct cmi_mc_ops *, void *); extern void cmi_mc_sw_memscrub_disable(void); extern cmi_errno_t cmi_mc_patounum(uint64_t, uint8_t, uint8_t, uint32_t, int, mc_unum_t *); diff --git a/usr/src/uts/intel/sys/memlist_plat.h b/usr/src/uts/intel/sys/memlist_plat.h new file mode 100644 index 0000000000..065f8ed37a --- /dev/null +++ b/usr/src/uts/intel/sys/memlist_plat.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_MEMLIST_PLAT_H +#define _SYS_MEMLIST_PLAT_H + +/* + * Boot time configuration information objects + */ + +#include <sys/types.h> +#include <sys/memlist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int check_boot_version(int); +extern void copy_memlist_filter(struct memlist *, struct memlist **, + void (*filter)(uint64_t *, uint64_t *)); +extern void installed_top_size(struct memlist *, pfn_t *, pgcnt_t *); +extern void installed_top_size_ex(struct memlist *, pfn_t *, pgcnt_t *, int *); +extern void phys_install_has_changed(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MEMLIST_PLAT_H */ diff --git a/usr/src/uts/sun4/os/ddi_impl.c b/usr/src/uts/sun4/os/ddi_impl.c index d01eabae19..f3efc433c5 100644 --- a/usr/src/uts/sun4/os/ddi_impl.c +++ b/usr/src/uts/sun4/os/ddi_impl.c @@ -48,12 +48,8 @@ #include <sys/systeminfo.h> #include <sys/fpu/fpusystm.h> #include <sys/vm.h> -#include <sys/fs/dv_node.h> -#include <sys/fs/snode.h> #include <sys/ddi_isa.h> -#include <sys/modhash.h> #include <sys/modctl.h> -#include <sys/sunldi_impl.h> dev_info_t *get_intr_parent(dev_info_t *, dev_info_t *, ddi_intr_handle_impl_t *); @@ -1954,937 +1950,3 @@ peekpoke_mem(ddi_ctl_enum_t cmd, peekpoke_ctlops_t *in_args) return (err); } - -/* - * Platform independent DR routines - */ - -static int -ndi2errno(int n) -{ - int err = 0; - - switch (n) { - case NDI_NOMEM: - err = ENOMEM; - break; - case NDI_BUSY: - err = EBUSY; - break; - case NDI_FAULT: - err = EFAULT; - break; - case NDI_FAILURE: - err = EIO; - break; - case NDI_SUCCESS: - break; - case NDI_BADHANDLE: - default: - err = EINVAL; - break; - } - return (err); -} - -/* - * Prom tree node list - */ -struct ptnode { - pnode_t nodeid; - struct ptnode *next; -}; - -/* - * Prom tree walk arg - */ -struct pta { - dev_info_t *pdip; - devi_branch_t *bp; - uint_t flags; - dev_info_t *fdip; - struct ptnode *head; -}; - -static void -visit_node(pnode_t nodeid, struct pta *ap) -{ - struct ptnode **nextp; - int (*select)(pnode_t, void *, uint_t); - - ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE); - - select = ap->bp->create.prom_branch_select; - - ASSERT(select); - - if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) { - - for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next) - ; - - *nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP); - - (*nextp)->nodeid = nodeid; - } - - if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD) - return; - - nodeid = prom_childnode(nodeid); - while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { - visit_node(nodeid, ap); - nodeid = prom_nextnode(nodeid); - } -} - -/* - * NOTE: The caller of this function must check for device contracts - * or LDI callbacks against this dip before setting the dip offline. - */ -static int -set_infant_dip_offline(dev_info_t *dip, void *arg) -{ - char *path = (char *)arg; - - ASSERT(dip); - ASSERT(arg); - - if (i_ddi_node_state(dip) >= DS_ATTACHED) { - (void) ddi_pathname(dip, path); - cmn_err(CE_WARN, "Attempt to set offline flag on attached " - "node: %s", path); - return (DDI_FAILURE); - } - - mutex_enter(&(DEVI(dip)->devi_lock)); - if (!DEVI_IS_DEVICE_OFFLINE(dip)) - DEVI_SET_DEVICE_OFFLINE(dip); - mutex_exit(&(DEVI(dip)->devi_lock)); - - return (DDI_SUCCESS); -} - -typedef struct result { - char *path; - int result; -} result_t; - -static int -dip_set_offline(dev_info_t *dip, void *arg) -{ - int end; - result_t *resp = (result_t *)arg; - - ASSERT(dip); - ASSERT(resp); - - /* - * We stop the walk if e_ddi_offline_notify() returns - * failure, because this implies that one or more consumers - * (either LDI or contract based) has blocked the offline. - * So there is no point in conitnuing the walk - */ - if (e_ddi_offline_notify(dip) == DDI_FAILURE) { - resp->result = DDI_FAILURE; - return (DDI_WALK_TERMINATE); - } - - /* - * If set_infant_dip_offline() returns failure, it implies - * that we failed to set a particular dip offline. This - * does not imply that the offline as a whole should fail. - * We want to do the best we can, so we continue the walk. - */ - if (set_infant_dip_offline(dip, resp->path) == DDI_SUCCESS) - end = DDI_SUCCESS; - else - end = DDI_FAILURE; - - e_ddi_offline_finalize(dip, end); - - return (DDI_WALK_CONTINUE); -} - -/* - * The call to e_ddi_offline_notify() exists for the - * unlikely error case that a branch we are trying to - * create already exists and has device contracts or LDI - * event callbacks against it. - * - * We allow create to succeed for such branches only if - * no constraints block the offline. - */ -static int -branch_set_offline(dev_info_t *dip, char *path) -{ - int circ; - int end; - result_t res; - - - if (e_ddi_offline_notify(dip) == DDI_FAILURE) { - return (DDI_FAILURE); - } - - if (set_infant_dip_offline(dip, path) == DDI_SUCCESS) - end = DDI_SUCCESS; - else - end = DDI_FAILURE; - - e_ddi_offline_finalize(dip, end); - - if (end == DDI_FAILURE) - return (DDI_FAILURE); - - res.result = DDI_SUCCESS; - res.path = path; - - ndi_devi_enter(dip, &circ); - ddi_walk_devs(ddi_get_child(dip), dip_set_offline, &res); - ndi_devi_exit(dip, circ); - - return (res.result); -} - -/*ARGSUSED*/ -static int -create_prom_branch(void *arg, int has_changed) -{ - int circ; - int exists, rv; - pnode_t nodeid; - struct ptnode *tnp; - dev_info_t *dip; - struct pta *ap = arg; - devi_branch_t *bp; - char *path; - - ASSERT(ap); - ASSERT(ap->fdip == NULL); - ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip)); - - bp = ap->bp; - - nodeid = ddi_get_nodeid(ap->pdip); - if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) { - cmn_err(CE_WARN, "create_prom_branch: invalid " - "nodeid: 0x%x", nodeid); - return (EINVAL); - } - - ap->head = NULL; - - nodeid = prom_childnode(nodeid); - while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { - visit_node(nodeid, ap); - nodeid = prom_nextnode(nodeid); - } - - if (ap->head == NULL) - return (ENODEV); - - path = kmem_alloc(MAXPATHLEN, KM_SLEEP); - rv = 0; - while ((tnp = ap->head) != NULL) { - ap->head = tnp->next; - - ndi_devi_enter(ap->pdip, &circ); - - /* - * Check if the branch already exists. - */ - exists = 0; - dip = e_ddi_nodeid_to_dip(tnp->nodeid); - if (dip != NULL) { - exists = 1; - - /* Parent is held busy, so release hold */ - ndi_rele_devi(dip); -#ifdef DEBUG - cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists" - " for nodeid 0x%x", (void *)dip, tnp->nodeid); -#endif - } else { - dip = i_ddi_create_branch(ap->pdip, tnp->nodeid); - } - - kmem_free(tnp, sizeof (struct ptnode)); - - /* - * Hold the branch if it is not already held - */ - if (dip && !exists) { - e_ddi_branch_hold(dip); - } - - ASSERT(dip == NULL || e_ddi_branch_held(dip)); - - /* - * Set all dips in the newly created branch offline so that - * only a "configure" operation can attach - * the branch - */ - if (dip == NULL || branch_set_offline(dip, path) - == DDI_FAILURE) { - ndi_devi_exit(ap->pdip, circ); - rv = EIO; - continue; - } - - ASSERT(ddi_get_parent(dip) == ap->pdip); - - ndi_devi_exit(ap->pdip, circ); - - if (ap->flags & DEVI_BRANCH_CONFIGURE) { - int error = e_ddi_branch_configure(dip, &ap->fdip, 0); - if (error && rv == 0) - rv = error; - } - - /* - * Invoke devi_branch_callback() (if it exists) only for - * newly created branches - */ - if (bp->devi_branch_callback && !exists) - bp->devi_branch_callback(dip, bp->arg, 0); - } - - kmem_free(path, MAXPATHLEN); - - return (rv); -} - -static int -sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp) -{ - int rv, circ, len; - int i, flags, ret; - dev_info_t *dip; - char *nbuf; - char *path; - static const char *noname = "<none>"; - - ASSERT(pdip); - ASSERT(DEVI_BUSY_OWNED(pdip)); - - flags = 0; - - /* - * Creating the root of a branch ? - */ - if (rdipp) { - *rdipp = NULL; - flags = DEVI_BRANCH_ROOT; - } - - ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip); - rv = bp->create.sid_branch_create(dip, bp->arg, flags); - - nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP); - - if (rv == DDI_WALK_ERROR) { - cmn_err(CE_WARN, "e_ddi_branch_create: Error setting" - " properties on devinfo node %p", (void *)dip); - goto fail; - } - - len = OBP_MAXDRVNAME; - if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, - DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len) - != DDI_PROP_SUCCESS) { - cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has" - "no name property", (void *)dip); - goto fail; - } - - ASSERT(i_ddi_node_state(dip) == DS_PROTO); - if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) { - cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)" - " for devinfo node %p", nbuf, (void *)dip); - goto fail; - } - - kmem_free(nbuf, OBP_MAXDRVNAME); - - /* - * Ignore bind failures just like boot does - */ - (void) ndi_devi_bind_driver(dip, 0); - - switch (rv) { - case DDI_WALK_CONTINUE: - case DDI_WALK_PRUNESIB: - ndi_devi_enter(dip, &circ); - - i = DDI_WALK_CONTINUE; - for (; i == DDI_WALK_CONTINUE; ) { - i = sid_node_create(dip, bp, NULL); - } - - ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB); - if (i == DDI_WALK_ERROR) - rv = i; - /* - * If PRUNESIB stop creating siblings - * of dip's child. Subsequent walk behavior - * is determined by rv returned by dip. - */ - - ndi_devi_exit(dip, circ); - break; - case DDI_WALK_TERMINATE: - /* - * Don't create children and ask our parent - * to not create siblings either. - */ - rv = DDI_WALK_PRUNESIB; - break; - case DDI_WALK_PRUNECHILD: - /* - * Don't create children, but ask parent to continue - * with siblings. - */ - rv = DDI_WALK_CONTINUE; - break; - default: - ASSERT(0); - break; - } - - if (rdipp) - *rdipp = dip; - - /* - * Set device offline - only the "configure" op should cause an attach. - * Note that it is safe to set the dip offline without checking - * for either device contract or layered driver (LDI) based constraints - * since there cannot be any contracts or LDI opens of this device. - * This is because this node is a newly created dip with the parent busy - * held, so no other thread can come in and attach this dip. A dip that - * has never been attached cannot have contracts since by definition - * a device contract (an agreement between a process and a device minor - * node) can only be created against a device that has minor nodes - * i.e is attached. Similarly an LDI open will only succeed if the - * dip is attached. We assert below that the dip is not attached. - */ - ASSERT(i_ddi_node_state(dip) < DS_ATTACHED); - path = kmem_alloc(MAXPATHLEN, KM_SLEEP); - ret = set_infant_dip_offline(dip, path); - ASSERT(ret == DDI_SUCCESS); - kmem_free(path, MAXPATHLEN); - - return (rv); -fail: - (void) ndi_devi_free(dip); - kmem_free(nbuf, OBP_MAXDRVNAME); - return (DDI_WALK_ERROR); -} - -static int -create_sid_branch( - dev_info_t *pdip, - devi_branch_t *bp, - dev_info_t **dipp, - uint_t flags) -{ - int rv = 0, state = DDI_WALK_CONTINUE; - dev_info_t *rdip; - - while (state == DDI_WALK_CONTINUE) { - int circ; - - ndi_devi_enter(pdip, &circ); - - state = sid_node_create(pdip, bp, &rdip); - if (rdip == NULL) { - ndi_devi_exit(pdip, circ); - ASSERT(state == DDI_WALK_ERROR); - break; - } - - e_ddi_branch_hold(rdip); - - ndi_devi_exit(pdip, circ); - - if (flags & DEVI_BRANCH_CONFIGURE) { - int error = e_ddi_branch_configure(rdip, dipp, 0); - if (error && rv == 0) - rv = error; - } - - /* - * devi_branch_callback() is optional - */ - if (bp->devi_branch_callback) - bp->devi_branch_callback(rdip, bp->arg, 0); - } - - ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB); - - return (state == DDI_WALK_ERROR ? EIO : rv); -} - -int -e_ddi_branch_create( - dev_info_t *pdip, - devi_branch_t *bp, - dev_info_t **dipp, - uint_t flags) -{ - int prom_devi, sid_devi, error; - - if (pdip == NULL || bp == NULL || bp->type == 0) - return (EINVAL); - - prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0; - sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0; - - if (prom_devi && bp->create.prom_branch_select == NULL) - return (EINVAL); - else if (sid_devi && bp->create.sid_branch_create == NULL) - return (EINVAL); - else if (!prom_devi && !sid_devi) - return (EINVAL); - - if (flags & DEVI_BRANCH_EVENT) - return (EINVAL); - - if (prom_devi) { - struct pta pta = {0}; - - pta.pdip = pdip; - pta.bp = bp; - pta.flags = flags; - - error = prom_tree_access(create_prom_branch, &pta, NULL); - - if (dipp) - *dipp = pta.fdip; - else if (pta.fdip) - ndi_rele_devi(pta.fdip); - } else { - error = create_sid_branch(pdip, bp, dipp, flags); - } - - return (error); -} - -int -e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags) -{ - int rv; - char *devnm; - dev_info_t *pdip; - - if (dipp) - *dipp = NULL; - - if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT)) - return (EINVAL); - - pdip = ddi_get_parent(rdip); - - ndi_hold_devi(pdip); - - if (!e_ddi_branch_held(rdip)) { - ndi_rele_devi(pdip); - cmn_err(CE_WARN, "e_ddi_branch_configure: " - "dip(%p) not held", (void *)rdip); - return (EINVAL); - } - - if (i_ddi_node_state(rdip) < DS_INITIALIZED) { - /* - * First attempt to bind a driver. If we fail, return - * success (On some platforms, dips for some device - * types (CPUs) may not have a driver) - */ - if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) { - ndi_rele_devi(pdip); - return (0); - } - - if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) { - rv = NDI_FAILURE; - goto out; - } - } - - ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED); - - devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); - - (void) ddi_deviname(rdip, devnm); - - if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip, - NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) { - /* release hold from ndi_devi_config_one() */ - ndi_rele_devi(rdip); - } - - kmem_free(devnm, MAXNAMELEN + 1); -out: - if (rv != NDI_SUCCESS && dipp && rdip) { - ndi_hold_devi(rdip); - *dipp = rdip; - } - ndi_rele_devi(pdip); - return (ndi2errno(rv)); -} - -void -e_ddi_branch_hold(dev_info_t *rdip) -{ - if (e_ddi_branch_held(rdip)) { - cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held"); - return; - } - - mutex_enter(&DEVI(rdip)->devi_lock); - if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) { - DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD; - DEVI(rdip)->devi_ref++; - } - ASSERT(DEVI(rdip)->devi_ref > 0); - mutex_exit(&DEVI(rdip)->devi_lock); -} - -int -e_ddi_branch_held(dev_info_t *rdip) -{ - int rv = 0; - - mutex_enter(&DEVI(rdip)->devi_lock); - if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) && - DEVI(rdip)->devi_ref > 0) { - rv = 1; - } - mutex_exit(&DEVI(rdip)->devi_lock); - - return (rv); -} -void -e_ddi_branch_rele(dev_info_t *rdip) -{ - mutex_enter(&DEVI(rdip)->devi_lock); - DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD; - DEVI(rdip)->devi_ref--; - mutex_exit(&DEVI(rdip)->devi_lock); -} - -int -e_ddi_branch_unconfigure( - dev_info_t *rdip, - dev_info_t **dipp, - uint_t flags) -{ - int circ, rv; - int destroy; - char *devnm; - uint_t nflags; - dev_info_t *pdip; - - if (dipp) - *dipp = NULL; - - if (rdip == NULL) - return (EINVAL); - - pdip = ddi_get_parent(rdip); - - ASSERT(pdip); - - /* - * Check if caller holds pdip busy - can cause deadlocks during - * devfs_clean() - */ - if (DEVI_BUSY_OWNED(pdip)) { - cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent" - " devinfo node(%p) is busy held", (void *)pdip); - return (EINVAL); - } - - destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0; - - devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); - - ndi_devi_enter(pdip, &circ); - (void) ddi_deviname(rdip, devnm); - ndi_devi_exit(pdip, circ); - - /* - * ddi_deviname() returns a component name with / prepended. - */ - (void) devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE); - - ndi_devi_enter(pdip, &circ); - - /* - * Recreate device name as it may have changed state (init/uninit) - * when parent busy lock was dropped for devfs_clean() - */ - (void) ddi_deviname(rdip, devnm); - - if (!e_ddi_branch_held(rdip)) { - kmem_free(devnm, MAXNAMELEN + 1); - ndi_devi_exit(pdip, circ); - cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held", - destroy ? "destroy" : "unconfigure", (void *)rdip); - return (EINVAL); - } - - /* - * Release hold on the branch. This is ok since we are holding the - * parent busy. If rdip is not removed, we must do a hold on the - * branch before returning. - */ - e_ddi_branch_rele(rdip); - - nflags = NDI_DEVI_OFFLINE; - if (destroy || (flags & DEVI_BRANCH_DESTROY)) { - nflags |= NDI_DEVI_REMOVE; - destroy = 1; - } else { - nflags |= NDI_UNCONFIG; /* uninit but don't remove */ - } - - if (flags & DEVI_BRANCH_EVENT) - nflags |= NDI_POST_EVENT; - - if (i_ddi_devi_attached(pdip) && - (i_ddi_node_state(rdip) >= DS_INITIALIZED)) { - rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags); - } else { - rv = e_ddi_devi_unconfig(rdip, dipp, nflags); - if (rv == NDI_SUCCESS) { - ASSERT(!destroy || ddi_get_child(rdip) == NULL); - rv = ndi_devi_offline(rdip, nflags); - } - } - - if (!destroy || rv != NDI_SUCCESS) { - /* The dip still exists, so do a hold */ - e_ddi_branch_hold(rdip); - } -out: - kmem_free(devnm, MAXNAMELEN + 1); - ndi_devi_exit(pdip, circ); - return (ndi2errno(rv)); -} - -int -e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag) -{ - return (e_ddi_branch_unconfigure(rdip, dipp, - flag|DEVI_BRANCH_DESTROY)); -} - -/* - * Number of chains for hash table - */ -#define NUMCHAINS 17 - -/* - * Devinfo busy arg - */ -struct devi_busy { - int dv_total; - int s_total; - mod_hash_t *dv_hash; - mod_hash_t *s_hash; - int (*callback)(dev_info_t *, void *, uint_t); - void *arg; -}; - -static int -visit_dip(dev_info_t *dip, void *arg) -{ - uintptr_t sbusy, dvbusy, ref; - struct devi_busy *bsp = arg; - - ASSERT(bsp->callback); - - /* - * A dip cannot be busy if its reference count is 0 - */ - if ((ref = e_ddi_devi_holdcnt(dip)) == 0) { - return (bsp->callback(dip, bsp->arg, 0)); - } - - if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy)) - dvbusy = 0; - - /* - * To catch device opens currently maintained on specfs common snodes. - */ - if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) - sbusy = 0; - -#ifdef DEBUG - if (ref < sbusy || ref < dvbusy) { - cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu " - "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref); - } -#endif - - dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy; - - return (bsp->callback(dip, bsp->arg, dvbusy)); -} - -static int -visit_snode(struct snode *sp, void *arg) -{ - uintptr_t sbusy; - dev_info_t *dip; - int count; - struct devi_busy *bsp = arg; - - ASSERT(sp); - - /* - * The stable lock is held. This prevents - * the snode and its associated dip from - * going away. - */ - dip = NULL; - count = spec_devi_open_count(sp, &dip); - - if (count <= 0) - return (DDI_WALK_CONTINUE); - - ASSERT(dip); - - if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) - sbusy = count; - else - sbusy += count; - - if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) { - cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, " - "sbusy = %lu", "e_ddi_branch_referenced", - (void *)dip, sbusy); - } - - bsp->s_total += count; - - return (DDI_WALK_CONTINUE); -} - -static void -visit_dvnode(struct dv_node *dv, void *arg) -{ - uintptr_t dvbusy; - uint_t count; - struct vnode *vp; - struct devi_busy *bsp = arg; - - ASSERT(dv && dv->dv_devi); - - vp = DVTOV(dv); - - mutex_enter(&vp->v_lock); - count = vp->v_count; - mutex_exit(&vp->v_lock); - - if (!count) - return; - - if (mod_hash_remove(bsp->dv_hash, dv->dv_devi, - (mod_hash_val_t *)&dvbusy)) - dvbusy = count; - else - dvbusy += count; - - if (mod_hash_insert(bsp->dv_hash, dv->dv_devi, - (mod_hash_val_t)dvbusy)) { - cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, " - "dvbusy=%lu", "e_ddi_branch_referenced", - (void *)dv->dv_devi, dvbusy); - } - - bsp->dv_total += count; -} - -/* - * Returns reference count on success or -1 on failure. - */ -int -e_ddi_branch_referenced( - dev_info_t *rdip, - int (*callback)(dev_info_t *dip, void *arg, uint_t ref), - void *arg) -{ - int circ; - char *path; - dev_info_t *pdip; - struct devi_busy bsa = {0}; - - ASSERT(rdip); - - path = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - ndi_hold_devi(rdip); - - pdip = ddi_get_parent(rdip); - - ASSERT(pdip); - - /* - * Check if caller holds pdip busy - can cause deadlocks during - * devfs_walk() - */ - if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) { - cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: " - "devinfo branch(%p) not held or parent busy held", - (void *)rdip); - ndi_rele_devi(rdip); - kmem_free(path, MAXPATHLEN); - return (-1); - } - - ndi_devi_enter(pdip, &circ); - (void) ddi_pathname(rdip, path); - ndi_devi_exit(pdip, circ); - - bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS, - mod_hash_null_valdtor, sizeof (struct dev_info)); - - bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS, - mod_hash_null_valdtor, sizeof (struct snode)); - - if (devfs_walk(path, visit_dvnode, &bsa)) { - cmn_err(CE_WARN, "e_ddi_branch_referenced: " - "devfs walk failed for: %s", path); - kmem_free(path, MAXPATHLEN); - bsa.s_total = bsa.dv_total = -1; - goto out; - } - - kmem_free(path, MAXPATHLEN); - - /* - * Walk the snode table to detect device opens, which are currently - * maintained on specfs common snodes. - */ - spec_snode_walk(visit_snode, &bsa); - - if (callback == NULL) - goto out; - - bsa.callback = callback; - bsa.arg = arg; - - if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) { - ndi_devi_enter(rdip, &circ); - ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa); - ndi_devi_exit(rdip, circ); - } - -out: - ndi_rele_devi(rdip); - mod_hash_destroy_ptrhash(bsa.s_hash); - mod_hash_destroy_ptrhash(bsa.dv_hash); - return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total); -} diff --git a/usr/src/uts/sun4u/sys/machsystm.h b/usr/src/uts/sun4u/sys/machsystm.h index 76e96a56d6..ccacd922b4 100644 --- a/usr/src/uts/sun4u/sys/machsystm.h +++ b/usr/src/uts/sun4u/sys/machsystm.h @@ -26,8 +26,6 @@ #ifndef _SYS_MACHSYSTM_H #define _SYS_MACHSYSTM_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Numerous platform-dependent interfaces that don't seem to belong * in any other header file. @@ -357,46 +355,6 @@ extern uint64_t cpu_pa[]; extern void ptl1_init_cpu(struct cpu *); /* - * Defines for DR interfaces - */ -#define DEVI_BRANCH_CHILD 0x01 /* Walk immediate children of root */ -#define DEVI_BRANCH_CONFIGURE 0x02 /* Configure branch after create */ -#define DEVI_BRANCH_DESTROY 0x04 /* Destroy branch after unconfigure */ -#define DEVI_BRANCH_EVENT 0x08 /* Post NDI event */ -#define DEVI_BRANCH_PROM 0x10 /* Branches derived from PROM nodes */ -#define DEVI_BRANCH_SID 0x20 /* SID node branches */ -#define DEVI_BRANCH_ROOT 0x40 /* Node is the root of a branch */ - -typedef struct devi_branch { - void *arg; - void (*devi_branch_callback)(dev_info_t *, void *, uint_t); - int type; - union { - int (*prom_branch_select)(pnode_t, void *, uint_t); - int (*sid_branch_create)(dev_info_t *, void *, uint_t); - } create; -} devi_branch_t; - - -/* - * Prototypes which really belongs to sunddi.c, and should be moved to - * sunddi.c if there is another platform using these calls. - */ -extern int e_ddi_branch_create(dev_info_t *pdip, devi_branch_t *bp, - dev_info_t **dipp, uint_t flags); -extern int e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern int e_ddi_branch_unconfigure(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern int e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern void e_ddi_branch_hold(dev_info_t *rdip); -extern void e_ddi_branch_rele(dev_info_t *rdip); -extern int e_ddi_branch_held(dev_info_t *rdip); -extern int e_ddi_branch_referenced(dev_info_t *rdip, - int (*cb)(dev_info_t *dip, void *, uint_t), void *arg); - -/* * Constants which define the "hole" in the 64-bit sfmmu address space. * These are set to specific values by the CPU module code. */ diff --git a/usr/src/uts/sun4v/sys/machsystm.h b/usr/src/uts/sun4v/sys/machsystm.h index 832a634d0f..d9c795ecf9 100644 --- a/usr/src/uts/sun4v/sys/machsystm.h +++ b/usr/src/uts/sun4v/sys/machsystm.h @@ -341,46 +341,6 @@ extern uint64_t cpu_pa[]; extern void ptl1_init_cpu(struct cpu *); /* - * Defines for DR interfaces - */ -#define DEVI_BRANCH_CHILD 0x01 /* Walk immediate children of root */ -#define DEVI_BRANCH_CONFIGURE 0x02 /* Configure branch after create */ -#define DEVI_BRANCH_DESTROY 0x04 /* Destroy branch after unconfigure */ -#define DEVI_BRANCH_EVENT 0x08 /* Post NDI event */ -#define DEVI_BRANCH_PROM 0x10 /* Branches derived from PROM nodes */ -#define DEVI_BRANCH_SID 0x20 /* SID node branches */ -#define DEVI_BRANCH_ROOT 0x40 /* Node is the root of a branch */ - -typedef struct devi_branch { - void *arg; - void (*devi_branch_callback)(dev_info_t *, void *, uint_t); - int type; - union { - int (*prom_branch_select)(pnode_t, void *, uint_t); - int (*sid_branch_create)(dev_info_t *, void *, uint_t); - } create; -} devi_branch_t; - - -/* - * Prototypes which really belongs to sunddi.c, and should be moved to - * sunddi.c if there is another platform using these calls. - */ -extern int e_ddi_branch_create(dev_info_t *pdip, devi_branch_t *bp, - dev_info_t **dipp, uint_t flags); -extern int e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern int e_ddi_branch_unconfigure(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern int e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, - uint_t flags); -extern void e_ddi_branch_hold(dev_info_t *rdip); -extern void e_ddi_branch_rele(dev_info_t *rdip); -extern int e_ddi_branch_held(dev_info_t *rdip); -extern int e_ddi_branch_referenced(dev_info_t *rdip, - int (*cb)(dev_info_t *dip, void *, uint_t), void *arg); - -/* * Constants which define the "hole" in the 64-bit sfmmu address space. * These are set to specific values by the CPU module code. */ |