diff options
Diffstat (limited to 'usr/src')
27 files changed, 2819 insertions, 52 deletions
diff --git a/usr/src/cmd/devfsadm/Makefile.com b/usr/src/cmd/devfsadm/Makefile.com index b446b148ff..cec58108c8 100644 --- a/usr/src/cmd/devfsadm/Makefile.com +++ b/usr/src/cmd/devfsadm/Makefile.com @@ -21,7 +21,7 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019, Joyent, Inc. # This target builds both a command (daemon) and various shared objects. This # isn't a typical target, and the inclusion of both library and command @@ -71,7 +71,8 @@ LINK_OBJS_CMN = \ dtrace_link.o \ vscan_link.o \ zfs_link.o \ - zut_link.o + zut_link.o \ + sensor_link.o LINK_OBJS = $(LINK_OBJS_CMN) \ $(LINK_OBJS_$(MACH)) @@ -164,7 +165,7 @@ install: all \ clean: - $(RM) $(OBJS) + $(RM) $(OBJS) lint: $(DEVFSADM_MOD).ln $(LINT_MODULES) diff --git a/usr/src/cmd/devfsadm/devfsadm.c b/usr/src/cmd/devfsadm/devfsadm.c index f81d5b5d67..52f4f4c0da 100644 --- a/usr/src/cmd/devfsadm/devfsadm.c +++ b/usr/src/cmd/devfsadm/devfsadm.c @@ -23,6 +23,7 @@ * Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019, Joyent, Inc. */ /* @@ -2073,6 +2074,16 @@ class_ok(char *class) return (DEVFSADM_SUCCESS); } + /* + * Some create tabs operate on multiple classes of devices because the + * kernel doesn't have a good way for a driver to indicate that a + * particular minor's class is different from that of the dev_info_t + * it belongs to. As such, we'll always fail to match those here. + */ + if (class == NULL) { + return (DEVFSADM_FAILURE); + } + for (i = 0; i < num_classes; i++) { if (strcmp(class, classes[i]) == 0) { return (DEVFSADM_SUCCESS); @@ -3717,10 +3728,10 @@ do_inst_sync(char *filename, char *instfilename) * safely, the database is flushed to a temporary file, then moved into place. * * The following files are used during this process: - * /etc/path_to_inst: The path_to_inst file - * /etc/path_to_inst.<pid>: Contains data flushed from the kernel - * /etc/path_to_inst.old: The backup file - * /etc/path_to_inst.old.<pid>: Temp file for creating backup + * /etc/path_to_inst: The path_to_inst file + * /etc/path_to_inst.<pid>: Contains data flushed from the kernel + * /etc/path_to_inst.old: The backup file + * /etc/path_to_inst.old.<pid>: Temp file for creating backup * */ static void @@ -7803,7 +7814,7 @@ add_verbose_id(char *mid) * returns DEVFSADM_TRUE if contents is a minor node in /devices. * If mn_root is not NULL, mn_root is set to: * if contents is a /dev node, mn_root = contents - * OR + * OR * if contents is a /devices node, mn_root set to the '/' * following /devices. */ diff --git a/usr/src/cmd/devfsadm/sensor_link.c b/usr/src/cmd/devfsadm/sensor_link.c new file mode 100644 index 0000000000..7a2b48af75 --- /dev/null +++ b/usr/src/cmd/devfsadm/sensor_link.c @@ -0,0 +1,79 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* + * Create /devices links for various sensors. The sensor series of node types + * all begin with ddi_sensor. After which, there is a series of : delineated + * paths in the node type. Those represent the directory under /dev/sensors that + * the nodes should ultimately be created. + * + * For example, ddi_sensor:temperature:cpu would cause us to place the named + * minor under /dev/sensors/temperature/cpu/. Currently it is up to drivers to + * not conflict in names or if there is a fear of conflicting, make sure their + * minor is unique. + */ + +#include <devfsadm.h> +#include <string.h> + +#define SENSORS_BASE "sensors" + +static int +sensor_link(di_minor_t minor, di_node_t node) +{ + const char *t, *minor_name, *dir_path = NULL; + char *type, *c; + char buf[PATH_MAX]; + size_t len; + + if ((t = di_minor_nodetype(minor)) == NULL) { + return (DEVFSADM_CONTINUE); + } + + if ((minor_name = di_minor_name(minor)) == NULL) { + return (DEVFSADM_CONTINUE); + } + + if ((type = strdup(t)) == NULL) { + return (DEVFSADM_TERMINATE); + } + + while ((c = strchr(type, ':')) != NULL) { + if (dir_path == NULL) { + dir_path = c + 1; + } + *c = '/'; + } + + if (dir_path == NULL || *dir_path == '\0') { + len = snprintf(buf, sizeof (buf), "%s/%s", SENSORS_BASE, + minor_name); + } else { + len = snprintf(buf, sizeof (buf), "%s/%s/%s", SENSORS_BASE, + dir_path, minor_name); + } + + if (len < sizeof (buf)) { + (void) devfsadm_mklink(buf, node, minor, 0); + } + + free(type); + return (DEVFSADM_CONTINUE); +} + +static devfsadm_create_t sensor_create_cbt[] = { + { NULL, "ddi_sensor", NULL, TYPE_PARTIAL, ILEVEL_0, sensor_link } +}; +DEVFSADM_CREATE_INIT_V0(sensor_create_cbt); diff --git a/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.c b/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.c new file mode 100644 index 0000000000..28fcf3e314 --- /dev/null +++ b/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.c @@ -0,0 +1,261 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* + * This file provides routines to interact with the kernel sensor framework. + * Currently, modules that require interacting with a kernel sensor need to + * build this file as part of the module. This takes care of all the work of + * setting up and creating the temperature sensor, given a path to that sensor. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <libnvpair.h> +#include <sys/sensors.h> +#include <sys/fm/protocol.h> +#include <fm/topo_mod.h> + +#define TOPO_METH_TOPO_SENSOR_TEMP "topo_sensor_temp_reading" +#define TOPO_METH_TOPO_SENSOR_TEMP_DESC "Kernel Temperature Reading" +#define TOPO_METH_TOPO_SENSOR_TEMP_VERSION 0 + +static int +topo_sensor_temp_read(topo_mod_t *mod, tnode_t *node, topo_version_t vers, + nvlist_t *in, nvlist_t **out) +{ + int fd = -1, ret; + nvlist_t *args, *nvl; + char *path; + sensor_ioctl_temperature_t temp; + double degrees; + + if (vers != TOPO_METH_TOPO_SENSOR_TEMP_VERSION) { + return (topo_mod_seterrno(mod, ETOPO_METHOD_VERNEW)); + } + + if (nvlist_lookup_nvlist(in, TOPO_PROP_ARGS, &args) != 0 || + nvlist_lookup_string(args, TOPO_IO_DEV_PATH, &path) != 0) { + topo_mod_dprintf(mod, "failed to lookup sensor path from " + "property %s", TOPO_IO_DEV_PATH); + return (topo_mod_seterrno(mod, EMOD_NVL_INVAL)); + } + + if ((fd = open(path, O_RDONLY)) < 0) { + topo_mod_dprintf(mod, "failed to open sensor path %s: %s", + path, strerror(errno)); + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + + (void) memset(&temp, '\0', sizeof (temp)); + if (ioctl(fd, SENSOR_IOCTL_TEMPERATURE, &temp) != 0) { + topo_mod_dprintf(mod, "failed to read temperature sensor " + "%s: %s", path, strerror(errno)); + ret = topo_mod_seterrno(mod, EMOD_UNKNOWN); + goto out; + } + + /* + * Check to see if we need to change the value to get it into an + * accurate reading. Positive values indicate that the temperature + * reading is in a fractional number of degrees and that each degree + * contains temp.sit_gran steps. A negative number means that the + * temperature reading represents temp.sit_gran degrees. + */ + degrees = (double)temp.sit_temp; + if (temp.sit_gran > 1) { + degrees /= (double)temp.sit_gran; + } else if (temp.sit_gran < -1) { + degrees *= (double)labs(temp.sit_gran); + } + + if (topo_mod_nvalloc(mod, &nvl, NV_UNIQUE_NAME) != 0) { + topo_mod_dprintf(mod, "failed to allocate output temperature " + "nvl"); + ret = topo_mod_seterrno(mod, EMOD_NOMEM); + goto out; + } + + if (nvlist_add_string(nvl, TOPO_PROP_VAL_NAME, TOPO_SENSOR_READING) != + 0 || + nvlist_add_uint32(nvl, TOPO_PROP_VAL_TYPE, TOPO_TYPE_DOUBLE) != 0 || + nvlist_add_double(nvl, TOPO_PROP_VAL_VAL, degrees) != 0) { + topo_mod_dprintf(mod, "failed to add members to output " + "temperature nvlist"); + nvlist_free(nvl); + ret = topo_mod_seterrno(mod, EMOD_NOMEM); + goto out; + } + + *out = nvl; + ret = 0; +out: + if (fd >= 0) { + (void) close(fd); + } + return (ret); +} + +static const topo_method_t topo_sensor_temp_fac_methods[] = { + { TOPO_METH_TOPO_SENSOR_TEMP, TOPO_METH_TOPO_SENSOR_TEMP_DESC, + TOPO_METH_TOPO_SENSOR_TEMP_VERSION, TOPO_STABILITY_INTERNAL, + topo_sensor_temp_read }, + { NULL } +}; + +static topo_sensor_unit_t +topo_sensor_units(const sensor_ioctl_temperature_t *temp) +{ + switch (temp->sit_unit) { + case SENSOR_UNIT_CELSIUS: + return (TOPO_SENSOR_UNITS_DEGREES_C); + case SENSOR_UNIT_FAHRENHEIT: + return (TOPO_SENSOR_UNITS_DEGREES_F); + case SENSOR_UNIT_KELVIN: + return (TOPO_SENSOR_UNITS_DEGREES_K); + default: + return (TOPO_SENSOR_UNITS_UNSPECIFIED); + } +} + +int +topo_sensor_create_temp_sensor(topo_mod_t *mod, tnode_t *pnode, + const char *path, const char *fname) +{ + int fd, ret, err; + sensor_ioctl_kind_t sik; + sensor_ioctl_temperature_t temp; + tnode_t *fnode = NULL; + topo_pgroup_info_t pgi; + nvlist_t *reader_arg = NULL; + + topo_mod_dprintf(mod, "attempting to create sensor for %s at %s", + topo_node_name(pnode), path); + + (void) memset(&sik, '\0', sizeof (sik)); + (void) memset(&temp, '\0', sizeof (temp)); + + if ((fd = open(path, O_RDONLY)) < 0) { + topo_mod_dprintf(mod, "failed to open sensor path %s: %s", + path, strerror(errno)); + + /* + * We always try to create temperature sensors; however, they + * may not exist or be supported on the system in question. + * Therefore ENOENT is totally acceptable. + */ + if (errno == ENOENT) { + return (0); + } + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + + if (ioctl(fd, SENSOR_IOCTL_TYPE, &sik) != 0) { + topo_mod_dprintf(mod, "failed to verify sensor kind for sensor " + "%s: %s", path, strerror(errno)); + ret = topo_mod_seterrno(mod, EMOD_UNKNOWN); + goto out; + } + + if (sik.sik_kind != SENSOR_KIND_TEMPERATURE) { + topo_mod_dprintf(mod, "sensor kind for %s is not temperature, " + "found 0x%x", path, sik.sik_kind); + ret = topo_mod_seterrno(mod, EMOD_UNKNOWN); + goto out; + } + + if (ioctl(fd, SENSOR_IOCTL_TEMPERATURE, &temp) != 0) { + topo_mod_dprintf(mod, "failed to read temperature sensor " + "%s: %s", path, strerror(errno)); + ret = topo_mod_seterrno(mod, EMOD_UNKNOWN); + goto out; + } + + (void) close(fd); + fd = -1; + + if ((fnode = topo_node_facbind(mod, pnode, fname, + TOPO_FAC_TYPE_SENSOR)) == NULL) { + topo_mod_dprintf(mod, "failed to bind temperature facility " + "node to %s: %d", path, topo_mod_errno(mod)); + ret = -1; + goto out; + } + + pgi.tpi_name = TOPO_PGROUP_FACILITY; + pgi.tpi_namestab = TOPO_STABILITY_PRIVATE; + pgi.tpi_datastab = TOPO_STABILITY_PRIVATE; + pgi.tpi_version = 1; + + if (topo_pgroup_create(fnode, &pgi, &err) != 0) { + topo_mod_dprintf(mod, "failed to create facility pgroup: %s", + topo_strerror(err)); + ret = topo_mod_seterrno(mod, err); + goto out; + } + + if (topo_prop_set_string(fnode, TOPO_PGROUP_FACILITY, + TOPO_SENSOR_CLASS, TOPO_PROP_IMMUTABLE, + TOPO_SENSOR_CLASS_THRESHOLD, &err) != 0 || + topo_prop_set_uint32(fnode, TOPO_PGROUP_FACILITY, + TOPO_FACILITY_TYPE, TOPO_PROP_IMMUTABLE, TOPO_SENSOR_TYPE_TEMP, + &err) != 0 || + topo_prop_set_uint32(fnode, TOPO_PGROUP_FACILITY, + TOPO_SENSOR_UNITS, TOPO_PROP_IMMUTABLE, topo_sensor_units(&temp), + &err) != 0) { + topo_mod_dprintf(mod, "failed to set properties for sensor " + "%s: %s", path, topo_strerror(err)); + ret = topo_mod_seterrno(mod, err); + goto out; + + } + + if (topo_method_register(mod, fnode, topo_sensor_temp_fac_methods) < 0) { + topo_mod_dprintf(mod, "failed to register reading methods on " + "%s", path); + ret = -1; + goto out; + } + + if (topo_mod_nvalloc(mod, &reader_arg, NV_UNIQUE_NAME) != 0 || + nvlist_add_string(reader_arg, TOPO_IO_DEV_PATH, path) != 0) { + topo_mod_dprintf(mod, "Failed to set up reader argument nvl"); + ret = topo_mod_seterrno(mod, EMOD_NOMEM); + goto out; + } + + if (topo_prop_method_register(fnode, TOPO_PGROUP_FACILITY, + TOPO_SENSOR_READING, TOPO_TYPE_DOUBLE, TOPO_METH_TOPO_SENSOR_TEMP, + reader_arg, &err) != 0) { + topo_mod_dprintf(mod, "failed to set argument for sensor %s: " + "%s", path, topo_strerror(err)); + err = topo_mod_seterrno(mod, err); + goto out; + } + + nvlist_free(reader_arg); + return (0); +out: + if (fd >= 0) { + (void) close(fd); + } + + topo_node_unbind(fnode); + nvlist_free(reader_arg); + return (ret); +} diff --git a/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.h b/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.h new file mode 100644 index 0000000000..ff6e1ea92e --- /dev/null +++ b/usr/src/lib/fm/topo/modules/common/shared/topo_sensor.h @@ -0,0 +1,34 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +#ifndef _TOPO_SENSOR_H +#define _TOPO_SENSOR_H + +/* + * Routines to interact with the common kernel sensor framework. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int topo_sensor_create_temp_sensor(topo_mod_t *, tnode_t *, const char *, + const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _TOPO_SENSOR_H */ diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/Makefile b/usr/src/lib/fm/topo/modules/i86pc/chip/Makefile index 3da69e6ce2..f56686faf1 100644 --- a/usr/src/lib/fm/topo/modules/i86pc/chip/Makefile +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/Makefile @@ -22,16 +22,22 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019, Joyent, Inc. MODULE = chip ARCH = i86pc CLASS = arch +SHAREDDIR = ../../common/shared/ + MODULESRCS = chip.c chip_label.c chip_subr.c chip_amd.c chip_intel.c\ -chip_serial.c chip_smbios.c +chip_serial.c chip_smbios.c chip_temp.o +MODULESRCS += topo_sensor.c include ../../Makefile.plugin LDLIBS += -lipmi -lfmd_agent -lumem -lsmbios -lkstat -# not linted -SMATCH=off +CPPFLAGS += -I$(SHAREDDIR) + +%.o: $(SHAREDDIR)/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/chip.c b/usr/src/lib/fm/topo/modules/i86pc/chip/chip.c index cdd799cc0b..c81f01c3e9 100644 --- a/usr/src/lib/fm/topo/modules/i86pc/chip/chip.c +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/chip.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ #include <unistd.h> @@ -403,6 +403,13 @@ create_core(topo_mod_t *mod, tnode_t *pnode, nvlist_t *cpu, if (topo_node_range_create(mod, core, STRAND_NODE_NAME, 0, 255) != 0) return (-1); + + /* + * Creating a temperature sensor may fail because the sensor + * doesn't exist or due to internal reasons. At the moment, we + * swallow any such errors that occur. + */ + (void) chip_create_core_temp_sensor(mod, core); } if (!is_xpv()) { @@ -644,6 +651,13 @@ create_chip(topo_mod_t *mod, tnode_t *pnode, topo_instance_t min, } create_mc = B_TRUE; + + /* + * Creating a temperature sensor may fail because the sensor + * doesn't exist or due to internal reasons. At the moment, we + * swallow any such errors that occur. + */ + (void) chip_create_chip_temp_sensor(mod, chip); } if (FM_AWARE_SMBIOS(mod)) { diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/chip.h b/usr/src/lib/fm/topo/modules/i86pc/chip/chip.h index 8b5ad3b88f..b4fd850996 100644 --- a/usr/src/lib/fm/topo/modules/i86pc/chip/chip.h +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/chip.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ #ifndef _CHIP_H @@ -221,6 +221,11 @@ extern const char *chip_rev_smbios_get(topo_mod_t *, id_t); extern id_t memnode_to_smbiosid(topo_mod_t *, uint16_t, const char *, uint64_t, void *); +/* + * Prototypes for chip_temp.c + */ +extern int chip_create_chip_temp_sensor(topo_mod_t *, tnode_t *); +extern int chip_create_core_temp_sensor(topo_mod_t *, tnode_t *); #ifdef __cplusplus } diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/chip_label.c b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_label.c index 67c35058c3..4275bc46f6 100644 --- a/usr/src/lib/fm/topo/modules/i86pc/chip/chip_label.c +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_label.c @@ -22,6 +22,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019, Joyent, Inc. */ #include <stdio.h> @@ -561,8 +563,8 @@ get_num_chips(topo_mod_t *mod) nchip = -1; break; } - if ((bitmap & (1 << chipid)) != 0) { - bitmap |= (1 << chipid); + if ((bitmap & (1ULL << chipid)) != 0) { + bitmap |= (1ULL << chipid); nchip++; } } @@ -660,7 +662,7 @@ a4fplus_chip_label(topo_mod_t *mod, tnode_t *node, topo_version_t vers, * * This function computes the DIMM slot number using the following formula: * - * slot = cs - (cs % 2) + channel + offset + * slot = cs - (cs % 2) + channel + offset */ /* ARGSUSED */ int diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/chip_subr.c b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_subr.c index a83f31dbb4..53fd7852ef 100644 --- a/usr/src/lib/fm/topo/modules/i86pc/chip/chip_subr.c +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_subr.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019, Joyent, Inc. */ /* @@ -230,7 +230,7 @@ mkrsrc(topo_mod_t *mod, tnode_t *pnode, const char *name, int inst, { *nvl = topo_mod_hcfmri(mod, pnode, FM_HC_SCHEME_VERSION, name, inst, NULL, auth, NULL, NULL, NULL); - return (nvl != NULL ? 0 : -1); /* caller must free nvlist */ + return (*nvl != NULL ? 0 : -1); /* caller must free nvlist */ } /* diff --git a/usr/src/lib/fm/topo/modules/i86pc/chip/chip_temp.c b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_temp.c new file mode 100644 index 0000000000..89f8d57fb6 --- /dev/null +++ b/usr/src/lib/fm/topo/modules/i86pc/chip/chip_temp.c @@ -0,0 +1,91 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <libnvpair.h> +#include <sys/sensors.h> +#include <sys/fm/protocol.h> +#include <fm/topo_mod.h> +#include <topo_sensor.h> + +#include "chip.h" + +static const char *chip_sensor_base = "/dev/sensors/temperature/cpu"; + +int +chip_create_core_temp_sensor(topo_mod_t *mod, tnode_t *pnode) +{ + int err; + int32_t chip, core; + char buf[PATH_MAX]; + struct stat st; + + core = topo_node_instance(pnode); + if (topo_prop_get_int32(pnode, PGNAME(CORE), CORE_CHIP_ID, &chip, + &err) != 0) { + return (topo_mod_seterrno(mod, err)); + } + + if (snprintf(buf, sizeof (buf), "%s/chip%d.core%d", chip_sensor_base, + chip, core) >= sizeof (buf)) { + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + + /* + * Some systems have per-core sensors. Others have it on a per-die aka + * procnode basis. Check to see if the file exists before we attempt to + * do something. + */ + if (stat(buf, &st) != 0) { + int32_t procnode; + + if (errno != ENOENT) { + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + + if (topo_prop_get_int32(pnode, PGNAME(CORE), CORE_PROCNODE_ID, + &procnode, &err) != 0) { + return (topo_mod_seterrno(mod, err)); + } + + if (snprintf(buf, sizeof (buf), "%s/procnode.%d", + chip_sensor_base, procnode) >= sizeof (buf)) { + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + } + + return (topo_sensor_create_temp_sensor(mod, pnode, buf, "temp")); +} + +int +chip_create_chip_temp_sensor(topo_mod_t *mod, tnode_t *pnode) +{ + int32_t chip; + char buf[PATH_MAX]; + + chip = topo_node_instance(pnode); + + if (snprintf(buf, sizeof (buf), "%s/chip%d", chip_sensor_base, + chip) >= sizeof (buf)) { + return (topo_mod_seterrno(mod, EMOD_UNKNOWN)); + } + + return (topo_sensor_create_temp_sensor(mod, pnode, buf, "temp")); +} diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile index c984ee588a..6ad3c6b28f 100644 --- a/usr/src/man/man7d/Makefile +++ b/usr/src/man/man7d/Makefile @@ -12,7 +12,7 @@ # # Copyright 2011, Richard Lowe # Copyright 2016 Garrett D'Amore <garrett@damore.org> -# Copyright (c) 2017, Joyent, Inc. +# Copyright 2019, Joyent, Inc. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2018 Nexenta Systems, Inc. # Copyright 2019 Peter Tribble @@ -20,10 +20,11 @@ include $(SRC)/Makefile.master -MANSECT= 7d +MANSECT= 7d _MANFILES= aac.7d \ afe.7d \ + amdf17nbdf.7d \ audio.7d \ audio1575.7d \ audioens.7d \ @@ -38,6 +39,7 @@ _MANFILES= aac.7d \ bscv.7d \ chxge.7d \ console.7d \ + coretemp.7d \ cpuid.7d \ dca.7d \ dcam1394.7d \ diff --git a/usr/src/man/man7d/amdf17nbdf.7d b/usr/src/man/man7d/amdf17nbdf.7d new file mode 100644 index 0000000000..739eab6c82 --- /dev/null +++ b/usr/src/man/man7d/amdf17nbdf.7d @@ -0,0 +1,53 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2019, Joyent, Inc. +.\" +.Dd March 20, 2019 +.Dt AMDF17NBDF 7D +.Os +.Sh NAME +.Nm amdf17nbdf +.Nd AMD Family 17h Northbridge and Data Fabric Driver +.Sh SYNOPSIS +.Pa /dev/sensors/temperature/cpu/* +.Sh DESCRIPTION +The +.Nm +driver provides the system access to the Northbridge and Data Fabric +devices on AMD Family 17h +.Pq Zen +processors allowing the operating system to communicate with the system +management unit +.Pq SMU . +.Pp +From this, the driver exposes temperature sensors. +On Family 17h systems, temperature sensors exist for each Zeppelin die, +of which there may be multiple in a single package. +This means that each sensor covers more than one core. +.Pp +Temperature information is available to the system via the fault +management architecture +.Pq FMA . +The file system location and programming interface to the +.Nm +driver are considered +.Sy Volatile , +subject to change without notice, and should not be used directly. +Raw temperature information can be dumped through the FMA developer +utility fmtopo. +.Sh SEE ALSO +.Xr fmadm 1M +.Rs +.%A AMD +.%B Open-Source Register Reference For AMD Family 17h Processors Models 00h-2Fh +.%D July, 2018 +.Re diff --git a/usr/src/man/man7d/coretemp.7d b/usr/src/man/man7d/coretemp.7d new file mode 100644 index 0000000000..2ac1008e55 --- /dev/null +++ b/usr/src/man/man7d/coretemp.7d @@ -0,0 +1,49 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2019, Joyent, Inc. +.\" +.Dd March 20, 2019 +.Dt CORETEMP 7D +.Os +.Sh NAME +.Nm coretemp +.Nd Intel core-family temperature sensor driver +.Sh SYNOPSIS +.Pa /dev/sensors/temperature/cpu/* +.Sh DESCRIPTION +The +.Nm +driver provides the system with a means of reading the per-core and, +when available, per-package digital temperature sensors on Intel CPUs. +Currently, the +.Nm +driver supports Intel Core family processors after Penryn +microarchitecture and Intel Atom processors starting with the Silvermont +microarchitecure. +.Pp +Temperature information is available to the system via the fault +management architecture +.Pq FMA . +The file system location and programming interface to the +.Nm +driver are considered +.Sy Volatile , +subject to change without notice, and should not be used directly. +Raw temperature information can be dumped through the FMA developer +utility fmtopo. +.Sh SEE ALSO +.Xr fmadm 1M +.Rs +.%A Intel Corporation +.%B Intel 64 and IA-32 Architectures Software Developer's Manual +.%V Volume 3 (3A, 3B, 3C & 3D): System Programming Guide +.Re diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 5a6d7a204c..909160f2db 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -21,9 +21,8 @@ # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019, Joyent, Inc. # Copyright 2013 Garrett D'Amore <garrett@damore.org> -# Copyright 2015, Joyent, Inc. All rights reserved. # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com> # Copyright 2017 Nexenta Systems, Inc. @@ -518,6 +517,7 @@ CHKHDRS= \ sema_impl.h \ semaphore.h \ sendfile.h \ + sensors.h \ ser_sync.h \ session.h \ sha1.h \ diff --git a/usr/src/uts/common/sys/sensors.h b/usr/src/uts/common/sys/sensors.h new file mode 100644 index 0000000000..b9ca9f1f3f --- /dev/null +++ b/usr/src/uts/common/sys/sensors.h @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +#ifndef _SYS_SENSORS_H +#define _SYS_SENSORS_H + +/* + * Consolidated sensor ioctls for various parts of the operating system. These + * interfaces should not be relied on at all. They are evolving and will change + * as we add more to the system for this. This may eventually become a larger + * framework, though it's more likely we'll consolidate that in userland. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * List of different possible kinds of sensors. + */ +#define SENSOR_KIND_UNKNOWN 0x00 +#define SENSOR_KIND_TEMPERATURE 0x01 + +/* + * Lists of units that senors may have. + */ +#define SENSOR_UNIT_UNKNOWN 0x00 +#define SENSOR_UNIT_CELSIUS 0x01 +#define SENSOR_UNIT_FAHRENHEIT 0x02 +#define SENSOR_UNIT_KELVIN 0x03 + +#define SENSOR_IOCTL (('s' << 24) | ('e' << 16) | ('n' << 8)) + +/* + * Ask the sensor what kind of sensor it is. + */ +#define SENSOR_IOCTL_TYPE (SENSOR_IOCTL | 0x01) + +typedef struct sensor_ioctl_kind { + uint64_t sik_kind; +} sensor_ioctl_kind_t; + +/* + * Ask the sensor for a temperature measurement. The sensor is responsible for + * returning the units it's in. A temperature measurement is broken down into a + * signed value and a notion of its granularity. The sit_gran member indicates + * the granularity: the number of increments per degree in the temperature + * measurement (the sit_temp member). sit_gran is signed and the sign indicates + * whether one needs to multiply or divide the granularity. For example, a + * value that set sit_gran to 10 would mean that the value in sit_temp was in + * 10ths of a degree and that to get the actual value in degrees, one would + * divide by 10. On the other hand, a negative value means that we effectively + * have to multiply to get there. For example, a value of -2 would indicate that + * each value in sit_temp indicated two degrees and to get the temperature in + * degrees you would multiply sit_temp by two. + */ +#define SENSOR_IOCTL_TEMPERATURE (SENSOR_IOCTL | 0x02) + +typedef struct sensor_ioctl_temperature { + uint32_t sit_unit; + int32_t sit_gran; + int64_t sit_temp; +} sensor_ioctl_temperature_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SENSORS_H */ diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index b260971a89..3026dc961a 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -24,6 +24,7 @@ * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019, Joyent, Inc. */ #ifndef _SYS_SUNDDI_H @@ -202,13 +203,13 @@ extern "C" { #define DDI_NT_KEYBOARD "ddi_keyboard" /* keyboard device */ -#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */ +#define DDI_NT_PARALLEL "ddi_parallel" /* parallel port */ #define DDI_NT_PRINTER "ddi_printer" /* printer device */ #define DDI_NT_UGEN "ddi_generic:usb" /* USB generic drv */ -#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */ +#define DDI_NT_SMP "ddi_sas_smp" /* smp devcies */ #define DDI_NT_NEXUS "ddi_ctl:devctl" /* nexus drivers */ @@ -260,6 +261,11 @@ extern "C" { #define DDI_NT_INTRCTL "ddi_tool_intr" /* tool intr access */ /* + * Various device types used for sensors. + */ +#define DDI_NT_SENSOR_TEMP_CPU "ddi_sensor:temperature:cpu" + +/* * DDI event definitions */ #define EC_DEVFS "EC_devfs" /* Event class devfs */ @@ -839,7 +845,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, * allocated by property provider via kmem_alloc. Requester * is responsible for freeing returned property via kmem_free. * - * Arguments: + * Arguments: * * dev: Input: dev_t of property. * dip: Input: dev_info_t pointer of child. @@ -850,7 +856,7 @@ ddi_prop_op_nblocks_blksize(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, * valuep: Output: Addr of callers buffer pointer. * lengthp:Output: *lengthp will contain prop length on exit. * - * Possible Returns: + * Possible Returns: * * DDI_PROP_SUCCESS: Prop found and returned. * DDI_PROP_NOT_FOUND: Prop not found diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 3b0133ce07..1bf09bcf08 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -1037,7 +1037,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = { "fma4", "tbm", "avx512_vnni", - "amd_pcec" + "amd_pcec", + "core_thermal", + "pkg_thermal" }; boolean_t @@ -2392,6 +2394,41 @@ cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) } } +/* + * Gather relevant CPU features from leaf 6 which covers thermal information. We + * always gather leaf 6 if it's supported; however, we only look for features on + * Intel systems as AMD does not currently define any of the features we look + * for below. + */ +static void +cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset) +{ + struct cpuid_regs *cp; + struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; + + if (cpi->cpi_maxeax < 6) { + return; + } + + cp = &cpi->cpi_std[6]; + cp->cp_eax = 6; + cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0; + (void) __cpuid_insn(cp); + platform_cpuid_mangle(cpi->cpi_vendor, 6, cp); + + if (cpi->cpi_vendor != X86_VENDOR_Intel) { + return; + } + + if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) { + add_x86_feature(featureset, X86FSET_CORE_THERMAL); + } + + if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) { + add_x86_feature(featureset, X86FSET_PKG_THERMAL); + } +} + void cpuid_pass1(cpu_t *cpu, uchar_t *featureset) { @@ -3230,6 +3267,7 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) } cpuid_pass1_topology(cpu, featureset); + cpuid_pass1_thermal(cpu, featureset); /* * Synthesize chip "revision" and socket type @@ -3293,9 +3331,9 @@ cpuid_pass2(cpu_t *cpu) cp->cp_eax = n; /* - * n == 7 was handled in pass 1 + * leaves 6 and 7 were handled in pass 1 */ - if (n == 7) + if (n == 6 || n == 7) continue; /* @@ -6443,7 +6481,7 @@ cpuid_arat_supported(void) if (cpi->cpi_maxeax >= 6) { regs.cp_eax = 6; (void) cpuid_insn(NULL, ®s); - return (regs.cp_eax & CPUID_CSTATE_ARAT); + return (regs.cp_eax & CPUID_INTC_EAX_ARAT); } else { return (0); } @@ -6477,7 +6515,7 @@ cpuid_iepb_supported(struct cpu *cp) regs.cp_eax = 0x6; (void) cpuid_insn(NULL, ®s); - return (regs.cp_ecx & CPUID_EPB_SUPPORT); + return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS); } /* diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index bfcfe1dc52..b2ad69e8c1 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -21,7 +21,7 @@ # # Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright 2018, Joyent, Inc. +# Copyright 2019, Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # @@ -101,7 +101,7 @@ GENUNIX_OBJS += \ # CORE_OBJS += \ prmachdep.o - + LX_CGROUP_OBJS += \ cgrps_node.o \ cgrps_vfsops.o \ @@ -423,3 +423,13 @@ VMXNET3S_OBJS = vmxnet3_main.o \ # VMware PVSCSI SCSI Controller # PVSCSI_OBJS = pvscsi.o + +# +# Intel Temperature Module +# +CORETEMP_OBJS = coretemp.o + +# +# AMD Family 17 northbridge driver +# +AMDF17NBDF_OBJS = amdf17nbdf.o diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index 00785ef1be..e23797aeac 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -21,7 +21,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. -# Copyright 2016 Joyent, Inc. +# Copyright 2019, Joyent, Inc. # Copyright 2016 Garrett D'Amore <garrett@damore.org> # Copyright 2018 Nexenta Systems, Inc. # @@ -214,7 +214,7 @@ DRV_KMODS += audiopci DRV_KMODS += audiosolo DRV_KMODS += audiots DRV_KMODS += audiovia823x -DRV_KMODS += bl +DRV_KMODS += bl DRV_KMODS += blkdev DRV_KMODS += bge DRV_KMODS += bofi @@ -360,7 +360,7 @@ DRV_KMODS += ural DRV_KMODS += uath DRV_KMODS += urtw DRV_KMODS += vgatext -DRV_KMODS += vmxnet +DRV_KMODS += vmxnet DRV_KMODS += vnd DRV_KMODS += vnic DRV_KMODS += vscan @@ -506,9 +506,9 @@ DRV_KMODS += xhci # DRV_KMODS += usbgem DRV_KMODS += axf -DRV_KMODS += udmf +DRV_KMODS += udmf DRV_KMODS += upf -DRV_KMODS += urf +DRV_KMODS += urf # # 1394 modules @@ -522,7 +522,7 @@ DRV_KMODS += dcam1394 # InfiniBand pseudo drivers # DRV_KMODS += ib ibp eibnx eoib rdsib sdp iser daplt hermon tavor sol_ucma sol_uverbs -DRV_KMODS += sol_umad +DRV_KMODS += sol_umad # # Brand modules @@ -753,3 +753,9 @@ DACF_KMODS += net_dacf # global cross check. # LINTFLAGS += -D_MACHDEP -I$(UTSBASE)/i86pc + +# +# Sensor related drivers +# +DRV_KMODS += amdf17nbdf +DRV_KMODS += coretemp diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules index 998fb97496..723cd2fd84 100644 --- a/usr/src/uts/intel/Makefile.rules +++ b/usr/src/uts/intel/Makefile.rules @@ -21,7 +21,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# Copyright 2019 Joyent, Inc. All rights reserved. +# Copyright 2019, Joyent, Inc. # Copyright 2017 Nexenta Systems, Inc. # @@ -154,10 +154,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/amd8111s/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/amdf17nbdf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/amr/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/coretemp/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/drm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/amdf17nbdf/Makefile b/usr/src/uts/intel/amdf17nbdf/Makefile new file mode 100644 index 0000000000..a5543f176f --- /dev/null +++ b/usr/src/uts/intel/amdf17nbdf/Makefile @@ -0,0 +1,47 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = amdf17nbdf +OBJECTS = $(AMDF17NBDF_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/amdf17nb + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(CONFMOD) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/coretemp/Makefile b/usr/src/uts/intel/coretemp/Makefile new file mode 100644 index 0000000000..9ce4a8ab56 --- /dev/null +++ b/usr/src/uts/intel/coretemp/Makefile @@ -0,0 +1,54 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = coretemp +OBJECTS = $(CORETEMP_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/coretemp + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(CONFMOD) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Because we need to use cross calls directly, we must include the +# definitions below. Once CMI rdmsr routines have been fixed, we can +# remove this and move out of the platform specific driver world. +# +CPPFLAGS += -I$(UTSBASE)/i86pc/ + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/io/amdf17nbdf/amdf17nbdf.c b/usr/src/uts/intel/io/amdf17nbdf/amdf17nbdf.c new file mode 100644 index 0000000000..11bddfa515 --- /dev/null +++ b/usr/src/uts/intel/io/amdf17nbdf/amdf17nbdf.c @@ -0,0 +1,1015 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* + * AMD Family 17 Northbridge and Data Fabric Driver + * + * This driver attaches to the AMD Family 17h northbridge and data fabric bus. + * Each Zeppelin die ('processor node' in cpuid.c parlance) has its own + * northbridge and access to the data fabric bus. The northbridge and data + * fabric both provide access to various features such as: + * + * - The System Management Network (SMN) + * - Data Fabric via Fabric Indirect Config Access (FICAA) + * + * These are required to access things such as temperature sensors or memory + * controller configuration registers. + * + * In AMD Family 17h systems, the 'northbridge' is an ASIC that is part of the + * package that contains many I/O capabilities related to things like PCI + * express, etc. The 'data fabric' is the means by which different components + * both inside the socket and multiple sockets are connected together. Both the + * northbridge and the data fabric have dedicated PCI devices which the + * operating system can use to interact with them. + * + * ------------------------ + * Mapping Devices Together + * ------------------------ + * + * The operating system needs to expose things like temperature sensors and DRAM + * configuration registers in terms that are meaningful to the system such as + * logical CPUs, cores, etc. This driver attaches to the PCI IDs that represent + * the northbridge and data fabric; however, there are multiple PCI devices (one + * per die) that exist. This driver does manage to map all of these three things + * together; however, it requires some acrobatics. Unfortunately, there's no + * direct way to map a northbridge to its corresponding die. However, we can map + * a CPU die to a data fabric PCI device and a data fabric PCI device to a + * corresponding northbridge PCI device. + * + * In current Zen based products, there is a direct mapping between processor + * nodes and a data fabric PCI device. All of the devices are on PCI Bus 0 and + * start from Device 0x18. Device 0x18 maps to processor node 0, 0x19 to + * processor node 1, etc. This means that to map a logical CPU to a data fabric + * device, we take its processor node id, add it to 0x18 and find the PCI device + * that is on bus 0, device 0x18. As each data fabric device is attached based + * on its PCI ID, we add it to the global list, amd_nbdf_dfs that is in the + * amd_f17nbdf_t structure. + * + * The northbridge PCI device has a defined device and function, but the PCI bus + * that it's on can vary. Each die has its own series of PCI buses that are + * assigned to it and the northbridge PCI device is on the first of die-specific + * PCI bus for each die. This also means that the northbridge will not show up + * on PCI bus 0, which is the PCI bus that all of the data fabric devices are + * on. While conventionally the northbridge with the lowest PCI bus value + * would correspond to processor node zero, hardware does not guarantee that at + * all. Because we don't want to be at the mercy of firmware, we don't rely on + * this ordering, even though we have yet to find a system that deviates from + * this scheme. + * + * One of the registers in the data fabric device's function 0 + * (AMDF17_DF_CFG_ADDR_CTL), happens to have the first PCI bus that is + * associated with the processor node. This means, that we can map a data fabric + * device to a northbridge by finding the northbridge whose PCI bus matches the + * value in the corresponding data fabric's AMDF17_DF_CFG_ADDR_CTL. + * + * This means that we can map a northbridge to a data fabric device and a data + * fabric device to a die. Because these are 1:1 mappings, there is a transitive + * relationship and therefore we know which northbridge is associated with which + * processor die. This is summarized in the following image: + * + * +-------+ +----------------------------+ +--------------+ + * | Die 0 | ---> | Data Fabric PCI BDF 0/18/0 |-------> | Northbridge | + * +-------+ | AMDF17_DF_CFG_ADDR: bus 10 | | PCI 10/0/0 | + * ... +----------------------------+ +--------------+ + * +-------+ +------------------------------+ +--------------+ + * | Die n | ---> | Data Fabric PCI BDF 0/18+n/0 |-------> | Northbridge | + * +-------+ | AMDF17_DF_CFG_ADDR: bus 133 | | PCI 133/0/0 | + * +------------------------------+ +--------------+ + * + * Note, the PCI buses used by the northbridges here are arbitrary. They do not + * reflect the actual values by hardware; however, the bus/device/function (BDF) + * of the data fabric accurately models hardware. All of the BDF values are in + * hex. + * + * ------------------------------- + * Attach and Detach Complications + * ------------------------------- + * + * Because we need to map different PCI devices together, this means that we + * have multiple dev_info_t structures that we need to manage. Each of these is + * independently attached and detached. While this is easily managed for attach, + * it is not for detach. + * + * Once a device has been detached it will only come back if we have an active + * minor node that will be accessed. While we have minor nodes associated with + * the northbridges, we don't with the data fabric devices. This means that if + * they are detached, nothing would ever cause them to be reattached. The system + * also doesn't provide us a way or any guarantees around making sure that we're + * attached to all such devices before we detach. As a result, unfortunately, + * it's easier to basically have detach always fail. + * + * To deal with both development and if issues arise in the field, there is a + * knob, amdf17df_allow_detach, which if set to a non-zero value, will allow + * instances to detach. + * + * --------------- + * Exposed Devices + * --------------- + * + * Currently we expose a single set of character devices which represent + * temperature sensors for this family of processors. Because temperature + * sensors exist on a per-processor node basis, we create a single minor node + * for each one. Because our naming matches the cpuid naming, FMA can match that + * up to logical CPUs and take care of matching the sensors appropriately. We + * internally rate limit the sensor updates to 100ms, which is controlled by the + * global amdf17nbdf_cache_ms. + */ + +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/types.h> +#include <sys/file.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/list.h> +#include <sys/pci.h> +#include <sys/stddef.h> +#include <sys/stat.h> +#include <sys/x86_archext.h> +#include <sys/cpuvar.h> +#include <sys/sensors.h> + +/* + * The range of minors that we'll allow. + */ +#define AMDF17_MINOR_LOW 1 +#define AMDF17_MINOR_HIGH INT32_MAX + +/* + * This is the value of the first PCI data fabric device that globally exists. + * It always maps to AMD's first nodeid (what we call cpi_procnodeid). + */ +#define AMDF17_DF_FIRST_DEVICE 0x18 + +/* + * The data fabric devices are defined to always be on PCI bus zero. + */ +#define AMDF17_DF_BUSNO 0x00 + +/* + * This register contains the BUS A of the the processor node that corresponds + * to the data fabric device. + */ +#define AMDF17_DF_CFG_ADDR_CTL 0x84 +#define AMDF17_DF_CFG_ADDR_CTL_MASK 0xff + +/* + * Northbridge registers that are related to accessing the SMN. One writes to + * the SMN address register and then can read from the SMN data register. + */ +#define AMDF17_NB_SMN_ADDR 0x60 +#define AMDF17_NB_SMN_DATA 0x64 + +/* + * The following are register offsets and the meaning of their bits related to + * temperature. These addresses are addresses in the System Management Network + * which is accessed through the northbridge. They are not addresses in PCI + * configuration space. + */ +#define AMDF17_SMU_THERMAL_CURTEMP 0x00059800 +#define AMDF17_SMU_THERMAL_CURTEMP_TEMPERATURE(x) ((x) >> 21) +#define AMDF17_SMU_THERMAL_CURTEMP_RANGE_SEL (1 << 19) + +#define AMDF17_SMU_THERMAL_CURTEMP_RANGE_ADJ (-49) +#define AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS 3 +#define AMDF17_SMU_THERMAL_CURTEMP_BITS_MASK 0x7 + +/* + * The temperature sensor in family 17 is measured in terms of 0.125 C steps. + */ +#define AMDF17_THERMAL_GRANULARITY 8 + +struct amdf17nb; +struct amdf17df; + +typedef struct amdf17nb { + list_node_t amd_nb_link; + dev_info_t *amd_nb_dip; + ddi_acc_handle_t amd_nb_cfgspace; + uint_t amd_nb_bus; + uint_t amd_nb_dev; + uint_t amd_nb_func; + struct amdf17df *amd_nb_df; + uint_t amd_nb_procnodeid; + id_t amd_nb_temp_minor; + hrtime_t amd_nb_temp_last_read; + int amd_nb_temp_off; + uint32_t amd_nb_temp_reg; + /* Values derived from the above */ + int64_t amd_nb_temp; +} amdf17nb_t; + +typedef struct amdf17df { + list_node_t amd_df_link; + dev_info_t *amd_df_f0_dip; + ddi_acc_handle_t amd_df_f0_cfgspace; + uint_t amd_df_procnodeid; + uint_t amd_df_iobus; + amdf17nb_t *amd_df_nb; +} amdf17df_t; + +typedef struct amdf17nbdf { + kmutex_t amd_nbdf_lock; + id_space_t *amd_nbdf_minors; + list_t amd_nbdf_nbs; + list_t amd_nbdf_dfs; +} amdf17nbdf_t; + +typedef enum { + AMD_NBDF_TYPE_UNKNOWN, + AMD_NBDF_TYPE_NORTHBRIDGE, + AMD_NBDF_TYPE_DATA_FABRIC +} amdf17nbdf_type_t; + +typedef struct { + uint16_t amd_nbdft_pci_did; + amdf17nbdf_type_t amd_nbdft_type; +} amdf17nbdf_table_t; + +static const amdf17nbdf_table_t amdf17nbdf_dev_map[] = { + /* Family 17h Ryzen, Epyc Models 00h-0fh (Zen uarch) */ + { 0x1450, AMD_NBDF_TYPE_NORTHBRIDGE }, + { 0x1460, AMD_NBDF_TYPE_DATA_FABRIC }, + { PCI_EINVAL16 } +}; + +typedef struct { + const char *amd_nbdfo_brand; + uint_t amd_nbdfo_family; + int amd_nbdfo_off; +} amdf17nbdf_offset_t; + +/* + * AMD processors report a control temperature (called Tctl) which may be + * different from the junction temperature, which is the value that is actually + * measured from the die (sometimes called Tdie or Tjct). This is done so that + * socket-based environmental monitoring can be consistent from a platform + * perspective, but doesn't help us. Unfortunately, these values aren't in + * datasheets that we can find, but have been documented partially in a series + * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software + * for Windows. + * + * The brand strings below may contain partial matches such in the Threadripper + * cases so we can match the entire family of processors. The offset value is + * the quantity in degrees that we should adjust Tctl to reach Tdie. + */ +static const amdf17nbdf_offset_t amdf17nbdf_offsets[] = { + { "AMD Ryzen 5 1600X", 0x17, -20 }, + { "AMD Ryzen 7 1700X", 0x17, -20 }, + { "AMD Ryzen 7 1800X", 0x17, -20 }, + { "AMD Ryzen 7 2700X", 0x17, -10 }, + { "AMD Ryzen Threadripper 19", 0x17, -27 }, + { "AMD Ryzen Threadripper 29", 0x17, -27 }, + { NULL } +}; + +/* + * This indicates a number of milliseconds that we should wait between reads. + * This is somewhat arbitrary, but the goal is to reduce cross call activity + * and reflect that the sensor may not update all the time. + */ +uint_t amdf17nbdf_cache_ms = 100; + +/* + * This indicates whether detach is allowed. It is not by default. See the + * theory statement section 'Attach and Detach Complications' for more + * information. + */ +uint_t amdf17nbdf_allow_detach = 0; + +/* + * Global data that we keep regarding the device. + */ +amdf17nbdf_t *amdf17nbdf; + +static amdf17nb_t * +amdf17nbdf_lookup_nb(amdf17nbdf_t *nbdf, minor_t minor) +{ + ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock)); + + if (minor < AMDF17_MINOR_LOW || minor > AMDF17_MINOR_HIGH) { + return (NULL); + } + + for (amdf17nb_t *nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL; + nb = list_next(&nbdf->amd_nbdf_nbs, nb)) { + if ((id_t)minor == nb->amd_nb_temp_minor) { + return (nb); + } + } + + return (NULL); +} + +static void +amdf17nbdf_cleanup_nb(amdf17nbdf_t *nbdf, amdf17nb_t *nb) +{ + if (nb == NULL) + return; + + ddi_remove_minor_node(nb->amd_nb_dip, NULL); + if (nb->amd_nb_temp_minor > 0) { + id_free(nbdf->amd_nbdf_minors, nb->amd_nb_temp_minor); + } + if (nb->amd_nb_cfgspace != NULL) { + pci_config_teardown(&nb->amd_nb_cfgspace); + } + kmem_free(nb, sizeof (amdf17nb_t)); +} + +static void +amdf17nbdf_cleanup_df(amdf17df_t *df) +{ + if (df == NULL) + return; + + if (df->amd_df_f0_cfgspace != NULL) { + pci_config_teardown(&df->amd_df_f0_cfgspace); + } + kmem_free(df, sizeof (amdf17df_t)); +} + +static int +amdf17nbdf_smn_read(amdf17nbdf_t *nbdf, amdf17nb_t *nb, uint32_t addr, + uint32_t *valp) +{ + VERIFY(MUTEX_HELD(&nbdf->amd_nbdf_lock)); + + pci_config_put32(nb->amd_nb_cfgspace, AMDF17_NB_SMN_ADDR, addr); + *valp = pci_config_get32(nb->amd_nb_cfgspace, AMDF17_NB_SMN_DATA); + + return (0); +} + +static int +amdf17nbdf_temp_read(amdf17nbdf_t *nbdf, amdf17nb_t *nb) +{ + int ret; + uint32_t reg, rawtemp, decimal; + + ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock)); + + /* + * Update the last read time first. Even if this fails, we want to make + * sure that we latch the fact that we tried. + */ + nb->amd_nb_temp_last_read = gethrtime(); + if ((ret = amdf17nbdf_smn_read(nbdf, nb, AMDF17_SMU_THERMAL_CURTEMP, + ®)) != 0) { + return (ret); + } + + nb->amd_nb_temp_reg = reg; + + /* + * Take the primary temperature value and break apart its decimal value + * from its main value. + */ + rawtemp = AMDF17_SMU_THERMAL_CURTEMP_TEMPERATURE(reg); + decimal = rawtemp & AMDF17_SMU_THERMAL_CURTEMP_BITS_MASK; + rawtemp = rawtemp >> AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS; + + if ((reg & AMDF17_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) { + rawtemp += AMDF17_SMU_THERMAL_CURTEMP_RANGE_ADJ; + } + rawtemp += nb->amd_nb_temp_off; + nb->amd_nb_temp = rawtemp << AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS; + nb->amd_nb_temp += decimal; + + return (0); +} + +static int +amdf17nbdf_temp_init(amdf17nbdf_t *nbdf, amdf17nb_t *nb) +{ + uint_t i, family; + char buf[256]; + + if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) { + dev_err(nb->amd_nb_dip, CE_WARN, "!failed to read processor " + "brand string, brand larger than internal buffer"); + return (EOVERFLOW); + } + + family = cpuid_getfamily(CPU); + + for (i = 0; amdf17nbdf_offsets[i].amd_nbdfo_brand != NULL; i++) { + if (family != amdf17nbdf_offsets[i].amd_nbdfo_family) + continue; + if (strncmp(buf, amdf17nbdf_offsets[i].amd_nbdfo_brand, + strlen(amdf17nbdf_offsets[i].amd_nbdfo_brand)) == 0) { + nb->amd_nb_temp_off = + amdf17nbdf_offsets[i].amd_nbdfo_off; + break; + } + } + + return (amdf17nbdf_temp_read(nbdf, nb)); +} + +static amdf17nbdf_type_t +amdf17nbdf_dip_type(uint16_t dev) +{ + uint_t i; + const amdf17nbdf_table_t *tp = amdf17nbdf_dev_map; + + for (i = 0; tp[i].amd_nbdft_pci_did != PCI_EINVAL16; i++) { + if (tp[i].amd_nbdft_pci_did == dev) { + return (tp[i].amd_nbdft_type); + } + } + + return (AMD_NBDF_TYPE_UNKNOWN); +} + +static boolean_t +amdf17nbdf_map(amdf17nbdf_t *nbdf, amdf17nb_t *nb, amdf17df_t *df) +{ + int ret; + char buf[128]; + + ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock)); + + /* + * This means that we encountered a duplicate. We're going to stop + * processing, but we're not going to fail its attach at this point. + */ + if (nb->amd_nb_df != NULL) { + dev_err(nb->amd_nb_dip, CE_WARN, "!trying to map NB %u/%u/%u " + "to DF procnode %u, but NB is already mapped to DF " + "procnode %u!", + nb->amd_nb_bus, nb->amd_nb_dev, nb->amd_nb_func, + df->amd_df_procnodeid, nb->amd_nb_df->amd_df_procnodeid); + return (B_TRUE); + } + + /* + * Now that we have found a mapping, initialize our temperature + * information and create the minor node. + */ + nb->amd_nb_procnodeid = df->amd_df_procnodeid; + nb->amd_nb_temp_minor = id_alloc(nbdf->amd_nbdf_minors); + + if ((ret = amdf17nbdf_temp_init(nbdf, nb)) != 0) { + dev_err(nb->amd_nb_dip, CE_WARN, "!failed to init SMN " + "temperature data on node %u: %d", nb->amd_nb_procnodeid, + ret); + return (B_FALSE); + } + + if (snprintf(buf, sizeof (buf), "procnode.%u", nb->amd_nb_procnodeid) >= + sizeof (buf)) { + dev_err(nb->amd_nb_dip, CE_WARN, "!unexpected buffer name " + "overrun assembling temperature minor %u", + nb->amd_nb_procnodeid); + return (B_FALSE); + } + + if (ddi_create_minor_node(nb->amd_nb_dip, buf, S_IFCHR, + nb->amd_nb_temp_minor, DDI_NT_SENSOR_TEMP_CPU, 0) != DDI_SUCCESS) { + dev_err(nb->amd_nb_dip, CE_WARN, "!failed to create minor node " + "%s", buf); + return (B_FALSE); + } + + /* + * Now that's it's all done, note that they're mapped to each other. + */ + nb->amd_nb_df = df; + df->amd_df_nb = nb; + + return (B_TRUE); +} + +static boolean_t +amdf17nbdf_add_nb(amdf17nbdf_t *nbdf, amdf17nb_t *nb) +{ + amdf17df_t *df; + boolean_t ret = B_TRUE; + + mutex_enter(&nbdf->amd_nbdf_lock); + list_insert_tail(&nbdf->amd_nbdf_nbs, nb); + for (df = list_head(&nbdf->amd_nbdf_dfs); df != NULL; + df = list_next(&nbdf->amd_nbdf_dfs, df)) { + if (nb->amd_nb_bus == df->amd_df_iobus) { + ret = amdf17nbdf_map(nbdf, nb, df); + break; + } + } + mutex_exit(&nbdf->amd_nbdf_lock); + + return (ret); +} + +static boolean_t +amdf17nbdf_add_df(amdf17nbdf_t *nbdf, amdf17df_t *df) +{ + amdf17nb_t *nb; + boolean_t ret = B_TRUE; + + mutex_enter(&nbdf->amd_nbdf_lock); + list_insert_tail(&nbdf->amd_nbdf_dfs, df); + for (nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL; + nb = list_next(&nbdf->amd_nbdf_nbs, nb)) { + if (nb->amd_nb_bus == df->amd_df_iobus) { + ret = amdf17nbdf_map(nbdf, nb, df); + } + } + mutex_exit(&nbdf->amd_nbdf_lock); + + return (ret); +} + +static boolean_t +amdf17nbdf_attach_nb(amdf17nbdf_t *nbdf, dev_info_t *dip, ddi_acc_handle_t hdl, + uint_t bus, uint_t dev, uint_t func) +{ + amdf17nb_t *nb; + + nb = kmem_zalloc(sizeof (amdf17nb_t), KM_SLEEP); + nb->amd_nb_dip = dip; + nb->amd_nb_cfgspace = hdl; + nb->amd_nb_bus = bus; + nb->amd_nb_dev = dev; + nb->amd_nb_func = func; + /* + * Set this to a value we won't get from the processor. + */ + nb->amd_nb_procnodeid = UINT_MAX; + + if (!amdf17nbdf_add_nb(nbdf, nb)) { + amdf17nbdf_cleanup_nb(nbdf, nb); + return (B_FALSE); + } + + return (B_TRUE); +} + +static boolean_t +amdf17nbdf_attach_df(amdf17nbdf_t *nbdf, dev_info_t *dip, ddi_acc_handle_t hdl, + uint_t bus, uint_t dev, uint_t func) +{ + amdf17df_t *df; + + if (bus != AMDF17_DF_BUSNO) { + dev_err(dip, CE_WARN, "!encountered data fabric device with " + "unexpected PCI bus assignment, found 0x%x, expected 0x%x", + bus, AMDF17_DF_BUSNO); + return (B_FALSE); + } + + if (dev < AMDF17_DF_FIRST_DEVICE) { + dev_err(dip, CE_WARN, "!encountered data fabric device with " + "PCI device assignment below the first minimum device " + "(0x%x): 0x%x", AMDF17_DF_FIRST_DEVICE, dev); + return (B_FALSE); + } + + /* + * At the moment we only care about function 0. However, we may care + * about Function 4 in the future which has access to the FICAA. + * However, only function zero should ever be attached, so this is just + * an extra precaution. + */ + if (func != 0) { + dev_err(dip, CE_WARN, "!encountered data fabric device with " + "unxpected PCI function assignment, found 0x%x, expected " + "0x0", func); + return (B_FALSE); + } + + df = kmem_zalloc(sizeof (amdf17df_t), KM_SLEEP); + df->amd_df_f0_dip = dip; + df->amd_df_f0_cfgspace = hdl; + df->amd_df_procnodeid = dev - AMDF17_DF_FIRST_DEVICE; + df->amd_df_iobus = pci_config_get32(hdl, AMDF17_DF_CFG_ADDR_CTL) & + AMDF17_DF_CFG_ADDR_CTL_MASK; + + if (!amdf17nbdf_add_df(nbdf, df)) { + amdf17nbdf_cleanup_df(df); + return (B_FALSE); + } + + return (B_TRUE); +} + +static int +amdf17nbdf_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + amdf17nbdf_t *nbdf = amdf17nbdf; + minor_t m; + + if (crgetzoneid(credp) != GLOBAL_ZONEID || drv_priv(credp)) { + return (EPERM); + } + + if ((flags & (FEXCL | FNDELAY | FWRITE)) != 0) { + return (EINVAL); + } + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + m = getminor(*devp); + + /* + * Sanity check the minor + */ + mutex_enter(&nbdf->amd_nbdf_lock); + if (amdf17nbdf_lookup_nb(nbdf, m) == NULL) { + mutex_exit(&nbdf->amd_nbdf_lock); + return (ENXIO); + } + mutex_exit(&nbdf->amd_nbdf_lock); + + return (0); +} + +static int +amdf17nbdf_ioctl_kind(intptr_t arg, int mode) +{ + sensor_ioctl_kind_t kind; + + bzero(&kind, sizeof (sensor_ioctl_kind_t)); + kind.sik_kind = SENSOR_KIND_TEMPERATURE; + + if (ddi_copyout((void *)&kind, (void *)arg, + sizeof (sensor_ioctl_kind_t), mode & FKIOCTL) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +amdf17nbdf_ioctl_temp(amdf17nbdf_t *nbdf, minor_t minor, intptr_t arg, int mode) +{ + amdf17nb_t *nb; + hrtime_t diff; + sensor_ioctl_temperature_t temp; + + bzero(&temp, sizeof (temp)); + + mutex_enter(&nbdf->amd_nbdf_lock); + nb = amdf17nbdf_lookup_nb(nbdf, minor); + if (nb == NULL) { + mutex_exit(&nbdf->amd_nbdf_lock); + return (ENXIO); + } + + diff = NSEC2MSEC(gethrtime() - nb->amd_nb_temp_last_read); + if (diff > 0 && diff > (hrtime_t)amdf17nbdf_cache_ms) { + int ret; + + ret = amdf17nbdf_temp_read(nbdf, nb); + if (ret != 0) { + mutex_exit(&nbdf->amd_nbdf_lock); + return (ret); + } + } + + temp.sit_unit = SENSOR_UNIT_CELSIUS; + temp.sit_temp = nb->amd_nb_temp; + temp.sit_gran = AMDF17_THERMAL_GRANULARITY; + mutex_exit(&nbdf->amd_nbdf_lock); + + if (ddi_copyout(&temp, (void *)arg, sizeof (temp), + mode & FKIOCTL) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +amdf17nbdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + minor_t m; + amdf17nbdf_t *nbdf = amdf17nbdf; + + if ((mode & FREAD) == 0) { + return (EINVAL); + } + + m = getminor(dev); + + switch (cmd) { + case SENSOR_IOCTL_TYPE: + return (amdf17nbdf_ioctl_kind(arg, mode)); + case SENSOR_IOCTL_TEMPERATURE: + return (amdf17nbdf_ioctl_temp(nbdf, m, arg, mode)); + default: + return (ENOTTY); + } +} + +/* + * We don't really do any state tracking on close, so for now, just allow it to + * always succeed. + */ +static int +amdf17nbdf_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + return (0); +} + +static int +amdf17nbdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + uint_t nregs; + int *regs; + uint_t bus, dev, func; + uint16_t pci_did; + ddi_acc_handle_t pci_hdl; + amdf17nbdf_type_t type; + amdf17nbdf_t *nbdf = amdf17nbdf; + + if (cmd == DDI_RESUME) + return (DDI_SUCCESS); + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, "reg", + ®s, &nregs) != DDI_PROP_SUCCESS) { + dev_err(dip, CE_WARN, "!failed to find pci 'reg' property"); + return (DDI_FAILURE); + } + + if (nregs < 1) { + ddi_prop_free(regs); + return (DDI_FAILURE); + } + + bus = PCI_REG_BUS_G(regs[0]); + dev = PCI_REG_DEV_G(regs[0]); + func = PCI_REG_FUNC_G(regs[0]); + + ddi_prop_free(regs); + + if (pci_config_setup(dip, &pci_hdl) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "!failed to map pci devices"); + return (DDI_FAILURE); + } + + pci_did = pci_config_get16(pci_hdl, PCI_CONF_DEVID); + + type = amdf17nbdf_dip_type(pci_did); + switch (type) { + case AMD_NBDF_TYPE_NORTHBRIDGE: + if (!amdf17nbdf_attach_nb(nbdf, dip, pci_hdl, bus, dev, func)) { + return (DDI_FAILURE); + } + break; + case AMD_NBDF_TYPE_DATA_FABRIC: + if (!amdf17nbdf_attach_df(nbdf, dip, pci_hdl, bus, dev, func)) { + return (DDI_FAILURE); + } + break; + default: + pci_config_teardown(&pci_hdl); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * Unfortunately, it's hard for us to really support detach here. The problem is + * that we need both the data fabric devices and the northbridges to make sure + * that we map everything. However, only the northbridges actually create minor + * nodes that'll be opened and thus trigger them to reattach when accessed. What + * we should probably look at doing in the future is making this into a nexus + * driver that enumerates children like a temperature driver. + */ +static int +amdf17nbdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + amdf17nbdf_t *nbdf = amdf17nbdf; + + if (cmd == DDI_SUSPEND) + return (DDI_SUCCESS); + + if (nbdf == NULL) { + return (DDI_FAILURE); + } + + if (amdf17nbdf_allow_detach == 0) { + return (DDI_FAILURE); + } + + mutex_enter(&nbdf->amd_nbdf_lock); + for (amdf17nb_t *nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL; + nb = list_next(&nbdf->amd_nbdf_nbs, nb)) { + if (dip == nb->amd_nb_dip) { + list_remove(&nbdf->amd_nbdf_nbs, nb); + if (nb->amd_nb_df != NULL) { + ASSERT3P(nb->amd_nb_df->amd_df_nb, ==, nb); + nb->amd_nb_df->amd_df_nb = NULL; + } + amdf17nbdf_cleanup_nb(nbdf, nb); + mutex_exit(&nbdf->amd_nbdf_lock); + return (DDI_SUCCESS); + } + } + + for (amdf17df_t *df = list_head(&nbdf->amd_nbdf_dfs); df != NULL; + df = list_next(&nbdf->amd_nbdf_nbs, df)) { + if (dip == df->amd_df_f0_dip) { + list_remove(&nbdf->amd_nbdf_dfs, df); + if (df->amd_df_nb != NULL) { + ASSERT3P(df->amd_df_nb->amd_nb_df, ==, df); + df->amd_df_nb->amd_nb_df = NULL; + } + amdf17nbdf_cleanup_df(df); + mutex_exit(&nbdf->amd_nbdf_lock); + return (DDI_SUCCESS); + } + } + mutex_exit(&nbdf->amd_nbdf_lock); + + return (DDI_FAILURE); +} + +static int +amdf17nbdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, + void **resultp) +{ + dev_t dev; + minor_t minor; + amdf17nbdf_t *nbdf; + amdf17nb_t *nb; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + case DDI_INFO_DEVT2INSTANCE: + break; + default: + return (DDI_FAILURE); + } + + dev = (dev_t)arg; + minor = getminor(dev); + nbdf = amdf17nbdf; + + mutex_enter(&nbdf->amd_nbdf_lock); + nb = amdf17nbdf_lookup_nb(nbdf, (id_t)minor); + if (nb == NULL) { + mutex_exit(&nbdf->amd_nbdf_lock); + return (DDI_FAILURE); + } + if (cmd == DDI_INFO_DEVT2DEVINFO) { + *resultp = nb->amd_nb_dip; + } else { + int inst = ddi_get_instance(nb->amd_nb_dip); + *resultp = (void *)(uintptr_t)inst; + } + mutex_exit(&nbdf->amd_nbdf_lock); + + return (DDI_SUCCESS); +} + +static void +amdf17nbdf_destroy(amdf17nbdf_t *nbdf) +{ + amdf17nb_t *nb; + amdf17df_t *df; + + while ((nb = list_remove_head(&nbdf->amd_nbdf_nbs)) != NULL) { + amdf17nbdf_cleanup_nb(nbdf, nb); + } + list_destroy(&nbdf->amd_nbdf_nbs); + + while ((df = list_remove_head(&nbdf->amd_nbdf_dfs)) != NULL) { + amdf17nbdf_cleanup_df(df); + } + list_destroy(&nbdf->amd_nbdf_dfs); + + if (nbdf->amd_nbdf_minors != NULL) { + id_space_destroy(nbdf->amd_nbdf_minors); + } + + mutex_destroy(&nbdf->amd_nbdf_lock); + kmem_free(nbdf, sizeof (amdf17nbdf_t)); +} + +static amdf17nbdf_t * +amdf17nbdf_create(void) +{ + amdf17nbdf_t *nbdf; + + nbdf = kmem_zalloc(sizeof (amdf17nbdf_t), KM_SLEEP); + mutex_init(&nbdf->amd_nbdf_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nbdf->amd_nbdf_nbs, sizeof (amdf17nb_t), + offsetof(amdf17nb_t, amd_nb_link)); + list_create(&nbdf->amd_nbdf_dfs, sizeof (amdf17df_t), + offsetof(amdf17df_t, amd_df_link)); + if ((nbdf->amd_nbdf_minors = id_space_create("amdf17nbdf_minors", + AMDF17_MINOR_LOW, AMDF17_MINOR_HIGH)) == NULL) { + amdf17nbdf_destroy(nbdf); + return (NULL); + } + + return (nbdf); +} + +static struct cb_ops amdf17nbdf_cb_ops = { + .cb_open = amdf17nbdf_open, + .cb_close = amdf17nbdf_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = amdf17nbdf_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_flag = D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +static struct dev_ops amdf17nbdf_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = amdf17nbdf_getinfo, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_attach = amdf17nbdf_attach, + .devo_detach = amdf17nbdf_detach, + .devo_reset = nodev, + .devo_power = ddi_power, + .devo_quiesce = ddi_quiesce_not_needed, + .devo_cb_ops = &amdf17nbdf_cb_ops +}; + +static struct modldrv amdf17nbdf_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "AMD Family 17h Driver", + .drv_dev_ops = &amdf17nbdf_dev_ops +}; + +static struct modlinkage amdf17nbdf_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &amdf17nbdf_modldrv, NULL } +}; + +int +_init(void) +{ + int ret; + amdf17nbdf_t *nbdf; + + if ((nbdf = amdf17nbdf_create()) == NULL) { + return (ENOMEM); + } + + if ((ret = mod_install(&amdf17nbdf_modlinkage)) != 0) { + amdf17nbdf_destroy(amdf17nbdf); + return (ret); + } + + amdf17nbdf = nbdf; + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&amdf17nbdf_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = mod_remove(&amdf17nbdf_modlinkage)) != 0) { + return (ret); + } + + amdf17nbdf_destroy(amdf17nbdf); + amdf17nbdf = NULL; + return (ret); +} diff --git a/usr/src/uts/intel/io/coretemp/coretemp.c b/usr/src/uts/intel/io/coretemp/coretemp.c new file mode 100644 index 0000000000..e21d385991 --- /dev/null +++ b/usr/src/uts/intel/io/coretemp/coretemp.c @@ -0,0 +1,784 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* + * Intel CPU Thermal sensor driver + * + * These MSRs that were used were introduced with the 'Core' family processors + * and have since spread beyond there, even to the Atom line. Currently, + * temperature sensors exist on a per-core basis and optionally on a per-package + * basis. The temperature sensor exposes a reading that's relative to the + * processor's maximum junction temperature, often referred to as Tj. We + * currently only support models where we can determine that junction + * temperature programatically. For older processors, we would need to track + * down the datasheet. Unfortunately, the values here are often on a per-brand + * string basis. As in two CPUs with the same model and stepping, but have + * binned differently have different temperatures. + * + * The temperature is exposed through /dev and uses a semi-standard sensor + * framework. We expose one minor node per CPU core and one minor node per CPU + * package, if that is supported. Reads are rate-limited in the driver at 100ms + * by default per the global variable coretemp_cache_ms. + */ + +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/types.h> +#include <sys/file.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/list.h> +#include <sys/stddef.h> +#include <sys/cmn_err.h> +#include <sys/id_space.h> +#include <sys/x86_archext.h> +#include <sys/cpu_module.h> +#include <sys/ontrap.h> +#include <sys/cpuvar.h> +#include <sys/x_call.h> +#include <sys/sensors.h> + +#define CORETEMP_MINOR_MIN 1 +#define CORETEMP_MINOR_MAX INT32_MAX + +typedef struct coretemp_core { + list_node_t ctc_link; + id_t ctc_core_minor; + id_t ctc_pkg_minor; + enum cmi_hdl_class ctc_class; + uint_t ctc_chip; + uint_t ctc_core; + uint_t ctc_strand; + uint_t ctc_tjmax; + hrtime_t ctc_last_read; + uint64_t ctc_core_status; + uint64_t ctc_core_intr; + uint64_t ctc_pkg_status; + uint64_t ctc_pkg_intr; + uint64_t ctc_invalid_reads; + /* The following fields are derived from above */ + uint_t ctc_temperature; + uint_t ctc_resolution; + uint_t ctc_pkg_temperature; +} coretemp_core_t; + +typedef struct coretemp { + dev_info_t *coretemp_dip; + id_space_t *coretemp_ids; + cpuset_t *coretemp_cpuset; + boolean_t coretemp_pkg; + kmutex_t coretemp_mutex; + list_t coretemp_cores; +} coretemp_t; + +coretemp_t *coretemp; + +/* + * This indicates a number of milliseconds that we should wait between reads. + * This is somewhat arbitrary, but the goal is to reduce cross call activity + * and reflect that the sensor may not update all the time. + */ +uint_t coretemp_cache_ms = 100; + +static int +coretemp_rdmsr_xc(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) +{ + uint_t msr = (uint_t)arg1; + uint64_t *valp = (uint64_t *)arg2; + cmi_errno_t *errp = (cmi_errno_t *)arg3; + + on_trap_data_t otd; + + if (on_trap(&otd, OT_DATA_ACCESS) == 0) { + if (checked_rdmsr(msr, valp) == 0) { + *errp = CMI_SUCCESS; + } else { + *errp = CMIERR_NOTSUP; + } + } else { + *errp = CMIERR_MSRGPF; + } + no_trap(); + + return (0); +} + +/* + * This really should just be a call to the CMI handle to provide us the MSR. + * However, that routine, cmi_hdl_rdmsr(), cannot be safely used until it is + * fixed for use outside of a panic-like context. + */ +static int +coretemp_rdmsr(coretemp_t *ct, cmi_hdl_t hdl, uint_t msr, uint64_t *valp) +{ + id_t cpu = cmi_hdl_logical_id(hdl); + int ret = CMI_SUCCESS; + + ASSERT(MUTEX_HELD(&ct->coretemp_mutex)); + kpreempt_disable(); + if (CPU->cpu_id == cpu) { + (void) coretemp_rdmsr_xc((xc_arg_t)msr, (xc_arg_t)valp, + (xc_arg_t)&ret); + } else { + cpuset_only(ct->coretemp_cpuset, (uint_t)cpu); + xc_call((xc_arg_t)msr, (xc_arg_t)valp, (xc_arg_t)&ret, + (ulong_t *)ct->coretemp_cpuset, coretemp_rdmsr_xc); + } + kpreempt_enable(); + + return (ret); +} + +static int +coretemp_cmi_errno(cmi_errno_t e) +{ + switch (e) { + case CMIERR_NOTSUP: + return (ENOTSUP); + default: + return (EIO); + } +} + +/* + * Answer the question of whether or not the driver can support the CPU in + * question. Right now we have the following constraints for supporting the CPU: + * + * o The CPU is made by Intel + * o The CPU has the Digital Thermal Sensor + * o The CPU family is 6, which is usually implicit from the above + * o We can determine its junction temperature through an MSR + * + * If we can't determine the junction temperature programatically, then we need + * to set up tables of CPUs to do so. This can be fleshed out and improved. + */ +static boolean_t +coretemp_supported(void) +{ + uint_t model; + + if (cpuid_getvendor(CPU) != X86_VENDOR_Intel) { + return (B_FALSE); + } + + if (!is_x86_feature(x86_featureset, X86FSET_CORE_THERMAL)) { + return (B_FALSE); + } + + if (cpuid_getfamily(CPU) != 6) { + return (B_FALSE); + } + + model = cpuid_getmodel(CPU); + if (model <= INTC_MODEL_PENRYN || model == INTC_MODEL_SILVERTHORNE || + model == INTC_MODEL_LINCROFT || model == INTC_MODEL_PENWELL || + model == INTC_MODEL_CLOVERVIEW || model == INTC_MODEL_CEDARVIEW) { + return (B_FALSE); + } + + return (B_TRUE); +} + +static coretemp_core_t * +coretemp_lookup_core(coretemp_t *ct, minor_t minor) +{ + coretemp_core_t *ctc; + + ASSERT(MUTEX_HELD(&ct->coretemp_mutex)); + + if (minor < CORETEMP_MINOR_MIN || minor > CORETEMP_MINOR_MAX) { + return (NULL); + } + + for (ctc = list_head(&ct->coretemp_cores); ctc != NULL; + ctc = list_next(&ct->coretemp_cores, ctc)) { + if (ctc->ctc_core_minor == (id_t)minor || + (ctc->ctc_pkg_minor >= CORETEMP_MINOR_MIN && + ctc->ctc_pkg_minor == (id_t)minor)) { + return (ctc); + } + } + + return (NULL); +} + + +/* + * We need to determine the value of Tj Max as all temperature sensors are + * derived from this value. The ease of this depends on how old the processor in + * question is. The Core family processors after Penryn have support for an MSR + * that tells us what to go for. In the Atom family, processors starting with + * Silvermont have support for an MSR that documents this value. For older + * processors, one needs to track down the datasheet for a specific processor. + * Two processors in the same family/model may have different values of Tj Max. + * At the moment, we only support this on processors that have that MSR. + */ +static int +coretemp_calculate_tjmax(coretemp_t *ct, coretemp_core_t *ctc, cmi_hdl_t hdl) +{ + cmi_errno_t e; + int err = 0; + uint64_t val = 0; + + e = coretemp_rdmsr(ct, hdl, MSR_TEMPERATURE_TARGET, &val); + if (e == CMI_SUCCESS && val != 0) { + ctc->ctc_tjmax = MSR_TEMPERATURE_TARGET_TARGET(val); + } else if (val == 0) { + err = EINVAL; + } else { + err = coretemp_cmi_errno(e); + } + + return (err); +} + +static int +coretemp_read(coretemp_t *ct, coretemp_core_t *ctc, cmi_hdl_t hdl) +{ + cmi_errno_t e; + int err = 0; + uint64_t val = 0; + + ctc->ctc_last_read = gethrtime(); + + e = coretemp_rdmsr(ct, hdl, MSR_IA32_THERM_STATUS, &val); + if (e == CMI_SUCCESS) { + ctc->ctc_core_status = val; + } else { + err = coretemp_cmi_errno(e); + dev_err(ct->coretemp_dip, CE_WARN, "!failed to get core " + "thermal status on %u/%u: %d", ctc->ctc_chip, ctc->ctc_core, + err); + return (err); + } + + e = coretemp_rdmsr(ct, hdl, MSR_IA32_THERM_INTERRUPT, &val); + if (e == CMI_SUCCESS) { + ctc->ctc_core_intr = val; + } else { + err = coretemp_cmi_errno(e); + dev_err(ct->coretemp_dip, CE_WARN, "!failed to get core " + "thermal interrupt on %u/%u: %d", ctc->ctc_chip, + ctc->ctc_core, err); + return (err); + } + + /* + * If the last read wasn't valid, then we should keep the current state. + */ + if ((ctc->ctc_core_status & IA32_THERM_STATUS_READ_VALID) != 0) { + uint_t diff; + diff = IA32_THERM_STATUS_READING(ctc->ctc_core_status); + + if (diff >= ctc->ctc_tjmax) { + dev_err(ct->coretemp_dip, CE_WARN, "!found invalid " + "core temperature on %u/%u: readout: %u, Tjmax: " + "%u, raw: 0x%" PRIx64, ctc->ctc_chip, + ctc->ctc_core, diff, ctc->ctc_tjmax, + ctc->ctc_core_status); + ctc->ctc_invalid_reads++; + } else { + ctc->ctc_temperature = ctc->ctc_tjmax - diff; + } + } else { + ctc->ctc_invalid_reads++; + } + + ctc->ctc_resolution = + IA32_THERM_STATUS_RESOLUTION(ctc->ctc_core_status); + + /* + * If we have package support and this is core zero, then update the + * package data. + */ + if (ct->coretemp_pkg && ctc->ctc_core == 0) { + uint_t diff; + + e = coretemp_rdmsr(ct, hdl, MSR_IA32_PACKAGE_THERM_STATUS, + &val); + if (e == CMI_SUCCESS) { + ctc->ctc_pkg_status = val; + } else { + err = coretemp_cmi_errno(e); + dev_err(ct->coretemp_dip, CE_WARN, "!failed to get " + "package thermal status on %u: %d", ctc->ctc_chip, + err); + return (err); + } + + e = coretemp_rdmsr(ct, hdl, MSR_IA32_PACKAGE_THERM_INTERRUPT, + &val); + if (e == CMI_SUCCESS) { + ctc->ctc_pkg_intr = val; + } else { + err = coretemp_cmi_errno(e); + dev_err(ct->coretemp_dip, CE_WARN, "!failed to get " + "package thermal interrupt on %u: %d", + ctc->ctc_chip, err); + return (err); + } + + diff = IA32_PKG_THERM_STATUS_READING(ctc->ctc_pkg_status); + if (diff >= ctc->ctc_tjmax) { + dev_err(ct->coretemp_dip, CE_WARN, "!found invalid " + "package temperature on %u: readout: %u, tjmax: " + "%u, raw: 0x%" PRIx64, ctc->ctc_chip, diff, + ctc->ctc_tjmax, ctc->ctc_pkg_status); + ctc->ctc_invalid_reads++; + + } else { + ctc->ctc_pkg_temperature = ctc->ctc_tjmax - diff; + } + } + + return (0); +} + +static int +coretemp_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + coretemp_t *ct = coretemp; + + if (crgetzoneid(credp) != GLOBAL_ZONEID || drv_priv(credp)) { + return (EPERM); + } + + if ((flags & (FEXCL | FNDELAY | FWRITE)) != 0) { + return (EINVAL); + } + + if (otype != OTYP_CHR) { + return (EINVAL); + } + + /* + * Sanity check the minor + */ + mutex_enter(&ct->coretemp_mutex); + if (coretemp_lookup_core(ct, getminor(*devp)) == NULL) { + mutex_exit(&ct->coretemp_mutex); + return (ENXIO); + } + mutex_exit(&ct->coretemp_mutex); + + return (0); +} + +static int +coretemp_ioctl_kind(intptr_t arg, int mode) +{ + sensor_ioctl_kind_t kind; + + bzero(&kind, sizeof (kind)); + kind.sik_kind = SENSOR_KIND_TEMPERATURE; + + if (ddi_copyout((void *)&kind, (void *)arg, sizeof (kind), + mode & FKIOCTL) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +coretemp_ioctl_temp(coretemp_t *ct, minor_t minor, intptr_t arg, int mode) +{ + coretemp_core_t *ctc; + hrtime_t diff; + sensor_ioctl_temperature_t temp; + + bzero(&temp, sizeof (temp)); + + mutex_enter(&ct->coretemp_mutex); + ctc = coretemp_lookup_core(ct, minor); + if (ctc == NULL) { + mutex_exit(&ct->coretemp_mutex); + return (ENXIO); + } + + diff = NSEC2MSEC(gethrtime() - ctc->ctc_last_read); + if (diff > 0 && diff > (hrtime_t)coretemp_cache_ms) { + int ret; + cmi_hdl_t hdl; + + if ((hdl = cmi_hdl_lookup(ctc->ctc_class, ctc->ctc_chip, + ctc->ctc_core, ctc->ctc_strand)) == NULL) { + mutex_exit(&ct->coretemp_mutex); + return (ENXIO); + } + ret = coretemp_read(ct, ctc, hdl); + cmi_hdl_rele(hdl); + if (ret != 0) { + mutex_exit(&ct->coretemp_mutex); + return (ret); + } + } + + temp.sit_unit = SENSOR_UNIT_CELSIUS; + if ((id_t)minor == ctc->ctc_core_minor) { + temp.sit_temp = ctc->ctc_temperature; + } else { + temp.sit_temp = ctc->ctc_pkg_temperature; + } + + /* + * The resolution field is in whole units of degrees Celsius. + */ + temp.sit_gran = ctc->ctc_resolution; + if (ctc->ctc_resolution > 1) { + temp.sit_gran *= -1; + } + mutex_exit(&ct->coretemp_mutex); + + if (ddi_copyout(&temp, (void *)arg, sizeof (temp), + mode & FKIOCTL) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +coretemp_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + coretemp_t *ct = coretemp; + + if ((mode & FREAD) == 0) { + return (EINVAL); + } + + switch (cmd) { + case SENSOR_IOCTL_TYPE: + return (coretemp_ioctl_kind(arg, mode)); + case SENSOR_IOCTL_TEMPERATURE: + return (coretemp_ioctl_temp(ct, getminor(dev), arg, mode)); + default: + return (ENOTTY); + } +} + +/* + * We don't really do any state tracking on close, so for now, just allow it to + * always succeed. + */ +static int +coretemp_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + return (0); +} + +static void +coretemp_fini_core(coretemp_t *ct, coretemp_core_t *ctc) +{ + if (ctc->ctc_core_minor > 0) + id_free(ct->coretemp_ids, ctc->ctc_core_minor); + if (ctc->ctc_pkg_minor > 0) + id_free(ct->coretemp_ids, ctc->ctc_pkg_minor); + kmem_free(ctc, sizeof (coretemp_core_t)); +} + +static void +coretemp_destroy(coretemp_t *ct) +{ + coretemp_core_t *ctc; + + ddi_remove_minor_node(ct->coretemp_dip, NULL); + + while ((ctc = list_remove_head(&ct->coretemp_cores)) != NULL) { + coretemp_fini_core(ct, ctc); + } + list_destroy(&ct->coretemp_cores); + + if (ct->coretemp_cpuset != NULL) { + cpuset_free(ct->coretemp_cpuset); + } + + if (ct->coretemp_ids != NULL) { + id_space_destroy(ct->coretemp_ids); + } + + mutex_destroy(&ct->coretemp_mutex); + kmem_free(ct, sizeof (coretemp_t)); +} + +static int +coretemp_init_core(cmi_hdl_t hdl, void *arg1, void *arg2, void *arg3) +{ + coretemp_t *ct = arg1; + boolean_t *walkerr = arg2; + coretemp_core_t *ctc; + uint_t chip, core; + int err; + + chip = cmi_hdl_chipid(hdl); + core = cmi_hdl_coreid(hdl); + + /* + * The temperature sensor only exists on a per-core basis. Therefore we + * ignore any non-zero strand. + */ + if (cmi_hdl_strandid(hdl) != 0) { + return (CMI_HDL_WALK_NEXT); + } + + ctc = kmem_zalloc(sizeof (coretemp_core_t), KM_SLEEP); + ctc->ctc_class = cmi_hdl_class(hdl); + ctc->ctc_chip = chip; + ctc->ctc_core = core; + ctc->ctc_strand = 0; + ctc->ctc_core_minor = id_alloc(ct->coretemp_ids); + if (ct->coretemp_pkg && ctc->ctc_core == 0) { + ctc->ctc_pkg_minor = id_alloc(ct->coretemp_ids); + } + + if ((err = coretemp_calculate_tjmax(ct, ctc, hdl)) != 0) { + dev_err(ct->coretemp_dip, CE_WARN, + "failed to read Tj Max on %u/%u: %d", chip, core, err); + *walkerr = B_TRUE; + coretemp_fini_core(ct, ctc); + return (CMI_HDL_WALK_DONE); + } + + if ((err = coretemp_read(ct, ctc, hdl)) != 0) { + dev_err(ct->coretemp_dip, CE_WARN, + "failed to take initial temperature reading on %u/%u: %d", + chip, core, err); + *walkerr = B_TRUE; + coretemp_fini_core(ct, ctc); + return (CMI_HDL_WALK_DONE); + } + + list_insert_tail(&ct->coretemp_cores, ctc); + + return (CMI_HDL_WALK_NEXT); +} + +static boolean_t +coretemp_create_minors(coretemp_t *ct) +{ + coretemp_core_t *ctc; + + for (ctc = list_head(&ct->coretemp_cores); ctc != NULL; + ctc = list_next(&ct->coretemp_cores, ctc)) { + int ret; + char buf[128]; + + if (snprintf(buf, sizeof (buf), "chip%u.core%u", ctc->ctc_chip, + ctc->ctc_core) >= sizeof (buf)) { + return (B_FALSE); + } + ret = ddi_create_minor_node(ct->coretemp_dip, buf, S_IFCHR, + ctc->ctc_core_minor, DDI_NT_SENSOR_TEMP_CPU, 0); + if (ret != DDI_SUCCESS) { + dev_err(ct->coretemp_dip, CE_WARN, "!failed to create " + "minor node %s", buf); + return (B_FALSE); + } + + if (ctc->ctc_core != 0) + continue; + + if (snprintf(buf, sizeof (buf), "chip%u", ctc->ctc_chip) >= + sizeof (buf)) { + return (B_FALSE); + } + + ret = ddi_create_minor_node(ct->coretemp_dip, buf, S_IFCHR, + ctc->ctc_pkg_minor, DDI_NT_SENSOR_TEMP_CPU, 0); + if (ret != DDI_SUCCESS) { + dev_err(ct->coretemp_dip, CE_WARN, "!failed to create " + "minor node %s", buf); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static int +coretemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + boolean_t walkerr; + coretemp_t *ct = NULL; + + if (cmd == DDI_RESUME) { + /* + * Currently suspend and resume for this driver are nops. + */ + return (DDI_SUCCESS); + } + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (coretemp != NULL) { + return (DDI_FAILURE); + } + + ct = kmem_zalloc(sizeof (coretemp_t), KM_SLEEP); + ct->coretemp_dip = dip; + ct->coretemp_pkg = is_x86_feature(x86_featureset, X86FSET_PKG_THERMAL); + list_create(&ct->coretemp_cores, sizeof (coretemp_core_t), + offsetof(coretemp_core_t, ctc_link)); + mutex_init(&ct->coretemp_mutex, NULL, MUTEX_DRIVER, NULL); + ct->coretemp_cpuset = cpuset_alloc(KM_SLEEP); + if ((ct->coretemp_ids = id_space_create("coretemp_minors", 1, + INT32_MAX)) == NULL) { + goto fail; + } + + mutex_enter(&ct->coretemp_mutex); + walkerr = B_FALSE; + cmi_hdl_walk(coretemp_init_core, ct, &walkerr, NULL); + + if (walkerr) { + mutex_exit(&ct->coretemp_mutex); + goto fail; + } + + if (!coretemp_create_minors(ct)) { + mutex_exit(&ct->coretemp_mutex); + goto fail; + } + + coretemp = ct; + mutex_exit(&ct->coretemp_mutex); + return (DDI_SUCCESS); +fail: + coretemp = NULL; + coretemp_destroy(ct); + return (DDI_FAILURE); + +} + +static int +coretemp_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, + void **resultp) +{ + int ret; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = coretemp->coretemp_dip; + ret = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + ret = DDI_SUCCESS; + break; + default: + ret = DDI_FAILURE; + break; + } + + return (ret); +} + +static int +coretemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + coretemp_t *ct; + + if (cmd == DDI_SUSPEND) { + return (DDI_SUCCESS); + } + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + if (coretemp == NULL) { + return (DDI_FAILURE); + } + + ct = coretemp; + coretemp = NULL; + coretemp_destroy(ct); + + return (DDI_SUCCESS); +} + +static struct cb_ops coretemp_cb_ops = { + .cb_open = coretemp_open, + .cb_close = coretemp_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = coretemp_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_flag = D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +static struct dev_ops coretemp_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = coretemp_getinfo, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_attach = coretemp_attach, + .devo_detach = coretemp_detach, + .devo_reset = nodev, + .devo_power = ddi_power, + .devo_quiesce = ddi_quiesce_not_needed, + .devo_cb_ops = &coretemp_cb_ops +}; + +static struct modldrv coretemp_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "Intel CPU/Package thermal sensor", + .drv_dev_ops = &coretemp_dev_ops +}; + +static struct modlinkage coretemp_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &coretemp_modldrv, NULL } +}; + +int +_init(void) +{ + if (!coretemp_supported()) { + return (ENOTSUP); + } + + return (mod_install(&coretemp_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&coretemp_modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&coretemp_modlinkage)); +} diff --git a/usr/src/uts/intel/io/coretemp/coretemp.conf b/usr/src/uts/intel/io/coretemp/coretemp.conf new file mode 100644 index 0000000000..1880a2fa16 --- /dev/null +++ b/usr/src/uts/intel/io/coretemp/coretemp.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, Joyent, Inc. +# + +name="coretemp" parent="pseudo" instance=0; diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 87fac33563..0545633682 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -235,6 +235,38 @@ extern "C" { #define CPUID_INTC_ECX_AHF64 0x00100000 /* LAHF and SAHF in long mode */ /* + * Intel uses cpuid leaf 6 to cover various thermal and power control + * operations. + */ +#define CPUID_INTC_EAX_DTS 0x00000001 /* Digital Thermal Sensor */ +#define CPUID_INTC_EAX_TURBO 0x00000002 /* Turboboost */ +#define CPUID_INTC_EAX_ARAT 0x00000004 /* APIC-Timer-Always-Running */ +/* bit 3 is reserved */ +#define CPUID_INTC_EAX_PLN 0x00000010 /* Power limit notification */ +#define CPUID_INTC_EAX_ECMD 0x00000020 /* Clock mod. duty cycle */ +#define CPUID_INTC_EAX_PTM 0x00000040 /* Package thermal management */ +#define CPUID_INTC_EAX_HWP 0x00000080 /* HWP base registers */ +#define CPUID_INTC_EAX_HWP_NOT 0x00000100 /* HWP Notification */ +#define CPUID_INTC_EAX_HWP_ACT 0x00000200 /* HWP Activity Window */ +#define CPUID_INTC_EAX_HWP_EPR 0x00000400 /* HWP Energy Perf. Pref. */ +#define CPUID_INTC_EAX_HWP_PLR 0x00000800 /* HWP Package Level Request */ +/* bit 12 is reserved */ +#define CPUID_INTC_EAX_HDC 0x00002000 /* HDC */ +#define CPUID_INTC_EAX_TURBO3 0x00004000 /* Turbo Boost Max Tech 3.0 */ +#define CPUID_INTC_EAX_HWP_CAP 0x00008000 /* HWP Capabilities */ +#define CPUID_INTC_EAX_HWP_PECI 0x00010000 /* HWP PECI override */ +#define CPUID_INTC_EAX_HWP_FLEX 0x00020000 /* Flexible HWP */ +#define CPUID_INTC_EAX_HWP_FAST 0x00040000 /* Fast IA32_HWP_REQUEST */ +/* bit 19 is reserved */ +#define CPUID_INTC_EAX_HWP_IDLE 0x00100000 /* Ignore Idle Logical HWP */ + +#define CPUID_INTC_EBX_DTS_NTRESH(x) ((x) & 0xf) + +#define CPUID_INTC_ECX_MAPERF 0x00000001 /* IA32_MPERF / IA32_APERF */ +/* bits 1-2 are reserved */ +#define CPUID_INTC_ECX_PERFBIAS 0x00000008 /* IA32_ENERGY_PERF_BIAS */ + +/* * Intel also uses cpuid leaf 7 to have additional instructions and features. * Like some other leaves, but unlike the current ones we care about, it * requires us to specify both a leaf in %eax and a sub-leaf in %ecx. To deal @@ -481,6 +513,74 @@ extern "C" { #define IA32_VMX_EPT_VPID_INVEPT_SINGLE (1UL << 25) #define IA32_VMX_EPT_VPID_INVEPT_ALL (1UL << 26) +/* + * Intel Thermal MSRs + */ +#define MSR_IA32_THERM_INTERRUPT 0x19b +#define IA32_THERM_INTERRUPT_HIGH_IE 0x00000001 +#define IA32_THERM_INTERRUPT_LOW_IE 0x00000002 +#define IA32_THERM_INTERRUPT_PROCHOT_IE 0x00000004 +#define IA32_THERM_INTERRUPT_FORCEPR_IE 0x00000008 +#define IA32_THERM_INTERRUPT_CRIT_IE 0x00000010 +#define IA32_THERM_INTERRUPT_TR1_VAL(x) (((x) >> 8) & 0x7f) +#define IA32_THERM_INTTERUPT_TR1_IE 0x00008000 +#define IA32_THERM_INTTERUPT_TR2_VAL(x) (((x) >> 16) & 0x7f) +#define IA32_THERM_INTERRUPT_TR2_IE 0x00800000 +#define IA32_THERM_INTERRUPT_PL_NE 0x01000000 + +#define MSR_IA32_THERM_STATUS 0x19c +#define IA32_THERM_STATUS_STATUS 0x00000001 +#define IA32_THERM_STATUS_STATUS_LOG 0x00000002 +#define IA32_THERM_STATUS_PROCHOT 0x00000004 +#define IA32_THERM_STATUS_PROCHOT_LOG 0x00000008 +#define IA32_THERM_STATUS_CRIT_STATUS 0x00000010 +#define IA32_THERM_STATUS_CRIT_LOG 0x00000020 +#define IA32_THERM_STATUS_TR1_STATUS 0x00000040 +#define IA32_THERM_STATUS_TR1_LOG 0x00000080 +#define IA32_THERM_STATUS_TR2_STATUS 0x00000100 +#define IA32_THERM_STATUS_TR2_LOG 0x00000200 +#define IA32_THERM_STATUS_POWER_LIMIT_STATUS 0x00000400 +#define IA32_THERM_STATUS_POWER_LIMIT_LOG 0x00000800 +#define IA32_THERM_STATUS_CURRENT_STATUS 0x00001000 +#define IA32_THERM_STATUS_CURRENT_LOG 0x00002000 +#define IA32_THERM_STATUS_CROSS_DOMAIN_STATUS 0x00004000 +#define IA32_THERM_STATUS_CROSS_DOMAIN_LOG 0x00008000 +#define IA32_THERM_STATUS_READING(x) (((x) >> 16) & 0x7f) +#define IA32_THERM_STATUS_RESOLUTION(x) (((x) >> 27) & 0x0f) +#define IA32_THERM_STATUS_READ_VALID 0x80000000 + +#define MSR_TEMPERATURE_TARGET 0x1a2 +#define MSR_TEMPERATURE_TARGET_TARGET(x) (((x) >> 16) & 0xff) +/* + * Not all models support the offset. Refer to the Intel SDM Volume 4 for a list + * of which models have support for which bits. + */ +#define MSR_TEMPERATURE_TARGET_OFFSET(x) (((x) >> 24) & 0x0f) + +#define MSR_IA32_PACKAGE_THERM_STATUS 0x1b1 +#define IA32_PKG_THERM_STATUS_STATUS 0x00000001 +#define IA32_PKG_THERM_STATUS_STATUS_LOG 0x00000002 +#define IA32_PKG_THERM_STATUS_PROCHOT 0x00000004 +#define IA32_PKG_THERM_STATUS_PROCHOT_LOG 0x00000008 +#define IA32_PKG_THERM_STATUS_CRIT_STATUS 0x00000010 +#define IA32_PKG_THERM_STATUS_CRIT_LOG 0x00000020 +#define IA32_PKG_THERM_STATUS_TR1_STATUS 0x00000040 +#define IA32_PKG_THERM_STATUS_TR1_LOG 0x00000080 +#define IA32_PKG_THERM_STATUS_TR2_STATUS 0x00000100 +#define IA32_PKG_THERM_STATUS_TR2_LOG 0x00000200 +#define IA32_PKG_THERM_STATUS_READING(x) (((x) >> 16) & 0x7f) + +#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x1b2 +#define IA32_PKG_THERM_INTERRUPT_HIGH_IE 0x00000001 +#define IA32_PKG_THERM_INTERRUPT_LOW_IE 0x00000002 +#define IA32_PKG_THERM_INTERRUPT_PROCHOT_IE 0x00000004 +#define IA32_PKG_THERM_INTERRUPT_OVERHEAT_IE 0x00000010 +#define IA32_PKG_THERM_INTERRUPT_TR1_VAL(x) (((x) >> 8) & 0x7f) +#define IA32_PKG_THERM_INTTERUPT_TR1_IE 0x00008000 +#define IA32_PKG_THERM_INTTERUPT_TR2_VAL(x) (((x) >> 16) & 0x7f) +#define IA32_PKG_THERM_INTERRUPT_TR2_IE 0x00800000 +#define IA32_PKG_THERM_INTERRUPT_PL_NE 0x01000000 + #define MCI_CTL_VALUE 0xffffffff #define MTRR_TYPE_UC 0 @@ -605,6 +705,8 @@ extern "C" { #define X86FSET_TBM 90 #define X86FSET_AVX512VNNI 91 #define X86FSET_AMD_PCEC 92 +#define X86FSET_CORE_THERMAL 93 +#define X86FSET_PKG_THERMAL 94 /* * Intel Deep C-State invariant TSC in leaf 0x80000007. @@ -612,16 +714,6 @@ extern "C" { #define CPUID_TSC_CSTATE_INVARIANCE (0x100) /* - * Intel Deep C-state always-running local APIC timer - */ -#define CPUID_CSTATE_ARAT (0x4) - -/* - * Intel ENERGY_PERF_BIAS MSR indicated by feature bit CPUID.6.ECX[3]. - */ -#define CPUID_EPB_SUPPORT (1 << 3) - -/* * Intel TSC deadline timer */ #define CPUID_DEADLINE_TSC (1 << 24) @@ -888,7 +980,9 @@ extern "C" { * Definitions for Intel processor models. These are all for Family 6 * processors. This list and the Atom set below it are not exhuastive. */ +#define INTC_MODEL_YONAH 0x0e #define INTC_MODEL_MEROM 0x0f +#define INTC_MODEL_MEROM_L 0x16 #define INTC_MODEL_PENRYN 0x17 #define INTC_MODEL_DUNNINGTON 0x1d @@ -974,7 +1068,7 @@ extern "C" { #if defined(_KERNEL) || defined(_KMEMUSER) -#define NUM_X86_FEATURES 93 +#define NUM_X86_FEATURES 95 extern uchar_t x86_featureset[]; extern void free_x86_featureset(void *featureset); |