From 74079a53e205d2eeb75b215833ddc684a1db3088 Mon Sep 17 00:00:00 2001 From: Robert Mustacchi Date: Thu, 16 Sep 2021 08:07:40 -0700 Subject: 14092 imc(7D) should not be under i86pc Reviewed by: Gergő Mihály Doma Reviewed by: Andy Fiddaman Reviewed by: Toomas Soome Approved by: Dan McDonald MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- usr/src/cmd/fm/mcdecode/Makefile | 2 +- usr/src/pkg/manifests/driver-cpu-mc.p5m | 15 +- usr/src/test/os-tests/tests/imc/Makefile | 2 +- usr/src/uts/i86pc/Makefile.files | 7 - usr/src/uts/i86pc/Makefile.i86pc | 1 - usr/src/uts/i86pc/Makefile.rules | 8 - usr/src/uts/i86pc/imc/Makefile | 43 - usr/src/uts/i86pc/imcstub/Makefile | 41 - usr/src/uts/i86pc/io/imc/imc.c | 3011 ------------------------------ usr/src/uts/i86pc/io/imc/imc.conf | 16 - usr/src/uts/i86pc/io/imc/imc.h | 941 ---------- usr/src/uts/i86pc/io/imc/imcstub.c | 81 - usr/src/uts/intel/Makefile.files | 7 + usr/src/uts/intel/Makefile.intel | 5 + usr/src/uts/intel/Makefile.rules | 8 + usr/src/uts/intel/imc/Makefile | 43 + usr/src/uts/intel/imcstub/Makefile | 41 + usr/src/uts/intel/io/imc/imc.c | 3011 ++++++++++++++++++++++++++++++ usr/src/uts/intel/io/imc/imc.conf | 16 + usr/src/uts/intel/io/imc/imc.h | 941 ++++++++++ usr/src/uts/intel/io/imc/imcstub.c | 81 + 21 files changed, 4161 insertions(+), 4160 deletions(-) delete mode 100644 usr/src/uts/i86pc/imc/Makefile delete mode 100644 usr/src/uts/i86pc/imcstub/Makefile delete mode 100644 usr/src/uts/i86pc/io/imc/imc.c delete mode 100644 usr/src/uts/i86pc/io/imc/imc.conf delete mode 100644 usr/src/uts/i86pc/io/imc/imc.h delete mode 100644 usr/src/uts/i86pc/io/imc/imcstub.c create mode 100644 usr/src/uts/intel/imc/Makefile create mode 100644 usr/src/uts/intel/imcstub/Makefile create mode 100644 usr/src/uts/intel/io/imc/imc.c create mode 100644 usr/src/uts/intel/io/imc/imc.conf create mode 100644 usr/src/uts/intel/io/imc/imc.h create mode 100644 usr/src/uts/intel/io/imc/imcstub.c (limited to 'usr/src') diff --git a/usr/src/cmd/fm/mcdecode/Makefile b/usr/src/cmd/fm/mcdecode/Makefile index 6cfbde5556..841725e4f7 100644 --- a/usr/src/cmd/fm/mcdecode/Makefile +++ b/usr/src/cmd/fm/mcdecode/Makefile @@ -26,7 +26,7 @@ ROOTLIBFMD = $(ROOT)/usr/lib/fm/fmd ROOTPROG = $(ROOTLIBFMD)/$(PROG) $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG -CPPFLAGS += -I$(SRC)/uts/i86pc/io/imc +CPPFLAGS += -I$(SRC)/uts/intel/io/imc LDLIBS += -lnvpair CSTD = $(CSTD_GNU99) diff --git a/usr/src/pkg/manifests/driver-cpu-mc.p5m b/usr/src/pkg/manifests/driver-cpu-mc.p5m index d8e6016674..9e7f7f8e3b 100644 --- a/usr/src/pkg/manifests/driver-cpu-mc.p5m +++ b/usr/src/pkg/manifests/driver-cpu-mc.p5m @@ -19,15 +19,12 @@ set name=pkg.summary value="CPU Memory Controller Drivers" set name=pkg.description value="CPU Memory Controller Drivers" set name=info.classification value=org.opensolaris.category.2008:System/Hardware set name=variant.arch value=i386 -dir path=platform group=sys -dir path=platform/i86pc group=sys -dir path=platform/i86pc/$(ARCH64) group=sys -dir path=platform/i86pc/kernel group=sys -dir path=platform/i86pc/kernel/drv group=sys -dir path=platform/i86pc/kernel/drv/$(ARCH64) group=sys -file path=platform/i86pc/kernel/drv/$(ARCH64)/imc group=sys -file path=platform/i86pc/kernel/drv/$(ARCH64)/imcstub group=sys -file path=platform/i86pc/kernel/drv/imc.conf group=sys +dir path=kernel group=sys +dir path=kernel/drv group=sys +dir path=kernel/drv/$(ARCH64) group=sys +file path=kernel/drv/$(ARCH64)/imc group=sys +file path=kernel/drv/$(ARCH64)/imcstub group=sys +file path=kernel/drv/imc.conf group=sys dir path=usr/share/man dir path=usr/share/man/man7d file path=usr/share/man/man7d/imc.7d diff --git a/usr/src/test/os-tests/tests/imc/Makefile b/usr/src/test/os-tests/tests/imc/Makefile index d62c8048f1..cbf3d81654 100644 --- a/usr/src/test/os-tests/tests/imc/Makefile +++ b/usr/src/test/os-tests/tests/imc/Makefile @@ -39,7 +39,7 @@ include $(SRC)/cmd/Makefile.cmd include $(SRC)/test/Makefile.com include $(SRC)/cmd/Makefile.ctf -CPPFLAGS += -I$(SRC)/uts/i86pc/io/imc +CPPFLAGS += -I$(SRC)/uts/intel/io/imc CMDS = $(PROG:%=$(TESTDIR)/%) $(CMDS) := FILEMODE = 0555 diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 9b83a780a5..3f387f508c 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -321,10 +321,3 @@ ASSYM_DEPS += \ CPR_IMPL_OBJS = cpr_impl.o cpr_wakecode.o $(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h - -# -# Intel Integrated Memory Controller -# (Sandy Bridge - Cascade Lake) -# -IMC_OBJS = imc.o imc_decode.o imc_dump.o -IMCSTUB_OBJS = imcstub.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index 2ce11ac8f6..e803d33801 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -245,7 +245,6 @@ DRV_KMODS += amd_iommu DRV_KMODS += dr DRV_KMODS += ioat DRV_KMODS += fipe -DRV_KMODS += imc imcstub DRV_KMODS += vmm DRV_KMODS += viona DRV_KMODS += ppt diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index f07b4b866b..9610e075fa 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -121,14 +121,6 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) -$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/imc/%.c - $(COMPILE.c) -o $@ $< - $(CTFCONVERT_O) - -$(OBJS_DIR)/%.o: $(SRC)/common/mc/imc/%.c - $(COMPILE.c) -o $@ $< - $(CTFCONVERT_O) - $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pci/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/i86pc/imc/Makefile b/usr/src/uts/i86pc/imc/Makefile deleted file mode 100644 index 32f24db809..0000000000 --- a/usr/src/uts/i86pc/imc/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2019 Joyent, Inc. -# - -UTSBASE = ../.. - -MODULE = imc -OBJECTS = $(IMC_OBJS:%=$(OBJS_DIR)/%) -ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) -CONF_SRCDIR = $(UTSBASE)/i86pc/io/imc - -include $(UTSBASE)/i86pc/Makefile.i86pc - -ALL_TARGET = $(BINARY) $(CONFMOD) -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) - -CPPFLAGS += -I$(CONF_SRCDIR) -LDFLAGS += -dy - -.KEEP_STATE: - -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) - -install: $(INSTALL_DEPS) - -include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/imcstub/Makefile b/usr/src/uts/i86pc/imcstub/Makefile deleted file mode 100644 index 6c0026ef51..0000000000 --- a/usr/src/uts/i86pc/imcstub/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2019 Joyent, Inc. -# - -UTSBASE = ../.. - -MODULE = imcstub -OBJECTS = $(IMCSTUB_OBJS:%=$(OBJS_DIR)/%) -ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) - -include $(UTSBASE)/i86pc/Makefile.i86pc - -ALL_TARGET = $(BINARY) -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) - -LDFLAGS += -dy -Ndrv/imc - -.KEEP_STATE: - -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) - -install: $(INSTALL_DEPS) - -include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/io/imc/imc.c b/usr/src/uts/i86pc/io/imc/imc.c deleted file mode 100644 index e1dbfbfc2e..0000000000 --- a/usr/src/uts/i86pc/io/imc/imc.c +++ /dev/null @@ -1,3011 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2019 Joyent, Inc. - */ - -/* - * Generic Intel Integrated Memory Controller (IMC) Driver - * - * This driver talks to the CPU's IMC to understand the detailed topology of the - * processor and to determine how to map between physical addresses to the - * corresponding DIMM. This driver supports the following generations of Intel - * chips: - * - * - Sandy Bridge - * - Ivy Bridge - * - Haswell - * - Broadwell - * - Skylake / Cascade Lake - * - * Memory Decoding - * --------------- - * - * For more detailed summaries of the memory decoding process, please refer to - * the Intel External Design Specifications for the corresponding processor. - * What follows is a rough overview of how the memory decoding system works. - * - * First, we'd like to define the following concepts: - * - * SYSTEM ADDRESS - * - * This is a physical address that the operating system normally uses. This - * address may refer to DRAM, it may refer to memory mapped PCI - * configuration space or device registers, or it may refer to other parts - * of the system's memory map, such as the extended advanced programmable - * interrupt controller (xAPIC), etc. - * - * DIMM - * - * Dual-inline memory module. This refers to a physical stick of volatile - * memory that is inserted into a slot on the motherboard. - * - * RANK - * - * A potential sub-division of a DIMM. A DIMM's memory capacity is divided - * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have - * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks. - * - * RANK ADDRESS - * - * An address that exists in the context of a given rank on a DIMM. All - * ranks have overlapping addresses, so the address 0x400 exists on all - * ranks on a given DIMM. - * - * CHANNEL - * - * Multiple DIMMs may be combined into a single channel. The channel - * represents the combined memory of all the DIMMs. A given channel only - * ever exists on a socket and is bound to a single memory controller. - * - * CHANNEL ADDRESS - * - * This is an address that exists logically on a channel. Each address on a - * channel maps to a corresponding DIMM that exists on that channel. The - * address space on one channel is independent from that on another. This - * means that address 0x1000 can exist on each memory channel in the - * system. - * - * INTERLEAVE - * - * There are several different cases where interleaving occurs on the - * system. For example, addresses may be interleaved across sockets, - * memory channels, or DIMM ranks. When addresses are interleaved, then - * some number of bits in an address are used to select which target to go - * to (usually through a look up table). The effect of interleaving is that - * addresses that are next to one another may not all go to the same - * device. The following image shows a non-interleaving case. - * - * 0x0fff +-----+ +-----+ 0x7ff - * | |\___________/| | - * | | __________ | (b) | - * | | / \| | - * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff - * | | \______________________________/| | - * | | _______________________________ | (a) | - * | |/ \| | - * 0x0000 +-----+ +-----+ 0x000 - * - * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to - * device (a). While, addresses 0x08000 to 0xfff, go to device (b). - * However, each range is divided into the same number of components. - * - * If instead, we were to look at that with interleaving, what we might say - * is that rather than splitting the range in half, we might say that if - * the address has bit 8 set (0x100), then it goes to (b), otherwise it - * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a). - * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a) - * again, and then 0x300 to 0x2ff would go back to (b). This would continue - * for a while. This would instead look something more like: - * - * - * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+ - * | (b) | | e00-eff | | f00-fff | - * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+ - * | (a) | | c00-cff | | d00-dff | - * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+ - * *** | a00-aff | | b00-bff | - * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+ - * | (b) | | 800-8ff | | 900-9ff | - * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+ - * | (a) | | 600-6ff | | 700-7ff | - * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+ - * | (b) | | 400-4ff | | 500-5ff | - * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+ - * | (a) | | 200-2ff | | 300-3ff | - * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+ - * | 000-0ff | | 100-1ff | - * 0x000 +---------+ 0x000 +---------+ - * - * In this example we've performed two-way interleaving. The number of ways - * that something can interleave varies based on what we're interleaving - * between. - * - * MEMORY CONTROLLER - * - * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of - * memory controllers. Usually 1 or two. Each memory controller supports a - * given number of DIMMs, which are divided across multiple channels. - * - * TARGET ADDRESS DECODER - * - * The target address decoder (TAD) is responsible for taking a system - * address and transforming it into a channel address based on the rules - * that are present. Each memory controller has a corresponding TAD. The - * TAD is often contained in a device called a 'Home Agent'. - * - * SYSTEM ADDRESS DECODER - * - * The system address decoder (SAD) is responsible for taking a system - * address and directing it to the right place, whether this be memory or - * otherwise. There is a single memory controller per socket (see - * uts/i86pc/os/cpuid.c) that is shared between all the cores currently. - * - * NODE IDENTIFIER - * - * The node identifier is used to uniquely identify an element in the - * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the - * definition of 'die'). One can roughly think about this as a unique - * identifier for the socket itself. In general, the primary node ID for a - * socket should map to the socket APIC ID. - * - * Finding Devices - * --------------- - * - * There is a bit of a chicken and egg problem on Intel systems and in the - * device driver interface. The information that we need in the system is spread - * out amongst a large number of different PCI devices that the processor - * exposes. The number of such devices can vary based on the processor - * generation and the specific SKU in the processor. To deal with this, we break - * the driver into two different components: a stub driver and the full driver. - * - * The stub driver has aliases for all known PCI devices that we might attach to - * in a given generation on the system. This driver is called 'imcstub'. When a - * stub attaches, it just registers itself with the main driver, upon which it - * has a module dependency. - * - * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it - * kicks off a scan of the device tree which takes place in a task queue. Once - * there, it determines the number of devices that it expects to exist by - * walking the tree and comparing it against the generation-specific table. - * - * If all devices are found, we'll go ahead and read through all the devices and - * build a map of all the information we need to understand the topology of the - * system and to be able to decode addresses. We do this here, because we can be - * asked to perform decoding in dangerous contexts (after taking an MCE, panic, - * etc) where we don't want to have to rely on the broader kernel functioning at - * this point in time. - * - * Once our topology is built, we'll create minor nodes which are used by the - * fault management architecture to query for information and register our - * decoding functionality with the kernel. - * - * PCI Numbering - * ------------- - * - * For each device that we care about, Intel defines the device and function - * that we can expect to find the information and PCI configuration space - * registers that we care about at. However, the PCI bus is not well defined. - * Devices that are on the same socket use the same set of bus numbers; however, - * some sockets have multiple device numbers that they'll use to represent - * different classes. These bus numbers are programmed by systems firmware as - * part of powering on the system. This means, that we need the ability to - * map together these disparate ranges ourselves. - * - * There is a device called a utility box (UBOX), which exists per-socket and - * maps the different sockets together. We use this to determine which devices - * correspond to which sockets. - * - * Mapping Sockets - * --------------- - * - * Another wrinkle is that the way that the OS sees the numbering of the CPUs is - * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more - * information). However, to map to the corresponding socket, we need to look at - * the socket's node ID. The order of PCI buses in the system is not required to - * have any relation to the socket ID. Therefore, we have to have yet another - * indirection table in the imc_t. - * - * Exposing Data - * ------------- - * - * We expose topology data to FMA using the OS-private memory controller - * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a - * number of specific interfaces that we can then implement. The ioctl API asks - * us for a snapshot of data, which basically has us go through and send an - * nvlist_t to userland. This nvlist_t is constructed as part of the scan - * process. This nvlist uses the version 1 format, which more explicitly encodes - * the topology in a series of nested nvlists. - * - * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the - * decoder and ask it to perform decoding. - * - * Decoding Addresses - * ------------------ - * - * The decoding logic can be found in common/imc/imc_decode.c. This file is - * shared between the kernel and userland to allow for easier testing and - * additional flexibility in operation. The decoding process happens in a few - * different phases. - * - * The first phase, is to determine which memory controller on which socket is - * responsible for this data. To determine this, we use the system address - * decoder and walk the rules, looking for the correct target. There are various - * manipulations to the address that exist which are used to determine which - * index we use. The way that we interpret the output of the rule varies - * somewhat based on the generation. Sandy Bridge just has a node ID which - * points us to the socket with its single IMC. On Ivy Bridge through Broadwell, - * the memory controller to use is also encoded in part of the node ID. Finally, - * on Skylake, the SAD tells us which socket to look at. The socket in question - * then has a routing table which tells us which channel on which memory - * controller that is local to that socket. - * - * Once we have the target memory controller, we walk the list of target address - * decoder rules. These rules can help tell us which channel we care about - * (which is required on Sandy Bridge through Broadwell) and then describe some - * amount of the interleaving rules which are used to turn the system address - * into a channel address. - * - * Once we know the channel and the channel address, we walk the rank interleave - * rules which help us determine which DIMM and the corresponding rank on it - * that the corresponding channel address is on. It also has logic that we need - * to use to determine how to transform a channel address into an address on - * that specific rank. Once we have that, then the initial decoding is done. - * - * The logic in imc_decode.c is abstracted away from the broader kernel CMI - * logic. This is on purpose and allows us not only an easier time unit testing - * the logic, but also allows us to express more high fidelity errors that are - * translated into a much smaller subset. This logic is exercised in the - * 'imc_test' program which is built in 'test/os-tests/tests/imc'. - * - * Limitations - * ----------- - * - * Currently, this driver has the following limitations: - * - * o It doesn't decode the row and column addresses. - * o It doesn't encode from a DIMM address to a system address. - * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge - - * Broadwell platforms. - * o It doesn't support virtual lockstep and adaptive mirroring on Purley - * platforms. - * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs. - * o It doesn't know how to decode three way channel interleaving. - * - * None of these are intrinsic problems to the driver, it's mostly a matter of - * having proper documentation and testing. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "imc.h" - -/* - * These tables contain generational data that varies between processor - * generation such as the maximum number of sockets, memory controllers, and the - * offsets of the various registers. - */ - -static const imc_gen_data_t imc_gen_data_snb = { - .igd_max_sockets = 4, - .igd_max_imcs = 2, - .igd_max_channels = 4, - .igd_max_dimms = 3, - .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, - .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, - IMC_REG_MC_MTR2 }, - .igd_mcmtr_offset = 0x7c, - .igd_tolm_offset = 0x80, - .igd_tohm_low_offset = 0x84, - .igd_sad_dram_offset = 0x80, - .igd_sad_ndram_rules = 10, - .igd_sad_nodeid_offset = 0x40, - .igd_tad_nrules = 12, - .igd_tad_rule_offset = 0x40, - .igd_tad_chan_offset = 0x90, - .igd_tad_sysdef = 0x80, - .igd_tad_sysdef2 = 0x84, - .igd_mc_mirror = 0xac, - .igd_rir_nways = 5, - .igd_rir_way_offset = 0x108, - .igd_rir_nileaves = 8, - .igd_rir_ileave_offset = 0x120, - .igd_ubox_cpubusno_offset = 0xd0, -}; - -static const imc_gen_data_t imc_gen_data_ivb = { - .igd_max_sockets = 4, - .igd_max_imcs = 2, - .igd_max_channels = 4, - .igd_max_dimms = 3, - .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, - .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, - IMC_REG_MC_MTR2 }, - .igd_mcmtr_offset = 0x7c, - .igd_tolm_offset = 0x80, - .igd_tohm_low_offset = 0x84, - .igd_sad_dram_offset = 0x60, - .igd_sad_ndram_rules = 20, - .igd_sad_nodeid_offset = 0x40, - .igd_tad_nrules = 12, - .igd_tad_rule_offset = 0x40, - .igd_tad_chan_offset = 0x90, - .igd_tad_sysdef = 0x80, - .igd_tad_sysdef2 = 0x84, - .igd_mc_mirror = 0xac, - .igd_rir_nways = 5, - .igd_rir_way_offset = 0x108, - .igd_rir_nileaves = 8, - .igd_rir_ileave_offset = 0x120, - .igd_ubox_cpubusno_offset = 0xd0, -}; - -static const imc_gen_data_t imc_gen_data_has_brd = { - .igd_max_sockets = 4, - .igd_max_imcs = 2, - .igd_max_channels = 4, - .igd_max_dimms = 3, - .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX, - .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, - IMC_REG_MC_MTR2 }, - .igd_mcmtr_offset = 0x7c, - .igd_tolm_offset = 0xd0, - .igd_tohm_low_offset = 0xd4, - .igd_tohm_hi_offset = 0xd8, - .igd_sad_dram_offset = 0x60, - .igd_sad_ndram_rules = 20, - .igd_sad_nodeid_offset = 0x40, - .igd_tad_nrules = 12, - .igd_tad_rule_offset = 0x40, - .igd_tad_chan_offset = 0x90, - .igd_tad_sysdef = 0x80, - .igd_tad_sysdef2 = 0x84, - .igd_mc_mirror = 0xac, - .igd_rir_nways = 5, - .igd_rir_way_offset = 0x108, - .igd_rir_nileaves = 8, - .igd_rir_ileave_offset = 0x120, - .igd_ubox_cpubusno_offset = 0xd0, -}; - -static const imc_gen_data_t imc_gen_data_skx = { - .igd_max_sockets = 8, - .igd_max_imcs = 2, - .igd_max_channels = 3, - .igd_max_dimms = 2, - .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, - .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 }, - .igd_mcmtr_offset = 0x87c, - .igd_topo_offset = 0x88, - .igd_tolm_offset = 0xd0, - .igd_tohm_low_offset = 0xd4, - .igd_tohm_hi_offset = 0xd8, - .igd_sad_dram_offset = 0x60, - .igd_sad_ndram_rules = 24, - .igd_sad_nodeid_offset = 0xc0, - .igd_tad_nrules = 8, - .igd_tad_rule_offset = 0x850, - .igd_tad_chan_offset = 0x90, - .igd_rir_nways = 4, - .igd_rir_way_offset = 0x108, - .igd_rir_nileaves = 4, - .igd_rir_ileave_offset = 0x120, - .igd_ubox_cpubusno_offset = 0xcc, -}; - -/* - * This table contains all of the devices that we're looking for from a stub - * perspective. These are organized by generation. Different generations behave - * in slightly different ways. For example, Sandy Bridge through Broadwell use - * unique PCI IDs for each PCI device/function combination that appears. Whereas - * Skylake based systems use the same PCI ID; however, different device/function - * values indicate that the IDs are used for different purposes. - */ -/* BEGIN CSTYLED */ -static const imc_stub_table_t imc_stub_table[] = { - /* Sandy Bridge */ - { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" }, - { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" }, - { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" }, - { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" }, - { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" }, - { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" }, - { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" }, - { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" }, - { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" }, - { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" }, - { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" }, - { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" }, - /* Ivy Bridge */ - { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" }, - { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" }, - { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" }, - { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" }, - { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" }, - { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" }, - { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" }, - { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" }, - { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" }, - { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" }, - { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" }, - /* Haswell */ - { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" }, - { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" }, - { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" }, - { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" }, - { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" }, - { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" }, - { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" }, - { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" }, - /* Broadwell Devices */ - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" }, - { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" }, - { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" }, - { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" }, - { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" }, - { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" }, - { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" }, - { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" }, - /* Skylake and Cascade Lake Devices */ - { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" }, - - /* - * There is one SAD MC Route type device per core! Because of this a - * wide array of device and functions are allocated. For now, we list - * all 28 of them out. - */ - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" }, - - { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" }, - { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" }, -}; -/* END CSTYLED */ - -#define IMC_PCI_VENDOR_INTC 0x8086 - -/* - * Our IMC data is global and statically set up during a combination of - * _init(9E) and attach(9E). While we have a module dependency between the PCI - * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't - * guarantee that the imc driver has finished attaching. As such we make sure - * that it can operate without it being attached in any way. - */ -static imc_t *imc_data = NULL; - -/* - * By default we should not allow the stubs to detach as we don't have a good - * way of forcing them to attach again. This is provided in case someone does - * want to allow the driver to unload. - */ -int imc_allow_detach = 0; - -static void -imc_set_gen_data(imc_t *imc) -{ - switch (imc->imc_gen) { - case IMC_GEN_SANDY: - imc->imc_gen_data = &imc_gen_data_snb; - break; - case IMC_GEN_IVY: - imc->imc_gen_data = &imc_gen_data_ivb; - break; - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - imc->imc_gen_data = &imc_gen_data_has_brd; - break; - case IMC_GEN_SKYLAKE: - imc->imc_gen_data = &imc_gen_data_skx; - break; - default: - dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " - "set to unknown generation: %u", imc->imc_gen); - } -} - -/* - * If our device (dev_info_t) does not have a non-zero unit address, then - * devfsadmd will not pay attention to us at all. Therefore we need to set the - * unit address below, before we create minor nodes. - * - * The rest of the system expects us to have one minor node per socket. The - * minor node ID should be the ID of the socket. - */ -static boolean_t -imc_create_minors(imc_t *imc) -{ - uint_t i; - - ddi_set_name_addr(imc->imc_dip, "1"); - for (i = 0; i < imc->imc_nsockets; i++) { - char buf[MAXNAMELEN]; - - if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >= - sizeof (buf)) { - goto fail; - } - - if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i, - "ddi_mem_ctrl", 0) != DDI_SUCCESS) { - dev_err(imc->imc_dip, CE_WARN, "failed to create " - "minor node %u: %s", i, buf); - goto fail; - } - } - return (B_TRUE); - -fail: - ddi_remove_minor_node(imc->imc_dip, NULL); - return (B_FALSE); -} - -/* - * Check the current MC route value for this SAD. On Skylake systems there is - * one per core. Every core should agree. If not, we will not trust the SAD - * MCROUTE values and this will cause system address decoding to fail on - * skylake. - */ -static void -imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub) -{ - uint32_t val; - - val = pci_config_get32(stub->istub_cfgspace, - IMC_REG_SKX_SAD_MC_ROUTE_TABLE); - if (val == PCI_EINVAL32) { - sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; - return; - } - - if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) { - sad->isad_flags |= IMC_SAD_MCROUTE_VALID; - sad->isad_mcroute.ismc_raw_mcroute = val; - return; - } - - /* - * Occasionally we see MC ROUTE table entries with a value of zero. - * We should ignore those for now. - */ - if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) { - dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch " - "with socket. SAD has val 0x%x, system has %x\n", - val, sad->isad_mcroute.ismc_raw_mcroute); - sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE; - } -} - -/* - * On Skylake, many of the devices that we care about are on separate PCI Buses. - * These can be mapped together by the DECS register. However, we need to know - * how to map different buses together so that we can more usefully associate - * information. The set of buses is all present in the DECS register. We'll - * effectively assign sockets to buses. This is also still something that comes - * up on pre-Skylake systems as well. - */ -static boolean_t -imc_map_buses(imc_t *imc) -{ - imc_stub_t *stub; - uint_t nsock; - - /* - * Find the UBOX_DECS registers so we can establish socket mappings. On - * Skylake, there are three different sets of buses that we need to - * cover all of our devices, while there are only two before that. - */ - for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL; - stub = AVL_NEXT(&imc->imc_stubs, stub)) { - uint32_t busno; - - if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) { - continue; - } - - busno = pci_config_get32(stub->istub_cfgspace, - imc->imc_gen_data->igd_ubox_cpubusno_offset); - if (busno == PCI_EINVAL32) { - dev_err(imc->imc_dip, CE_WARN, "failed to read " - "UBOX_DECS CPUBUSNO0: invalid PCI read"); - return (B_FALSE); - } - - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - imc->imc_sockets[nsock].isock_nbus = 3; - imc->imc_sockets[nsock].isock_bus[0] = - IMC_UBOX_CPUBUSNO_0(busno); - imc->imc_sockets[nsock].isock_bus[1] = - IMC_UBOX_CPUBUSNO_1(busno); - imc->imc_sockets[nsock].isock_bus[2] = - IMC_UBOX_CPUBUSNO_2(busno); - } else { - imc->imc_sockets[nsock].isock_bus[0] = - IMC_UBOX_CPUBUSNO_0(busno); - imc->imc_sockets[nsock].isock_bus[1] = - IMC_UBOX_CPUBUSNO_1(busno); - imc->imc_sockets[nsock].isock_nbus = 2; - } - nsock++; - } - imc->imc_nsockets = nsock; - - return (B_TRUE); -} - -/* - * For a given stub that we've found, map it to its corresponding socket based - * on the PCI bus that it has. - */ -static imc_socket_t * -imc_map_find_socket(imc_t *imc, imc_stub_t *stub) -{ - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - uint_t bus; - - for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) { - if (imc->imc_sockets[i].isock_bus[bus] == - stub->istub_bus) { - return (&imc->imc_sockets[i]); - } - } - } - - return (NULL); -} - -static boolean_t -imc_map_stubs(imc_t *imc) -{ - imc_stub_t *stub; - - if (!imc_map_buses(imc)) { - return (B_FALSE); - } - - stub = avl_first(&imc->imc_stubs); - for (stub = avl_first(&imc->imc_stubs); stub != NULL; - stub = AVL_NEXT(&imc->imc_stubs, stub)) { - imc_socket_t *sock = imc_map_find_socket(imc, stub); - - if (sock == NULL) { - dev_err(imc->imc_dip, CE_WARN, "found stub type %u " - "PCI%x,%x with bdf %u/%u/%u that does not match a " - "known PCI bus for any of %u sockets", - stub->istub_table->imcs_type, stub->istub_vid, - stub->istub_did, stub->istub_bus, stub->istub_dev, - stub->istub_func, imc->imc_nsockets); - continue; - } - - /* - * We don't have to worry about duplicates here. We check to - * make sure that we have unique bdfs here. - */ - switch (stub->istub_table->imcs_type) { - case IMC_TYPE_MC0_M2M: - sock->isock_imcs[0].icn_m2m = stub; - break; - case IMC_TYPE_MC1_M2M: - sock->isock_imcs[1].icn_m2m = stub; - break; - case IMC_TYPE_MC0_MAIN0: - sock->isock_nimc++; - sock->isock_imcs[0].icn_main0 = stub; - - /* - * On Skylake, the MAIN0 does double duty as channel - * zero and as the TAD. - */ - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - sock->isock_imcs[0].icn_nchannels++; - sock->isock_imcs[0].icn_channels[0].ich_desc = - stub; - sock->isock_tad[0].itad_stub = stub; - sock->isock_ntad++; - } - break; - case IMC_TYPE_MC0_MAIN1: - sock->isock_imcs[0].icn_main1 = stub; - break; - case IMC_TYPE_MC1_MAIN0: - sock->isock_nimc++; - sock->isock_imcs[1].icn_main0 = stub; - - /* - * On Skylake, the MAIN0 does double duty as channel - * zero and as the TAD. - */ - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - sock->isock_imcs[1].icn_nchannels++; - sock->isock_imcs[1].icn_channels[0].ich_desc = - stub; - sock->isock_tad[1].itad_stub = stub; - sock->isock_ntad++; - } - break; - case IMC_TYPE_MC1_MAIN1: - sock->isock_imcs[1].icn_main1 = stub; - break; - case IMC_TYPE_MC0_CHANNEL0: - sock->isock_imcs[0].icn_nchannels++; - sock->isock_imcs[0].icn_channels[0].ich_desc = stub; - break; - case IMC_TYPE_MC0_CHANNEL1: - sock->isock_imcs[0].icn_nchannels++; - sock->isock_imcs[0].icn_channels[1].ich_desc = stub; - break; - case IMC_TYPE_MC0_CHANNEL2: - sock->isock_imcs[0].icn_nchannels++; - sock->isock_imcs[0].icn_channels[2].ich_desc = stub; - break; - case IMC_TYPE_MC0_CHANNEL3: - sock->isock_imcs[0].icn_nchannels++; - sock->isock_imcs[0].icn_channels[3].ich_desc = stub; - break; - case IMC_TYPE_MC1_CHANNEL0: - sock->isock_imcs[1].icn_nchannels++; - sock->isock_imcs[1].icn_channels[0].ich_desc = stub; - break; - case IMC_TYPE_MC1_CHANNEL1: - sock->isock_imcs[1].icn_nchannels++; - sock->isock_imcs[1].icn_channels[1].ich_desc = stub; - break; - case IMC_TYPE_MC1_CHANNEL2: - sock->isock_imcs[1].icn_nchannels++; - sock->isock_imcs[1].icn_channels[2].ich_desc = stub; - break; - case IMC_TYPE_MC1_CHANNEL3: - sock->isock_imcs[1].icn_nchannels++; - sock->isock_imcs[1].icn_channels[3].ich_desc = stub; - break; - case IMC_TYPE_SAD_DRAM: - sock->isock_sad.isad_dram = stub; - break; - case IMC_TYPE_SAD_MMIO: - sock->isock_sad.isad_mmio = stub; - break; - case IMC_TYPE_SAD_MISC: - sock->isock_sad.isad_tolh = stub; - break; - case IMC_TYPE_VTD_MISC: - /* - * Some systems have multiple VT-D Misc. entry points - * in the system. In this case, only use the first one - * we find. - */ - if (imc->imc_gvtd_misc == NULL) { - imc->imc_gvtd_misc = stub; - } - break; - case IMC_TYPE_SAD_MCROUTE: - ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE); - imc_mcroute_check(imc, &sock->isock_sad, stub); - break; - case IMC_TYPE_UBOX: - sock->isock_ubox = stub; - break; - case IMC_TYPE_HA0: - sock->isock_ntad++; - sock->isock_tad[0].itad_stub = stub; - break; - case IMC_TYPE_HA1: - sock->isock_ntad++; - sock->isock_tad[1].itad_stub = stub; - break; - case IMC_TYPE_UBOX_CPUBUSNO: - sock->isock_cpubusno = stub; - break; - default: - /* - * Attempt to still attach if we can. - */ - dev_err(imc->imc_dip, CE_WARN, "Encountered unknown " - "IMC type (%u) on PCI %x,%x", - stub->istub_table->imcs_type, - stub->istub_vid, stub->istub_did); - break; - } - } - - return (B_TRUE); -} - -/* - * Go through and fix up various aspects of the stubs mappings on systems. The - * following are a list of what we need to fix up: - * - * 1. On Haswell and newer systems, there is only one global VT-d device. We - * need to go back and map that to all of the per-socket imc_sad_t entries. - */ -static void -imc_fixup_stubs(imc_t *imc) -{ - if (imc->imc_gen >= IMC_GEN_HASWELL) { - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh, - ==, NULL); - imc->imc_sockets[i].isock_sad.isad_tolh = - imc->imc_gvtd_misc; - } - } -} - -/* - * In the wild we've hit a few odd cases where not all devices are exposed that - * we might expect by firmware. In particular we've seen and validate the - * following cases: - * - * o We don't find all of the channel devices that we expect, e.g. we have the - * stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW - * with an E5-2630v3. - */ -static boolean_t -imc_validate_stubs(imc_t *imc) -{ - for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) { - imc_socket_t *socket = &imc->imc_sockets[sock]; - - for (uint_t mc = 0; mc < socket->isock_nimc; mc++) { - imc_mc_t *mcp = &socket->isock_imcs[mc]; - - for (uint_t chan = 0; chan < mcp->icn_nchannels; - chan++) { - if (mcp->icn_channels[chan].ich_desc == NULL) { - dev_err(imc->imc_dip, CE_WARN, - "!missing device for socket %u/" - "imc %u/channel %u", sock, mc, - chan); - return (B_FALSE); - } - } - } - } - - return (B_TRUE); -} - -/* - * Attempt to map all of the discovered sockets to the corresponding APIC based - * socket. We do these mappings by getting the node id of the socket and - * adjusting it to make sure that no home agent is present in it. We use the - * UBOX to avoid any home agent related bits that are present in other - * registers. - */ -static void -imc_map_sockets(imc_t *imc) -{ - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - uint32_t nodeid; - ddi_acc_handle_t h; - - h = imc->imc_sockets[i].isock_ubox->istub_cfgspace; - nodeid = pci_config_get32(h, - imc->imc_gen_data->igd_sad_nodeid_offset); - if (nodeid == PCI_EINVAL32) { - imc->imc_sockets[i].isock_valid |= - IMC_SOCKET_V_BAD_NODEID; - continue; - } - - imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid); - imc->imc_spointers[nodeid] = &imc->imc_sockets[i]; - } -} - -/* - * Decode the MTR, accounting for variances between processor generations. - */ -static void -imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr) -{ - uint8_t disable; - - /* - * Check present first, before worrying about anything else. - */ - if (imc->imc_gen < IMC_GEN_SKYLAKE && - IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) { - dimm->idimm_present = B_FALSE; - return; - } else if (imc->imc_gen >= IMC_GEN_SKYLAKE && - IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) { - dimm->idimm_present = B_FALSE; - return; - } - - dimm->idimm_present = B_TRUE; - dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE; - if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN || - dimm->idimm_ncolumns > IMC_MTR_CA_MAX) { - dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS; - } - - dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE; - if (dimm->idimm_nrows < IMC_MTR_RA_MIN || - dimm->idimm_nrows > IMC_MTR_RA_MAX) { - dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS; - } - - /* - * Determine Density, this information is not present on Sandy Bridge. - */ - switch (imc->imc_gen) { - case IMC_GEN_IVY: - dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr); - break; - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) { - case 0: - default: - dimm->idimm_density = 0; - dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; - break; - case 1: - dimm->idimm_density = 2; - break; - case 2: - dimm->idimm_density = 4; - break; - case 3: - dimm->idimm_density = 8; - break; - } - break; - case IMC_GEN_SKYLAKE: - switch (IMC_MTR_DENSITY_SKX(mtr)) { - case 0: - default: - dimm->idimm_density = 0; - dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; - break; - case 1: - dimm->idimm_density = 2; - break; - case 2: - dimm->idimm_density = 4; - break; - case 3: - dimm->idimm_density = 8; - break; - case 4: - dimm->idimm_density = 16; - break; - case 5: - dimm->idimm_density = 12; - break; - } - break; - case IMC_GEN_UNKNOWN: - case IMC_GEN_SANDY: - dimm->idimm_density = 0; - break; - } - - /* - * The values of width are the same on IVY->SKX, but the bits are - * different. This doesn't exist on SNB. - */ - if (imc->imc_gen > IMC_GEN_SANDY) { - uint8_t width; - - if (imc->imc_gen >= IMC_GEN_BROADWELL) { - width = IMC_MTR_WIDTH_BRD_SKX(mtr); - } else { - width = IMC_MTR_WIDTH_IVB_HAS(mtr); - } - switch (width) { - case 0: - dimm->idimm_width = 4; - break; - case 1: - dimm->idimm_width = 8; - break; - case 2: - dimm->idimm_width = 16; - break; - default: - dimm->idimm_width = 0; - dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH; - break; - } - } else { - dimm->idimm_width = 0; - } - - dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr); - switch (imc->imc_gen) { - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - case IMC_GEN_SKYLAKE: - if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) { - dimm->idimm_nranks = 0; - dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; - } - break; - default: - if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) { - dimm->idimm_nranks = 0; - dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; - } - } - - disable = IMC_MTR_RANK_DISABLE(mtr); - dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0; - dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0; - dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0; - dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0; - - /* - * Only Haswell and later have this information. - */ - if (imc->imc_gen >= IMC_GEN_HASWELL) { - dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0; - dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0; - dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr); - if (dimm->idimm_3dsranks != 0) { - dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks; - } - } - - - if (icn->icn_dimm_type == IMC_DIMM_DDR4) { - dimm->idimm_nbanks = 16; - } else { - dimm->idimm_nbanks = 8; - } - - /* - * To calculate the DIMM size we need first take the number of rows and - * columns. This gives us the number of slots per chip. In a given rank - * there are nbanks of these. There are nrank entries of those. Each of - * these slots can fit a byte. - */ - dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 * - (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows)); -} - -static void -imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan) -{ - uint_t i; - - /* - * There's one register for each DIMM that might be present, we always - * read that information to determine information about the DIMMs. - */ - chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms; - for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { - uint32_t mtr; - imc_dimm_t *dimm = &chan->ich_dimms[i]; - - bzero(dimm, sizeof (imc_dimm_t)); - mtr = pci_config_get32(chan->ich_desc->istub_cfgspace, - imc->imc_gen_data->igd_mtr_offsets[i]); - dimm->idimm_mtr = mtr; - /* - * We don't really expect to get a bad PCIe read. However, if we - * do, treat that for the moment as though the DIMM is bad. - */ - if (mtr == PCI_EINVAL32) { - dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ; - continue; - } - - imc_decode_mtr(imc, icn, dimm, mtr); - } -} - -static boolean_t -imc_fill_controller(imc_t *imc, imc_mc_t *icn) -{ - uint32_t mcmtr; - - mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace, - imc->imc_gen_data->igd_mcmtr_offset); - if (mcmtr == PCI_EINVAL32) { - icn->icn_invalid = B_TRUE; - return (B_FALSE); - } - - icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0; - if (imc->imc_gen < IMC_GEN_SKYLAKE) { - icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0; - } else { - icn->icn_lockstep = B_FALSE; - } - - icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0; - - /* - * SNB and IVB only support DDR3. Haswell and Broadwell may support - * DDR4, depends on the SKU. Skylake only supports DDR4. - */ - switch (imc->imc_gen) { - case IMC_GEN_SANDY: - case IMC_GEN_IVY: - icn->icn_dimm_type = IMC_DIMM_DDR3; - break; - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) { - icn->icn_dimm_type = IMC_DIMM_DDR4; - } else { - icn->icn_dimm_type = IMC_DIMM_DDR3; - } - break; - default: - /* - * Skylake and on are all DDR4. - */ - icn->icn_dimm_type = IMC_DIMM_DDR4; - break; - } - - if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) { - icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace, - imc->imc_gen_data->igd_topo_offset); - } - - return (B_TRUE); -} - -/* - * Walk the IMC data and fill in the information on DIMMs and the memory - * controller configurations. - */ -static void -imc_fill_data(imc_t *imc) -{ - uint_t csock, cmc, cchan; - - for (csock = 0; csock < imc->imc_nsockets; csock++) { - imc_socket_t *sock = &imc->imc_sockets[csock]; - - for (cmc = 0; cmc < sock->isock_nimc; cmc++) { - imc_mc_t *icn = &sock->isock_imcs[cmc]; - - if (!imc_fill_controller(imc, icn)) - continue; - - for (cchan = 0; cchan < icn->icn_nchannels; cchan++) { - imc_fill_dimms(imc, icn, - &icn->icn_channels[cchan]); - } - } - } -} - -static nvlist_t * -imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm) -{ - nvlist_t *nvl; - - nvl = fnvlist_alloc(); - fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT, - dimm->idimm_present); - if (!dimm->idimm_present) { - return (nvl); - } - - fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS, - dimm->idimm_ncolumns); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS, - dimm->idimm_nrows); - - if (imc->imc_gen > IMC_GEN_SANDY) { - fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY, - dimm->idimm_density * (1ULL << 30)); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH, - dimm->idimm_width); - } - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS, - dimm->idimm_nranks); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS, - dimm->idimm_nbanks); - fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS, - dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE); - - if (imc->imc_gen >= IMC_GEN_HASWELL) { - fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL, - dimm->idimm_hdrl); - fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP, - dimm->idimm_hdrl_parity); - if (dimm->idimm_3dsranks > 0) { - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK, - dimm->idimm_3dsranks); - } - } - - return (nvl); -} - -static nvlist_t * -imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan) -{ - nvlist_t *nvl; - nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; - uint_t i; - - nvl = fnvlist_alloc(); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC, - imc->imc_gen_data->igd_max_dimms); - for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { - dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]); - } - - fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS, - dimms, i); - - for (; i > 0; i--) { - nvlist_free(dimms[i-1]); - } - - return (nvl); -} - -static nvlist_t * -imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn) -{ - nvlist_t *nvl; - nvlist_t *channels[IMC_MAX_CHANPERMC]; - uint_t i; - - nvl = fnvlist_alloc(); - fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels); - fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC, - icn->icn_ecc); - if (icn->icn_lockstep) { - fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, - MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK); - } else { - fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, - MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP); - - } - - if (icn->icn_closed) { - fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, - MCINTEL_NVLIST_V1_MC_POLICY_CLOSED); - } else { - fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, - MCINTEL_NVLIST_V1_MC_POLICY_OPEN); - } - - for (i = 0; i < icn->icn_nchannels; i++) { - channels[i] = imc_nvl_create_channel(imc, - &icn->icn_channels[i]); - } - fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS, - channels, icn->icn_nchannels); - for (i = 0; i < icn->icn_nchannels; i++) { - nvlist_free(channels[i]); - } - - return (nvl); -} - -static void -imc_nvl_pack(imc_socket_t *sock, boolean_t sleep) -{ - char *buf = NULL; - size_t len = 0; - int kmflag; - - if (sock->isock_nvl == NULL) - return; - - if (sock->isock_buf != NULL) - return; - - if (sleep) { - kmflag = KM_SLEEP; - } else { - kmflag = KM_NOSLEEP | KM_NORMALPRI; - } - - if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR, - kmflag) != 0) { - return; - } - - sock->isock_buf = buf; - sock->isock_buflen = len; - sock->isock_gen++; -} - -static void -imc_decoder_pack(imc_t *imc) -{ - char *buf = NULL; - size_t len = 0; - - if (imc->imc_decoder_buf != NULL) - return; - - if (imc->imc_decoder_dump == NULL) { - imc->imc_decoder_dump = imc_dump_decoder(imc); - } - - if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR, - KM_NOSLEEP | KM_NORMALPRI) != 0) { - return; - } - - imc->imc_decoder_buf = buf; - imc->imc_decoder_len = len; -} - -static void -imc_nvl_create(imc_t *imc) -{ - uint_t csock; - for (csock = 0; csock < imc->imc_nsockets; csock++) { - uint_t i; - nvlist_t *nvl; - nvlist_t *mcs[IMC_MAX_IMCPERSOCK]; - imc_socket_t *sock = &imc->imc_sockets[csock]; - - nvl = fnvlist_alloc(); - fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR, - MCINTEL_NVLIST_VERS1); - fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC, - sock->isock_nimc); - - for (i = 0; i < sock->isock_nimc; i++) { - mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]); - } - - fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS, - mcs, sock->isock_nimc); - - for (i = 0; i < sock->isock_nimc; i++) { - nvlist_free(mcs[i]); - } - - sock->isock_nvl = nvl; - imc_nvl_pack(sock, B_TRUE); - } -} - -/* - * Determine the top of low and high memory. These determine whether transaction - * addresses target main memory or not. Unfortunately, the way that these are - * stored and fetched changes with different generations. - */ -static void -imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad) -{ - uint32_t tolm, tohm_low, tohm_hi; - - tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace, - imc->imc_gen_data->igd_tolm_offset); - tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace, - imc->imc_gen_data->igd_tohm_low_offset); - if (imc->imc_gen_data->igd_tohm_hi_offset != 0) { - tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace, - imc->imc_gen_data->igd_tohm_hi_offset); - } else { - tohm_hi = 0; - } - - if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 || - tohm_hi == PCI_EINVAL32) { - sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; - return; - } - - switch (imc->imc_gen) { - case IMC_GEN_SANDY: - case IMC_GEN_IVY: - sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) << - IMC_TOLM_SNB_IVY_SHIFT; - sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) << - IMC_TOLM_SNB_IVY_SHIFT; - break; - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - case IMC_GEN_SKYLAKE: - sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK; - sad->isad_tohm = ((uint64_t)tohm_low & - IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32); - - /* - * Adjust the values to turn them into an exclusive range. - */ - sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL; - sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL; - break; - default: - dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " - "set to unknown generation: %u", imc->imc_gen); - return; - } -} - -static void -imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule, - uint32_t raw) -{ - uint_t attr; - uint64_t limit; - bzero(rule, sizeof (imc_sad_rule_t)); - - rule->isr_raw_dram = raw; - rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0; - if (imc->imc_gen < IMC_GEN_SKYLAKE) { - switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) { - case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6: - rule->isr_imode = IMC_SAD_IMODE_8t6; - break; - case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR: - rule->isr_imode = IMC_SAD_IMODE_8t6XOR; - break; - } - } else { - switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) { - case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6: - rule->isr_imode = IMC_SAD_IMODE_8t6; - break; - case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8: - rule->isr_imode = IMC_SAD_IMODE_10t8; - break; - case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12: - rule->isr_imode = IMC_SAD_IMODE_14t12; - break; - case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30: - rule->isr_imode = IMC_SAD_IMODE_32t30; - break; - } - } - - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - attr = IMC_SAD_DRAM_ATTR_SKX(raw); - } else { - attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw); - } - - switch (attr) { - case IMC_SAD_DRAM_ATTR_DRAM: - rule->isr_type = IMC_SAD_TYPE_DRAM; - break; - case IMC_SAD_DRAM_ATTR_MMCFG: - rule->isr_type = IMC_SAD_TYPE_MMCFG; - break; - case IMC_SAD_DRAM_ATTR_NXM: - if (imc->imc_gen < IMC_GEN_SKYLAKE) { - sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; - } - rule->isr_type = IMC_SAD_TYPE_NXM; - break; - default: - sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; - break; - } - - /* - * Fetch the limit which represents bits 45:26 and then adjust this so - * that it is exclusive. - */ - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - limit = IMC_SAD_DRAM_LIMIT_SKX(raw); - } else { - limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw); - } - rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) + - IMC_SAD_DRAM_LIMIT_EXCLUSIVE; - - /* - * The rest of this does not apply to Sandy Bridge. - */ - if (imc->imc_gen == IMC_GEN_SANDY) - return; - - if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) { - rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0; - return; - } - - switch (IMC_SAD_DRAM_MOD23_SKX(raw)) { - case IMC_SAD_DRAM_MOD23_MOD3: - rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3; - break; - case IMC_SAD_DRAM_MOD23_MOD2_C01: - rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01; - break; - case IMC_SAD_DRAM_MOD23_MOD2_C12: - rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12; - break; - case IMC_SAD_DRAM_MOD23_MOD2_C02: - rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02; - break; - } - - rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0; - switch (IMC_SAD_DRAM_MOD3_SKX(raw)) { - case IMC_SAD_DRAM_MOD3_MODE_45t6: - rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6; - break; - case IMC_SAD_DRAM_MOD3_MODE_45t8: - rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8; - break; - case IMC_SAD_DRAM_MOD3_MODE_45t12: - rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12; - break; - default: - sad->isad_valid |= IMC_SAD_V_BAD_MOD3; - break; - } -} - -static void -imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw) -{ - uint_t i; - uint32_t mlen, mbase, skipbits, skipafter; - - rule->isr_raw_interleave = raw; - - /* - * Right now all architectures always have the maximum number of SAD - * interleave targets. - */ - rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE; - - /* - * Sandy Bridge has a gap in the interleave list due to the fact that it - * uses a smaller length. - */ - if (imc->imc_gen > IMC_GEN_SANDY) { - mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN; - mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK; - skipbits = skipafter = 0; - } else { - mlen = IMC_SAD_ILEAVE_SNB_LEN; - mbase = IMC_SAD_ILEAVE_SNB_MASK; - skipbits = 2; - skipafter = 4; - } - - for (i = 0; i < rule->isr_ntargets; i++) { - uint32_t mask, shift; - - shift = i * mlen; - if (i >= skipafter) - shift += skipbits; - mask = mbase << shift; - rule->isr_targets[i] = (raw & mask) >> shift; - } -} - -static void -imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad) -{ - uint_t i; - off_t off; - - sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules; - for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset; - i < sad->isad_nrules; i++, off += sizeof (uint64_t)) { - uint32_t dram, interleave; - imc_sad_rule_t *rule = &sad->isad_rules[i]; - - dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off); - interleave = pci_config_get32(sad->isad_dram->istub_cfgspace, - off + 4); - - if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) { - sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; - return; - } - - imc_sad_fill_rule(imc, sad, rule, dram); - imc_sad_fill_rule_interleave(imc, rule, interleave); - } -} - -static void -imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad) -{ - uint_t i; - imc_sad_mcroute_table_t *mc = &sad->isad_mcroute; - - if (imc->imc_gen < IMC_GEN_SKYLAKE) - return; - if (sad->isad_valid != 0) - return; - - mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES; - for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) { - uint_t chanoff, ringoff; - - ringoff = i * IMC_MC_ROUTE_RING_BITS; - chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET; - - mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >> - ringoff) & IMC_MC_ROUTE_RING_MASK; - mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >> - chanoff) & IMC_MC_ROUTE_CHAN_MASK; - } -} - -/* - * Initialize the SAD. To do this we have to do a few different things: - * - * 1. Determine where the top of low and high memory is. - * 2. Read and decode all of the rules for the SAD - * 3. On systems with a route table, decode the raw routes - * - * At this point in time, we treat TOLM and TOHM as a per-socket construct, even - * though it really should be global, this just makes life a bit simpler. - */ -static void -imc_decoder_init_sad(imc_t *imc) -{ - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad); - imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad); - imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad); - } -} - -static void -imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev, - imc_tad_rule_t *rule, uint32_t val) -{ - uint64_t limit; - - limit = IMC_TAD_LIMIT(val); - rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) + - IMC_TAD_LIMIT_EXCLUSIVE; - rule->itr_raw = val; - - switch (IMC_TAD_SOCK_WAY(val)) { - case IMC_TAD_SOCK_WAY_1: - rule->itr_sock_way = 1; - break; - case IMC_TAD_SOCK_WAY_2: - rule->itr_sock_way = 2; - break; - case IMC_TAD_SOCK_WAY_4: - rule->itr_sock_way = 4; - break; - case IMC_TAD_SOCK_WAY_8: - rule->itr_sock_way = 8; - break; - } - - rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1; - rule->itr_sock_gran = IMC_TAD_GRAN_64B; - rule->itr_chan_gran = IMC_TAD_GRAN_64B; - - /* - * Starting with Skylake the targets that are used are no longer part of - * the TAD. Those come from the IMC route table. - */ - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - rule->itr_ntargets = 0; - return; - } - - rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS; - rule->itr_targets[0] = IMC_TAD_TARG0(val); - rule->itr_targets[1] = IMC_TAD_TARG1(val); - rule->itr_targets[2] = IMC_TAD_TARG2(val); - rule->itr_targets[3] = IMC_TAD_TARG3(val); - - if (prev == NULL) { - rule->itr_base = 0; - } else { - rule->itr_base = prev->itr_limit + 1; - } -} - -static void -imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule, - uint32_t val) -{ - uint64_t base; - - rule->itr_raw_gran = val; - base = IMC_TAD_BASE_BASE(val); - rule->itr_base = base << IMC_TAD_BASE_SHIFT; - - switch (IMC_TAD_BASE_CHAN_GRAN(val)) { - case IMC_TAD_BASE_CHAN_GRAN_64B: - rule->itr_sock_gran = IMC_TAD_GRAN_64B; - break; - case IMC_TAD_BASE_CHAN_GRAN_256B: - rule->itr_sock_gran = IMC_TAD_GRAN_256B; - break; - case IMC_TAD_BASE_CHAN_GRAN_4KB: - rule->itr_sock_gran = IMC_TAD_GRAN_4KB; - break; - default: - tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN; - return; - } - - switch (IMC_TAD_BASE_SOCK_GRAN(val)) { - case IMC_TAD_BASE_SOCK_GRAN_64B: - rule->itr_sock_gran = IMC_TAD_GRAN_64B; - break; - case IMC_TAD_BASE_SOCK_GRAN_256B: - rule->itr_sock_gran = IMC_TAD_GRAN_256B; - break; - case IMC_TAD_BASE_SOCK_GRAN_4KB: - rule->itr_sock_gran = IMC_TAD_GRAN_4KB; - break; - case IMC_TAD_BASE_SOCK_GRAN_1GB: - rule->itr_sock_gran = IMC_TAD_GRAN_1GB; - break; - } -} - -/* - * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's - * suggested that the channel wayness will take this into account and therefore - * should be accurately reflected. - */ -static void -imc_tad_read_rules(imc_t *imc, imc_tad_t *tad) -{ - uint_t i; - off_t baseoff; - imc_tad_rule_t *prev; - - tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules; - for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset, - prev = NULL; i < tad->itad_nrules; - i++, baseoff += sizeof (uint32_t)) { - uint32_t val; - off_t off; - imc_tad_rule_t *rule = &tad->itad_rules[i]; - - /* - * On Skylake, the TAD rules are split among two registers. The - * latter set mimics what exists on pre-Skylake. - */ - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - off = baseoff + IMC_SKX_WAYNESS_OFFSET; - } else { - off = baseoff; - } - - val = pci_config_get32(tad->itad_stub->istub_cfgspace, off); - if (val == PCI_EINVAL32) { - tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; - return; - } - - imc_tad_fill_rule(imc, tad, prev, rule, val); - prev = rule; - if (imc->imc_gen < IMC_GEN_SKYLAKE) - continue; - - val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff); - if (val == PCI_EINVAL32) { - tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; - return; - } - - imc_tad_fill_skx(imc, tad, rule, val); - } -} - -/* - * Check for features which change how decoding works. - */ -static void -imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc) -{ - uint32_t val; - - /* - * Determine whether or not lockstep mode or mirroring are enabled. - * These change the behavior of how we're supposed to interpret channel - * wayness. Lockstep is available in the TAD's features. Mirroring is - * available on the IMC's features. This isn't present in Skylake+. On - * Skylake Mirorring is a property of the SAD rule and there is no - * lockstep. - */ - switch (imc->imc_gen) { - case IMC_GEN_SANDY: - case IMC_GEN_IVY: - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - val = pci_config_get32(tad->itad_stub->istub_cfgspace, - imc->imc_gen_data->igd_tad_sysdef); - if (val == PCI_EINVAL32) { - tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; - return; - } - if (IMC_TAD_SYSDEF_LOCKSTEP(val)) { - tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP; - } - - val = pci_config_get32(mc->icn_main1->istub_cfgspace, - imc->imc_gen_data->igd_mc_mirror); - if (val == PCI_EINVAL32) { - tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; - return; - } - if (IMC_MC_MIRROR_SNB_BRD(val)) { - tad->itad_flags |= IMC_TAD_FLAG_MIRROR; - } - break; - default: - break; - } - - /* - * Now, go through and look at values that'll change how we do the - * channel index and adddress calculation. These are only present - * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge - * and they don't exist on Skylake+. - */ - switch (imc->imc_gen) { - case IMC_GEN_IVY: - case IMC_GEN_HASWELL: - case IMC_GEN_BROADWELL: - val = pci_config_get32(tad->itad_stub->istub_cfgspace, - imc->imc_gen_data->igd_tad_sysdef2); - if (val == PCI_EINVAL32) { - tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; - return; - } - if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { - tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT; - } - if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { - tad->itad_flags |= IMC_TAD_FLAG_CHANHASH; - } - break; - default: - break; - } -} - -/* - * Read the IMC channel interleave records - */ -static void -imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan) -{ - uint_t i; - off_t off; - - chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules; - for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset; - i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) { - uint32_t val; - uint64_t offset; - - val = pci_config_get32(chan->ich_desc->istub_cfgspace, - off); - if (val == PCI_EINVAL32) { - chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; - return; - } - - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - offset = IMC_TADCHAN_OFFSET_SKX(val); - } else { - offset = IMC_TADCHAN_OFFSET_SNB_BRD(val); - } - - chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT; - chan->ich_tad_offsets_raw[i] = val; - } -} - -static void -imc_decoder_init_tad(imc_t *imc) -{ - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - uint_t j; - - for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) { - imc_tad_read_features(imc, - &imc->imc_sockets[i].isock_tad[j], - &imc->imc_sockets[i].isock_imcs[j]); - imc_tad_read_rules(imc, - &imc->imc_sockets[i].isock_tad[j]); - } - } - - for (i = 0; i < imc->imc_nsockets; i++) { - uint_t j; - imc_socket_t *sock = &imc->imc_sockets[i]; - - for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { - uint_t k; - imc_mc_t *mc = &sock->isock_imcs[j]; - - for (k = 0; k < mc->icn_nchannels; k++) { - imc_channel_t *chan = &mc->icn_channels[k]; - imc_tad_read_interleave(imc, chan); - } - } - } -} - -static void -imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan, - imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig) -{ - uint_t i; - off_t off, incr; - - /* - * Rank interleave offset registers come in two forms. Either they are - * contiguous for a given wayness, meaning that all of the entries for - * wayness zero are contiguous, or they are sparse, meaning that there - * is a bank for entry zero for all wayness, then entry one for all - * wayness, etc. - */ - if (contig) { - off = imc->imc_gen_data->igd_rir_ileave_offset + - (rirno * imc->imc_gen_data->igd_rir_nileaves * - sizeof (uint32_t)); - incr = sizeof (uint32_t); - } else { - off = imc->imc_gen_data->igd_rir_ileave_offset + - (rirno * sizeof (uint32_t)); - incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t); - } - for (i = 0; i < rank->irle_nentries; i++, off += incr) { - uint32_t val; - uint64_t offset; - imc_rank_ileave_entry_t *ent = &rank->irle_entries[i]; - - val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); - if (val == PCI_EINVAL32) { - chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; - return; - } - - switch (imc->imc_gen) { - case IMC_GEN_BROADWELL: - ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val); - break; - default: - ent->irle_target = IMC_RIR_OFFSET_TARGET(val); - break; - } - if (imc->imc_gen >= IMC_GEN_HASWELL) { - offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val); - } else { - offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val); - } - ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT; - } -} - -static void -imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan) -{ - uint_t i; - off_t off; - - chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways; - for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset; - i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) { - uint32_t val; - uint64_t lim; - imc_rank_ileave_t *ent = &chan->ich_rankileaves[i]; - - val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); - if (val == PCI_EINVAL32) { - chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; - return; - } - - ent->irle_raw = val; - ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0; - ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val); - ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val); - if (imc->imc_gen >= IMC_GEN_HASWELL) { - lim = IMC_RIR_LIMIT_HAS_SKX(val); - } else { - lim = IMC_RIR_LIMIT_SNB_IVB(val); - } - - ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) + - IMC_RIR_LIMIT_EXCLUSIVE; - - ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves; - if (imc->imc_gen >= IMC_GEN_SKYLAKE) { - imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE); - } else { - imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE); - } - } -} - -static void -imc_decoder_init_rir(imc_t *imc) -{ - uint_t i; - - for (i = 0; i < imc->imc_nsockets; i++) { - uint_t j; - imc_socket_t *sock = &imc->imc_sockets[i]; - - for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { - uint_t k; - imc_mc_t *mc = &sock->isock_imcs[j]; - - for (k = 0; k < mc->icn_nchannels; k++) { - imc_channel_t *chan = &mc->icn_channels[k]; - imc_rir_read_wayness(imc, chan); - } - } - } -} - -static cmi_errno_t -imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, - uint32_t synd, int syndtype, mc_unum_t *unump) -{ - imc_t *imc = arg; - uint_t i; - imc_decode_state_t dec; - - bzero(&dec, sizeof (dec)); - if (!imc_decode_pa(imc, pa, &dec)) { - switch (dec.ids_fail) { - case IMC_DECODE_F_LEGACY_RANGE: - case IMC_DECODE_F_OUTSIDE_DRAM: - return (CMIERR_MC_NOTDIMMADDR); - default: - return (CMIERR_MC_BADSTATE); - } - } - - unump->unum_board = 0; - /* - * The chip id needs to be in the order that the OS expects it, which - * may not be our order. - */ - for (i = 0; i < imc->imc_nsockets; i++) { - if (imc->imc_spointers[i] == dec.ids_socket) - break; - } - if (i == imc->imc_nsockets) { - return (CMIERR_MC_BADSTATE); - } - unump->unum_chip = i; - unump->unum_mc = dec.ids_tadid; - unump->unum_chan = dec.ids_channelid; - unump->unum_cs = dec.ids_dimmid; - unump->unum_rank = dec.ids_rankid; - unump->unum_offset = dec.ids_rankaddr; - for (i = 0; i < MC_UNUM_NDIMM; i++) { - unump->unum_dimms[i] = MC_INVALNUM; - } - - return (CMI_SUCCESS); -} - -static cmi_errno_t -imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa) -{ - return (CMIERR_UNKNOWN); -} - -static const cmi_mc_ops_t imc_mc_ops = { - .cmi_mc_patounum = imc_mc_patounum, - .cmi_mc_unumtopa = imc_mc_unumtopa -}; - -/* - * This is where we really finish attaching and become open for business. This - * occurs once we have all of the expected stubs attached. Here's where all of - * the real fun begins. - */ -static void -imc_attach_complete(void *arg) -{ - imc_t *imc = arg; - cmi_errno_t err; - - imc_set_gen_data(imc); - - /* - * On SKX and newer, we can fail to map PCI buses at this point due to - * bad PCIe reads. - */ - if (!imc_map_stubs(imc)) { - goto done; - } - - if (!imc_validate_stubs(imc)) { - imc->imc_flags |= IMC_F_VALIDATE_FAILED; - goto done; - } - - imc_fixup_stubs(imc); - imc_map_sockets(imc); - - if (!imc_create_minors(imc)) { - goto done; - } - - imc_fill_data(imc); - imc_nvl_create(imc); - - /* - * Gather additional information that we need so that we can properly - * initialize the memory decoder and encoder. - */ - imc_decoder_init_sad(imc); - imc_decoder_init_tad(imc); - imc_decoder_init_rir(imc); - - /* - * Register decoder functions. This may fail. If so, try and complain - * loudly, but stay active to allow other data to be useful. Register a - * global handle. - */ - if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) { - imc->imc_flags |= IMC_F_MCREG_FAILED; - dev_err(imc->imc_dip, CE_WARN, "failed to register memory " - "decoding operations: 0x%x", err); - } - -done: - mutex_enter(&imc->imc_lock); - imc->imc_flags &= IMC_F_ATTACH_DISPATCHED; - imc->imc_flags |= IMC_F_ATTACH_COMPLETE; - mutex_exit(&imc->imc_lock); -} - -static int -imc_stub_comparator(const void *l, const void *r) -{ - const imc_stub_t *sl = l, *sr = r; - if (sl->istub_bus > sr->istub_bus) - return (1); - if (sl->istub_bus < sr->istub_bus) - return (-1); - if (sl->istub_dev > sr->istub_dev) - return (1); - if (sl->istub_dev < sr->istub_dev) - return (-1); - if (sl->istub_func > sr->istub_func) - return (1); - if (sl->istub_func < sr->istub_func) - return (-1); - return (0); -} - -static int -imc_stub_scan_cb(dev_info_t *dip, void *arg) -{ - int vid, did; - const imc_stub_table_t *table; - imc_t *imc = arg; - int *regs; - uint_t i, nregs; - - if (dip == ddi_root_node()) { - return (DDI_WALK_CONTINUE); - } - - /* - * Get the dev info name. PCI devices will always be children of PCI - * devices today on x86. If we reach something that has a device name - * that's not PCI, then we can prune it's children. - */ - if (strncmp("pci", ddi_get_name(dip), 3) != 0) { - return (DDI_WALK_PRUNECHILD); - } - - /* - * Get the device and vendor ID and see if this is something the imc - * knows about or cares about. - */ - vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "vendor-id", PCI_EINVAL16); - did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "device-id", PCI_EINVAL16); - if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { - return (DDI_WALK_CONTINUE); - } - - if (vid != IMC_PCI_VENDOR_INTC) { - return (DDI_WALK_PRUNECHILD); - } - - if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { - return (DDI_WALK_CONTINUE); - } - - if (nregs == 0) { - ddi_prop_free(regs); - return (DDI_WALK_CONTINUE); - } - - - table = NULL; - for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { - if (imc_stub_table[i].imcs_devid == did && - imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && - imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { - table = &imc_stub_table[i]; - break; - } - } - ddi_prop_free(regs); - - /* - * Not a match, not interesting. - */ - if (table == NULL) { - return (DDI_WALK_CONTINUE); - } - - mutex_enter(&imc->imc_lock); - imc->imc_nscanned++; - mutex_exit(&imc->imc_lock); - - return (DDI_WALK_CONTINUE); -} - -/* - * From here, go through and see how many of the devices that we know about. - */ -static void -imc_stub_scan(void *arg) -{ - imc_t *imc = arg; - boolean_t dispatch = B_FALSE; - - /* - * Zero out the scan results in case we've been detached and reattached. - */ - mutex_enter(&imc->imc_lock); - imc->imc_nscanned = 0; - mutex_exit(&imc->imc_lock); - - ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc); - - mutex_enter(&imc->imc_lock); - imc->imc_flags |= IMC_F_SCAN_COMPLETE; - imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED; - - /* - * If the scan found no nodes, then that means that we're on a hardware - * platform that we don't support. Therefore, there's no reason to do - * anything here. - */ - if (imc->imc_nscanned == 0) { - imc->imc_flags |= IMC_F_UNSUP_PLATFORM; - mutex_exit(&imc->imc_lock); - return; - } - - if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { - imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; - dispatch = B_TRUE; - } - - mutex_exit(&imc->imc_lock); - - if (dispatch) { - (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, - imc, DDI_SLEEP); - } -} - -/* - * By default, refuse to allow stubs to detach. - */ -int -imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - imc_stub_t *stub; - imc_t *imc = imc_data; - - mutex_enter(&imc->imc_lock); - - /* - * By default, we do not allow stubs to detach. However, if the driver - * has attached to devices on a platform it doesn't recognize or - * support or if the override flag has been set, then allow detach to - * proceed. - */ - if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 && - imc_allow_detach == 0) { - mutex_exit(&imc->imc_lock); - return (DDI_FAILURE); - } - - for (stub = avl_first(&imc->imc_stubs); stub != NULL; - stub = AVL_NEXT(&imc->imc_stubs, stub)) { - if (stub->istub_dip == dip) { - break; - } - } - - /* - * A device was attached to us that we somehow don't know about. Allow - * this to proceed. - */ - if (stub == NULL) { - mutex_exit(&imc->imc_lock); - return (DDI_SUCCESS); - } - - pci_config_teardown(&stub->istub_cfgspace); - avl_remove(&imc->imc_stubs, stub); - kmem_free(stub, sizeof (imc_stub_t)); - mutex_exit(&imc->imc_lock); - - return (DDI_SUCCESS); -} - -int -imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - imc_stub_t *stub, *lookup; - int did, vid, *regs; - uint_t i, nregs; - const imc_stub_table_t *table; - avl_index_t idx; - boolean_t dispatch = B_FALSE; - imc_t *imc = imc_data; - - if (cmd != DDI_ATTACH) { - return (DDI_FAILURE); - } - - /* - * We've been asked to attach a stub. First, determine if this is even a - * PCI device that we should care about. Then, append it to our global - * list and kick off the configuration task. Note that we do this - * configuration task in a taskq so that we don't interfere with the - * normal attach / detach path processing. - */ - if (strncmp("pci", ddi_get_name(dip), 3) != 0) { - return (DDI_FAILURE); - } - - /* - * Get the device and vendor ID and see if this is something the imc - * knows about or cares about. - */ - vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "vendor-id", PCI_EINVAL16); - did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "device-id", PCI_EINVAL16); - if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { - return (DDI_FAILURE); - } - - /* - * Only accept INTC parts on the imc driver. - */ - if (vid != IMC_PCI_VENDOR_INTC) { - return (DDI_FAILURE); - } - - if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { - return (DDI_FAILURE); - } - - if (nregs == 0) { - ddi_prop_free(regs); - return (DDI_FAILURE); - } - - /* - * Determine if this matches a known device. - */ - table = NULL; - for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { - if (imc_stub_table[i].imcs_devid == did && - imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && - imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { - table = &imc_stub_table[i]; - break; - } - } - - if (i == ARRAY_SIZE(imc_stub_table)) { - ddi_prop_free(regs); - return (DDI_FAILURE); - } - - /* - * We've found something. Make sure the generation matches our current - * one. If it does, construct the entry and append it to the list. - */ - mutex_enter(&imc->imc_lock); - if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen != - table->imcs_gen) { - mutex_exit(&imc->imc_lock); - ddi_prop_free(regs); - dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) " - "that has different hardware generation (%u) from current " - "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen); - return (DDI_FAILURE); - } else { - imc->imc_gen = table->imcs_gen; - } - mutex_exit(&imc->imc_lock); - - stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP); - stub->istub_dip = dip; - stub->istub_vid = vid; - stub->istub_did = did; - stub->istub_bus = PCI_REG_BUS_G(regs[0]); - stub->istub_dev = PCI_REG_DEV_G(regs[0]); - stub->istub_func = PCI_REG_FUNC_G(regs[0]); - ddi_prop_free(regs); - stub->istub_table = table; - - if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) { - kmem_free(stub, sizeof (stub)); - dev_err(dip, CE_WARN, "Failed to set up PCI config space " - "for IMC stub device %s (%u/%u)", ddi_node_name(dip), - vid, did); - return (DDI_FAILURE); - } - - mutex_enter(&imc->imc_lock); - if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) { - dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate " - "bdf %u/%u/%u with %s (%u/%u), not attaching", - ddi_node_name(imc->imc_dip), vid, did, - stub->istub_bus, stub->istub_dev, stub->istub_func, - ddi_node_name(lookup->istub_dip), lookup->istub_vid, - lookup->istub_did); - mutex_exit(&imc->imc_lock); - pci_config_teardown(&stub->istub_cfgspace); - kmem_free(stub, sizeof (stub)); - - return (DDI_FAILURE); - } - avl_insert(&imc->imc_stubs, stub, idx); - - if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE && - avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { - imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; - dispatch = B_TRUE; - } - mutex_exit(&imc->imc_lock); - - if (dispatch) { - (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, - imc, DDI_SLEEP); - } - - return (DDI_SUCCESS); -} - -static int -imc_open(dev_t *devp, int flag, int otyp, cred_t *credp) -{ - imc_t *imc = imc_data; - - if ((flag & (FEXCL | FNDELAY)) != 0) - return (EINVAL); - - if (otyp != OTYP_CHR) - return (EINVAL); - - mutex_enter(&imc->imc_lock); - - if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) { - mutex_exit(&imc->imc_lock); - return (ENOTSUP); - } - - /* - * It's possible that someone has come in during the window between when - * we've created the minor node and when we've finished doing work. - */ - if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) { - mutex_exit(&imc->imc_lock); - return (EAGAIN); - } - - /* - * It's not clear how someone would get a minor that we didn't create. - * But be paranoid and make sure. - */ - if (getminor(*devp) >= imc->imc_nsockets) { - mutex_exit(&imc->imc_lock); - return (EINVAL); - } - - /* - * Make sure this socket entry has been filled in. - */ - if (imc->imc_spointers[getminor(*devp)] == NULL) { - mutex_exit(&imc->imc_lock); - return (EINVAL); - } - - mutex_exit(&imc->imc_lock); - - return (0); -} - -static void -imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode) -{ - imc_decode_state_t dec; - uint_t i; - - bzero(&dec, sizeof (dec)); - if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) { - encode->mcei_err = (uint32_t)dec.ids_fail; - encode->mcei_errdata = dec.ids_fail_data; - return; - } - - encode->mcei_errdata = 0; - encode->mcei_err = 0; - encode->mcei_board = 0; - for (i = 0; i < imc->imc_nsockets; i++) { - if (imc->imc_spointers[i] == dec.ids_socket) - break; - } - encode->mcei_chip = i; - encode->mcei_mc = dec.ids_tadid; - encode->mcei_chan = dec.ids_channelid; - encode->mcei_dimm = dec.ids_dimmid; - encode->mcei_rank_addr = dec.ids_rankaddr; - encode->mcei_rank = dec.ids_rankid; - encode->mcei_row = UINT32_MAX; - encode->mcei_column = UINT32_MAX; - encode->mcei_pad = 0; -} - -static int -imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, - int *rvalp) -{ - int ret; - minor_t m; - mc_snapshot_info_t info; - mc_encode_ioc_t encode; - imc_t *imc = imc_data; - imc_socket_t *sock; - - mutex_enter(&imc->imc_lock); - m = getminor(dev); - if (m >= imc->imc_nsockets) { - ret = EINVAL; - goto done; - } - sock = imc->imc_spointers[m]; - if (sock == NULL) { - ret = EINVAL; - goto done; - } - - /* - * Note, other memory controller drivers don't check mode for reading - * data nor do they care who can read it from a credential perspective. - * As such we don't either at this time. - */ - switch (cmd) { - case MC_IOC_SNAPSHOT_INFO: - imc_nvl_pack(sock, B_FALSE); - if (sock->isock_buf == NULL) { - ret = EIO; - break; - } - - info.mcs_size = sock->isock_buflen; - info.mcs_gen = sock->isock_gen; - - if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { - ret = EFAULT; - break; - } - - ret = 0; - break; - case MC_IOC_SNAPSHOT: - imc_nvl_pack(sock, B_FALSE); - if (sock->isock_buf == NULL) { - ret = EIO; - break; - } - - if (ddi_copyout(sock->isock_buf, (void *)arg, - sock->isock_buflen, mode) != 0) { - ret = EFAULT; - break; - } - - ret = 0; - break; - case MC_IOC_DECODE_SNAPSHOT_INFO: - imc_decoder_pack(imc); - if (imc->imc_decoder_buf == NULL) { - ret = EIO; - break; - } - - info.mcs_size = imc->imc_decoder_len; - info.mcs_gen = imc->imc_spointers[0]->isock_gen; - - if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { - ret = EFAULT; - break; - } - - ret = 0; - break; - case MC_IOC_DECODE_SNAPSHOT: - imc_decoder_pack(imc); - if (imc->imc_decoder_buf == NULL) { - ret = EIO; - break; - } - - if (ddi_copyout(imc->imc_decoder_buf, (void *)arg, - imc->imc_decoder_len, mode) != 0) { - ret = EFAULT; - break; - } - - ret = 0; - break; - case MC_IOC_DECODE_PA: - if (crgetzoneid(credp) != GLOBAL_ZONEID || - drv_priv(credp) != 0) { - ret = EPERM; - break; - } - - if (ddi_copyin((void *)arg, &encode, sizeof (encode), - mode & FKIOCTL) != 0) { - ret = EPERM; - break; - } - - imc_ioctl_decode(imc, &encode); - ret = 0; - - if (ddi_copyout(&encode, (void *)arg, sizeof (encode), - mode & FKIOCTL) != 0) { - ret = EPERM; - break; - } - break; - default: - ret = EINVAL; - goto done; - } - -done: - mutex_exit(&imc->imc_lock); - return (ret); -} - -static int -imc_close(dev_t dev, int flag, int otyp, cred_t *credp) -{ - return (0); -} - -static int -imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - if (cmd != DDI_ATTACH) { - return (DDI_FAILURE); - } - - if (imc_data == NULL || imc_data->imc_dip != NULL) { - return (DDI_FAILURE); - } - - mutex_enter(&imc_data->imc_lock); - if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1, - TASKQ_DEFAULTPRI, 0)) == NULL) { - mutex_exit(&imc_data->imc_lock); - return (DDI_FAILURE); - } - - imc_data->imc_dip = dip; - imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED; - mutex_exit(&imc_data->imc_lock); - - (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data, - DDI_SLEEP); - - return (DDI_SUCCESS); -} - -/* - * We only export a single instance. - */ -static int -imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) -{ - /* - * getinfo(9E) shouldn't be called if we're not attached. But be - * paranoid. - */ - if (imc_data == NULL || imc_data->imc_dip == NULL) { - return (DDI_FAILURE); - } - - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *resultp = imc_data->imc_dip; - break; - case DDI_INFO_DEVT2INSTANCE: - *resultp = (void *)0; - break; - default: - return (DDI_FAILURE); - } - - return (DDI_SUCCESS); -} - -static int -imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - if (cmd != DDI_DETACH) { - return (DDI_FAILURE); - } - - if (imc_data == NULL || imc_data->imc_dip) { - return (DDI_FAILURE); - } - - mutex_enter(&imc_data->imc_lock); - - /* - * While a scan or attach is outstanding, don't allow us to detach. - */ - if ((imc_data->imc_flags & - (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) { - mutex_exit(&imc_data->imc_lock); - return (DDI_FAILURE); - } - - /* - * Because the stub driver depends on the imc driver, we shouldn't be - * able to have any entries in this list when we detach. However, we - * check just to make sure. - */ - if (!avl_is_empty(&imc_data->imc_stubs)) { - mutex_exit(&imc_data->imc_lock); - return (DDI_FAILURE); - } - - nvlist_free(imc_data->imc_decoder_dump); - imc_data->imc_decoder_dump = NULL; - if (imc_data->imc_decoder_buf != NULL) { - kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len); - imc_data->imc_decoder_buf = NULL; - imc_data->imc_decoder_len = 0; - } - - ddi_remove_minor_node(imc_data->imc_dip, NULL); - imc_data->imc_dip = NULL; - mutex_exit(&imc_data->imc_lock); - - ddi_taskq_wait(imc_data->imc_taskq); - ddi_taskq_destroy(imc_data->imc_taskq); - imc_data->imc_taskq = NULL; - - return (DDI_SUCCESS); -} - -static void -imc_free(void) -{ - if (imc_data == NULL) { - return; - } - - VERIFY(avl_is_empty(&imc_data->imc_stubs)); - avl_destroy(&imc_data->imc_stubs); - mutex_destroy(&imc_data->imc_lock); - kmem_free(imc_data, sizeof (imc_t)); - imc_data = NULL; -} - -static void -imc_alloc(void) -{ - imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP); - - mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL); - avl_create(&imc_data->imc_stubs, imc_stub_comparator, - sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link)); -} - -static struct cb_ops imc_cb_ops = { - .cb_open = imc_open, - .cb_close = imc_close, - .cb_strategy = nodev, - .cb_print = nodev, - .cb_dump = nodev, - .cb_read = nodev, - .cb_write = nodev, - .cb_ioctl = imc_ioctl, - .cb_devmap = nodev, - .cb_mmap = nodev, - .cb_segmap = nodev, - .cb_chpoll = nochpoll, - .cb_prop_op = ddi_prop_op, - .cb_flag = D_MP, - .cb_rev = CB_REV, - .cb_aread = nodev, - .cb_awrite = nodev -}; - -static struct dev_ops imc_dev_ops = { - .devo_rev = DEVO_REV, - .devo_refcnt = 0, - .devo_getinfo = imc_getinfo, - .devo_identify = nulldev, - .devo_probe = nulldev, - .devo_attach = imc_attach, - .devo_detach = imc_detach, - .devo_reset = nodev, - .devo_cb_ops = &imc_cb_ops, - .devo_quiesce = ddi_quiesce_not_needed -}; - -static struct modldrv imc_modldrv = { - .drv_modops = &mod_driverops, - .drv_linkinfo = "Intel Integrated Memory Controller Driver", - .drv_dev_ops = &imc_dev_ops -}; - -static struct modlinkage imc_modlinkage = { - .ml_rev = MODREV_1, - .ml_linkage = { &imc_modldrv, NULL } -}; - -int -_init(void) -{ - int ret; - - if ((ret = mod_install(&imc_modlinkage)) == 0) { - imc_alloc(); - } - - return (ret); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&imc_modlinkage, modinfop)); -} - -int -_fini(void) -{ - int ret; - - if ((ret = mod_remove(&imc_modlinkage)) == 0) { - imc_free(); - } - return (ret); -} diff --git a/usr/src/uts/i86pc/io/imc/imc.conf b/usr/src/uts/i86pc/io/imc/imc.conf deleted file mode 100644 index 7f55dc2cae..0000000000 --- a/usr/src/uts/i86pc/io/imc/imc.conf +++ /dev/null @@ -1,16 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2019 Joyent, Inc. -# - -name="imc" parent="pseudo" instance=0; diff --git a/usr/src/uts/i86pc/io/imc/imc.h b/usr/src/uts/i86pc/io/imc/imc.h deleted file mode 100644 index 5f3def4930..0000000000 --- a/usr/src/uts/i86pc/io/imc/imc.h +++ /dev/null @@ -1,941 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2019 Joyent, Inc. - */ - -#ifndef _INTEL_IMC_H -#define _INTEL_IMC_H - -#include -#include -#include -#include - -/* - * This header file contains the definitions used for the various generations of - * the Intel IMC driver. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The maximum number of sockets that the IMC driver supports. This is currently - * determined by the Purley platforms (Skylake) which support up to 8 CPUs. - */ -#define IMC_MAX_SOCKETS 8 - -/* - * The maximum number of memory controllers that exist per socket. Currently all - * supported platforms (Sandy Bridge -> Skylake) support at most two. - */ -#define IMC_MAX_IMCPERSOCK 2 - -/* - * The maximum number of channels that exist per IMC. Currently Skylake supports - * 3 per IMC. On certain configurations of Haswell/Broadwell, there is only a - * single IMC which supports all 4 channels. - */ -#define IMC_MAX_CHANPERMC 4 - -/* - * The maximum number of DIMMs that exist per channel. On Skylake this is two - * DIMMs. However, Sandy Bridge through Broadwell support three. - */ -#define IMC_MAX_DIMMPERCHAN 3 - -/* - * The maximum number of rank disable bits per DIMM. This is currently - * consistent across all generations that have these bits. - */ -#define IMC_MAX_RANK_DISABLE 4 - -/* - * The number of different PCI buses that we need to record for a given - * platform. Pre-Skylake there are only two that are required, one for the IIO - * and one for the non-IIO. On Skylake, more PCI buses are used. - */ -#define IMC_MAX_PCIBUSES 3 - -/* - * Macros to take apart the node id for a given processor. These assume that - * we're reading the nodeid from the UBox and not from the SAD control. - */ -#define IMC_NODEID_UBOX_MASK(x) ((x) & 0x7) - -/* - * On Ivy Bridge through Broadwell, the node id that is found in the SAD targets - * has the HA indicator as NodeID[2]. This means that the actual target node of - * the socket is NodeID[3] | NodeID[1:0]. - */ -#define IMC_NODEID_IVY_BRD_UPPER(x) BITX(x, 3, 3) -#define IMC_NODEID_IVY_BRD_LOWER(x) BITX(x, 1, 0) -#define IMC_NODEID_IVY_BRD_HA(x) BITX(x, 2, 2) - -/* - * Macros to take apart the MCMTR register bits that we care about. - */ -#define IMC_MCMTR_CLOSED_PAGE(x) BITX(x, 0, 0) -#define IMC_MCMTR_LOCKSTEP(x) BITX(x, 1, 1) -#define IMC_MCMTR_ECC_ENABLED(x) BITX(x, 2, 2) - -#define IMC_MCMTR_DDR4_HAS_BRD(x) BITX(x, 14, 14) - -/* - * Macros to take apart the dimmmtr_* registers in different generations. While - * there are similarities, these often end up different between generations and - * chips. These macros use a range of CPUs that they're valid for in the name. - * Macros with no suffix are valid for all currently supported CPUs. - */ - -#define IMC_REG_MC_MTR0 0x80 -#define IMC_REG_MC_MTR1 0x84 -#define IMC_REG_MC_MTR2 0x88 - -#define IMC_MTR_CA_WIDTH(x) BITX(x, 1, 0) -#define IMC_MTR_CA_BASE 10 -#define IMC_MTR_CA_MIN 10 -#define IMC_MTR_CA_MAX 12 - -#define IMC_MTR_RA_WIDTH(x) BITX(x, 4, 2) -#define IMC_MTR_RA_BASE 12 -#define IMC_MTR_RA_MIN 13 -#define IMC_MTR_RA_MAX 18 - -#define IMC_MTR_DENSITY_IVY_BRD(x) BITX(x, 6, 5) -#define IMC_MTR_DENSITY_SKX(x) BITX(x, 7, 5) - -#define IMC_MTR_WIDTH_IVB_HAS(x) BITX(x, 8, 7) -#define IMC_MTR_WIDTH_BRD_SKX(x) BITX(x, 9, 8) - -#define IMC_MTR_DDR_RANKS(x) BITX(x, 13, 12) -#define IMC_MTR_DDR_RANKS_MAX 4 -#define IMC_MTR_DDR_RANKS_MAX_HAS_SKX 8 - -#define IMC_MTR_PRESENT_SNB_BRD(x) BITX(x, 14, 14) -#define IMC_MTR_PRESENT_SKYLAKE(x) BITX(x, 15, 15) - -#define IMC_MTR_RANK_DISABLE(x) BITX(x, 19, 16) - -#define IMC_MTR_DDR4_ENABLE_HAS_BRD(x) BITX(x, 20, 20) -#define IMC_MTR_HDRL_HAS_SKX(x) BITX(x, 21, 21) -#define IMC_MTR_HDRL_PARITY_HAS_SKX(x) BITX(x, 22, 22) -#define IMC_MTR_3DSRANKS_HAS_SKX(x) BITX(x, 24, 23) - -/* - * Data for the RASENABLES register. - */ -#define IMC_MC_MIRROR_SNB_BRD(x) BITX(x, 0, 0) - -/* - * The maximum number of SAD rules that exist on all supported platforms. - */ -#define IMC_MAX_SAD_RULES 24 - -/* - * The maximum number of targets that can be interleaved in a sad rule. - */ -#define IMC_MAX_SAD_INTERLEAVE 8 - -/* - * The maximum number of route entries that exist in SAD. This is only used on - * SKX. - */ -#define IMC_MAX_SAD_MCROUTES 6 - -/* - * Definitions used to decode the MC Route table. Note that at this time this is - * very Skylake specific (as it's the only platform it's supported on). - */ -#define IMC_REG_SKX_SAD_MC_ROUTE_TABLE 0xb4 -#define IMC_MC_ROUTE_RING_BITS 3 -#define IMC_MC_ROUTE_RING_MASK 0x7 -#define IMC_MC_ROUTE_CHAN_BITS 2 -#define IMC_MC_ROUTE_CHAN_MASK 0x3 -#define IMC_MC_ROUTE_CHAN_OFFSET 18 - -/* - * Definitions to help decode TOLM (top of low memory) and TOHM (top of high - * memory). The way this is done varies based on generation. These regions are - * currently always 64-MByte aligned - * - * On Sandy Bridge and Ivy Bridge the low four bits of TOLM are bits 31:28. TOHM - * is a single register. Bits 20:0 map to bits 45:25. Both registers represent - * the upper limit (as in one higher than the max DRAM value). - * - * On Haswell through Skylake, TOLM is represented as a 32-bit quantity. No - * shifting is required. However, only bits 31:26 are present. TOHM is spread - * out among two registers. The lower 32-bits is masked in a similar fashion. In - * both cases, these registers represent an inclusive range where we don't care - * about other bits. To deal with this we'll increment the lowest bit we care - * about to make it an exclusive range. - * - * Based on the above, we have opted to make both ranges in the IMC driver - * normalized to an _exclusive_ value. - * - * Ivy Bridge has the values in both the CBo SAD and a VT-d section; however, we - * use the CBo SAD which is why it looks like Sandy Bridge and not Haswell. - */ - -#define IMC_TOLM_SNB_IVY_MASK 0xf -#define IMC_TOLM_SNB_IVY_SHIFT 28 -#define IMC_TOHM_SNB_IVY_MASK 0x1fffff -#define IMC_TOHM_SNB_IVY_SHIFT 25 - -#define IMC_TOLM_HAS_SKX_MASK 0xfc000000 -#define IMC_TOLM_HAS_SKY_EXCL (1 << 26) -#define IMC_TOHM_LOW_HAS_SKX_MASK 0xfc000000 -#define IMC_TOHM_HAS_SKY_EXCL (1 << 26) - -/* - * Definitions to decode SAD values. These are sometimes subtlety different - * across generations. - */ -#define IMC_SAD_DRAM_RULE_ENABLE(x) BITX(x, 0, 0) - -#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(x) BITX(x, 1, 1) -#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR 0 -#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6 1 - -#define IMC_SAD_DRAM_INTERLEAVE_SKX(x) BITX(x, 2, 1) -#define IMC_SAD_DRAM_INTERLEAVE_SKX_8t6 0 -#define IMC_SAD_DRAM_INTERLEAVE_SKX_10t8 1 -#define IMC_SAD_DRAM_INTERLEAVE_SKX_14t12 2 -#define IMC_SAD_DRAM_INTERLEAVE_SKX_32t30 3 - -#define IMC_SAD_DRAM_ATTR_SNB_BRD(x) BITX(x, 3, 2) -#define IMC_SAD_DRAM_ATTR_SKX(x) BITX(x, 4, 3) -#define IMC_SAD_DRAM_ATTR_DRAM 0 -#define IMC_SAD_DRAM_ATTR_MMCFG 1 -#define IMC_SAD_DRAM_ATTR_NXM 2 - -#define IMC_SAD_DRAM_MOD23_SKX(x) BITX(x, 6, 5) -#define IMC_SAD_DRAM_MOD23_MOD3 0 -#define IMC_SAD_DRAM_MOD23_MOD2_C01 1 -#define IMC_SAD_DRAM_MOD23_MOD2_C12 2 -#define IMC_SAD_DRAM_MOD23_MOD2_C02 3 - -#define IMC_SAD_DRAM_LIMIT_SNB_BRD(x) BITX(x, 25, 6) -#define IMC_SAD_DRAM_LIMIT_SKX(x) BITX(x, 26, 7) -#define IMC_SAD_DRAM_LIMIT_SHIFT 26 -#define IMC_SAD_DRAM_LIMIT_EXCLUSIVE (1 << IMC_SAD_DRAM_LIMIT_SHIFT) - -#define IMC_SAD_DRAM_A7_IVB_BRD(x) BITX(x, 26, 26) -#define IMC_SAD_DRAM_MOD3_SKX(x) BITX(x, 27, 27) -#define IMC_SAD_DRAM_MOD3_MODE_SKX(x) BITX(x, 31, 30) -#define IMC_SAD_DRAM_MOD3_MODE_45t6 0 -#define IMC_SAD_DRAM_MOD3_MODE_45t8 1 -#define IMC_SAD_DRAM_MOD3_MODE_45t12 2 - -#define IMC_SAD_ILEAVE_SNB_MASK 0x7 -#define IMC_SAD_ILEAVE_SNB_LEN 3 -#define IMC_SAD_ILEAVE_IVB_SKX_MASK 0xf -#define IMC_SAD_ILEAVE_IVB_SKX_LEN 4 - -/* - * The interleave targets on Skylake use the upper bit to indicate whether it is - * referring to a local memory controller or if it actually refers to another - * node that is far away. The maximum value includes the upper bit which is used - * to indicate whether it is remote or far. - */ -#define IMC_SAD_ILEAVE_SKX_LOCAL(x) BITX(x, 3, 3) -#define IMC_SAD_ILEAVE_SKX_TARGET(x) BITX(x, 2, 0) -#define IMC_SAD_ILEAVE_SKX_MAX 0xf - -/* - * Maximum number of TAD tables that we need to consider. On Sandy Bridge - * through Broadwell this is based on the number of home agents that are present - * in the system. On Sandy Bridge there is one, on others, there are up to two. - * On Skylake, there is one TAD per IMC. - */ -#define IMC_MAX_TAD 2 - -/* - * Maximum number of TAD rules on any of the supported processors. - */ -#define IMC_MAX_TAD_RULES 12 - -/* - * Maximum number of interleave targets. Note, this only applies to Sandy Bridge - * through Broadwell. Skylake gets this information in another form. - */ -#define IMC_MAX_TAD_TARGETS 4 - -/* - * Offset between the base TAD rule and the corresponding wayness rule on - * Skylake. - */ -#define IMC_SKX_WAYNESS_OFFSET 0x30 - -/* - * Various macros to decode the TAD rules. - */ -#define IMC_TAD_LIMIT(x) BITX(x, 31, 12) -#define IMC_TAD_LIMIT_SHIFT 26 -#define IMC_TAD_LIMIT_EXCLUSIVE (1 << IMC_TAD_LIMIT_SHIFT) - -#define IMC_TAD_SOCK_WAY(x) BITX(x, 11, 10) -#define IMC_TAD_SOCK_WAY_1 0 -#define IMC_TAD_SOCK_WAY_2 1 -#define IMC_TAD_SOCK_WAY_4 2 -#define IMC_TAD_SOCK_WAY_8 3 -#define IMC_TAD_CHAN_WAY(x) BITX(x, 9, 8) -#define IMC_TAD_TARG3(x) BITX(x, 7, 6) -#define IMC_TAD_TARG2(x) BITX(x, 5, 4) -#define IMC_TAD_TARG1(x) BITX(x, 3, 2) -#define IMC_TAD_TARG0(x) BITX(x, 1, 0) - -#define IMC_TAD_SNB_BRD_NTARGETS 4 - -/* - * These are registers specific to the Skylake and newer TAD BASE registers. - */ -#define IMC_TAD_BASE_BASE(x) BITX(x, 31, 12) -#define IMC_TAD_BASE_SHIFT 26 - -#define IMC_TAD_BASE_CHAN_GRAN(x) BITX(x, 7, 6) -#define IMC_TAD_BASE_CHAN_GRAN_64B 0 -#define IMC_TAD_BASE_CHAN_GRAN_256B 1 -#define IMC_TAD_BASE_CHAN_GRAN_4KB 2 - -#define IMC_TAD_BASE_SOCK_GRAN(x) BITX(x, 5, 4) -#define IMC_TAD_BASE_SOCK_GRAN_64B 0 -#define IMC_TAD_BASE_SOCK_GRAN_256B 1 -#define IMC_TAD_BASE_SOCK_GRAN_4KB 2 -#define IMC_TAD_BASE_SOCK_GRAN_1GB 3 - -#define IMC_TADCHAN_OFFSET_SNB_BRD(x) BITX(x, 25, 6) -#define IMC_TADCHAN_OFFSET_SKX(x) BITX(x, 23, 4) -#define IMC_TADCHAN_OFFSET_SHIFT 26 - -/* - * Macros to get at various TAD features. - */ -#define IMC_TAD_SYSDEF_LOCKSTEP(x) BITX(x, 7, 7) -#define IMC_TAD_SYSDEF2_SHIFTUP(x) BITX(x, 22, 22) -#define IMC_TAD_SYSDEF2_CHANHASH(x) BITX(x, 21, 21) - -/* - * Maximum number of different wayness entries that exist across the various IMC - * generations. Each wayness then has a maximum number of target entries. - */ -#define IMC_MAX_RANK_WAYS 5 -#define IMC_MAX_RANK_INTERLEAVES 8 - -/* - * Macros to take apart the rank interleave wayness and offset registers. - */ -#define IMC_RIR_WAYNESS_ENABLED(x) BITX(x, 31, 31) -#define IMC_RIR_WAYNESS_WAY(x) BITX(x, 29, 28) -#define IMC_RIR_LIMIT_HAS_SKX(x) BITX(x, 11, 1) -#define IMC_RIR_LIMIT_SNB_IVB(x) BITX(x, 10, 1) -#define IMC_RIR_LIMIT_SHIFT 29 -#define IMC_RIR_LIMIT_EXCLUSIVE (1 << IMC_RIR_LIMIT_SHIFT) - -/* - * Currently, everything other than Broadwell has the same value for the target - * offset. - */ -#define IMC_RIR_OFFSET_TARGET_BRD(x) BITX(x, 23, 20) -#define IMC_RIR_OFFSET_TARGET(x) BITX(x, 19, 16) -#define IMC_RIR_OFFSET_OFFSET_HAS_SKX(x) BITX(x, 15, 2) -#define IMC_RIR_OFFSET_OFFSET_SNB_IVB(x) BITX(x, 14, 2) -#define IMC_RIR_OFFSET_SHIFT 29 - -/* - * Definitions to cover manipulations of open and closed pages. - */ -#define IMC_PAGE_BITS_CLOSED 6 -#define IMC_PAGE_BITS_OPEN 13 - -/* - * Macros to decode and understand the CPUBUSNO registers in the UBOX_DECS. - */ -#define IMC_UBOX_CPUBUSNO_0(x) BITX(x, 7, 0) -#define IMC_UBOX_CPUBUSNO_1(x) BITX(x, 15, 8) -#define IMC_UBOX_CPUBUSNO_2(x) BITX(x, 23, 16) - -/* - * Hardware generations supported by the IMC driver. - */ -typedef enum { - IMC_GEN_UNKNOWN = 0, - IMC_GEN_SANDY, - IMC_GEN_IVY, - IMC_GEN_HASWELL, - IMC_GEN_BROADWELL, - /* - * IMC_GEN_SKYLAKE also covers Cascade Lake. The two are similar to the - * point of even having the same PCI IDs for all of the devices. The - * only difference in the cpuid signature between them is the stepping, - * hence we do not have a separate Cascade Lake target here, as it's - * really the same as Skylake. - */ - IMC_GEN_SKYLAKE -} imc_gen_t; - -/* - * Generation specific limits. - */ -typedef struct imc_gen_data { - uint_t igd_max_sockets; - uint_t igd_max_imcs; - uint_t igd_max_channels; - uint_t igd_max_dimms; - uint_t igd_max_ranks; - uint_t igd_mtr_offsets[IMC_MAX_DIMMPERCHAN]; - uint_t igd_mcmtr_offset; - uint_t igd_topo_offset; - uint_t igd_num_mcroutes; - uint_t igd_tolm_offset; - uint_t igd_tohm_low_offset; - uint_t igd_tohm_hi_offset; - uint_t igd_sad_dram_offset; - uint_t igd_sad_ndram_rules; - uint_t igd_sad_nodeid_offset; - uint_t igd_tad_nrules; - uint_t igd_tad_rule_offset; - uint_t igd_tad_chan_offset; - uint_t igd_tad_sysdef; - uint_t igd_tad_sysdef2; - uint_t igd_mc_mirror; - uint_t igd_rir_nways; - uint_t igd_rir_way_offset; - uint_t igd_rir_nileaves; - uint_t igd_rir_ileave_offset; - uint_t igd_ubox_cpubusno_offset; -} imc_gen_data_t; - -/* - * Different types of PCI devices that show up on the core that we may need to - * attach to. - */ -typedef enum { - IMC_TYPE_UNKNOWN = 0, - IMC_TYPE_MC0_M2M, /* SKX Only */ - IMC_TYPE_MC1_M2M, /* SKX Only */ - IMC_TYPE_MC0_MAIN0, - IMC_TYPE_MC0_MAIN1, - IMC_TYPE_MC1_MAIN0, - IMC_TYPE_MC1_MAIN1, - IMC_TYPE_MC0_CHANNEL0, - IMC_TYPE_MC0_CHANNEL1, - IMC_TYPE_MC0_CHANNEL2, - IMC_TYPE_MC0_CHANNEL3, - IMC_TYPE_MC1_CHANNEL0, - IMC_TYPE_MC1_CHANNEL1, - IMC_TYPE_MC1_CHANNEL2, - IMC_TYPE_MC1_CHANNEL3, - IMC_TYPE_SAD_DRAM, - IMC_TYPE_SAD_MMIO, - /* - * We want to note which device has the TOLM and TOHM registers. - * Unfortunately this is a rather complicated affair. On Sandy Bridge - * they are a part of the IMC_TYPE_SAD_MMIO. On Ivy Bridge, it's on its - * own dedicated device on the CBo. - * - * On Haswell onward, these move to the VT-D misc. registers. On Haswell - * and Broadwell, only one of these exist in the system. However, on - * Skylake these exist per socket. - */ - IMC_TYPE_SAD_MISC, - IMC_TYPE_VTD_MISC, - /* - * On SKX this exists on a per-core basis. It contains the memory - * controller routing table. - */ - IMC_TYPE_SAD_MCROUTE, - IMC_TYPE_UBOX, - IMC_TYPE_UBOX_CPUBUSNO, - IMC_TYPE_HA0, - IMC_TYPE_HA1, -} imc_type_t; - -/* - * Each entry in the stub table represents a device that we might attach to in a - * given generation. This is only defined in the kernel to make it easier to - * build the imc decoder in userland for testing. - */ -#ifdef _KERNEL -typedef struct imc_stub_table { - imc_gen_t imcs_gen; - imc_type_t imcs_type; - uint16_t imcs_devid; - uint16_t imcs_pcidev; - uint16_t imcs_pcifunc; - const char *imcs_desc; -} imc_stub_table_t; - -typedef struct imc_stub { - avl_node_t istub_link; - dev_info_t *istub_dip; - uint16_t istub_vid; - uint16_t istub_did; - uint16_t istub_bus; - uint16_t istub_dev; - uint16_t istub_func; - ddi_acc_handle_t istub_cfgspace; - const imc_stub_table_t *istub_table; -} imc_stub_t; -#else -typedef struct imc_stub { - void *istub_unused; -} imc_stub_t; -#endif /* _KERNEL */ - -typedef enum { - IMC_F_UNSUP_PLATFORM = (1 << 0), - IMC_F_SCAN_DISPATCHED = (1 << 1), - IMC_F_SCAN_COMPLETE = (1 << 2), - IMC_F_ATTACH_DISPATCHED = (1 << 3), - IMC_F_ATTACH_COMPLETE = (1 << 4), - IMC_F_MCREG_FAILED = (1 << 5), - IMC_F_VALIDATE_FAILED = (1 << 6) -} imc_flags_t; - -#define IMC_F_ALL_FLAGS (IMC_F_UNSUP_PLATFORM | IMC_F_SCAN_DISPATCHED | \ - IMC_F_SCAN_COMPLETE | IMC_F_ATTACH_DISPATCHED | IMC_F_ATTACH_COMPLETE | \ - IMC_F_MCREG_FAILED | IMC_F_VALIDATE_FAILED) - -typedef enum imc_dimm_type { - IMC_DIMM_UNKNOWN, - IMC_DIMM_DDR3, - IMC_DIMM_DDR4, - IMC_DIMM_NVDIMM -} imc_dimm_type_t; - -typedef enum imc_dimm_valid { - IMC_DIMM_V_VALID = 0, - IMC_DIMM_V_BAD_PCI_READ = (1 << 0), - IMC_DIMM_V_BAD_ROWS = (1 << 1), - IMC_DIMM_V_BAD_COLUMNS = (1 << 2), - IMC_DIMM_V_BAD_DENSITY = (1 << 3), - IMC_DIMM_V_BAD_WIDTH = (1 << 4), - IMC_DIMM_V_BAD_RANKS = (1 << 5) -} imc_dimm_valid_t; - -typedef struct imc_dimm { - imc_dimm_valid_t idimm_valid; - boolean_t idimm_present; - uint8_t idimm_3dsranks; - boolean_t idimm_hdrl_parity; - boolean_t idimm_hdrl; - boolean_t idimm_ranks_disabled[IMC_MAX_RANK_DISABLE]; - uint8_t idimm_nbanks; - uint8_t idimm_nranks; - uint8_t idimm_width; - uint8_t idimm_density; /* In GiB */ - uint8_t idimm_nrows; - uint8_t idimm_ncolumns; - /* Synthesized */ - uint64_t idimm_size; - /* Raw data */ - uint32_t idimm_mtr; -} imc_dimm_t; - -typedef struct imc_rank_ileave_entry { - uint8_t irle_target; - uint64_t irle_offset; -} imc_rank_ileave_entry_t; - -typedef struct imc_rank_ileave { - boolean_t irle_enabled; - uint32_t irle_raw; - uint8_t irle_nways; - uint8_t irle_nwaysbits; - uint64_t irle_limit; - uint_t irle_nentries; - imc_rank_ileave_entry_t irle_entries[IMC_MAX_RANK_INTERLEAVES]; -} imc_rank_ileave_t; - -typedef enum imc_channel_valid { - IMC_CHANNEL_V_VALID = 0, - IMC_CHANNEL_V_BAD_PCI_READ = 1 << 0, -} imc_channel_valid_t; - -typedef struct imc_channel { - imc_channel_valid_t ich_valid; - imc_stub_t *ich_desc; - uint_t ich_ndimms; - imc_dimm_t ich_dimms[IMC_MAX_DIMMPERCHAN]; - uint_t ich_ntad_offsets; - uint32_t ich_tad_offsets_raw[IMC_MAX_TAD_RULES]; - uint64_t ich_tad_offsets[IMC_MAX_TAD_RULES]; - uint_t ich_nrankileaves; - imc_rank_ileave_t ich_rankileaves[IMC_MAX_RANK_WAYS]; -} imc_channel_t; - -typedef struct imc_controller { - imc_stub_t *icn_main0; - imc_stub_t *icn_main1; - imc_stub_t *icn_m2m; - boolean_t icn_invalid; - imc_dimm_type_t icn_dimm_type; - boolean_t icn_ecc; - boolean_t icn_lockstep; - boolean_t icn_closed; - uint32_t icn_topo; - uint_t icn_nchannels; - imc_channel_t icn_channels[IMC_MAX_CHANPERMC]; -} imc_mc_t; - -typedef enum imc_sad_rule_type { - IMC_SAD_TYPE_DRAM, - IMC_SAD_TYPE_MMCFG, - IMC_SAD_TYPE_NXM -} imc_sad_rule_type_t; - -typedef enum imc_sad_rule_imode { - IMC_SAD_IMODE_8t6, - IMC_SAD_IMODE_8t6XOR, - IMC_SAD_IMODE_10t8, - IMC_SAD_IMODE_14t12, - IMC_SAD_IMODE_32t30 -} imc_sad_rule_imode_t; - -typedef enum imc_sad_rule_mod_mode { - IMC_SAD_MOD_MODE_NONE, - IMC_SAD_MOD_MODE_45t6, - IMC_SAD_MOD_MODE_45t8, - IMC_SAD_MOD_MODE_45t12 -} imc_sad_rule_mod_mode_t; - -typedef enum imc_sad_rule_mod_type { - IMC_SAD_MOD_TYPE_NONE, - IMC_SAD_MOD_TYPE_MOD3, - IMC_SAD_MOD_TYPE_MOD2_01, - IMC_SAD_MOD_TYPE_MOD2_12, - IMC_SAD_MOD_TYPE_MOD2_02 -} imc_sad_rule_mod_type_t; - -typedef struct imc_sad_mcroute_entry { - uint8_t ismce_imc; /* ID of the target IMC */ - uint8_t ismce_pchannel; /* ID of the target physical channel */ -} imc_sad_mcroute_entry_t; - -typedef struct imc_sad_mcroute_table { - uint32_t ismc_raw_mcroute; - uint_t ismc_nroutes; - imc_sad_mcroute_entry_t ismc_mcroutes[IMC_MAX_SAD_MCROUTES]; -} imc_sad_mcroute_table_t; - -/* - * This rule represents a single SAD entry. - */ -typedef struct imc_sad_rule { - uint32_t isr_raw_dram; - uint32_t isr_raw_interleave; - boolean_t isr_enable; - boolean_t isr_a7mode; - boolean_t isr_need_mod3; - uint64_t isr_limit; - imc_sad_rule_type_t isr_type; - imc_sad_rule_imode_t isr_imode; - imc_sad_rule_mod_mode_t isr_mod_mode; - imc_sad_rule_mod_type_t isr_mod_type; - uint_t isr_ntargets; - uint8_t isr_targets[IMC_MAX_SAD_INTERLEAVE]; -} imc_sad_rule_t; - -typedef enum imc_sad_flags { - IMC_SAD_MCROUTE_VALID = 1 << 0, -} imc_sad_flags_t; - -typedef enum imc_sad_valid { - IMC_SAD_V_VALID = 0, - IMC_SAD_V_BAD_PCI_READ = 1 << 0, - IMC_SAD_V_BAD_MCROUTE = 1 << 1, - IMC_SAD_V_BAD_DRAM_ATTR = 1 << 2, - IMC_SAD_V_BAD_MOD3 = 1 << 3, -} imc_sad_valid_t; - -typedef struct imc_sad { - imc_sad_flags_t isad_flags; - imc_sad_valid_t isad_valid; - imc_stub_t *isad_dram; - imc_stub_t *isad_mmio; - imc_stub_t *isad_tolh; - uint64_t isad_tolm; - uint64_t isad_tohm; - uint_t isad_nrules; - imc_sad_rule_t isad_rules[IMC_MAX_SAD_RULES]; - imc_sad_mcroute_table_t isad_mcroute; -} imc_sad_t; - -typedef enum imc_tad_gran { - IMC_TAD_GRAN_64B = 0, - IMC_TAD_GRAN_256B, - IMC_TAD_GRAN_4KB, - IMC_TAD_GRAN_1GB -} imc_tad_gran_t; - -typedef struct imc_tad_rule { - uint64_t itr_base; - uint64_t itr_limit; - uint32_t itr_raw; - uint32_t itr_raw_gran; - uint8_t itr_sock_way; - uint8_t itr_chan_way; - imc_tad_gran_t itr_sock_gran; - imc_tad_gran_t itr_chan_gran; - uint_t itr_ntargets; - uint8_t itr_targets[IMC_MAX_TAD_TARGETS]; -} imc_tad_rule_t; - -typedef enum imc_tad_valid { - IMC_TAD_V_VALID = 1 << 0, - IMC_TAD_V_BAD_PCI_READ = 1 << 1, - IMC_TAD_V_BAD_CHAN_GRAN = 1 << 2 -} imc_tad_valid_t; - -typedef enum imc_tad_flags { - IMC_TAD_FLAG_CHANSHIFT = 1 << 0, - IMC_TAD_FLAG_CHANHASH = 1 << 1, - IMC_TAD_FLAG_MIRROR = 1 << 2, - IMC_TAD_FLAG_LOCKSTEP = 1 << 3 -} imc_tad_flags_t; - -typedef struct imc_tad { - imc_tad_valid_t itad_valid; - imc_stub_t *itad_stub; - imc_tad_flags_t itad_flags; - uint_t itad_nrules; - imc_tad_rule_t itad_rules[IMC_MAX_TAD_RULES]; -} imc_tad_t; - -typedef enum imc_socket_valid { - IMC_SOCKET_V_VALID = 0, - IMC_SOCKET_V_BAD_NODEID = 1 << 0 -} imc_socket_valid_t; - -typedef struct imc_socket { - imc_socket_valid_t isock_valid; - uint_t isock_bus[IMC_MAX_PCIBUSES]; - uint_t isock_nbus; - uint_t isock_gen; - nvlist_t *isock_nvl; - char *isock_buf; - size_t isock_buflen; - imc_sad_t isock_sad; - uint_t isock_ntad; - imc_tad_t isock_tad[IMC_MAX_TAD]; - imc_stub_t *isock_ubox; - imc_stub_t *isock_cpubusno; - uint32_t isock_nodeid; - uint_t isock_nimc; - imc_mc_t isock_imcs[IMC_MAX_IMCPERSOCK]; -} imc_socket_t; - -typedef struct imc { - /* - * The initial members here are only used in the kernel. This is done to - * make it easier for us to be able to define a version of this to use - * in testing. - */ -#ifdef _KERNEL - dev_info_t *imc_dip; - kmutex_t imc_lock; - imc_flags_t imc_flags; - const imc_gen_data_t *imc_gen_data; - ddi_taskq_t *imc_taskq; - uint_t imc_nscanned; - avl_tree_t imc_stubs; - nvlist_t *imc_decoder_dump; - char *imc_decoder_buf; - size_t imc_decoder_len; -#endif /* _KERNEL */ - imc_gen_t imc_gen; - - /* - * Data about the memory in the system - */ - uint_t imc_nsockets; - imc_socket_t imc_sockets[IMC_MAX_SOCKETS]; - -#ifdef _KERNEL - /* - * The imc_sockets[] array is organized based on increasing PCI Bus ID. - * This array maps the socket id that user land thinks of back to the - * actual underlying socket in case hardware does not put them in order. - */ - imc_socket_t *imc_spointers[IMC_MAX_SOCKETS]; - - /* - * Store the IIO global VT-D misc. device. While there are sometimes - * multiple on the system, we only keep a single one around. - */ - imc_stub_t *imc_gvtd_misc; -#endif -} imc_t; - - -/* - * Decoder failure reasons - */ -typedef enum imc_decode_failure { - IMC_DECODE_F_NONE = 0, - /* - * Indicates that the memory address fell into a reserved legacy range. - * The legacy range index is stored in the failure data. - */ - IMC_DECODE_F_LEGACY_RANGE, - /* - * Indicates that we had bad socket data. The socket in question is - * noted in the failure data. - */ - IMC_DECODE_F_BAD_SOCKET, - /* - * Indicates that we had bad SAD data. The socket the SAD is associated - * with is noted in the failure data. - */ - IMC_DECODE_F_BAD_SAD, - /* - * Indicates that the address was not contained in conventional, low, - * or high memory. - */ - IMC_DECODE_F_OUTSIDE_DRAM, - /* - * Indicates that no valid SAD rule was found for the address. - */ - IMC_DECODE_F_NO_SAD_RULE, - /* - * Indicates that the SAD interleave target was beyond the valid index. - */ - IMC_DECODE_F_BAD_SAD_INTERLEAVE, - /* - * Indicates that the route suggested a remote processor we can't find. - */ - IMC_DECODE_F_BAD_REMOTE_MC_ROUTE, - /* - * Indicates that we ended up in a loop trying to find the right socket - * to use. - */ - IMC_DECODE_F_SAD_SEARCH_LOOP, - /* - * Indicates that we encountered a SAD rule that asked for inconsistent - * mod rules. - */ - IMC_DECODE_F_SAD_BAD_MOD, - /* - * Indicates that the socket or tad rule we found doesn't actually point - * to something that we know about. - */ - IMC_DECODE_F_SAD_BAD_SOCKET, - IMC_DECODE_F_SAD_BAD_TAD, - /* - * Indicates that we could not find a matching tad rule. - */ - IMC_DECODE_F_NO_TAD_RULE, - /* - * Indicates that we encountered the TAD channel 3-way interleave that - * we don't support. - */ - IMC_DECODE_F_TAD_3_ILEAVE, - /* - * Indicates that we had a bad target index. - */ - IMC_DECODE_F_TAD_BAD_TARGET_INDEX, - /* - * Indicates that we have a bad channel ID. - */ - IMC_DECODE_F_BAD_CHANNEL_ID, - /* - * Indicates that the TAD rule offset in the channel interleave was - * incorrect. - */ - IMC_DECODE_F_BAD_CHANNEL_TAD_OFFSET, - /* - * We couldn't find a valid rank interleave rule. - */ - IMC_DECODE_F_NO_RIR_RULE, - /* - * Indicates that the index of the rank interleaving target was bad. - */ - IMC_DECODE_F_BAD_RIR_ILEAVE_TARGET, - /* - * Indicates that the calculated DIMM represents an invalid DIMM that is - * beyond the number of supported DIMMS per channel on the platform. - */ - IMC_DECODE_F_BAD_DIMM_INDEX, - /* - * Indicates that the specified DIMM is not preset; however, it is a - * valid DIMM number. - */ - IMC_DECODE_F_DIMM_NOT_PRESENT, - /* - * Indicates that the specified rank on the DIMM is more than the number - * of ranks that the DIMM has. - */ - IMC_DECODE_F_BAD_DIMM_RANK, - /* - * Indicates that the channel offset is larger than the system address, - * meaning that we would end up with an underflow if we continued. The - * equivalent is true for the rank address. - */ - IMC_DECODE_F_CHANOFF_UNDERFLOW, - IMC_DECODE_F_RANKOFF_UNDERFLOW, -} imc_decode_failure_t; - -/* - * Decoder state tracking - */ -typedef struct imc_decode_state { - imc_decode_failure_t ids_fail; - uint64_t ids_fail_data; - uint64_t ids_pa; - uint64_t ids_chanaddr; - uint64_t ids_rankaddr; - uint32_t ids_nodeid; - uint32_t ids_tadid; - uint32_t ids_channelid; - uint32_t ids_physrankid; - uint32_t ids_dimmid; - uint32_t ids_rankid; - const imc_socket_t *ids_socket; - const imc_sad_t *ids_sad; - const imc_sad_rule_t *ids_sad_rule; - const imc_tad_t *ids_tad; - const imc_tad_rule_t *ids_tad_rule; - const imc_mc_t *ids_mc; - const imc_channel_t *ids_chan; - const imc_rank_ileave_t *ids_rir; - const imc_dimm_t *ids_dimm; -} imc_decode_state_t; - -#ifdef _KERNEL - -/* - * Functions needed for the stub drivers. - */ -extern int imc_attach_stub(dev_info_t *, ddi_attach_cmd_t); -extern int imc_detach_stub(dev_info_t *, ddi_detach_cmd_t); - -/* - * Decoder related functions - */ -extern void imc_decoder_init(imc_t *); - -extern nvlist_t *imc_dump_decoder(imc_t *); -#else /* !_KERNEL */ -extern boolean_t imc_restore_decoder(nvlist_t *, imc_t *); -#endif /* _KERNEL */ - -extern boolean_t imc_decode_pa(const imc_t *, uint64_t, imc_decode_state_t *); - - -#ifdef __cplusplus -} -#endif - -#endif /* _INTEL_IMC_H */ diff --git a/usr/src/uts/i86pc/io/imc/imcstub.c b/usr/src/uts/i86pc/io/imc/imcstub.c deleted file mode 100644 index ee020dd5c4..0000000000 --- a/usr/src/uts/i86pc/io/imc/imcstub.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2019 Joyent, Inc. - */ - -/* - * This is a stub driver that is used by the main imcstub driver to attach - * component PCI devices so that it can access their dev_info_t. - */ - -#include -#include -#include -#include -#include - -#include "imc.h" - - -static int -imcstub_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) -{ - return (imc_attach_stub(dip, cmd)); -} - -static int -imcstub_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) -{ - return (imc_detach_stub(dip, cmd)); -} - -static struct dev_ops imcstub_dev_ops = { - .devo_rev = DEVO_REV, - .devo_refcnt = 0, - .devo_getinfo = nodev, - .devo_identify = nodev, - .devo_probe = nulldev, - .devo_attach = imcstub_attach, - .devo_detach = imcstub_detach, - .devo_reset = nodev, - .devo_quiesce = ddi_quiesce_not_needed -}; - -static struct modldrv imcstub_modldrv = { - .drv_modops = &mod_driverops, - .drv_linkinfo = "IMC Stub driver", - .drv_dev_ops = &imcstub_dev_ops -}; - -static struct modlinkage imcstub_modlinkage = { - .ml_rev = MODREV_1, - .ml_linkage = { &imcstub_modldrv, NULL } -}; - -int -_init(void) -{ - return (mod_install(&imcstub_modlinkage)); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&imcstub_modlinkage, modinfop)); -} - -int -_fini(void) -{ - return (mod_remove(&imcstub_modlinkage)); -} diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 336c25d739..2e7455a93a 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -351,3 +351,10 @@ AMDZEN_STUB_OBJS = amdzen_stub.o SMNTEMP_OBJS = smntemp.o USMN_OBJS = usmn.o ZEN_UDF_OBJS = zen_udf.o + +# +# Intel Integrated Memory Controller +# (Sandy Bridge - Cascade Lake) +# +IMC_OBJS = imc.o imc_decode.o imc_dump.o +IMCSTUB_OBJS = imcstub.o diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index 4d1d2664c3..3fafc22c66 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -738,3 +738,8 @@ DRV_KMODS += smntemp DRV_KMODS += amdzen DRV_KMODS += amdzen_stub DRV_KMODS += usmn zen_udf + +# +# Intel Integrated Memory Controller +# +DRV_KMODS += imc imcstub diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules index 1c6786c283..84ecfad278 100644 --- a/usr/src/uts/intel/Makefile.rules +++ b/usr/src/uts/intel/Makefile.rules @@ -157,6 +157,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/hotplug/pcicfg/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/imc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(SRC)/common/mc/imc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/ipmi/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/imc/Makefile b/usr/src/uts/intel/imc/Makefile new file mode 100644 index 0000000000..752b7b8544 --- /dev/null +++ b/usr/src/uts/intel/imc/Makefile @@ -0,0 +1,43 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = imc +OBJECTS = $(IMC_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/imc + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(CONFMOD) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +CPPFLAGS += -I$(CONF_SRCDIR) +LDFLAGS += -dy + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/imcstub/Makefile b/usr/src/uts/intel/imcstub/Makefile new file mode 100644 index 0000000000..f7e38ce1e7 --- /dev/null +++ b/usr/src/uts/intel/imcstub/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = imcstub +OBJECTS = $(IMCSTUB_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +LDFLAGS += -dy -Ndrv/imc + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/io/imc/imc.c b/usr/src/uts/intel/io/imc/imc.c new file mode 100644 index 0000000000..e1dbfbfc2e --- /dev/null +++ b/usr/src/uts/intel/io/imc/imc.c @@ -0,0 +1,3011 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Generic Intel Integrated Memory Controller (IMC) Driver + * + * This driver talks to the CPU's IMC to understand the detailed topology of the + * processor and to determine how to map between physical addresses to the + * corresponding DIMM. This driver supports the following generations of Intel + * chips: + * + * - Sandy Bridge + * - Ivy Bridge + * - Haswell + * - Broadwell + * - Skylake / Cascade Lake + * + * Memory Decoding + * --------------- + * + * For more detailed summaries of the memory decoding process, please refer to + * the Intel External Design Specifications for the corresponding processor. + * What follows is a rough overview of how the memory decoding system works. + * + * First, we'd like to define the following concepts: + * + * SYSTEM ADDRESS + * + * This is a physical address that the operating system normally uses. This + * address may refer to DRAM, it may refer to memory mapped PCI + * configuration space or device registers, or it may refer to other parts + * of the system's memory map, such as the extended advanced programmable + * interrupt controller (xAPIC), etc. + * + * DIMM + * + * Dual-inline memory module. This refers to a physical stick of volatile + * memory that is inserted into a slot on the motherboard. + * + * RANK + * + * A potential sub-division of a DIMM. A DIMM's memory capacity is divided + * into a number of equal sized ranks. For example, an 8 GiB DIMM, may have + * 1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks. + * + * RANK ADDRESS + * + * An address that exists in the context of a given rank on a DIMM. All + * ranks have overlapping addresses, so the address 0x400 exists on all + * ranks on a given DIMM. + * + * CHANNEL + * + * Multiple DIMMs may be combined into a single channel. The channel + * represents the combined memory of all the DIMMs. A given channel only + * ever exists on a socket and is bound to a single memory controller. + * + * CHANNEL ADDRESS + * + * This is an address that exists logically on a channel. Each address on a + * channel maps to a corresponding DIMM that exists on that channel. The + * address space on one channel is independent from that on another. This + * means that address 0x1000 can exist on each memory channel in the + * system. + * + * INTERLEAVE + * + * There are several different cases where interleaving occurs on the + * system. For example, addresses may be interleaved across sockets, + * memory channels, or DIMM ranks. When addresses are interleaved, then + * some number of bits in an address are used to select which target to go + * to (usually through a look up table). The effect of interleaving is that + * addresses that are next to one another may not all go to the same + * device. The following image shows a non-interleaving case. + * + * 0x0fff +-----+ +-----+ 0x7ff + * | |\___________/| | + * | | __________ | (b) | + * | | / \| | + * 0x0800 |=====|= +-----+ 0x000 +-----+ 0x7ff + * | | \______________________________/| | + * | | _______________________________ | (a) | + * | |/ \| | + * 0x0000 +-----+ +-----+ 0x000 + * + * In this example of non-interleaving, addresses 0x0000 to 0x07ff go to + * device (a). While, addresses 0x08000 to 0xfff, go to device (b). + * However, each range is divided into the same number of components. + * + * If instead, we were to look at that with interleaving, what we might say + * is that rather than splitting the range in half, we might say that if + * the address has bit 8 set (0x100), then it goes to (b), otherwise it + * goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a). + * 0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a) + * again, and then 0x300 to 0x2ff would go back to (b). This would continue + * for a while. This would instead look something more like: + * + * + * 0x0fff +-----+ A: 0x7ff +---------+ B: 0x7ff +---------+ + * | (b) | | e00-eff | | f00-fff | + * 0x0f00 |-----| 0x700 +---------+ 0x700 +---------+ + * | (a) | | c00-cff | | d00-dff | + * 0x0e00 ~~~~~~~ 0x600 +---------+ 0x600 +---------+ + * *** | a00-aff | | b00-bff | + * 0x0400 ~~~~~~~ 0x500 +---------+ 0x500 +---------+ + * | (b) | | 800-8ff | | 900-9ff | + * 0x0300 |-----| 0x400 +---------+ 0x400 +---------+ + * | (a) | | 600-6ff | | 700-7ff | + * 0x0200 |-----| 0x300 +---------+ 0x300 +---------+ + * | (b) | | 400-4ff | | 500-5ff | + * 0x0100 |-----| 0x200 +---------+ 0x200 +---------+ + * | (a) | | 200-2ff | | 300-3ff | + * 0x0000 +-----+ 0x100 +---------+ 0x100 +---------+ + * | 000-0ff | | 100-1ff | + * 0x000 +---------+ 0x000 +---------+ + * + * In this example we've performed two-way interleaving. The number of ways + * that something can interleave varies based on what we're interleaving + * between. + * + * MEMORY CONTROLLER + * + * A given processor die (see uts/i86pc/os/cpuid.c) contains a number of + * memory controllers. Usually 1 or two. Each memory controller supports a + * given number of DIMMs, which are divided across multiple channels. + * + * TARGET ADDRESS DECODER + * + * The target address decoder (TAD) is responsible for taking a system + * address and transforming it into a channel address based on the rules + * that are present. Each memory controller has a corresponding TAD. The + * TAD is often contained in a device called a 'Home Agent'. + * + * SYSTEM ADDRESS DECODER + * + * The system address decoder (SAD) is responsible for taking a system + * address and directing it to the right place, whether this be memory or + * otherwise. There is a single memory controller per socket (see + * uts/i86pc/os/cpuid.c) that is shared between all the cores currently. + * + * NODE IDENTIFIER + * + * The node identifier is used to uniquely identify an element in the + * various routing topologies on the die (see uts/i86pc/os/cpuid.c for the + * definition of 'die'). One can roughly think about this as a unique + * identifier for the socket itself. In general, the primary node ID for a + * socket should map to the socket APIC ID. + * + * Finding Devices + * --------------- + * + * There is a bit of a chicken and egg problem on Intel systems and in the + * device driver interface. The information that we need in the system is spread + * out amongst a large number of different PCI devices that the processor + * exposes. The number of such devices can vary based on the processor + * generation and the specific SKU in the processor. To deal with this, we break + * the driver into two different components: a stub driver and the full driver. + * + * The stub driver has aliases for all known PCI devices that we might attach to + * in a given generation on the system. This driver is called 'imcstub'. When a + * stub attaches, it just registers itself with the main driver, upon which it + * has a module dependency. + * + * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it + * kicks off a scan of the device tree which takes place in a task queue. Once + * there, it determines the number of devices that it expects to exist by + * walking the tree and comparing it against the generation-specific table. + * + * If all devices are found, we'll go ahead and read through all the devices and + * build a map of all the information we need to understand the topology of the + * system and to be able to decode addresses. We do this here, because we can be + * asked to perform decoding in dangerous contexts (after taking an MCE, panic, + * etc) where we don't want to have to rely on the broader kernel functioning at + * this point in time. + * + * Once our topology is built, we'll create minor nodes which are used by the + * fault management architecture to query for information and register our + * decoding functionality with the kernel. + * + * PCI Numbering + * ------------- + * + * For each device that we care about, Intel defines the device and function + * that we can expect to find the information and PCI configuration space + * registers that we care about at. However, the PCI bus is not well defined. + * Devices that are on the same socket use the same set of bus numbers; however, + * some sockets have multiple device numbers that they'll use to represent + * different classes. These bus numbers are programmed by systems firmware as + * part of powering on the system. This means, that we need the ability to + * map together these disparate ranges ourselves. + * + * There is a device called a utility box (UBOX), which exists per-socket and + * maps the different sockets together. We use this to determine which devices + * correspond to which sockets. + * + * Mapping Sockets + * --------------- + * + * Another wrinkle is that the way that the OS sees the numbering of the CPUs is + * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more + * information). However, to map to the corresponding socket, we need to look at + * the socket's node ID. The order of PCI buses in the system is not required to + * have any relation to the socket ID. Therefore, we have to have yet another + * indirection table in the imc_t. + * + * Exposing Data + * ------------- + * + * We expose topology data to FMA using the OS-private memory controller + * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a + * number of specific interfaces that we can then implement. The ioctl API asks + * us for a snapshot of data, which basically has us go through and send an + * nvlist_t to userland. This nvlist_t is constructed as part of the scan + * process. This nvlist uses the version 1 format, which more explicitly encodes + * the topology in a series of nested nvlists. + * + * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the + * decoder and ask it to perform decoding. + * + * Decoding Addresses + * ------------------ + * + * The decoding logic can be found in common/imc/imc_decode.c. This file is + * shared between the kernel and userland to allow for easier testing and + * additional flexibility in operation. The decoding process happens in a few + * different phases. + * + * The first phase, is to determine which memory controller on which socket is + * responsible for this data. To determine this, we use the system address + * decoder and walk the rules, looking for the correct target. There are various + * manipulations to the address that exist which are used to determine which + * index we use. The way that we interpret the output of the rule varies + * somewhat based on the generation. Sandy Bridge just has a node ID which + * points us to the socket with its single IMC. On Ivy Bridge through Broadwell, + * the memory controller to use is also encoded in part of the node ID. Finally, + * on Skylake, the SAD tells us which socket to look at. The socket in question + * then has a routing table which tells us which channel on which memory + * controller that is local to that socket. + * + * Once we have the target memory controller, we walk the list of target address + * decoder rules. These rules can help tell us which channel we care about + * (which is required on Sandy Bridge through Broadwell) and then describe some + * amount of the interleaving rules which are used to turn the system address + * into a channel address. + * + * Once we know the channel and the channel address, we walk the rank interleave + * rules which help us determine which DIMM and the corresponding rank on it + * that the corresponding channel address is on. It also has logic that we need + * to use to determine how to transform a channel address into an address on + * that specific rank. Once we have that, then the initial decoding is done. + * + * The logic in imc_decode.c is abstracted away from the broader kernel CMI + * logic. This is on purpose and allows us not only an easier time unit testing + * the logic, but also allows us to express more high fidelity errors that are + * translated into a much smaller subset. This logic is exercised in the + * 'imc_test' program which is built in 'test/os-tests/tests/imc'. + * + * Limitations + * ----------- + * + * Currently, this driver has the following limitations: + * + * o It doesn't decode the row and column addresses. + * o It doesn't encode from a DIMM address to a system address. + * o It doesn't properly support lockstep and mirroring modes on Sandy Bridge - + * Broadwell platforms. + * o It doesn't support virtual lockstep and adaptive mirroring on Purley + * platforms. + * o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs. + * o It doesn't know how to decode three way channel interleaving. + * + * None of these are intrinsic problems to the driver, it's mostly a matter of + * having proper documentation and testing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "imc.h" + +/* + * These tables contain generational data that varies between processor + * generation such as the maximum number of sockets, memory controllers, and the + * offsets of the various registers. + */ + +static const imc_gen_data_t imc_gen_data_snb = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0x80, + .igd_tohm_low_offset = 0x84, + .igd_sad_dram_offset = 0x80, + .igd_sad_ndram_rules = 10, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_ivb = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0x80, + .igd_tohm_low_offset = 0x84, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 20, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_has_brd = { + .igd_max_sockets = 4, + .igd_max_imcs = 2, + .igd_max_channels = 4, + .igd_max_dimms = 3, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1, + IMC_REG_MC_MTR2 }, + .igd_mcmtr_offset = 0x7c, + .igd_tolm_offset = 0xd0, + .igd_tohm_low_offset = 0xd4, + .igd_tohm_hi_offset = 0xd8, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 20, + .igd_sad_nodeid_offset = 0x40, + .igd_tad_nrules = 12, + .igd_tad_rule_offset = 0x40, + .igd_tad_chan_offset = 0x90, + .igd_tad_sysdef = 0x80, + .igd_tad_sysdef2 = 0x84, + .igd_mc_mirror = 0xac, + .igd_rir_nways = 5, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 8, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xd0, +}; + +static const imc_gen_data_t imc_gen_data_skx = { + .igd_max_sockets = 8, + .igd_max_imcs = 2, + .igd_max_channels = 3, + .igd_max_dimms = 2, + .igd_max_ranks = IMC_MTR_DDR_RANKS_MAX, + .igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 }, + .igd_mcmtr_offset = 0x87c, + .igd_topo_offset = 0x88, + .igd_tolm_offset = 0xd0, + .igd_tohm_low_offset = 0xd4, + .igd_tohm_hi_offset = 0xd8, + .igd_sad_dram_offset = 0x60, + .igd_sad_ndram_rules = 24, + .igd_sad_nodeid_offset = 0xc0, + .igd_tad_nrules = 8, + .igd_tad_rule_offset = 0x850, + .igd_tad_chan_offset = 0x90, + .igd_rir_nways = 4, + .igd_rir_way_offset = 0x108, + .igd_rir_nileaves = 4, + .igd_rir_ileave_offset = 0x120, + .igd_ubox_cpubusno_offset = 0xcc, +}; + +/* + * This table contains all of the devices that we're looking for from a stub + * perspective. These are organized by generation. Different generations behave + * in slightly different ways. For example, Sandy Bridge through Broadwell use + * unique PCI IDs for each PCI device/function combination that appears. Whereas + * Skylake based systems use the same PCI ID; however, different device/function + * values indicate that the IDs are used for different purposes. + */ +/* BEGIN CSTYLED */ +static const imc_stub_table_t imc_stub_table[] = { + /* Sandy Bridge */ + { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" }, + { IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" }, + { IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" }, + { IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" }, + { IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" }, + /* Ivy Bridge */ + { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" }, + { IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" }, + { IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" }, + { IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" }, + { IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" }, + { IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" }, + /* Haswell */ + { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" }, + { IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" }, + { IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" }, + { IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" }, + { IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" }, + { IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" }, + { IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" }, + /* Broadwell Devices */ + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" }, + { IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" }, + { IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" }, + { IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" }, + { IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" }, + { IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" }, + { IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" }, + { IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" }, + /* Skylake and Cascade Lake Devices */ + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" }, + + /* + * There is one SAD MC Route type device per core! Because of this a + * wide array of device and functions are allocated. For now, we list + * all 28 of them out. + */ + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" }, + + { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" }, + { IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" }, +}; +/* END CSTYLED */ + +#define IMC_PCI_VENDOR_INTC 0x8086 + +/* + * Our IMC data is global and statically set up during a combination of + * _init(9E) and attach(9E). While we have a module dependency between the PCI + * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't + * guarantee that the imc driver has finished attaching. As such we make sure + * that it can operate without it being attached in any way. + */ +static imc_t *imc_data = NULL; + +/* + * By default we should not allow the stubs to detach as we don't have a good + * way of forcing them to attach again. This is provided in case someone does + * want to allow the driver to unload. + */ +int imc_allow_detach = 0; + +static void +imc_set_gen_data(imc_t *imc) +{ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + imc->imc_gen_data = &imc_gen_data_snb; + break; + case IMC_GEN_IVY: + imc->imc_gen_data = &imc_gen_data_ivb; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + imc->imc_gen_data = &imc_gen_data_has_brd; + break; + case IMC_GEN_SKYLAKE: + imc->imc_gen_data = &imc_gen_data_skx; + break; + default: + dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " + "set to unknown generation: %u", imc->imc_gen); + } +} + +/* + * If our device (dev_info_t) does not have a non-zero unit address, then + * devfsadmd will not pay attention to us at all. Therefore we need to set the + * unit address below, before we create minor nodes. + * + * The rest of the system expects us to have one minor node per socket. The + * minor node ID should be the ID of the socket. + */ +static boolean_t +imc_create_minors(imc_t *imc) +{ + uint_t i; + + ddi_set_name_addr(imc->imc_dip, "1"); + for (i = 0; i < imc->imc_nsockets; i++) { + char buf[MAXNAMELEN]; + + if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >= + sizeof (buf)) { + goto fail; + } + + if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i, + "ddi_mem_ctrl", 0) != DDI_SUCCESS) { + dev_err(imc->imc_dip, CE_WARN, "failed to create " + "minor node %u: %s", i, buf); + goto fail; + } + } + return (B_TRUE); + +fail: + ddi_remove_minor_node(imc->imc_dip, NULL); + return (B_FALSE); +} + +/* + * Check the current MC route value for this SAD. On Skylake systems there is + * one per core. Every core should agree. If not, we will not trust the SAD + * MCROUTE values and this will cause system address decoding to fail on + * skylake. + */ +static void +imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub) +{ + uint32_t val; + + val = pci_config_get32(stub->istub_cfgspace, + IMC_REG_SKX_SAD_MC_ROUTE_TABLE); + if (val == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) { + sad->isad_flags |= IMC_SAD_MCROUTE_VALID; + sad->isad_mcroute.ismc_raw_mcroute = val; + return; + } + + /* + * Occasionally we see MC ROUTE table entries with a value of zero. + * We should ignore those for now. + */ + if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) { + dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch " + "with socket. SAD has val 0x%x, system has %x\n", + val, sad->isad_mcroute.ismc_raw_mcroute); + sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE; + } +} + +/* + * On Skylake, many of the devices that we care about are on separate PCI Buses. + * These can be mapped together by the DECS register. However, we need to know + * how to map different buses together so that we can more usefully associate + * information. The set of buses is all present in the DECS register. We'll + * effectively assign sockets to buses. This is also still something that comes + * up on pre-Skylake systems as well. + */ +static boolean_t +imc_map_buses(imc_t *imc) +{ + imc_stub_t *stub; + uint_t nsock; + + /* + * Find the UBOX_DECS registers so we can establish socket mappings. On + * Skylake, there are three different sets of buses that we need to + * cover all of our devices, while there are only two before that. + */ + for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + uint32_t busno; + + if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) { + continue; + } + + busno = pci_config_get32(stub->istub_cfgspace, + imc->imc_gen_data->igd_ubox_cpubusno_offset); + if (busno == PCI_EINVAL32) { + dev_err(imc->imc_dip, CE_WARN, "failed to read " + "UBOX_DECS CPUBUSNO0: invalid PCI read"); + return (B_FALSE); + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + imc->imc_sockets[nsock].isock_nbus = 3; + imc->imc_sockets[nsock].isock_bus[0] = + IMC_UBOX_CPUBUSNO_0(busno); + imc->imc_sockets[nsock].isock_bus[1] = + IMC_UBOX_CPUBUSNO_1(busno); + imc->imc_sockets[nsock].isock_bus[2] = + IMC_UBOX_CPUBUSNO_2(busno); + } else { + imc->imc_sockets[nsock].isock_bus[0] = + IMC_UBOX_CPUBUSNO_0(busno); + imc->imc_sockets[nsock].isock_bus[1] = + IMC_UBOX_CPUBUSNO_1(busno); + imc->imc_sockets[nsock].isock_nbus = 2; + } + nsock++; + } + imc->imc_nsockets = nsock; + + return (B_TRUE); +} + +/* + * For a given stub that we've found, map it to its corresponding socket based + * on the PCI bus that it has. + */ +static imc_socket_t * +imc_map_find_socket(imc_t *imc, imc_stub_t *stub) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t bus; + + for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) { + if (imc->imc_sockets[i].isock_bus[bus] == + stub->istub_bus) { + return (&imc->imc_sockets[i]); + } + } + } + + return (NULL); +} + +static boolean_t +imc_map_stubs(imc_t *imc) +{ + imc_stub_t *stub; + + if (!imc_map_buses(imc)) { + return (B_FALSE); + } + + stub = avl_first(&imc->imc_stubs); + for (stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + imc_socket_t *sock = imc_map_find_socket(imc, stub); + + if (sock == NULL) { + dev_err(imc->imc_dip, CE_WARN, "found stub type %u " + "PCI%x,%x with bdf %u/%u/%u that does not match a " + "known PCI bus for any of %u sockets", + stub->istub_table->imcs_type, stub->istub_vid, + stub->istub_did, stub->istub_bus, stub->istub_dev, + stub->istub_func, imc->imc_nsockets); + continue; + } + + /* + * We don't have to worry about duplicates here. We check to + * make sure that we have unique bdfs here. + */ + switch (stub->istub_table->imcs_type) { + case IMC_TYPE_MC0_M2M: + sock->isock_imcs[0].icn_m2m = stub; + break; + case IMC_TYPE_MC1_M2M: + sock->isock_imcs[1].icn_m2m = stub; + break; + case IMC_TYPE_MC0_MAIN0: + sock->isock_nimc++; + sock->isock_imcs[0].icn_main0 = stub; + + /* + * On Skylake, the MAIN0 does double duty as channel + * zero and as the TAD. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[0].ich_desc = + stub; + sock->isock_tad[0].itad_stub = stub; + sock->isock_ntad++; + } + break; + case IMC_TYPE_MC0_MAIN1: + sock->isock_imcs[0].icn_main1 = stub; + break; + case IMC_TYPE_MC1_MAIN0: + sock->isock_nimc++; + sock->isock_imcs[1].icn_main0 = stub; + + /* + * On Skylake, the MAIN0 does double duty as channel + * zero and as the TAD. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[0].ich_desc = + stub; + sock->isock_tad[1].itad_stub = stub; + sock->isock_ntad++; + } + break; + case IMC_TYPE_MC1_MAIN1: + sock->isock_imcs[1].icn_main1 = stub; + break; + case IMC_TYPE_MC0_CHANNEL0: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[0].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL1: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[1].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL2: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[2].ich_desc = stub; + break; + case IMC_TYPE_MC0_CHANNEL3: + sock->isock_imcs[0].icn_nchannels++; + sock->isock_imcs[0].icn_channels[3].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL0: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[0].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL1: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[1].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL2: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[2].ich_desc = stub; + break; + case IMC_TYPE_MC1_CHANNEL3: + sock->isock_imcs[1].icn_nchannels++; + sock->isock_imcs[1].icn_channels[3].ich_desc = stub; + break; + case IMC_TYPE_SAD_DRAM: + sock->isock_sad.isad_dram = stub; + break; + case IMC_TYPE_SAD_MMIO: + sock->isock_sad.isad_mmio = stub; + break; + case IMC_TYPE_SAD_MISC: + sock->isock_sad.isad_tolh = stub; + break; + case IMC_TYPE_VTD_MISC: + /* + * Some systems have multiple VT-D Misc. entry points + * in the system. In this case, only use the first one + * we find. + */ + if (imc->imc_gvtd_misc == NULL) { + imc->imc_gvtd_misc = stub; + } + break; + case IMC_TYPE_SAD_MCROUTE: + ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE); + imc_mcroute_check(imc, &sock->isock_sad, stub); + break; + case IMC_TYPE_UBOX: + sock->isock_ubox = stub; + break; + case IMC_TYPE_HA0: + sock->isock_ntad++; + sock->isock_tad[0].itad_stub = stub; + break; + case IMC_TYPE_HA1: + sock->isock_ntad++; + sock->isock_tad[1].itad_stub = stub; + break; + case IMC_TYPE_UBOX_CPUBUSNO: + sock->isock_cpubusno = stub; + break; + default: + /* + * Attempt to still attach if we can. + */ + dev_err(imc->imc_dip, CE_WARN, "Encountered unknown " + "IMC type (%u) on PCI %x,%x", + stub->istub_table->imcs_type, + stub->istub_vid, stub->istub_did); + break; + } + } + + return (B_TRUE); +} + +/* + * Go through and fix up various aspects of the stubs mappings on systems. The + * following are a list of what we need to fix up: + * + * 1. On Haswell and newer systems, there is only one global VT-d device. We + * need to go back and map that to all of the per-socket imc_sad_t entries. + */ +static void +imc_fixup_stubs(imc_t *imc) +{ + if (imc->imc_gen >= IMC_GEN_HASWELL) { + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh, + ==, NULL); + imc->imc_sockets[i].isock_sad.isad_tolh = + imc->imc_gvtd_misc; + } + } +} + +/* + * In the wild we've hit a few odd cases where not all devices are exposed that + * we might expect by firmware. In particular we've seen and validate the + * following cases: + * + * o We don't find all of the channel devices that we expect, e.g. we have the + * stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW + * with an E5-2630v3. + */ +static boolean_t +imc_validate_stubs(imc_t *imc) +{ + for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) { + imc_socket_t *socket = &imc->imc_sockets[sock]; + + for (uint_t mc = 0; mc < socket->isock_nimc; mc++) { + imc_mc_t *mcp = &socket->isock_imcs[mc]; + + for (uint_t chan = 0; chan < mcp->icn_nchannels; + chan++) { + if (mcp->icn_channels[chan].ich_desc == NULL) { + dev_err(imc->imc_dip, CE_WARN, + "!missing device for socket %u/" + "imc %u/channel %u", sock, mc, + chan); + return (B_FALSE); + } + } + } + } + + return (B_TRUE); +} + +/* + * Attempt to map all of the discovered sockets to the corresponding APIC based + * socket. We do these mappings by getting the node id of the socket and + * adjusting it to make sure that no home agent is present in it. We use the + * UBOX to avoid any home agent related bits that are present in other + * registers. + */ +static void +imc_map_sockets(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint32_t nodeid; + ddi_acc_handle_t h; + + h = imc->imc_sockets[i].isock_ubox->istub_cfgspace; + nodeid = pci_config_get32(h, + imc->imc_gen_data->igd_sad_nodeid_offset); + if (nodeid == PCI_EINVAL32) { + imc->imc_sockets[i].isock_valid |= + IMC_SOCKET_V_BAD_NODEID; + continue; + } + + imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid); + imc->imc_spointers[nodeid] = &imc->imc_sockets[i]; + } +} + +/* + * Decode the MTR, accounting for variances between processor generations. + */ +static void +imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr) +{ + uint8_t disable; + + /* + * Check present first, before worrying about anything else. + */ + if (imc->imc_gen < IMC_GEN_SKYLAKE && + IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) { + dimm->idimm_present = B_FALSE; + return; + } else if (imc->imc_gen >= IMC_GEN_SKYLAKE && + IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) { + dimm->idimm_present = B_FALSE; + return; + } + + dimm->idimm_present = B_TRUE; + dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE; + if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN || + dimm->idimm_ncolumns > IMC_MTR_CA_MAX) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS; + } + + dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE; + if (dimm->idimm_nrows < IMC_MTR_RA_MIN || + dimm->idimm_nrows > IMC_MTR_RA_MAX) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS; + } + + /* + * Determine Density, this information is not present on Sandy Bridge. + */ + switch (imc->imc_gen) { + case IMC_GEN_IVY: + dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr); + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) { + case 0: + default: + dimm->idimm_density = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; + break; + case 1: + dimm->idimm_density = 2; + break; + case 2: + dimm->idimm_density = 4; + break; + case 3: + dimm->idimm_density = 8; + break; + } + break; + case IMC_GEN_SKYLAKE: + switch (IMC_MTR_DENSITY_SKX(mtr)) { + case 0: + default: + dimm->idimm_density = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY; + break; + case 1: + dimm->idimm_density = 2; + break; + case 2: + dimm->idimm_density = 4; + break; + case 3: + dimm->idimm_density = 8; + break; + case 4: + dimm->idimm_density = 16; + break; + case 5: + dimm->idimm_density = 12; + break; + } + break; + case IMC_GEN_UNKNOWN: + case IMC_GEN_SANDY: + dimm->idimm_density = 0; + break; + } + + /* + * The values of width are the same on IVY->SKX, but the bits are + * different. This doesn't exist on SNB. + */ + if (imc->imc_gen > IMC_GEN_SANDY) { + uint8_t width; + + if (imc->imc_gen >= IMC_GEN_BROADWELL) { + width = IMC_MTR_WIDTH_BRD_SKX(mtr); + } else { + width = IMC_MTR_WIDTH_IVB_HAS(mtr); + } + switch (width) { + case 0: + dimm->idimm_width = 4; + break; + case 1: + dimm->idimm_width = 8; + break; + case 2: + dimm->idimm_width = 16; + break; + default: + dimm->idimm_width = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH; + break; + } + } else { + dimm->idimm_width = 0; + } + + dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr); + switch (imc->imc_gen) { + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + case IMC_GEN_SKYLAKE: + if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) { + dimm->idimm_nranks = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; + } + break; + default: + if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) { + dimm->idimm_nranks = 0; + dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS; + } + } + + disable = IMC_MTR_RANK_DISABLE(mtr); + dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0; + dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0; + dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0; + dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0; + + /* + * Only Haswell and later have this information. + */ + if (imc->imc_gen >= IMC_GEN_HASWELL) { + dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0; + dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0; + dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr); + if (dimm->idimm_3dsranks != 0) { + dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks; + } + } + + + if (icn->icn_dimm_type == IMC_DIMM_DDR4) { + dimm->idimm_nbanks = 16; + } else { + dimm->idimm_nbanks = 8; + } + + /* + * To calculate the DIMM size we need first take the number of rows and + * columns. This gives us the number of slots per chip. In a given rank + * there are nbanks of these. There are nrank entries of those. Each of + * these slots can fit a byte. + */ + dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 * + (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows)); +} + +static void +imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan) +{ + uint_t i; + + /* + * There's one register for each DIMM that might be present, we always + * read that information to determine information about the DIMMs. + */ + chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms; + for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { + uint32_t mtr; + imc_dimm_t *dimm = &chan->ich_dimms[i]; + + bzero(dimm, sizeof (imc_dimm_t)); + mtr = pci_config_get32(chan->ich_desc->istub_cfgspace, + imc->imc_gen_data->igd_mtr_offsets[i]); + dimm->idimm_mtr = mtr; + /* + * We don't really expect to get a bad PCIe read. However, if we + * do, treat that for the moment as though the DIMM is bad. + */ + if (mtr == PCI_EINVAL32) { + dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ; + continue; + } + + imc_decode_mtr(imc, icn, dimm, mtr); + } +} + +static boolean_t +imc_fill_controller(imc_t *imc, imc_mc_t *icn) +{ + uint32_t mcmtr; + + mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace, + imc->imc_gen_data->igd_mcmtr_offset); + if (mcmtr == PCI_EINVAL32) { + icn->icn_invalid = B_TRUE; + return (B_FALSE); + } + + icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0; + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0; + } else { + icn->icn_lockstep = B_FALSE; + } + + icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0; + + /* + * SNB and IVB only support DDR3. Haswell and Broadwell may support + * DDR4, depends on the SKU. Skylake only supports DDR4. + */ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + icn->icn_dimm_type = IMC_DIMM_DDR3; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) { + icn->icn_dimm_type = IMC_DIMM_DDR4; + } else { + icn->icn_dimm_type = IMC_DIMM_DDR3; + } + break; + default: + /* + * Skylake and on are all DDR4. + */ + icn->icn_dimm_type = IMC_DIMM_DDR4; + break; + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) { + icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace, + imc->imc_gen_data->igd_topo_offset); + } + + return (B_TRUE); +} + +/* + * Walk the IMC data and fill in the information on DIMMs and the memory + * controller configurations. + */ +static void +imc_fill_data(imc_t *imc) +{ + uint_t csock, cmc, cchan; + + for (csock = 0; csock < imc->imc_nsockets; csock++) { + imc_socket_t *sock = &imc->imc_sockets[csock]; + + for (cmc = 0; cmc < sock->isock_nimc; cmc++) { + imc_mc_t *icn = &sock->isock_imcs[cmc]; + + if (!imc_fill_controller(imc, icn)) + continue; + + for (cchan = 0; cchan < icn->icn_nchannels; cchan++) { + imc_fill_dimms(imc, icn, + &icn->icn_channels[cchan]); + } + } + } +} + +static nvlist_t * +imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm) +{ + nvlist_t *nvl; + + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT, + dimm->idimm_present); + if (!dimm->idimm_present) { + return (nvl); + } + + fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS, + dimm->idimm_ncolumns); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS, + dimm->idimm_nrows); + + if (imc->imc_gen > IMC_GEN_SANDY) { + fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY, + dimm->idimm_density * (1ULL << 30)); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH, + dimm->idimm_width); + } + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS, + dimm->idimm_nranks); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS, + dimm->idimm_nbanks); + fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS, + dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE); + + if (imc->imc_gen >= IMC_GEN_HASWELL) { + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL, + dimm->idimm_hdrl); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP, + dimm->idimm_hdrl_parity); + if (dimm->idimm_3dsranks > 0) { + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK, + dimm->idimm_3dsranks); + } + } + + return (nvl); +} + +static nvlist_t * +imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan) +{ + nvlist_t *nvl; + nvlist_t *dimms[IMC_MAX_DIMMPERCHAN]; + uint_t i; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC, + imc->imc_gen_data->igd_max_dimms); + for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) { + dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]); + } + + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS, + dimms, i); + + for (; i > 0; i--) { + nvlist_free(dimms[i-1]); + } + + return (nvl); +} + +static nvlist_t * +imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn) +{ + nvlist_t *nvl; + nvlist_t *channels[IMC_MAX_CHANPERMC]; + uint_t i; + + nvl = fnvlist_alloc(); + fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels); + fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC, + icn->icn_ecc); + if (icn->icn_lockstep) { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, + MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK); + } else { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE, + MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP); + + } + + if (icn->icn_closed) { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, + MCINTEL_NVLIST_V1_MC_POLICY_CLOSED); + } else { + fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY, + MCINTEL_NVLIST_V1_MC_POLICY_OPEN); + } + + for (i = 0; i < icn->icn_nchannels; i++) { + channels[i] = imc_nvl_create_channel(imc, + &icn->icn_channels[i]); + } + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS, + channels, icn->icn_nchannels); + for (i = 0; i < icn->icn_nchannels; i++) { + nvlist_free(channels[i]); + } + + return (nvl); +} + +static void +imc_nvl_pack(imc_socket_t *sock, boolean_t sleep) +{ + char *buf = NULL; + size_t len = 0; + int kmflag; + + if (sock->isock_nvl == NULL) + return; + + if (sock->isock_buf != NULL) + return; + + if (sleep) { + kmflag = KM_SLEEP; + } else { + kmflag = KM_NOSLEEP | KM_NORMALPRI; + } + + if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR, + kmflag) != 0) { + return; + } + + sock->isock_buf = buf; + sock->isock_buflen = len; + sock->isock_gen++; +} + +static void +imc_decoder_pack(imc_t *imc) +{ + char *buf = NULL; + size_t len = 0; + + if (imc->imc_decoder_buf != NULL) + return; + + if (imc->imc_decoder_dump == NULL) { + imc->imc_decoder_dump = imc_dump_decoder(imc); + } + + if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR, + KM_NOSLEEP | KM_NORMALPRI) != 0) { + return; + } + + imc->imc_decoder_buf = buf; + imc->imc_decoder_len = len; +} + +static void +imc_nvl_create(imc_t *imc) +{ + uint_t csock; + for (csock = 0; csock < imc->imc_nsockets; csock++) { + uint_t i; + nvlist_t *nvl; + nvlist_t *mcs[IMC_MAX_IMCPERSOCK]; + imc_socket_t *sock = &imc->imc_sockets[csock]; + + nvl = fnvlist_alloc(); + fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR, + MCINTEL_NVLIST_VERS1); + fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC, + sock->isock_nimc); + + for (i = 0; i < sock->isock_nimc; i++) { + mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]); + } + + fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS, + mcs, sock->isock_nimc); + + for (i = 0; i < sock->isock_nimc; i++) { + nvlist_free(mcs[i]); + } + + sock->isock_nvl = nvl; + imc_nvl_pack(sock, B_TRUE); + } +} + +/* + * Determine the top of low and high memory. These determine whether transaction + * addresses target main memory or not. Unfortunately, the way that these are + * stored and fetched changes with different generations. + */ +static void +imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad) +{ + uint32_t tolm, tohm_low, tohm_hi; + + tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tolm_offset); + tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tohm_low_offset); + if (imc->imc_gen_data->igd_tohm_hi_offset != 0) { + tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace, + imc->imc_gen_data->igd_tohm_hi_offset); + } else { + tohm_hi = 0; + } + + if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 || + tohm_hi == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) << + IMC_TOLM_SNB_IVY_SHIFT; + sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) << + IMC_TOLM_SNB_IVY_SHIFT; + break; + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + case IMC_GEN_SKYLAKE: + sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK; + sad->isad_tohm = ((uint64_t)tohm_low & + IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32); + + /* + * Adjust the values to turn them into an exclusive range. + */ + sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL; + sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL; + break; + default: + dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: " + "set to unknown generation: %u", imc->imc_gen); + return; + } +} + +static void +imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule, + uint32_t raw) +{ + uint_t attr; + uint64_t limit; + bzero(rule, sizeof (imc_sad_rule_t)); + + rule->isr_raw_dram = raw; + rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0; + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) { + case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6: + rule->isr_imode = IMC_SAD_IMODE_8t6; + break; + case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR: + rule->isr_imode = IMC_SAD_IMODE_8t6XOR; + break; + } + } else { + switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) { + case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6: + rule->isr_imode = IMC_SAD_IMODE_8t6; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8: + rule->isr_imode = IMC_SAD_IMODE_10t8; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12: + rule->isr_imode = IMC_SAD_IMODE_14t12; + break; + case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30: + rule->isr_imode = IMC_SAD_IMODE_32t30; + break; + } + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + attr = IMC_SAD_DRAM_ATTR_SKX(raw); + } else { + attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw); + } + + switch (attr) { + case IMC_SAD_DRAM_ATTR_DRAM: + rule->isr_type = IMC_SAD_TYPE_DRAM; + break; + case IMC_SAD_DRAM_ATTR_MMCFG: + rule->isr_type = IMC_SAD_TYPE_MMCFG; + break; + case IMC_SAD_DRAM_ATTR_NXM: + if (imc->imc_gen < IMC_GEN_SKYLAKE) { + sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; + } + rule->isr_type = IMC_SAD_TYPE_NXM; + break; + default: + sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR; + break; + } + + /* + * Fetch the limit which represents bits 45:26 and then adjust this so + * that it is exclusive. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + limit = IMC_SAD_DRAM_LIMIT_SKX(raw); + } else { + limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw); + } + rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) + + IMC_SAD_DRAM_LIMIT_EXCLUSIVE; + + /* + * The rest of this does not apply to Sandy Bridge. + */ + if (imc->imc_gen == IMC_GEN_SANDY) + return; + + if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) { + rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0; + return; + } + + switch (IMC_SAD_DRAM_MOD23_SKX(raw)) { + case IMC_SAD_DRAM_MOD23_MOD3: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C01: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C12: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12; + break; + case IMC_SAD_DRAM_MOD23_MOD2_C02: + rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02; + break; + } + + rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0; + switch (IMC_SAD_DRAM_MOD3_SKX(raw)) { + case IMC_SAD_DRAM_MOD3_MODE_45t6: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6; + break; + case IMC_SAD_DRAM_MOD3_MODE_45t8: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8; + break; + case IMC_SAD_DRAM_MOD3_MODE_45t12: + rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12; + break; + default: + sad->isad_valid |= IMC_SAD_V_BAD_MOD3; + break; + } +} + +static void +imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw) +{ + uint_t i; + uint32_t mlen, mbase, skipbits, skipafter; + + rule->isr_raw_interleave = raw; + + /* + * Right now all architectures always have the maximum number of SAD + * interleave targets. + */ + rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE; + + /* + * Sandy Bridge has a gap in the interleave list due to the fact that it + * uses a smaller length. + */ + if (imc->imc_gen > IMC_GEN_SANDY) { + mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN; + mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK; + skipbits = skipafter = 0; + } else { + mlen = IMC_SAD_ILEAVE_SNB_LEN; + mbase = IMC_SAD_ILEAVE_SNB_MASK; + skipbits = 2; + skipafter = 4; + } + + for (i = 0; i < rule->isr_ntargets; i++) { + uint32_t mask, shift; + + shift = i * mlen; + if (i >= skipafter) + shift += skipbits; + mask = mbase << shift; + rule->isr_targets[i] = (raw & mask) >> shift; + } +} + +static void +imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad) +{ + uint_t i; + off_t off; + + sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules; + for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset; + i < sad->isad_nrules; i++, off += sizeof (uint64_t)) { + uint32_t dram, interleave; + imc_sad_rule_t *rule = &sad->isad_rules[i]; + + dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off); + interleave = pci_config_get32(sad->isad_dram->istub_cfgspace, + off + 4); + + if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) { + sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ; + return; + } + + imc_sad_fill_rule(imc, sad, rule, dram); + imc_sad_fill_rule_interleave(imc, rule, interleave); + } +} + +static void +imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad) +{ + uint_t i; + imc_sad_mcroute_table_t *mc = &sad->isad_mcroute; + + if (imc->imc_gen < IMC_GEN_SKYLAKE) + return; + if (sad->isad_valid != 0) + return; + + mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES; + for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) { + uint_t chanoff, ringoff; + + ringoff = i * IMC_MC_ROUTE_RING_BITS; + chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET; + + mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >> + ringoff) & IMC_MC_ROUTE_RING_MASK; + mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >> + chanoff) & IMC_MC_ROUTE_CHAN_MASK; + } +} + +/* + * Initialize the SAD. To do this we have to do a few different things: + * + * 1. Determine where the top of low and high memory is. + * 2. Read and decode all of the rules for the SAD + * 3. On systems with a route table, decode the raw routes + * + * At this point in time, we treat TOLM and TOHM as a per-socket construct, even + * though it really should be global, this just makes life a bit simpler. + */ +static void +imc_decoder_init_sad(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad); + imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad); + imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad); + } +} + +static void +imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev, + imc_tad_rule_t *rule, uint32_t val) +{ + uint64_t limit; + + limit = IMC_TAD_LIMIT(val); + rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) + + IMC_TAD_LIMIT_EXCLUSIVE; + rule->itr_raw = val; + + switch (IMC_TAD_SOCK_WAY(val)) { + case IMC_TAD_SOCK_WAY_1: + rule->itr_sock_way = 1; + break; + case IMC_TAD_SOCK_WAY_2: + rule->itr_sock_way = 2; + break; + case IMC_TAD_SOCK_WAY_4: + rule->itr_sock_way = 4; + break; + case IMC_TAD_SOCK_WAY_8: + rule->itr_sock_way = 8; + break; + } + + rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1; + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + rule->itr_chan_gran = IMC_TAD_GRAN_64B; + + /* + * Starting with Skylake the targets that are used are no longer part of + * the TAD. Those come from the IMC route table. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + rule->itr_ntargets = 0; + return; + } + + rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS; + rule->itr_targets[0] = IMC_TAD_TARG0(val); + rule->itr_targets[1] = IMC_TAD_TARG1(val); + rule->itr_targets[2] = IMC_TAD_TARG2(val); + rule->itr_targets[3] = IMC_TAD_TARG3(val); + + if (prev == NULL) { + rule->itr_base = 0; + } else { + rule->itr_base = prev->itr_limit + 1; + } +} + +static void +imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule, + uint32_t val) +{ + uint64_t base; + + rule->itr_raw_gran = val; + base = IMC_TAD_BASE_BASE(val); + rule->itr_base = base << IMC_TAD_BASE_SHIFT; + + switch (IMC_TAD_BASE_CHAN_GRAN(val)) { + case IMC_TAD_BASE_CHAN_GRAN_64B: + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + break; + case IMC_TAD_BASE_CHAN_GRAN_256B: + rule->itr_sock_gran = IMC_TAD_GRAN_256B; + break; + case IMC_TAD_BASE_CHAN_GRAN_4KB: + rule->itr_sock_gran = IMC_TAD_GRAN_4KB; + break; + default: + tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN; + return; + } + + switch (IMC_TAD_BASE_SOCK_GRAN(val)) { + case IMC_TAD_BASE_SOCK_GRAN_64B: + rule->itr_sock_gran = IMC_TAD_GRAN_64B; + break; + case IMC_TAD_BASE_SOCK_GRAN_256B: + rule->itr_sock_gran = IMC_TAD_GRAN_256B; + break; + case IMC_TAD_BASE_SOCK_GRAN_4KB: + rule->itr_sock_gran = IMC_TAD_GRAN_4KB; + break; + case IMC_TAD_BASE_SOCK_GRAN_1GB: + rule->itr_sock_gran = IMC_TAD_GRAN_1GB; + break; + } +} + +/* + * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's + * suggested that the channel wayness will take this into account and therefore + * should be accurately reflected. + */ +static void +imc_tad_read_rules(imc_t *imc, imc_tad_t *tad) +{ + uint_t i; + off_t baseoff; + imc_tad_rule_t *prev; + + tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules; + for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset, + prev = NULL; i < tad->itad_nrules; + i++, baseoff += sizeof (uint32_t)) { + uint32_t val; + off_t off; + imc_tad_rule_t *rule = &tad->itad_rules[i]; + + /* + * On Skylake, the TAD rules are split among two registers. The + * latter set mimics what exists on pre-Skylake. + */ + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + off = baseoff + IMC_SKX_WAYNESS_OFFSET; + } else { + off = baseoff; + } + + val = pci_config_get32(tad->itad_stub->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + + imc_tad_fill_rule(imc, tad, prev, rule, val); + prev = rule; + if (imc->imc_gen < IMC_GEN_SKYLAKE) + continue; + + val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + + imc_tad_fill_skx(imc, tad, rule, val); + } +} + +/* + * Check for features which change how decoding works. + */ +static void +imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc) +{ + uint32_t val; + + /* + * Determine whether or not lockstep mode or mirroring are enabled. + * These change the behavior of how we're supposed to interpret channel + * wayness. Lockstep is available in the TAD's features. Mirroring is + * available on the IMC's features. This isn't present in Skylake+. On + * Skylake Mirorring is a property of the SAD rule and there is no + * lockstep. + */ + switch (imc->imc_gen) { + case IMC_GEN_SANDY: + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + val = pci_config_get32(tad->itad_stub->istub_cfgspace, + imc->imc_gen_data->igd_tad_sysdef); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_TAD_SYSDEF_LOCKSTEP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP; + } + + val = pci_config_get32(mc->icn_main1->istub_cfgspace, + imc->imc_gen_data->igd_mc_mirror); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_MC_MIRROR_SNB_BRD(val)) { + tad->itad_flags |= IMC_TAD_FLAG_MIRROR; + } + break; + default: + break; + } + + /* + * Now, go through and look at values that'll change how we do the + * channel index and adddress calculation. These are only present + * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge + * and they don't exist on Skylake+. + */ + switch (imc->imc_gen) { + case IMC_GEN_IVY: + case IMC_GEN_HASWELL: + case IMC_GEN_BROADWELL: + val = pci_config_get32(tad->itad_stub->istub_cfgspace, + imc->imc_gen_data->igd_tad_sysdef2); + if (val == PCI_EINVAL32) { + tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ; + return; + } + if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT; + } + if (IMC_TAD_SYSDEF2_SHIFTUP(val)) { + tad->itad_flags |= IMC_TAD_FLAG_CHANHASH; + } + break; + default: + break; + } +} + +/* + * Read the IMC channel interleave records + */ +static void +imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan) +{ + uint_t i; + off_t off; + + chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules; + for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset; + i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) { + uint32_t val; + uint64_t offset; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, + off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + offset = IMC_TADCHAN_OFFSET_SKX(val); + } else { + offset = IMC_TADCHAN_OFFSET_SNB_BRD(val); + } + + chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT; + chan->ich_tad_offsets_raw[i] = val; + } +} + +static void +imc_decoder_init_tad(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + + for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) { + imc_tad_read_features(imc, + &imc->imc_sockets[i].isock_tad[j], + &imc->imc_sockets[i].isock_imcs[j]); + imc_tad_read_rules(imc, + &imc->imc_sockets[i].isock_tad[j]); + } + } + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + imc_socket_t *sock = &imc->imc_sockets[i]; + + for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { + uint_t k; + imc_mc_t *mc = &sock->isock_imcs[j]; + + for (k = 0; k < mc->icn_nchannels; k++) { + imc_channel_t *chan = &mc->icn_channels[k]; + imc_tad_read_interleave(imc, chan); + } + } + } +} + +static void +imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan, + imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig) +{ + uint_t i; + off_t off, incr; + + /* + * Rank interleave offset registers come in two forms. Either they are + * contiguous for a given wayness, meaning that all of the entries for + * wayness zero are contiguous, or they are sparse, meaning that there + * is a bank for entry zero for all wayness, then entry one for all + * wayness, etc. + */ + if (contig) { + off = imc->imc_gen_data->igd_rir_ileave_offset + + (rirno * imc->imc_gen_data->igd_rir_nileaves * + sizeof (uint32_t)); + incr = sizeof (uint32_t); + } else { + off = imc->imc_gen_data->igd_rir_ileave_offset + + (rirno * sizeof (uint32_t)); + incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t); + } + for (i = 0; i < rank->irle_nentries; i++, off += incr) { + uint32_t val; + uint64_t offset; + imc_rank_ileave_entry_t *ent = &rank->irle_entries[i]; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + switch (imc->imc_gen) { + case IMC_GEN_BROADWELL: + ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val); + break; + default: + ent->irle_target = IMC_RIR_OFFSET_TARGET(val); + break; + } + if (imc->imc_gen >= IMC_GEN_HASWELL) { + offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val); + } else { + offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val); + } + ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT; + } +} + +static void +imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan) +{ + uint_t i; + off_t off; + + chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways; + for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset; + i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) { + uint32_t val; + uint64_t lim; + imc_rank_ileave_t *ent = &chan->ich_rankileaves[i]; + + val = pci_config_get32(chan->ich_desc->istub_cfgspace, off); + if (val == PCI_EINVAL32) { + chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ; + return; + } + + ent->irle_raw = val; + ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0; + ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val); + ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val); + if (imc->imc_gen >= IMC_GEN_HASWELL) { + lim = IMC_RIR_LIMIT_HAS_SKX(val); + } else { + lim = IMC_RIR_LIMIT_SNB_IVB(val); + } + + ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) + + IMC_RIR_LIMIT_EXCLUSIVE; + + ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves; + if (imc->imc_gen >= IMC_GEN_SKYLAKE) { + imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE); + } else { + imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE); + } + } +} + +static void +imc_decoder_init_rir(imc_t *imc) +{ + uint_t i; + + for (i = 0; i < imc->imc_nsockets; i++) { + uint_t j; + imc_socket_t *sock = &imc->imc_sockets[i]; + + for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) { + uint_t k; + imc_mc_t *mc = &sock->isock_imcs[j]; + + for (k = 0; k < mc->icn_nchannels; k++) { + imc_channel_t *chan = &mc->icn_channels[k]; + imc_rir_read_wayness(imc, chan); + } + } + } +} + +static cmi_errno_t +imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo, + uint32_t synd, int syndtype, mc_unum_t *unump) +{ + imc_t *imc = arg; + uint_t i; + imc_decode_state_t dec; + + bzero(&dec, sizeof (dec)); + if (!imc_decode_pa(imc, pa, &dec)) { + switch (dec.ids_fail) { + case IMC_DECODE_F_LEGACY_RANGE: + case IMC_DECODE_F_OUTSIDE_DRAM: + return (CMIERR_MC_NOTDIMMADDR); + default: + return (CMIERR_MC_BADSTATE); + } + } + + unump->unum_board = 0; + /* + * The chip id needs to be in the order that the OS expects it, which + * may not be our order. + */ + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_spointers[i] == dec.ids_socket) + break; + } + if (i == imc->imc_nsockets) { + return (CMIERR_MC_BADSTATE); + } + unump->unum_chip = i; + unump->unum_mc = dec.ids_tadid; + unump->unum_chan = dec.ids_channelid; + unump->unum_cs = dec.ids_dimmid; + unump->unum_rank = dec.ids_rankid; + unump->unum_offset = dec.ids_rankaddr; + for (i = 0; i < MC_UNUM_NDIMM; i++) { + unump->unum_dimms[i] = MC_INVALNUM; + } + + return (CMI_SUCCESS); +} + +static cmi_errno_t +imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa) +{ + return (CMIERR_UNKNOWN); +} + +static const cmi_mc_ops_t imc_mc_ops = { + .cmi_mc_patounum = imc_mc_patounum, + .cmi_mc_unumtopa = imc_mc_unumtopa +}; + +/* + * This is where we really finish attaching and become open for business. This + * occurs once we have all of the expected stubs attached. Here's where all of + * the real fun begins. + */ +static void +imc_attach_complete(void *arg) +{ + imc_t *imc = arg; + cmi_errno_t err; + + imc_set_gen_data(imc); + + /* + * On SKX and newer, we can fail to map PCI buses at this point due to + * bad PCIe reads. + */ + if (!imc_map_stubs(imc)) { + goto done; + } + + if (!imc_validate_stubs(imc)) { + imc->imc_flags |= IMC_F_VALIDATE_FAILED; + goto done; + } + + imc_fixup_stubs(imc); + imc_map_sockets(imc); + + if (!imc_create_minors(imc)) { + goto done; + } + + imc_fill_data(imc); + imc_nvl_create(imc); + + /* + * Gather additional information that we need so that we can properly + * initialize the memory decoder and encoder. + */ + imc_decoder_init_sad(imc); + imc_decoder_init_tad(imc); + imc_decoder_init_rir(imc); + + /* + * Register decoder functions. This may fail. If so, try and complain + * loudly, but stay active to allow other data to be useful. Register a + * global handle. + */ + if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) { + imc->imc_flags |= IMC_F_MCREG_FAILED; + dev_err(imc->imc_dip, CE_WARN, "failed to register memory " + "decoding operations: 0x%x", err); + } + +done: + mutex_enter(&imc->imc_lock); + imc->imc_flags &= IMC_F_ATTACH_DISPATCHED; + imc->imc_flags |= IMC_F_ATTACH_COMPLETE; + mutex_exit(&imc->imc_lock); +} + +static int +imc_stub_comparator(const void *l, const void *r) +{ + const imc_stub_t *sl = l, *sr = r; + if (sl->istub_bus > sr->istub_bus) + return (1); + if (sl->istub_bus < sr->istub_bus) + return (-1); + if (sl->istub_dev > sr->istub_dev) + return (1); + if (sl->istub_dev < sr->istub_dev) + return (-1); + if (sl->istub_func > sr->istub_func) + return (1); + if (sl->istub_func < sr->istub_func) + return (-1); + return (0); +} + +static int +imc_stub_scan_cb(dev_info_t *dip, void *arg) +{ + int vid, did; + const imc_stub_table_t *table; + imc_t *imc = arg; + int *regs; + uint_t i, nregs; + + if (dip == ddi_root_node()) { + return (DDI_WALK_CONTINUE); + } + + /* + * Get the dev info name. PCI devices will always be children of PCI + * devices today on x86. If we reach something that has a device name + * that's not PCI, then we can prune it's children. + */ + if (strncmp("pci", ddi_get_name(dip), 3) != 0) { + return (DDI_WALK_PRUNECHILD); + } + + /* + * Get the device and vendor ID and see if this is something the imc + * knows about or cares about. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "vendor-id", PCI_EINVAL16); + did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "device-id", PCI_EINVAL16); + if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { + return (DDI_WALK_CONTINUE); + } + + if (vid != IMC_PCI_VENDOR_INTC) { + return (DDI_WALK_PRUNECHILD); + } + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { + return (DDI_WALK_CONTINUE); + } + + if (nregs == 0) { + ddi_prop_free(regs); + return (DDI_WALK_CONTINUE); + } + + + table = NULL; + for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { + if (imc_stub_table[i].imcs_devid == did && + imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && + imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { + table = &imc_stub_table[i]; + break; + } + } + ddi_prop_free(regs); + + /* + * Not a match, not interesting. + */ + if (table == NULL) { + return (DDI_WALK_CONTINUE); + } + + mutex_enter(&imc->imc_lock); + imc->imc_nscanned++; + mutex_exit(&imc->imc_lock); + + return (DDI_WALK_CONTINUE); +} + +/* + * From here, go through and see how many of the devices that we know about. + */ +static void +imc_stub_scan(void *arg) +{ + imc_t *imc = arg; + boolean_t dispatch = B_FALSE; + + /* + * Zero out the scan results in case we've been detached and reattached. + */ + mutex_enter(&imc->imc_lock); + imc->imc_nscanned = 0; + mutex_exit(&imc->imc_lock); + + ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc); + + mutex_enter(&imc->imc_lock); + imc->imc_flags |= IMC_F_SCAN_COMPLETE; + imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED; + + /* + * If the scan found no nodes, then that means that we're on a hardware + * platform that we don't support. Therefore, there's no reason to do + * anything here. + */ + if (imc->imc_nscanned == 0) { + imc->imc_flags |= IMC_F_UNSUP_PLATFORM; + mutex_exit(&imc->imc_lock); + return; + } + + if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { + imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; + dispatch = B_TRUE; + } + + mutex_exit(&imc->imc_lock); + + if (dispatch) { + (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, + imc, DDI_SLEEP); + } +} + +/* + * By default, refuse to allow stubs to detach. + */ +int +imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + imc_stub_t *stub; + imc_t *imc = imc_data; + + mutex_enter(&imc->imc_lock); + + /* + * By default, we do not allow stubs to detach. However, if the driver + * has attached to devices on a platform it doesn't recognize or + * support or if the override flag has been set, then allow detach to + * proceed. + */ + if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 && + imc_allow_detach == 0) { + mutex_exit(&imc->imc_lock); + return (DDI_FAILURE); + } + + for (stub = avl_first(&imc->imc_stubs); stub != NULL; + stub = AVL_NEXT(&imc->imc_stubs, stub)) { + if (stub->istub_dip == dip) { + break; + } + } + + /* + * A device was attached to us that we somehow don't know about. Allow + * this to proceed. + */ + if (stub == NULL) { + mutex_exit(&imc->imc_lock); + return (DDI_SUCCESS); + } + + pci_config_teardown(&stub->istub_cfgspace); + avl_remove(&imc->imc_stubs, stub); + kmem_free(stub, sizeof (imc_stub_t)); + mutex_exit(&imc->imc_lock); + + return (DDI_SUCCESS); +} + +int +imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + imc_stub_t *stub, *lookup; + int did, vid, *regs; + uint_t i, nregs; + const imc_stub_table_t *table; + avl_index_t idx; + boolean_t dispatch = B_FALSE; + imc_t *imc = imc_data; + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + /* + * We've been asked to attach a stub. First, determine if this is even a + * PCI device that we should care about. Then, append it to our global + * list and kick off the configuration task. Note that we do this + * configuration task in a taskq so that we don't interfere with the + * normal attach / detach path processing. + */ + if (strncmp("pci", ddi_get_name(dip), 3) != 0) { + return (DDI_FAILURE); + } + + /* + * Get the device and vendor ID and see if this is something the imc + * knows about or cares about. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "vendor-id", PCI_EINVAL16); + did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "device-id", PCI_EINVAL16); + if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) { + return (DDI_FAILURE); + } + + /* + * Only accept INTC parts on the imc driver. + */ + if (vid != IMC_PCI_VENDOR_INTC) { + return (DDI_FAILURE); + } + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", ®s, &nregs) != DDI_PROP_SUCCESS) { + return (DDI_FAILURE); + } + + if (nregs == 0) { + ddi_prop_free(regs); + return (DDI_FAILURE); + } + + /* + * Determine if this matches a known device. + */ + table = NULL; + for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) { + if (imc_stub_table[i].imcs_devid == did && + imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) && + imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) { + table = &imc_stub_table[i]; + break; + } + } + + if (i == ARRAY_SIZE(imc_stub_table)) { + ddi_prop_free(regs); + return (DDI_FAILURE); + } + + /* + * We've found something. Make sure the generation matches our current + * one. If it does, construct the entry and append it to the list. + */ + mutex_enter(&imc->imc_lock); + if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen != + table->imcs_gen) { + mutex_exit(&imc->imc_lock); + ddi_prop_free(regs); + dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) " + "that has different hardware generation (%u) from current " + "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen); + return (DDI_FAILURE); + } else { + imc->imc_gen = table->imcs_gen; + } + mutex_exit(&imc->imc_lock); + + stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP); + stub->istub_dip = dip; + stub->istub_vid = vid; + stub->istub_did = did; + stub->istub_bus = PCI_REG_BUS_G(regs[0]); + stub->istub_dev = PCI_REG_DEV_G(regs[0]); + stub->istub_func = PCI_REG_FUNC_G(regs[0]); + ddi_prop_free(regs); + stub->istub_table = table; + + if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) { + kmem_free(stub, sizeof (stub)); + dev_err(dip, CE_WARN, "Failed to set up PCI config space " + "for IMC stub device %s (%u/%u)", ddi_node_name(dip), + vid, did); + return (DDI_FAILURE); + } + + mutex_enter(&imc->imc_lock); + if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) { + dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate " + "bdf %u/%u/%u with %s (%u/%u), not attaching", + ddi_node_name(imc->imc_dip), vid, did, + stub->istub_bus, stub->istub_dev, stub->istub_func, + ddi_node_name(lookup->istub_dip), lookup->istub_vid, + lookup->istub_did); + mutex_exit(&imc->imc_lock); + pci_config_teardown(&stub->istub_cfgspace); + kmem_free(stub, sizeof (stub)); + + return (DDI_FAILURE); + } + avl_insert(&imc->imc_stubs, stub, idx); + + if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE && + avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) { + imc->imc_flags |= IMC_F_ATTACH_DISPATCHED; + dispatch = B_TRUE; + } + mutex_exit(&imc->imc_lock); + + if (dispatch) { + (void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete, + imc, DDI_SLEEP); + } + + return (DDI_SUCCESS); +} + +static int +imc_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + imc_t *imc = imc_data; + + if ((flag & (FEXCL | FNDELAY)) != 0) + return (EINVAL); + + if (otyp != OTYP_CHR) + return (EINVAL); + + mutex_enter(&imc->imc_lock); + + if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) { + mutex_exit(&imc->imc_lock); + return (ENOTSUP); + } + + /* + * It's possible that someone has come in during the window between when + * we've created the minor node and when we've finished doing work. + */ + if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) { + mutex_exit(&imc->imc_lock); + return (EAGAIN); + } + + /* + * It's not clear how someone would get a minor that we didn't create. + * But be paranoid and make sure. + */ + if (getminor(*devp) >= imc->imc_nsockets) { + mutex_exit(&imc->imc_lock); + return (EINVAL); + } + + /* + * Make sure this socket entry has been filled in. + */ + if (imc->imc_spointers[getminor(*devp)] == NULL) { + mutex_exit(&imc->imc_lock); + return (EINVAL); + } + + mutex_exit(&imc->imc_lock); + + return (0); +} + +static void +imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode) +{ + imc_decode_state_t dec; + uint_t i; + + bzero(&dec, sizeof (dec)); + if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) { + encode->mcei_err = (uint32_t)dec.ids_fail; + encode->mcei_errdata = dec.ids_fail_data; + return; + } + + encode->mcei_errdata = 0; + encode->mcei_err = 0; + encode->mcei_board = 0; + for (i = 0; i < imc->imc_nsockets; i++) { + if (imc->imc_spointers[i] == dec.ids_socket) + break; + } + encode->mcei_chip = i; + encode->mcei_mc = dec.ids_tadid; + encode->mcei_chan = dec.ids_channelid; + encode->mcei_dimm = dec.ids_dimmid; + encode->mcei_rank_addr = dec.ids_rankaddr; + encode->mcei_rank = dec.ids_rankid; + encode->mcei_row = UINT32_MAX; + encode->mcei_column = UINT32_MAX; + encode->mcei_pad = 0; +} + +static int +imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + mc_snapshot_info_t info; + mc_encode_ioc_t encode; + imc_t *imc = imc_data; + imc_socket_t *sock; + + mutex_enter(&imc->imc_lock); + m = getminor(dev); + if (m >= imc->imc_nsockets) { + ret = EINVAL; + goto done; + } + sock = imc->imc_spointers[m]; + if (sock == NULL) { + ret = EINVAL; + goto done; + } + + /* + * Note, other memory controller drivers don't check mode for reading + * data nor do they care who can read it from a credential perspective. + * As such we don't either at this time. + */ + switch (cmd) { + case MC_IOC_SNAPSHOT_INFO: + imc_nvl_pack(sock, B_FALSE); + if (sock->isock_buf == NULL) { + ret = EIO; + break; + } + + info.mcs_size = sock->isock_buflen; + info.mcs_gen = sock->isock_gen; + + if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_SNAPSHOT: + imc_nvl_pack(sock, B_FALSE); + if (sock->isock_buf == NULL) { + ret = EIO; + break; + } + + if (ddi_copyout(sock->isock_buf, (void *)arg, + sock->isock_buflen, mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_SNAPSHOT_INFO: + imc_decoder_pack(imc); + if (imc->imc_decoder_buf == NULL) { + ret = EIO; + break; + } + + info.mcs_size = imc->imc_decoder_len; + info.mcs_gen = imc->imc_spointers[0]->isock_gen; + + if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_SNAPSHOT: + imc_decoder_pack(imc); + if (imc->imc_decoder_buf == NULL) { + ret = EIO; + break; + } + + if (ddi_copyout(imc->imc_decoder_buf, (void *)arg, + imc->imc_decoder_len, mode) != 0) { + ret = EFAULT; + break; + } + + ret = 0; + break; + case MC_IOC_DECODE_PA: + if (crgetzoneid(credp) != GLOBAL_ZONEID || + drv_priv(credp) != 0) { + ret = EPERM; + break; + } + + if (ddi_copyin((void *)arg, &encode, sizeof (encode), + mode & FKIOCTL) != 0) { + ret = EPERM; + break; + } + + imc_ioctl_decode(imc, &encode); + ret = 0; + + if (ddi_copyout(&encode, (void *)arg, sizeof (encode), + mode & FKIOCTL) != 0) { + ret = EPERM; + break; + } + break; + default: + ret = EINVAL; + goto done; + } + +done: + mutex_exit(&imc->imc_lock); + return (ret); +} + +static int +imc_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + return (0); +} + +static int +imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + if (imc_data == NULL || imc_data->imc_dip != NULL) { + return (DDI_FAILURE); + } + + mutex_enter(&imc_data->imc_lock); + if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + imc_data->imc_dip = dip; + imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED; + mutex_exit(&imc_data->imc_lock); + + (void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data, + DDI_SLEEP); + + return (DDI_SUCCESS); +} + +/* + * We only export a single instance. + */ +static int +imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) +{ + /* + * getinfo(9E) shouldn't be called if we're not attached. But be + * paranoid. + */ + if (imc_data == NULL || imc_data->imc_dip == NULL) { + return (DDI_FAILURE); + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = imc_data->imc_dip; + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + break; + default: + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + if (imc_data == NULL || imc_data->imc_dip) { + return (DDI_FAILURE); + } + + mutex_enter(&imc_data->imc_lock); + + /* + * While a scan or attach is outstanding, don't allow us to detach. + */ + if ((imc_data->imc_flags & + (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + /* + * Because the stub driver depends on the imc driver, we shouldn't be + * able to have any entries in this list when we detach. However, we + * check just to make sure. + */ + if (!avl_is_empty(&imc_data->imc_stubs)) { + mutex_exit(&imc_data->imc_lock); + return (DDI_FAILURE); + } + + nvlist_free(imc_data->imc_decoder_dump); + imc_data->imc_decoder_dump = NULL; + if (imc_data->imc_decoder_buf != NULL) { + kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len); + imc_data->imc_decoder_buf = NULL; + imc_data->imc_decoder_len = 0; + } + + ddi_remove_minor_node(imc_data->imc_dip, NULL); + imc_data->imc_dip = NULL; + mutex_exit(&imc_data->imc_lock); + + ddi_taskq_wait(imc_data->imc_taskq); + ddi_taskq_destroy(imc_data->imc_taskq); + imc_data->imc_taskq = NULL; + + return (DDI_SUCCESS); +} + +static void +imc_free(void) +{ + if (imc_data == NULL) { + return; + } + + VERIFY(avl_is_empty(&imc_data->imc_stubs)); + avl_destroy(&imc_data->imc_stubs); + mutex_destroy(&imc_data->imc_lock); + kmem_free(imc_data, sizeof (imc_t)); + imc_data = NULL; +} + +static void +imc_alloc(void) +{ + imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP); + + mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL); + avl_create(&imc_data->imc_stubs, imc_stub_comparator, + sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link)); +} + +static struct cb_ops imc_cb_ops = { + .cb_open = imc_open, + .cb_close = imc_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = imc_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_flag = D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +static struct dev_ops imc_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = imc_getinfo, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_attach = imc_attach, + .devo_detach = imc_detach, + .devo_reset = nodev, + .devo_cb_ops = &imc_cb_ops, + .devo_quiesce = ddi_quiesce_not_needed +}; + +static struct modldrv imc_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "Intel Integrated Memory Controller Driver", + .drv_dev_ops = &imc_dev_ops +}; + +static struct modlinkage imc_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &imc_modldrv, NULL } +}; + +int +_init(void) +{ + int ret; + + if ((ret = mod_install(&imc_modlinkage)) == 0) { + imc_alloc(); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&imc_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = mod_remove(&imc_modlinkage)) == 0) { + imc_free(); + } + return (ret); +} diff --git a/usr/src/uts/intel/io/imc/imc.conf b/usr/src/uts/intel/io/imc/imc.conf new file mode 100644 index 0000000000..7f55dc2cae --- /dev/null +++ b/usr/src/uts/intel/io/imc/imc.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +name="imc" parent="pseudo" instance=0; diff --git a/usr/src/uts/intel/io/imc/imc.h b/usr/src/uts/intel/io/imc/imc.h new file mode 100644 index 0000000000..5f3def4930 --- /dev/null +++ b/usr/src/uts/intel/io/imc/imc.h @@ -0,0 +1,941 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _INTEL_IMC_H +#define _INTEL_IMC_H + +#include +#include +#include +#include + +/* + * This header file contains the definitions used for the various generations of + * the Intel IMC driver. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The maximum number of sockets that the IMC driver supports. This is currently + * determined by the Purley platforms (Skylake) which support up to 8 CPUs. + */ +#define IMC_MAX_SOCKETS 8 + +/* + * The maximum number of memory controllers that exist per socket. Currently all + * supported platforms (Sandy Bridge -> Skylake) support at most two. + */ +#define IMC_MAX_IMCPERSOCK 2 + +/* + * The maximum number of channels that exist per IMC. Currently Skylake supports + * 3 per IMC. On certain configurations of Haswell/Broadwell, there is only a + * single IMC which supports all 4 channels. + */ +#define IMC_MAX_CHANPERMC 4 + +/* + * The maximum number of DIMMs that exist per channel. On Skylake this is two + * DIMMs. However, Sandy Bridge through Broadwell support three. + */ +#define IMC_MAX_DIMMPERCHAN 3 + +/* + * The maximum number of rank disable bits per DIMM. This is currently + * consistent across all generations that have these bits. + */ +#define IMC_MAX_RANK_DISABLE 4 + +/* + * The number of different PCI buses that we need to record for a given + * platform. Pre-Skylake there are only two that are required, one for the IIO + * and one for the non-IIO. On Skylake, more PCI buses are used. + */ +#define IMC_MAX_PCIBUSES 3 + +/* + * Macros to take apart the node id for a given processor. These assume that + * we're reading the nodeid from the UBox and not from the SAD control. + */ +#define IMC_NODEID_UBOX_MASK(x) ((x) & 0x7) + +/* + * On Ivy Bridge through Broadwell, the node id that is found in the SAD targets + * has the HA indicator as NodeID[2]. This means that the actual target node of + * the socket is NodeID[3] | NodeID[1:0]. + */ +#define IMC_NODEID_IVY_BRD_UPPER(x) BITX(x, 3, 3) +#define IMC_NODEID_IVY_BRD_LOWER(x) BITX(x, 1, 0) +#define IMC_NODEID_IVY_BRD_HA(x) BITX(x, 2, 2) + +/* + * Macros to take apart the MCMTR register bits that we care about. + */ +#define IMC_MCMTR_CLOSED_PAGE(x) BITX(x, 0, 0) +#define IMC_MCMTR_LOCKSTEP(x) BITX(x, 1, 1) +#define IMC_MCMTR_ECC_ENABLED(x) BITX(x, 2, 2) + +#define IMC_MCMTR_DDR4_HAS_BRD(x) BITX(x, 14, 14) + +/* + * Macros to take apart the dimmmtr_* registers in different generations. While + * there are similarities, these often end up different between generations and + * chips. These macros use a range of CPUs that they're valid for in the name. + * Macros with no suffix are valid for all currently supported CPUs. + */ + +#define IMC_REG_MC_MTR0 0x80 +#define IMC_REG_MC_MTR1 0x84 +#define IMC_REG_MC_MTR2 0x88 + +#define IMC_MTR_CA_WIDTH(x) BITX(x, 1, 0) +#define IMC_MTR_CA_BASE 10 +#define IMC_MTR_CA_MIN 10 +#define IMC_MTR_CA_MAX 12 + +#define IMC_MTR_RA_WIDTH(x) BITX(x, 4, 2) +#define IMC_MTR_RA_BASE 12 +#define IMC_MTR_RA_MIN 13 +#define IMC_MTR_RA_MAX 18 + +#define IMC_MTR_DENSITY_IVY_BRD(x) BITX(x, 6, 5) +#define IMC_MTR_DENSITY_SKX(x) BITX(x, 7, 5) + +#define IMC_MTR_WIDTH_IVB_HAS(x) BITX(x, 8, 7) +#define IMC_MTR_WIDTH_BRD_SKX(x) BITX(x, 9, 8) + +#define IMC_MTR_DDR_RANKS(x) BITX(x, 13, 12) +#define IMC_MTR_DDR_RANKS_MAX 4 +#define IMC_MTR_DDR_RANKS_MAX_HAS_SKX 8 + +#define IMC_MTR_PRESENT_SNB_BRD(x) BITX(x, 14, 14) +#define IMC_MTR_PRESENT_SKYLAKE(x) BITX(x, 15, 15) + +#define IMC_MTR_RANK_DISABLE(x) BITX(x, 19, 16) + +#define IMC_MTR_DDR4_ENABLE_HAS_BRD(x) BITX(x, 20, 20) +#define IMC_MTR_HDRL_HAS_SKX(x) BITX(x, 21, 21) +#define IMC_MTR_HDRL_PARITY_HAS_SKX(x) BITX(x, 22, 22) +#define IMC_MTR_3DSRANKS_HAS_SKX(x) BITX(x, 24, 23) + +/* + * Data for the RASENABLES register. + */ +#define IMC_MC_MIRROR_SNB_BRD(x) BITX(x, 0, 0) + +/* + * The maximum number of SAD rules that exist on all supported platforms. + */ +#define IMC_MAX_SAD_RULES 24 + +/* + * The maximum number of targets that can be interleaved in a sad rule. + */ +#define IMC_MAX_SAD_INTERLEAVE 8 + +/* + * The maximum number of route entries that exist in SAD. This is only used on + * SKX. + */ +#define IMC_MAX_SAD_MCROUTES 6 + +/* + * Definitions used to decode the MC Route table. Note that at this time this is + * very Skylake specific (as it's the only platform it's supported on). + */ +#define IMC_REG_SKX_SAD_MC_ROUTE_TABLE 0xb4 +#define IMC_MC_ROUTE_RING_BITS 3 +#define IMC_MC_ROUTE_RING_MASK 0x7 +#define IMC_MC_ROUTE_CHAN_BITS 2 +#define IMC_MC_ROUTE_CHAN_MASK 0x3 +#define IMC_MC_ROUTE_CHAN_OFFSET 18 + +/* + * Definitions to help decode TOLM (top of low memory) and TOHM (top of high + * memory). The way this is done varies based on generation. These regions are + * currently always 64-MByte aligned + * + * On Sandy Bridge and Ivy Bridge the low four bits of TOLM are bits 31:28. TOHM + * is a single register. Bits 20:0 map to bits 45:25. Both registers represent + * the upper limit (as in one higher than the max DRAM value). + * + * On Haswell through Skylake, TOLM is represented as a 32-bit quantity. No + * shifting is required. However, only bits 31:26 are present. TOHM is spread + * out among two registers. The lower 32-bits is masked in a similar fashion. In + * both cases, these registers represent an inclusive range where we don't care + * about other bits. To deal with this we'll increment the lowest bit we care + * about to make it an exclusive range. + * + * Based on the above, we have opted to make both ranges in the IMC driver + * normalized to an _exclusive_ value. + * + * Ivy Bridge has the values in both the CBo SAD and a VT-d section; however, we + * use the CBo SAD which is why it looks like Sandy Bridge and not Haswell. + */ + +#define IMC_TOLM_SNB_IVY_MASK 0xf +#define IMC_TOLM_SNB_IVY_SHIFT 28 +#define IMC_TOHM_SNB_IVY_MASK 0x1fffff +#define IMC_TOHM_SNB_IVY_SHIFT 25 + +#define IMC_TOLM_HAS_SKX_MASK 0xfc000000 +#define IMC_TOLM_HAS_SKY_EXCL (1 << 26) +#define IMC_TOHM_LOW_HAS_SKX_MASK 0xfc000000 +#define IMC_TOHM_HAS_SKY_EXCL (1 << 26) + +/* + * Definitions to decode SAD values. These are sometimes subtlety different + * across generations. + */ +#define IMC_SAD_DRAM_RULE_ENABLE(x) BITX(x, 0, 0) + +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(x) BITX(x, 1, 1) +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR 0 +#define IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6 1 + +#define IMC_SAD_DRAM_INTERLEAVE_SKX(x) BITX(x, 2, 1) +#define IMC_SAD_DRAM_INTERLEAVE_SKX_8t6 0 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_10t8 1 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_14t12 2 +#define IMC_SAD_DRAM_INTERLEAVE_SKX_32t30 3 + +#define IMC_SAD_DRAM_ATTR_SNB_BRD(x) BITX(x, 3, 2) +#define IMC_SAD_DRAM_ATTR_SKX(x) BITX(x, 4, 3) +#define IMC_SAD_DRAM_ATTR_DRAM 0 +#define IMC_SAD_DRAM_ATTR_MMCFG 1 +#define IMC_SAD_DRAM_ATTR_NXM 2 + +#define IMC_SAD_DRAM_MOD23_SKX(x) BITX(x, 6, 5) +#define IMC_SAD_DRAM_MOD23_MOD3 0 +#define IMC_SAD_DRAM_MOD23_MOD2_C01 1 +#define IMC_SAD_DRAM_MOD23_MOD2_C12 2 +#define IMC_SAD_DRAM_MOD23_MOD2_C02 3 + +#define IMC_SAD_DRAM_LIMIT_SNB_BRD(x) BITX(x, 25, 6) +#define IMC_SAD_DRAM_LIMIT_SKX(x) BITX(x, 26, 7) +#define IMC_SAD_DRAM_LIMIT_SHIFT 26 +#define IMC_SAD_DRAM_LIMIT_EXCLUSIVE (1 << IMC_SAD_DRAM_LIMIT_SHIFT) + +#define IMC_SAD_DRAM_A7_IVB_BRD(x) BITX(x, 26, 26) +#define IMC_SAD_DRAM_MOD3_SKX(x) BITX(x, 27, 27) +#define IMC_SAD_DRAM_MOD3_MODE_SKX(x) BITX(x, 31, 30) +#define IMC_SAD_DRAM_MOD3_MODE_45t6 0 +#define IMC_SAD_DRAM_MOD3_MODE_45t8 1 +#define IMC_SAD_DRAM_MOD3_MODE_45t12 2 + +#define IMC_SAD_ILEAVE_SNB_MASK 0x7 +#define IMC_SAD_ILEAVE_SNB_LEN 3 +#define IMC_SAD_ILEAVE_IVB_SKX_MASK 0xf +#define IMC_SAD_ILEAVE_IVB_SKX_LEN 4 + +/* + * The interleave targets on Skylake use the upper bit to indicate whether it is + * referring to a local memory controller or if it actually refers to another + * node that is far away. The maximum value includes the upper bit which is used + * to indicate whether it is remote or far. + */ +#define IMC_SAD_ILEAVE_SKX_LOCAL(x) BITX(x, 3, 3) +#define IMC_SAD_ILEAVE_SKX_TARGET(x) BITX(x, 2, 0) +#define IMC_SAD_ILEAVE_SKX_MAX 0xf + +/* + * Maximum number of TAD tables that we need to consider. On Sandy Bridge + * through Broadwell this is based on the number of home agents that are present + * in the system. On Sandy Bridge there is one, on others, there are up to two. + * On Skylake, there is one TAD per IMC. + */ +#define IMC_MAX_TAD 2 + +/* + * Maximum number of TAD rules on any of the supported processors. + */ +#define IMC_MAX_TAD_RULES 12 + +/* + * Maximum number of interleave targets. Note, this only applies to Sandy Bridge + * through Broadwell. Skylake gets this information in another form. + */ +#define IMC_MAX_TAD_TARGETS 4 + +/* + * Offset between the base TAD rule and the corresponding wayness rule on + * Skylake. + */ +#define IMC_SKX_WAYNESS_OFFSET 0x30 + +/* + * Various macros to decode the TAD rules. + */ +#define IMC_TAD_LIMIT(x) BITX(x, 31, 12) +#define IMC_TAD_LIMIT_SHIFT 26 +#define IMC_TAD_LIMIT_EXCLUSIVE (1 << IMC_TAD_LIMIT_SHIFT) + +#define IMC_TAD_SOCK_WAY(x) BITX(x, 11, 10) +#define IMC_TAD_SOCK_WAY_1 0 +#define IMC_TAD_SOCK_WAY_2 1 +#define IMC_TAD_SOCK_WAY_4 2 +#define IMC_TAD_SOCK_WAY_8 3 +#define IMC_TAD_CHAN_WAY(x) BITX(x, 9, 8) +#define IMC_TAD_TARG3(x) BITX(x, 7, 6) +#define IMC_TAD_TARG2(x) BITX(x, 5, 4) +#define IMC_TAD_TARG1(x) BITX(x, 3, 2) +#define IMC_TAD_TARG0(x) BITX(x, 1, 0) + +#define IMC_TAD_SNB_BRD_NTARGETS 4 + +/* + * These are registers specific to the Skylake and newer TAD BASE registers. + */ +#define IMC_TAD_BASE_BASE(x) BITX(x, 31, 12) +#define IMC_TAD_BASE_SHIFT 26 + +#define IMC_TAD_BASE_CHAN_GRAN(x) BITX(x, 7, 6) +#define IMC_TAD_BASE_CHAN_GRAN_64B 0 +#define IMC_TAD_BASE_CHAN_GRAN_256B 1 +#define IMC_TAD_BASE_CHAN_GRAN_4KB 2 + +#define IMC_TAD_BASE_SOCK_GRAN(x) BITX(x, 5, 4) +#define IMC_TAD_BASE_SOCK_GRAN_64B 0 +#define IMC_TAD_BASE_SOCK_GRAN_256B 1 +#define IMC_TAD_BASE_SOCK_GRAN_4KB 2 +#define IMC_TAD_BASE_SOCK_GRAN_1GB 3 + +#define IMC_TADCHAN_OFFSET_SNB_BRD(x) BITX(x, 25, 6) +#define IMC_TADCHAN_OFFSET_SKX(x) BITX(x, 23, 4) +#define IMC_TADCHAN_OFFSET_SHIFT 26 + +/* + * Macros to get at various TAD features. + */ +#define IMC_TAD_SYSDEF_LOCKSTEP(x) BITX(x, 7, 7) +#define IMC_TAD_SYSDEF2_SHIFTUP(x) BITX(x, 22, 22) +#define IMC_TAD_SYSDEF2_CHANHASH(x) BITX(x, 21, 21) + +/* + * Maximum number of different wayness entries that exist across the various IMC + * generations. Each wayness then has a maximum number of target entries. + */ +#define IMC_MAX_RANK_WAYS 5 +#define IMC_MAX_RANK_INTERLEAVES 8 + +/* + * Macros to take apart the rank interleave wayness and offset registers. + */ +#define IMC_RIR_WAYNESS_ENABLED(x) BITX(x, 31, 31) +#define IMC_RIR_WAYNESS_WAY(x) BITX(x, 29, 28) +#define IMC_RIR_LIMIT_HAS_SKX(x) BITX(x, 11, 1) +#define IMC_RIR_LIMIT_SNB_IVB(x) BITX(x, 10, 1) +#define IMC_RIR_LIMIT_SHIFT 29 +#define IMC_RIR_LIMIT_EXCLUSIVE (1 << IMC_RIR_LIMIT_SHIFT) + +/* + * Currently, everything other than Broadwell has the same value for the target + * offset. + */ +#define IMC_RIR_OFFSET_TARGET_BRD(x) BITX(x, 23, 20) +#define IMC_RIR_OFFSET_TARGET(x) BITX(x, 19, 16) +#define IMC_RIR_OFFSET_OFFSET_HAS_SKX(x) BITX(x, 15, 2) +#define IMC_RIR_OFFSET_OFFSET_SNB_IVB(x) BITX(x, 14, 2) +#define IMC_RIR_OFFSET_SHIFT 29 + +/* + * Definitions to cover manipulations of open and closed pages. + */ +#define IMC_PAGE_BITS_CLOSED 6 +#define IMC_PAGE_BITS_OPEN 13 + +/* + * Macros to decode and understand the CPUBUSNO registers in the UBOX_DECS. + */ +#define IMC_UBOX_CPUBUSNO_0(x) BITX(x, 7, 0) +#define IMC_UBOX_CPUBUSNO_1(x) BITX(x, 15, 8) +#define IMC_UBOX_CPUBUSNO_2(x) BITX(x, 23, 16) + +/* + * Hardware generations supported by the IMC driver. + */ +typedef enum { + IMC_GEN_UNKNOWN = 0, + IMC_GEN_SANDY, + IMC_GEN_IVY, + IMC_GEN_HASWELL, + IMC_GEN_BROADWELL, + /* + * IMC_GEN_SKYLAKE also covers Cascade Lake. The two are similar to the + * point of even having the same PCI IDs for all of the devices. The + * only difference in the cpuid signature between them is the stepping, + * hence we do not have a separate Cascade Lake target here, as it's + * really the same as Skylake. + */ + IMC_GEN_SKYLAKE +} imc_gen_t; + +/* + * Generation specific limits. + */ +typedef struct imc_gen_data { + uint_t igd_max_sockets; + uint_t igd_max_imcs; + uint_t igd_max_channels; + uint_t igd_max_dimms; + uint_t igd_max_ranks; + uint_t igd_mtr_offsets[IMC_MAX_DIMMPERCHAN]; + uint_t igd_mcmtr_offset; + uint_t igd_topo_offset; + uint_t igd_num_mcroutes; + uint_t igd_tolm_offset; + uint_t igd_tohm_low_offset; + uint_t igd_tohm_hi_offset; + uint_t igd_sad_dram_offset; + uint_t igd_sad_ndram_rules; + uint_t igd_sad_nodeid_offset; + uint_t igd_tad_nrules; + uint_t igd_tad_rule_offset; + uint_t igd_tad_chan_offset; + uint_t igd_tad_sysdef; + uint_t igd_tad_sysdef2; + uint_t igd_mc_mirror; + uint_t igd_rir_nways; + uint_t igd_rir_way_offset; + uint_t igd_rir_nileaves; + uint_t igd_rir_ileave_offset; + uint_t igd_ubox_cpubusno_offset; +} imc_gen_data_t; + +/* + * Different types of PCI devices that show up on the core that we may need to + * attach to. + */ +typedef enum { + IMC_TYPE_UNKNOWN = 0, + IMC_TYPE_MC0_M2M, /* SKX Only */ + IMC_TYPE_MC1_M2M, /* SKX Only */ + IMC_TYPE_MC0_MAIN0, + IMC_TYPE_MC0_MAIN1, + IMC_TYPE_MC1_MAIN0, + IMC_TYPE_MC1_MAIN1, + IMC_TYPE_MC0_CHANNEL0, + IMC_TYPE_MC0_CHANNEL1, + IMC_TYPE_MC0_CHANNEL2, + IMC_TYPE_MC0_CHANNEL3, + IMC_TYPE_MC1_CHANNEL0, + IMC_TYPE_MC1_CHANNEL1, + IMC_TYPE_MC1_CHANNEL2, + IMC_TYPE_MC1_CHANNEL3, + IMC_TYPE_SAD_DRAM, + IMC_TYPE_SAD_MMIO, + /* + * We want to note which device has the TOLM and TOHM registers. + * Unfortunately this is a rather complicated affair. On Sandy Bridge + * they are a part of the IMC_TYPE_SAD_MMIO. On Ivy Bridge, it's on its + * own dedicated device on the CBo. + * + * On Haswell onward, these move to the VT-D misc. registers. On Haswell + * and Broadwell, only one of these exist in the system. However, on + * Skylake these exist per socket. + */ + IMC_TYPE_SAD_MISC, + IMC_TYPE_VTD_MISC, + /* + * On SKX this exists on a per-core basis. It contains the memory + * controller routing table. + */ + IMC_TYPE_SAD_MCROUTE, + IMC_TYPE_UBOX, + IMC_TYPE_UBOX_CPUBUSNO, + IMC_TYPE_HA0, + IMC_TYPE_HA1, +} imc_type_t; + +/* + * Each entry in the stub table represents a device that we might attach to in a + * given generation. This is only defined in the kernel to make it easier to + * build the imc decoder in userland for testing. + */ +#ifdef _KERNEL +typedef struct imc_stub_table { + imc_gen_t imcs_gen; + imc_type_t imcs_type; + uint16_t imcs_devid; + uint16_t imcs_pcidev; + uint16_t imcs_pcifunc; + const char *imcs_desc; +} imc_stub_table_t; + +typedef struct imc_stub { + avl_node_t istub_link; + dev_info_t *istub_dip; + uint16_t istub_vid; + uint16_t istub_did; + uint16_t istub_bus; + uint16_t istub_dev; + uint16_t istub_func; + ddi_acc_handle_t istub_cfgspace; + const imc_stub_table_t *istub_table; +} imc_stub_t; +#else +typedef struct imc_stub { + void *istub_unused; +} imc_stub_t; +#endif /* _KERNEL */ + +typedef enum { + IMC_F_UNSUP_PLATFORM = (1 << 0), + IMC_F_SCAN_DISPATCHED = (1 << 1), + IMC_F_SCAN_COMPLETE = (1 << 2), + IMC_F_ATTACH_DISPATCHED = (1 << 3), + IMC_F_ATTACH_COMPLETE = (1 << 4), + IMC_F_MCREG_FAILED = (1 << 5), + IMC_F_VALIDATE_FAILED = (1 << 6) +} imc_flags_t; + +#define IMC_F_ALL_FLAGS (IMC_F_UNSUP_PLATFORM | IMC_F_SCAN_DISPATCHED | \ + IMC_F_SCAN_COMPLETE | IMC_F_ATTACH_DISPATCHED | IMC_F_ATTACH_COMPLETE | \ + IMC_F_MCREG_FAILED | IMC_F_VALIDATE_FAILED) + +typedef enum imc_dimm_type { + IMC_DIMM_UNKNOWN, + IMC_DIMM_DDR3, + IMC_DIMM_DDR4, + IMC_DIMM_NVDIMM +} imc_dimm_type_t; + +typedef enum imc_dimm_valid { + IMC_DIMM_V_VALID = 0, + IMC_DIMM_V_BAD_PCI_READ = (1 << 0), + IMC_DIMM_V_BAD_ROWS = (1 << 1), + IMC_DIMM_V_BAD_COLUMNS = (1 << 2), + IMC_DIMM_V_BAD_DENSITY = (1 << 3), + IMC_DIMM_V_BAD_WIDTH = (1 << 4), + IMC_DIMM_V_BAD_RANKS = (1 << 5) +} imc_dimm_valid_t; + +typedef struct imc_dimm { + imc_dimm_valid_t idimm_valid; + boolean_t idimm_present; + uint8_t idimm_3dsranks; + boolean_t idimm_hdrl_parity; + boolean_t idimm_hdrl; + boolean_t idimm_ranks_disabled[IMC_MAX_RANK_DISABLE]; + uint8_t idimm_nbanks; + uint8_t idimm_nranks; + uint8_t idimm_width; + uint8_t idimm_density; /* In GiB */ + uint8_t idimm_nrows; + uint8_t idimm_ncolumns; + /* Synthesized */ + uint64_t idimm_size; + /* Raw data */ + uint32_t idimm_mtr; +} imc_dimm_t; + +typedef struct imc_rank_ileave_entry { + uint8_t irle_target; + uint64_t irle_offset; +} imc_rank_ileave_entry_t; + +typedef struct imc_rank_ileave { + boolean_t irle_enabled; + uint32_t irle_raw; + uint8_t irle_nways; + uint8_t irle_nwaysbits; + uint64_t irle_limit; + uint_t irle_nentries; + imc_rank_ileave_entry_t irle_entries[IMC_MAX_RANK_INTERLEAVES]; +} imc_rank_ileave_t; + +typedef enum imc_channel_valid { + IMC_CHANNEL_V_VALID = 0, + IMC_CHANNEL_V_BAD_PCI_READ = 1 << 0, +} imc_channel_valid_t; + +typedef struct imc_channel { + imc_channel_valid_t ich_valid; + imc_stub_t *ich_desc; + uint_t ich_ndimms; + imc_dimm_t ich_dimms[IMC_MAX_DIMMPERCHAN]; + uint_t ich_ntad_offsets; + uint32_t ich_tad_offsets_raw[IMC_MAX_TAD_RULES]; + uint64_t ich_tad_offsets[IMC_MAX_TAD_RULES]; + uint_t ich_nrankileaves; + imc_rank_ileave_t ich_rankileaves[IMC_MAX_RANK_WAYS]; +} imc_channel_t; + +typedef struct imc_controller { + imc_stub_t *icn_main0; + imc_stub_t *icn_main1; + imc_stub_t *icn_m2m; + boolean_t icn_invalid; + imc_dimm_type_t icn_dimm_type; + boolean_t icn_ecc; + boolean_t icn_lockstep; + boolean_t icn_closed; + uint32_t icn_topo; + uint_t icn_nchannels; + imc_channel_t icn_channels[IMC_MAX_CHANPERMC]; +} imc_mc_t; + +typedef enum imc_sad_rule_type { + IMC_SAD_TYPE_DRAM, + IMC_SAD_TYPE_MMCFG, + IMC_SAD_TYPE_NXM +} imc_sad_rule_type_t; + +typedef enum imc_sad_rule_imode { + IMC_SAD_IMODE_8t6, + IMC_SAD_IMODE_8t6XOR, + IMC_SAD_IMODE_10t8, + IMC_SAD_IMODE_14t12, + IMC_SAD_IMODE_32t30 +} imc_sad_rule_imode_t; + +typedef enum imc_sad_rule_mod_mode { + IMC_SAD_MOD_MODE_NONE, + IMC_SAD_MOD_MODE_45t6, + IMC_SAD_MOD_MODE_45t8, + IMC_SAD_MOD_MODE_45t12 +} imc_sad_rule_mod_mode_t; + +typedef enum imc_sad_rule_mod_type { + IMC_SAD_MOD_TYPE_NONE, + IMC_SAD_MOD_TYPE_MOD3, + IMC_SAD_MOD_TYPE_MOD2_01, + IMC_SAD_MOD_TYPE_MOD2_12, + IMC_SAD_MOD_TYPE_MOD2_02 +} imc_sad_rule_mod_type_t; + +typedef struct imc_sad_mcroute_entry { + uint8_t ismce_imc; /* ID of the target IMC */ + uint8_t ismce_pchannel; /* ID of the target physical channel */ +} imc_sad_mcroute_entry_t; + +typedef struct imc_sad_mcroute_table { + uint32_t ismc_raw_mcroute; + uint_t ismc_nroutes; + imc_sad_mcroute_entry_t ismc_mcroutes[IMC_MAX_SAD_MCROUTES]; +} imc_sad_mcroute_table_t; + +/* + * This rule represents a single SAD entry. + */ +typedef struct imc_sad_rule { + uint32_t isr_raw_dram; + uint32_t isr_raw_interleave; + boolean_t isr_enable; + boolean_t isr_a7mode; + boolean_t isr_need_mod3; + uint64_t isr_limit; + imc_sad_rule_type_t isr_type; + imc_sad_rule_imode_t isr_imode; + imc_sad_rule_mod_mode_t isr_mod_mode; + imc_sad_rule_mod_type_t isr_mod_type; + uint_t isr_ntargets; + uint8_t isr_targets[IMC_MAX_SAD_INTERLEAVE]; +} imc_sad_rule_t; + +typedef enum imc_sad_flags { + IMC_SAD_MCROUTE_VALID = 1 << 0, +} imc_sad_flags_t; + +typedef enum imc_sad_valid { + IMC_SAD_V_VALID = 0, + IMC_SAD_V_BAD_PCI_READ = 1 << 0, + IMC_SAD_V_BAD_MCROUTE = 1 << 1, + IMC_SAD_V_BAD_DRAM_ATTR = 1 << 2, + IMC_SAD_V_BAD_MOD3 = 1 << 3, +} imc_sad_valid_t; + +typedef struct imc_sad { + imc_sad_flags_t isad_flags; + imc_sad_valid_t isad_valid; + imc_stub_t *isad_dram; + imc_stub_t *isad_mmio; + imc_stub_t *isad_tolh; + uint64_t isad_tolm; + uint64_t isad_tohm; + uint_t isad_nrules; + imc_sad_rule_t isad_rules[IMC_MAX_SAD_RULES]; + imc_sad_mcroute_table_t isad_mcroute; +} imc_sad_t; + +typedef enum imc_tad_gran { + IMC_TAD_GRAN_64B = 0, + IMC_TAD_GRAN_256B, + IMC_TAD_GRAN_4KB, + IMC_TAD_GRAN_1GB +} imc_tad_gran_t; + +typedef struct imc_tad_rule { + uint64_t itr_base; + uint64_t itr_limit; + uint32_t itr_raw; + uint32_t itr_raw_gran; + uint8_t itr_sock_way; + uint8_t itr_chan_way; + imc_tad_gran_t itr_sock_gran; + imc_tad_gran_t itr_chan_gran; + uint_t itr_ntargets; + uint8_t itr_targets[IMC_MAX_TAD_TARGETS]; +} imc_tad_rule_t; + +typedef enum imc_tad_valid { + IMC_TAD_V_VALID = 1 << 0, + IMC_TAD_V_BAD_PCI_READ = 1 << 1, + IMC_TAD_V_BAD_CHAN_GRAN = 1 << 2 +} imc_tad_valid_t; + +typedef enum imc_tad_flags { + IMC_TAD_FLAG_CHANSHIFT = 1 << 0, + IMC_TAD_FLAG_CHANHASH = 1 << 1, + IMC_TAD_FLAG_MIRROR = 1 << 2, + IMC_TAD_FLAG_LOCKSTEP = 1 << 3 +} imc_tad_flags_t; + +typedef struct imc_tad { + imc_tad_valid_t itad_valid; + imc_stub_t *itad_stub; + imc_tad_flags_t itad_flags; + uint_t itad_nrules; + imc_tad_rule_t itad_rules[IMC_MAX_TAD_RULES]; +} imc_tad_t; + +typedef enum imc_socket_valid { + IMC_SOCKET_V_VALID = 0, + IMC_SOCKET_V_BAD_NODEID = 1 << 0 +} imc_socket_valid_t; + +typedef struct imc_socket { + imc_socket_valid_t isock_valid; + uint_t isock_bus[IMC_MAX_PCIBUSES]; + uint_t isock_nbus; + uint_t isock_gen; + nvlist_t *isock_nvl; + char *isock_buf; + size_t isock_buflen; + imc_sad_t isock_sad; + uint_t isock_ntad; + imc_tad_t isock_tad[IMC_MAX_TAD]; + imc_stub_t *isock_ubox; + imc_stub_t *isock_cpubusno; + uint32_t isock_nodeid; + uint_t isock_nimc; + imc_mc_t isock_imcs[IMC_MAX_IMCPERSOCK]; +} imc_socket_t; + +typedef struct imc { + /* + * The initial members here are only used in the kernel. This is done to + * make it easier for us to be able to define a version of this to use + * in testing. + */ +#ifdef _KERNEL + dev_info_t *imc_dip; + kmutex_t imc_lock; + imc_flags_t imc_flags; + const imc_gen_data_t *imc_gen_data; + ddi_taskq_t *imc_taskq; + uint_t imc_nscanned; + avl_tree_t imc_stubs; + nvlist_t *imc_decoder_dump; + char *imc_decoder_buf; + size_t imc_decoder_len; +#endif /* _KERNEL */ + imc_gen_t imc_gen; + + /* + * Data about the memory in the system + */ + uint_t imc_nsockets; + imc_socket_t imc_sockets[IMC_MAX_SOCKETS]; + +#ifdef _KERNEL + /* + * The imc_sockets[] array is organized based on increasing PCI Bus ID. + * This array maps the socket id that user land thinks of back to the + * actual underlying socket in case hardware does not put them in order. + */ + imc_socket_t *imc_spointers[IMC_MAX_SOCKETS]; + + /* + * Store the IIO global VT-D misc. device. While there are sometimes + * multiple on the system, we only keep a single one around. + */ + imc_stub_t *imc_gvtd_misc; +#endif +} imc_t; + + +/* + * Decoder failure reasons + */ +typedef enum imc_decode_failure { + IMC_DECODE_F_NONE = 0, + /* + * Indicates that the memory address fell into a reserved legacy range. + * The legacy range index is stored in the failure data. + */ + IMC_DECODE_F_LEGACY_RANGE, + /* + * Indicates that we had bad socket data. The socket in question is + * noted in the failure data. + */ + IMC_DECODE_F_BAD_SOCKET, + /* + * Indicates that we had bad SAD data. The socket the SAD is associated + * with is noted in the failure data. + */ + IMC_DECODE_F_BAD_SAD, + /* + * Indicates that the address was not contained in conventional, low, + * or high memory. + */ + IMC_DECODE_F_OUTSIDE_DRAM, + /* + * Indicates that no valid SAD rule was found for the address. + */ + IMC_DECODE_F_NO_SAD_RULE, + /* + * Indicates that the SAD interleave target was beyond the valid index. + */ + IMC_DECODE_F_BAD_SAD_INTERLEAVE, + /* + * Indicates that the route suggested a remote processor we can't find. + */ + IMC_DECODE_F_BAD_REMOTE_MC_ROUTE, + /* + * Indicates that we ended up in a loop trying to find the right socket + * to use. + */ + IMC_DECODE_F_SAD_SEARCH_LOOP, + /* + * Indicates that we encountered a SAD rule that asked for inconsistent + * mod rules. + */ + IMC_DECODE_F_SAD_BAD_MOD, + /* + * Indicates that the socket or tad rule we found doesn't actually point + * to something that we know about. + */ + IMC_DECODE_F_SAD_BAD_SOCKET, + IMC_DECODE_F_SAD_BAD_TAD, + /* + * Indicates that we could not find a matching tad rule. + */ + IMC_DECODE_F_NO_TAD_RULE, + /* + * Indicates that we encountered the TAD channel 3-way interleave that + * we don't support. + */ + IMC_DECODE_F_TAD_3_ILEAVE, + /* + * Indicates that we had a bad target index. + */ + IMC_DECODE_F_TAD_BAD_TARGET_INDEX, + /* + * Indicates that we have a bad channel ID. + */ + IMC_DECODE_F_BAD_CHANNEL_ID, + /* + * Indicates that the TAD rule offset in the channel interleave was + * incorrect. + */ + IMC_DECODE_F_BAD_CHANNEL_TAD_OFFSET, + /* + * We couldn't find a valid rank interleave rule. + */ + IMC_DECODE_F_NO_RIR_RULE, + /* + * Indicates that the index of the rank interleaving target was bad. + */ + IMC_DECODE_F_BAD_RIR_ILEAVE_TARGET, + /* + * Indicates that the calculated DIMM represents an invalid DIMM that is + * beyond the number of supported DIMMS per channel on the platform. + */ + IMC_DECODE_F_BAD_DIMM_INDEX, + /* + * Indicates that the specified DIMM is not preset; however, it is a + * valid DIMM number. + */ + IMC_DECODE_F_DIMM_NOT_PRESENT, + /* + * Indicates that the specified rank on the DIMM is more than the number + * of ranks that the DIMM has. + */ + IMC_DECODE_F_BAD_DIMM_RANK, + /* + * Indicates that the channel offset is larger than the system address, + * meaning that we would end up with an underflow if we continued. The + * equivalent is true for the rank address. + */ + IMC_DECODE_F_CHANOFF_UNDERFLOW, + IMC_DECODE_F_RANKOFF_UNDERFLOW, +} imc_decode_failure_t; + +/* + * Decoder state tracking + */ +typedef struct imc_decode_state { + imc_decode_failure_t ids_fail; + uint64_t ids_fail_data; + uint64_t ids_pa; + uint64_t ids_chanaddr; + uint64_t ids_rankaddr; + uint32_t ids_nodeid; + uint32_t ids_tadid; + uint32_t ids_channelid; + uint32_t ids_physrankid; + uint32_t ids_dimmid; + uint32_t ids_rankid; + const imc_socket_t *ids_socket; + const imc_sad_t *ids_sad; + const imc_sad_rule_t *ids_sad_rule; + const imc_tad_t *ids_tad; + const imc_tad_rule_t *ids_tad_rule; + const imc_mc_t *ids_mc; + const imc_channel_t *ids_chan; + const imc_rank_ileave_t *ids_rir; + const imc_dimm_t *ids_dimm; +} imc_decode_state_t; + +#ifdef _KERNEL + +/* + * Functions needed for the stub drivers. + */ +extern int imc_attach_stub(dev_info_t *, ddi_attach_cmd_t); +extern int imc_detach_stub(dev_info_t *, ddi_detach_cmd_t); + +/* + * Decoder related functions + */ +extern void imc_decoder_init(imc_t *); + +extern nvlist_t *imc_dump_decoder(imc_t *); +#else /* !_KERNEL */ +extern boolean_t imc_restore_decoder(nvlist_t *, imc_t *); +#endif /* _KERNEL */ + +extern boolean_t imc_decode_pa(const imc_t *, uint64_t, imc_decode_state_t *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _INTEL_IMC_H */ diff --git a/usr/src/uts/intel/io/imc/imcstub.c b/usr/src/uts/intel/io/imc/imcstub.c new file mode 100644 index 0000000000..ee020dd5c4 --- /dev/null +++ b/usr/src/uts/intel/io/imc/imcstub.c @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * This is a stub driver that is used by the main imcstub driver to attach + * component PCI devices so that it can access their dev_info_t. + */ + +#include +#include +#include +#include +#include + +#include "imc.h" + + +static int +imcstub_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + return (imc_attach_stub(dip, cmd)); +} + +static int +imcstub_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + return (imc_detach_stub(dip, cmd)); +} + +static struct dev_ops imcstub_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = nodev, + .devo_identify = nodev, + .devo_probe = nulldev, + .devo_attach = imcstub_attach, + .devo_detach = imcstub_detach, + .devo_reset = nodev, + .devo_quiesce = ddi_quiesce_not_needed +}; + +static struct modldrv imcstub_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "IMC Stub driver", + .drv_dev_ops = &imcstub_dev_ops +}; + +static struct modlinkage imcstub_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &imcstub_modldrv, NULL } +}; + +int +_init(void) +{ + return (mod_install(&imcstub_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&imcstub_modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&imcstub_modlinkage)); +} -- cgit v1.2.3