summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc/os/cpuid.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86pc/os/cpuid.c')
-rw-r--r--usr/src/uts/i86pc/os/cpuid.c1771
1 files changed, 1379 insertions, 392 deletions
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index 1291b6180d..ddc09a4951 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -34,9 +34,852 @@
/*
* Copyright (c) 2019, Joyent, Inc.
*/
+
/*
- * Various routines to handle identification
- * and classification of x86 processors.
+ * CPU Identification logic
+ *
+ * The purpose of this file and its companion, cpuid_subr.c, is to help deal
+ * with the identification of CPUs, their features, and their topologies. More
+ * specifically, this file helps drive the following:
+ *
+ * 1. Enumeration of features of the processor which are used by the kernel to
+ * determine what features to enable or disable. These may be instruction set
+ * enhancements or features that we use.
+ *
+ * 2. Enumeration of instruction set architecture (ISA) additions that userland
+ * will be told about through the auxiliary vector.
+ *
+ * 3. Understanding the physical topology of the CPU such as the number of
+ * caches, how many cores it has, whether or not it supports symmetric
+ * multi-processing (SMT), etc.
+ *
+ * ------------------------
+ * CPUID History and Basics
+ * ------------------------
+ *
+ * The cpuid instruction was added by Intel roughly around the time that the
+ * original Pentium was introduced. The purpose of cpuid was to tell in a
+ * programmatic fashion information about the CPU that previously was guessed
+ * at. For example, an important part of cpuid is that we can know what
+ * extensions to the ISA exist. If you use an invalid opcode you would get a
+ * #UD, so this method allows a program (whether a user program or the kernel)
+ * to determine what exists without crashing or getting a SIGILL. Of course,
+ * this was also during the era of the clones and the AMD Am5x86. The vendor
+ * name shows up first in cpuid for a reason.
+ *
+ * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
+ * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
+ * its own meaning. The different leaves are broken down into different regions:
+ *
+ * [ 0, 7fffffff ] This region is called the 'basic'
+ * region. This region is generally defined
+ * by Intel, though some of the original
+ * portions have different meanings based
+ * on the manufacturer. These days, Intel
+ * adds most new features to this region.
+ * AMD adds non-Intel compatible
+ * information in the third, extended
+ * region. Intel uses this for everything
+ * including ISA extensions, CPU
+ * features, cache information, topology,
+ * and more.
+ *
+ * There is a hole carved out of this
+ * region which is reserved for
+ * hypervisors.
+ *
+ * [ 40000000, 4fffffff ] This region, which is found in the
+ * middle of the previous region, is
+ * explicitly promised to never be used by
+ * CPUs. Instead, it is used by hypervisors
+ * to communicate information about
+ * themselves to the operating system. The
+ * values and details are unique for each
+ * hypervisor.
+ *
+ * [ 80000000, ffffffff ] This region is called the 'extended'
+ * region. Some of the low leaves mirror
+ * parts of the basic leaves. This region
+ * has generally been used by AMD for
+ * various extensions. For example, AMD-
+ * specific information about caches,
+ * features, and topology are found in this
+ * region.
+ *
+ * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
+ * and %edx, and then issue the cpuid instruction. At the first leaf in each of
+ * the ranges, one of the primary things returned is the maximum valid leaf in
+ * that range. This allows for discovery of what range of CPUID is valid.
+ *
+ * The CPUs have potentially surprising behavior when using an invalid leaf or
+ * unimplemented leaf. If the requested leaf is within the valid basic or
+ * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
+ * set to zero. However, if you specify a leaf that is outside of a valid range,
+ * then instead it will be filled with the last valid _basic_ leaf. For example,
+ * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
+ * an invalid extended leaf will return the information for leaf 3.
+ *
+ * Some leaves are broken down into sub-leaves. This means that the value
+ * depends on both the leaf asked for in %eax and a secondary register. For
+ * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
+ * additional information. Or when getting topology information in leaf 0xb, the
+ * initial value in %ecx changes which level of the topology that you are
+ * getting information about.
+ *
+ * cpuid values are always kept to 32 bits regardless of whether or not the
+ * program is in 64-bit mode. When executing in 64-bit mode, the upper
+ * 32 bits of the register are always set to zero so that way the values are the
+ * same regardless of execution mode.
+ *
+ * ----------------------
+ * Identifying Processors
+ * ----------------------
+ *
+ * We can identify a processor in two steps. The first step looks at cpuid leaf
+ * 0. Leaf 0 contains the processor's vendor information. This is done by
+ * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
+ * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
+ *
+ * From there, a processor is identified by a combination of three different
+ * values:
+ *
+ * 1. Family
+ * 2. Model
+ * 3. Stepping
+ *
+ * Each vendor uses the family and model to uniquely identify a processor. The
+ * way that family and model are changed depends on the vendor. For example,
+ * Intel has been using family 0x6 for almost all of their processor since the
+ * Pentium Pro/Pentium II era, often called the P6. The model is used to
+ * identify the exact processor. Different models are often used for the client
+ * (consumer) and server parts. Even though each processor often has major
+ * architectural differences, they still are considered the same family by
+ * Intel.
+ *
+ * On the other hand, each major AMD architecture generally has its own family.
+ * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
+ * the model number is used to help identify specific processors.
+ *
+ * The stepping is used to refer to a revision of a specific microprocessor. The
+ * term comes from equipment used to produce masks that are used to create
+ * integrated circuits.
+ *
+ * The information is present in leaf 1, %eax. In technical documentation you
+ * will see the terms extended model and extended family. The original family,
+ * model, and stepping fields were each 4 bits wide. If the values in either
+ * are 0xf, then one is to consult the extended model and extended family, which
+ * take previously reserved bits and allow for a larger number of models and add
+ * 0xf to them.
+ *
+ * When we process this information, we store the full family, model, and
+ * stepping in the struct cpuid_info members cpi_family, cpi_model, and
+ * cpi_step, respectively. Whenever you are performing comparisons with the
+ * family, model, and stepping, you should use these members and not the raw
+ * values from cpuid. If you must use the raw values from cpuid directly, you
+ * must make sure that you add the extended model and family to the base model
+ * and family.
+ *
+ * In general, we do not use information about the family, model, and stepping
+ * to determine whether or not a feature is present; that is generally driven by
+ * specific leaves. However, when something we care about on the processor is
+ * not considered 'architectural' meaning that it is specific to a set of
+ * processors and not promised in the architecture model to be consistent from
+ * generation to generation, then we will fall back on this information. The
+ * most common cases where this comes up is when we have to workaround errata in
+ * the processor, are dealing with processor-specific features such as CPU
+ * performance counters, or we want to provide additional information for things
+ * such as fault management.
+ *
+ * While processors also do have a brand string, which is the name that people
+ * are familiar with when buying the processor, they are not meant for
+ * programmatic consumption. That is what the family, model, and stepping are
+ * for.
+ *
+ * ------------
+ * CPUID Passes
+ * ------------
+ *
+ * As part of performing feature detection, we break this into several different
+ * passes. The passes are as follows:
+ *
+ * Pass 0 This is a primordial pass done in locore.s to deal with
+ * Cyrix CPUs that don't support cpuid. The reality is that
+ * we likely don't run on them any more, but there is still
+ * logic for handling them.
+ *
+ * Pass 1 This is the primary pass and is responsible for doing a
+ * large number of different things:
+ *
+ * 1. Determine which vendor manufactured the CPU and
+ * determining the family, model, and stepping information.
+ *
+ * 2. Gathering a large number of feature flags to
+ * determine which features the CPU support and which
+ * indicate things that we need to do other work in the OS
+ * to enable. Features detected this way are added to the
+ * x86_featureset which can be queried to
+ * determine what we should do. This includes processing
+ * all of the basic and extended CPU features that we care
+ * about.
+ *
+ * 3. Determining the CPU's topology. This includes
+ * information about how many cores and threads are present
+ * in the package. It also is responsible for figuring out
+ * which logical CPUs are potentially part of the same core
+ * and what other resources they might share. For more
+ * information see the 'Topology' section.
+ *
+ * 4. Determining the set of CPU security-specific features
+ * that we need to worry about and determine the
+ * appropriate set of workarounds.
+ *
+ * Pass 1 on the boot CPU occurs before KMDB is started.
+ *
+ * Pass 2 The second pass is done after startup(). Here, we check
+ * other miscellaneous features. Most of this is gathering
+ * additional basic and extended features that we'll use in
+ * later passes or for debugging support.
+ *
+ * Pass 3 The third pass occurs after the kernel memory allocator
+ * has been fully initialized. This gathers information
+ * where we might need dynamic memory available for our
+ * uses. This includes several varying width leaves that
+ * have cache information and the processor's brand string.
+ *
+ * Pass 4 The fourth and final normal pass is performed after the
+ * kernel has brought most everything online. This is
+ * invoked from post_startup(). In this pass, we go through
+ * the set of features that we have enabled and turn that
+ * into the hardware auxiliary vector features that
+ * userland receives. This is used by userland, primarily
+ * by the run-time link-editor (RTLD), though userland
+ * software could also refer to it directly.
+ *
+ * Microcode After a microcode update, we do a selective rescan of
+ * the cpuid leaves to determine what features have
+ * changed. Microcode updates can provide more details
+ * about security related features to deal with issues like
+ * Spectre and L1TF. On occasion, vendors have violated
+ * their contract and removed bits. However, we don't try
+ * to detect that because that puts us in a situation that
+ * we really can't deal with. As such, the only thing we
+ * rescan are security related features today. See
+ * cpuid_pass_ucode().
+ *
+ * All of the passes (except pass 0) are run on all CPUs. However, for the most
+ * part we only care about what the boot CPU says about this information and use
+ * the other CPUs as a rough guide to sanity check that we have the same feature
+ * set.
+ *
+ * We do not support running multiple logical CPUs with disjoint, let alone
+ * different, feature sets.
+ *
+ * ------------------
+ * Processor Topology
+ * ------------------
+ *
+ * One of the important things that we need to do is to understand the topology
+ * of the underlying processor. When we say topology in this case, we're trying
+ * to understand the relationship between the logical CPUs that the operating
+ * system sees and the underlying physical layout. Different logical CPUs may
+ * share different resources which can have important consequences for the
+ * performance of the system. For example, they may share caches, execution
+ * units, and more.
+ *
+ * The topology of the processor changes from generation to generation and
+ * vendor to vendor. Along with that, different vendors use different
+ * terminology, and the operating system itself uses occasionally overlapping
+ * terminology. It's important to understand what this topology looks like so
+ * one can understand the different things that we try to calculate and
+ * determine.
+ *
+ * To get started, let's talk about a little bit of terminology that we've used
+ * so far, is used throughout this file, and is fairly generic across multiple
+ * vendors:
+ *
+ * CPU
+ * A central processing unit (CPU) refers to a logical and/or virtual
+ * entity that the operating system can execute instructions on. The
+ * underlying resources for this CPU may be shared between multiple
+ * entities; however, to the operating system it is a discrete unit.
+ *
+ * PROCESSOR and PACKAGE
+ *
+ * Generally, when we use the term 'processor' on its own, we are referring
+ * to the physical entity that one buys and plugs into a board. However,
+ * because processor has been overloaded and one might see it used to mean
+ * multiple different levels, we will instead use the term 'package' for
+ * the rest of this file. The term package comes from the electrical
+ * engineering side and refers to the physical entity that encloses the
+ * electronics inside. Strictly speaking the package can contain more than
+ * just the CPU, for example, on many processors it may also have what's
+ * called an 'integrated graphical processing unit (GPU)'. Because the
+ * package can encapsulate multiple units, it is the largest physical unit
+ * that we refer to.
+ *
+ * SOCKET
+ *
+ * A socket refers to unit on a system board (generally the motherboard)
+ * that can receive a package. A single package, or processor, is plugged
+ * into a single socket. A system may have multiple sockets. Often times,
+ * the term socket is used interchangeably with package and refers to the
+ * electrical component that has plugged in, and not the receptacle itself.
+ *
+ * CORE
+ *
+ * A core refers to the physical instantiation of a CPU, generally, with a
+ * full set of hardware resources available to it. A package may contain
+ * multiple cores inside of it or it may just have a single one. A
+ * processor with more than one core is often referred to as 'multi-core'.
+ * In illumos, we will use the feature X86FSET_CMP to refer to a system
+ * that has 'multi-core' processors.
+ *
+ * A core may expose a single logical CPU to the operating system, or it
+ * may expose multiple CPUs, which we call threads, defined below.
+ *
+ * Some resources may still be shared by cores in the same package. For
+ * example, many processors will share the level 3 cache between cores.
+ * Some AMD generations share hardware resources between cores. For more
+ * information on that see the section 'AMD Topology'.
+ *
+ * THREAD and STRAND
+ *
+ * In this file, generally a thread refers to a hardware resources and not
+ * the operating system's logical abstraction. A thread is always exposed
+ * as an independent logical CPU to the operating system. A thread belongs
+ * to a specific core. A core may have more than one thread. When that is
+ * the case, the threads that are part of the same core are often referred
+ * to as 'siblings'.
+ *
+ * When multiple threads exist, this is generally referred to as
+ * simultaneous multi-threading (SMT). When Intel introduced this in their
+ * processors they called it hyper-threading (HT). When multiple threads
+ * are active in a core, they split the resources of the core. For example,
+ * two threads may share the same set of hardware execution units.
+ *
+ * The operating system often uses the term 'strand' to refer to a thread.
+ * This helps disambiguate it from the software concept.
+ *
+ * CHIP
+ *
+ * Unfortunately, the term 'chip' is dramatically overloaded. At its most
+ * base meaning, it is used to refer to a single integrated circuit, which
+ * may or may not be the only thing in the package. In illumos, when you
+ * see the term 'chip' it is almost always referring to the same thing as
+ * the 'package'. However, many vendors may use chip to refer to one of
+ * many integrated circuits that have been placed in the package. As an
+ * example, see the subsequent definition.
+ *
+ * To try and keep things consistent, we will only use chip when referring
+ * to the entire integrated circuit package, with the exception of the
+ * definition of multi-chip module (because it is in the name) and use the
+ * term 'die' when we want the more general, potential sub-component
+ * definition.
+ *
+ * DIE
+ *
+ * A die refers to an integrated circuit. Inside of the package there may
+ * be a single die or multiple dies. This is sometimes called a 'chip' in
+ * vendor's parlance, but in this file, we use the term die to refer to a
+ * subcomponent.
+ *
+ * MULTI-CHIP MODULE
+ *
+ * A multi-chip module (MCM) refers to putting multiple distinct chips that
+ * are connected together in the same package. When a multi-chip design is
+ * used, generally each chip is manufactured independently and then joined
+ * together in the package. For example, on AMD's Zen microarchitecture
+ * (family 0x17), the package contains several dies (the second meaning of
+ * chip from above) that are connected together.
+ *
+ * CACHE
+ *
+ * A cache is a part of the processor that maintains copies of recently
+ * accessed memory. Caches are split into levels and then into types.
+ * Commonly there are one to three levels, called level one, two, and
+ * three. The lower the level, the smaller it is, the closer it is to the
+ * execution units of the CPU, and the faster it is to access. The layout
+ * and design of the cache come in many different flavors, consult other
+ * resources for a discussion of those.
+ *
+ * Caches are generally split into two types, the instruction and data
+ * cache. The caches contain what their names suggest, the instruction
+ * cache has executable program text, while the data cache has all other
+ * memory that the processor accesses. As of this writing, data is kept
+ * coherent between all of the caches on x86, so if one modifies program
+ * text before it is executed, that will be in the data cache, and the
+ * instruction cache will be synchronized with that change when the
+ * processor actually executes those instructions. This coherency also
+ * covers the fact that data could show up in multiple caches.
+ *
+ * Generally, the lowest level caches are specific to a core. However, the
+ * last layer cache is shared between some number of cores. The number of
+ * CPUs sharing this last level cache is important. This has implications
+ * for the choices that the scheduler makes, as accessing memory that might
+ * be in a remote cache after thread migration can be quite expensive.
+ *
+ * Sometimes, the word cache is abbreviated with a '$', because in US
+ * English the word cache is pronounced the same as cash. So L1D$ refers to
+ * the L1 data cache, and L2$ would be the L2 cache. This will not be used
+ * in the rest of this theory statement for clarity.
+ *
+ * MEMORY CONTROLLER
+ *
+ * The memory controller is a component that provides access to DRAM. Each
+ * memory controller can access a set number of DRAM channels. Each channel
+ * can have a number of DIMMs (sticks of memory) associated with it. A
+ * given package may have more than one memory controller. The association
+ * of the memory controller to a group of cores is important as it is
+ * cheaper to access memory on the controller that you are associated with.
+ *
+ * NUMA
+ *
+ * NUMA or non-uniform memory access, describes a way that systems are
+ * built. On x86, any processor core can address all of the memory in the
+ * system. However, When using multiple sockets or possibly within a
+ * multi-chip module, some of that memory is physically closer and some of
+ * it is further. Memory that is further away is more expensive to access.
+ * Consider the following image of multiple sockets with memory:
+ *
+ * +--------+ +--------+
+ * | DIMM A | +----------+ +----------+ | DIMM D |
+ * +--------+-+ | | | | +-+------+-+
+ * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
+ * +--------+-+ | | | | +-+------+-+
+ * | DIMM C | +----------+ +----------+ | DIMM F |
+ * +--------+ +--------+
+ *
+ * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
+ * closer to DIMMs D-F. This means that it is cheaper for socket 0 to
+ * access DIMMs A-C and more expensive to access D-F as it has to go
+ * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
+ * D-F are cheaper than A-C. While the socket form is the most common, when
+ * using multi-chip modules, this can also sometimes occur. For another
+ * example of this that's more involved, see the AMD topology section.
+ *
+ *
+ * Intel Topology
+ * --------------
+ *
+ * Most Intel processors since Nehalem, (as of this writing the current gen
+ * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
+ * the package is a single monolithic die. MCMs currently aren't used. Most
+ * parts have three levels of caches, with the L3 cache being shared between
+ * all of the cores on the package. The L1/L2 cache is generally specific to
+ * an individual core. The following image shows at a simplified level what
+ * this looks like. The memory controller is commonly part of something called
+ * the 'Uncore', that used to be separate physical chips that were not a part of
+ * the package, but are now part of the same chip.
+ *
+ * +-----------------------------------------------------------------------+
+ * | Package |
+ * | +-------------------+ +-------------------+ +-------------------+ |
+ * | | Core | | Core | | Core | |
+ * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
+ * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | |
+ * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | |
+ * | | +--------+ | | | | +--------+ | | | | +--------+ | | | |
+ * | | | Thread | | | | | | Thread | | | | | | Thread | | | | |
+ * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | |
+ * | | +--------------+ | | +--------------+ | | +--------------+ | |
+ * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | |
+ * | | +--------------+ | | +--------------+ | | +--------------+ | |
+ * | +-------------------+ +-------------------+ +-------------------+ |
+ * | +-------------------------------------------------------------------+ |
+ * | | Shared L3 Cache | |
+ * | +-------------------------------------------------------------------+ |
+ * | +-------------------------------------------------------------------+ |
+ * | | Memory Controller | |
+ * | +-------------------------------------------------------------------+ |
+ * +-----------------------------------------------------------------------+
+ *
+ * A side effect of this current architecture is that what we care about from a
+ * scheduling and topology perspective, is simplified. In general we care about
+ * understanding which logical CPUs are part of the same core and socket.
+ *
+ * To determine the relationship between threads and cores, Intel initially used
+ * the identifier in the advanced programmable interrupt controller (APIC). They
+ * also added cpuid leaf 4 to give additional information about the number of
+ * threads and CPUs in the processor. With the addition of x2apic (which
+ * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
+ * additional cpuid topology leaf 0xB was added.
+ *
+ * AMD Topology
+ * ------------
+ *
+ * When discussing AMD topology, we want to break this into three distinct
+ * generations of topology. There's the basic topology that has been used in
+ * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
+ * with family 0x15 (Bulldozer), and there's the topology that was introduced
+ * with family 0x17 (Zen). AMD also has some additional terminology that's worth
+ * talking about.
+ *
+ * Until the introduction of family 0x17 (Zen), AMD did not implement something
+ * that they considered SMT. Whether or not the AMD processors have SMT
+ * influences many things including scheduling and reliability, availability,
+ * and serviceability (RAS) features.
+ *
+ * NODE
+ *
+ * AMD uses the term node to refer to a die that contains a number of cores
+ * and I/O resources. Depending on the processor family and model, more
+ * than one node can be present in the package. When there is more than one
+ * node this indicates a multi-chip module. Usually each node has its own
+ * access to memory and I/O devices. This is important and generally
+ * different from the corresponding Intel Nehalem-Skylake+ processors. As a
+ * result, we track this relationship in the operating system.
+ *
+ * In processors with an L3 cache, the L3 cache is generally shared across
+ * the entire node, though the way this is carved up varies from generation
+ * to generation.
+ *
+ * BULLDOZER
+ *
+ * Starting with the Bulldozer family (0x15) and continuing until the
+ * introduction of the Zen microarchitecture, AMD introduced the idea of a
+ * compute unit. In a compute unit, two traditional cores share a number of
+ * hardware resources. Critically, they share the FPU, L1 instruction
+ * cache, and the L2 cache. Several compute units were then combined inside
+ * of a single node. Because the integer execution units, L1 data cache,
+ * and some other resources were not shared between the cores, AMD never
+ * considered this to be SMT.
+ *
+ * ZEN
+ *
+ * The Zen family (0x17) uses a multi-chip module (MCM) design, the module
+ * is called Zeppelin. These modules are similar to the idea of nodes used
+ * previously. Each of these nodes has two DRAM channels which all of the
+ * cores in the node can access uniformly. These nodes are linked together
+ * in the package, creating a NUMA environment.
+ *
+ * The Zeppelin die itself contains two different 'core complexes'. Each
+ * core complex consists of four cores which each have two threads, for a
+ * total of 8 logical CPUs per complex. Unlike other generations,
+ * where all the logical CPUs in a given node share the L3 cache, here each
+ * core complex has its own shared L3 cache.
+ *
+ * A further thing that we need to consider is that in some configurations,
+ * particularly with the Threadripper line of processors, not every die
+ * actually has its memory controllers wired up to actual memory channels.
+ * This means that some cores have memory attached to them and others
+ * don't.
+ *
+ * To put Zen in perspective, consider the following images:
+ *
+ * +--------------------------------------------------------+
+ * | Core Complex |
+ * | +-------------------+ +-------------------+ +---+ |
+ * | | Core +----+ | | Core +----+ | | | |
+ * | | +--------+ | L2 | | | +--------+ | L2 | | | | |
+ * | | | Thread | +----+ | | | Thread | +----+ | | | |
+ * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | |
+ * | | | Thread | |L1| | | | Thread | |L1| | | 3 | |
+ * | | +--------+ +--+ | | +--------+ +--+ | | | |
+ * | +-------------------+ +-------------------+ | C | |
+ * | +-------------------+ +-------------------+ | a | |
+ * | | Core +----+ | | Core +----+ | | c | |
+ * | | +--------+ | L2 | | | +--------+ | L2 | | | h | |
+ * | | | Thread | +----+ | | | Thread | +----+ | | e | |
+ * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | |
+ * | | | Thread | |L1| | | | Thread | |L1| | | | |
+ * | | +--------+ +--+ | | +--------+ +--+ | | | |
+ * | +-------------------+ +-------------------+ +---+ |
+ * | |
+ * +--------------------------------------------------------+
+ *
+ * This first image represents a single Zen core complex that consists of four
+ * cores.
+ *
+ *
+ * +--------------------------------------------------------+
+ * | Zeppelin Die |
+ * | +--------------------------------------------------+ |
+ * | | I/O Units (PCIe, SATA, USB, etc.) | |
+ * | +--------------------------------------------------+ |
+ * | HH |
+ * | +-----------+ HH +-----------+ |
+ * | | | HH | | |
+ * | | Core |==========| Core | |
+ * | | Complex |==========| Complex | |
+ * | | | HH | | |
+ * | +-----------+ HH +-----------+ |
+ * | HH |
+ * | +--------------------------------------------------+ |
+ * | | Memory Controller | |
+ * | +--------------------------------------------------+ |
+ * | |
+ * +--------------------------------------------------------+
+ *
+ * This image represents a single Zeppelin Die. Note how both cores are
+ * connected to the same memory controller and I/O units. While each core
+ * complex has its own L3 cache as seen in the first image, they both have
+ * uniform access to memory.
+ *
+ *
+ * PP PP
+ * PP PP
+ * +----------PP---------------------PP---------+
+ * | PP PP |
+ * | +-----------+ +-----------+ |
+ * | | | | | |
+ * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
+ * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
+ * | | | | | |
+ * | +-----------+ooo ...+-----------+ |
+ * | HH ooo ... HH |
+ * | HH oo.. HH |
+ * | HH ..oo HH |
+ * | HH ... ooo HH |
+ * | +-----------+... ooo+-----------+ |
+ * | | | | | |
+ * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM
+ * MMMMMMMMM| Die |==========| Die |MMMMMMMMM
+ * | | | | | |
+ * | +-----------+ +-----------+ |
+ * | PP PP |
+ * +----------PP---------------------PP---------+
+ * PP PP
+ * PP PP
+ *
+ * This image represents a single Zen package. In this example, it has four
+ * Zeppelin dies, though some configurations only have a single one. In this
+ * example, each die is directly connected to the next. Also, each die is
+ * represented as being connected to memory by the 'M' character and connected
+ * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
+ * die is made up of two core complexes, we have multiple different NUMA
+ * domains that we care about for these systems.
+ *
+ * CPUID LEAVES
+ *
+ * There are a few different CPUID leaves that we can use to try and understand
+ * the actual state of the world. As part of the introduction of family 0xf, AMD
+ * added CPUID leaf 0x80000008. This leaf tells us the number of logical
+ * processors that are in the system. Because families before Zen didn't have
+ * SMT, this was always the number of cores that were in the system. However, it
+ * should always be thought of as the number of logical threads to be consistent
+ * between generations. In addition we also get the size of the APIC ID that is
+ * used to represent the number of logical processors. This is important for
+ * deriving topology information.
+ *
+ * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
+ * bit between Bulldozer and later families, but it is quite useful in
+ * determining the topology information. Because this information has changed
+ * across family generations, it's worth calling out what these mean
+ * explicitly. The registers have the following meanings:
+ *
+ * %eax The APIC ID. The entire register is defined to have a 32-bit
+ * APIC ID, even though on systems without x2apic support, it will
+ * be limited to 8 bits.
+ *
+ * %ebx On Bulldozer-era systems this contains information about the
+ * number of cores that are in a compute unit (cores that share
+ * resources). It also contains a per-package compute unit ID that
+ * identifies which compute unit the logical CPU is a part of.
+ *
+ * On Zen-era systems this instead contains the number of threads
+ * per core and the ID of the core that the logical CPU is a part
+ * of. Note, this ID is unique only to the package, it is not
+ * globally unique across the entire system.
+ *
+ * %ecx This contains the number of nodes that exist in the package. It
+ * also contains an ID that identifies which node the logical CPU
+ * is a part of.
+ *
+ * Finally, we also use cpuid leaf 0x8000001D to determine information about the
+ * cache layout to determine which logical CPUs are sharing which caches.
+ *
+ * illumos Topology
+ * ----------------
+ *
+ * Based on the above we synthesize the information into several different
+ * variables that we store in the 'struct cpuid_info'. We'll go into the details
+ * of what each member is supposed to represent and their uniqueness. In
+ * general, there are two levels of uniqueness that we care about. We care about
+ * an ID that is globally unique. That means that it will be unique across all
+ * entities in the system. For example, the default logical CPU ID is globally
+ * unique. On the other hand, there is some information that we only care about
+ * being unique within the context of a single package / socket. Here are the
+ * variables that we keep track of and their meaning.
+ *
+ * Several of the values that are asking for an identifier, with the exception
+ * of cpi_apicid, are allowed to be synthetic.
+ *
+ *
+ * cpi_apicid
+ *
+ * This is the value of the CPU's APIC id. This should be the full 32-bit
+ * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
+ * APIC ID. This value is globally unique between all logical CPUs across
+ * all packages. This is usually required by the APIC.
+ *
+ * cpi_chipid
+ *
+ * This value indicates the ID of the package that the logical CPU is a
+ * part of. This value is allowed to be synthetic. It is usually derived by
+ * taking the CPU's APIC ID and determining how many bits are used to
+ * represent CPU cores in the package. All logical CPUs that are part of
+ * the same package must have the same value.
+ *
+ * cpi_coreid
+ *
+ * This represents the ID of a CPU core. Two logical CPUs should only have
+ * the same cpi_coreid value if they are part of the same core. These
+ * values may be synthetic. On systems that support SMT, this value is
+ * usually derived from the APIC ID, otherwise it is often synthetic and
+ * just set to the value of the cpu_id in the cpu_t.
+ *
+ * cpi_pkgcoreid
+ *
+ * This is similar to the cpi_coreid in that logical CPUs that are part of
+ * the same core should have the same ID. The main difference is that these
+ * values are only required to be unique to a given socket.
+ *
+ * cpi_clogid
+ *
+ * This represents the logical ID of a logical CPU. This value should be
+ * unique within a given socket for each logical CPU. This is allowed to be
+ * synthetic, though it is usually based off of the CPU's apic ID. The
+ * broader system expects that logical CPUs that have are part of the same
+ * core have contiguous numbers. For example, if there were two threads per
+ * core, then the core IDs divided by two should be the same and the first
+ * modulus two should be zero and the second one. For example, IDs 4 and 5
+ * indicate two logical CPUs that are part of the same core. But IDs 5 and
+ * 6 represent two logical CPUs that are part of different cores.
+ *
+ * While it is common for the cpi_coreid and the cpi_clogid to be derived
+ * from the same source, strictly speaking, they don't have to be and the
+ * two values should be considered logically independent. One should not
+ * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
+ * some kind of relationship. While this is tempting, we've seen cases on
+ * AMD family 0xf where the system's cpu id is not related to its APIC ID.
+ *
+ * cpi_ncpu_per_chip
+ *
+ * This value indicates the total number of logical CPUs that exist in the
+ * physical package. Critically, this is not the number of logical CPUs
+ * that exist for just the single core.
+ *
+ * This value should be the same for all logical CPUs in the same package.
+ *
+ * cpi_ncore_per_chip
+ *
+ * This value indicates the total number of physical CPU cores that exist
+ * in the package. The system compares this value with cpi_ncpu_per_chip to
+ * determine if simultaneous multi-threading (SMT) is enabled. When
+ * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
+ * the X86FSET_HTT feature is not set. If this value is greater than one,
+ * than we consider the processor to have the feature X86FSET_CMP, to
+ * indicate that there is support for more than one core.
+ *
+ * This value should be the same for all logical CPUs in the same package.
+ *
+ * cpi_procnodes_per_pkg
+ *
+ * This value indicates the number of 'nodes' that exist in the package.
+ * When processors are actually a multi-chip module, this represents the
+ * number of such modules that exist in the package. Currently, on Intel
+ * based systems this member is always set to 1.
+ *
+ * This value should be the same for all logical CPUs in the same package.
+ *
+ * cpi_procnodeid
+ *
+ * This value indicates the ID of the node that the logical CPU is a part
+ * of. All logical CPUs that are in the same node must have the same value
+ * here. This value must be unique across all of the packages in the
+ * system. On Intel based systems, this is currently set to the value in
+ * cpi_chipid because there is only one node.
+ *
+ * cpi_cores_per_compunit
+ *
+ * This value indicates the number of cores that are part of a compute
+ * unit. See the AMD topology section for this. This member only has real
+ * meaning currently for AMD Bulldozer family processors. For all other
+ * processors, this should currently be set to 1.
+ *
+ * cpi_compunitid
+ *
+ * This indicates the compute unit that the logical CPU belongs to. For
+ * processors without AMD Bulldozer-style compute units this should be set
+ * to the value of cpi_coreid.
+ *
+ * cpi_ncpu_shr_last_cache
+ *
+ * This indicates the number of logical CPUs that are sharing the same last
+ * level cache. This value should be the same for all CPUs that are sharing
+ * that cache. The last cache refers to the cache that is closest to memory
+ * and furthest away from the CPU.
+ *
+ * cpi_last_lvl_cacheid
+ *
+ * This indicates the ID of the last cache that the logical CPU uses. This
+ * cache is often shared between multiple logical CPUs and is the cache
+ * that is closest to memory and furthest away from the CPU. This value
+ * should be the same for a group of logical CPUs only if they actually
+ * share the same last level cache. IDs should not overlap between
+ * packages.
+ *
+ * -----------
+ * Hypervisors
+ * -----------
+ *
+ * If trying to manage the differences between vendors wasn't bad enough, it can
+ * get worse thanks to our friend hardware virtualization. Hypervisors are given
+ * the ability to interpose on all cpuid instructions and change them to suit
+ * their purposes. In general, this is necessary as the hypervisor wants to be
+ * able to present a more uniform set of features or not necessarily give the
+ * guest operating system kernel knowledge of all features so it can be
+ * more easily migrated between systems.
+ *
+ * When it comes to trying to determine topology information, this can be a
+ * double edged sword. When a hypervisor doesn't actually implement a cpuid
+ * leaf, it'll often return all zeros. Because of that, you'll often see various
+ * checks scattered about fields being non-zero before we assume we can use
+ * them.
+ *
+ * When it comes to topology information, the hypervisor is often incentivized
+ * to lie to you about topology. This is because it doesn't always actually
+ * guarantee that topology at all. The topology path we take in the system
+ * depends on how the CPU advertises itself. If it advertises itself as an Intel
+ * or AMD CPU, then we basically do our normal path. However, when they don't
+ * use an actual vendor, then that usually turns into multiple one-core CPUs
+ * that we enumerate that are often on different sockets. The actual behavior
+ * depends greatly on what the hypervisor actually exposes to us.
+ *
+ * --------------------
+ * Exposing Information
+ * --------------------
+ *
+ * We expose CPUID information in three different forms in the system.
+ *
+ * The first is through the x86_featureset variable. This is used in conjunction
+ * with the is_x86_feature() function. This is queried by x86-specific functions
+ * to determine which features are or aren't present in the system and to make
+ * decisions based upon them. For example, users of this include everything from
+ * parts of the system dedicated to reliability, availability, and
+ * serviceability (RAS), to making decisions about how to handle security
+ * mitigations, to various x86-specific drivers. General purpose or
+ * architecture independent drivers should never be calling this function.
+ *
+ * The second means is through the auxiliary vector. The auxiliary vector is a
+ * series of tagged data that the kernel passes down to a user program when it
+ * begins executing. This information is used to indicate to programs what
+ * instruction set extensions are present. For example, information about the
+ * CPU supporting the machine check architecture (MCA) wouldn't be passed down
+ * since user programs cannot make use of it. However, things like the AVX
+ * instruction sets are. Programs use this information to make run-time
+ * decisions about what features they should use. As an example, the run-time
+ * link-editor (rtld) can relocate different functions depending on the hardware
+ * support available.
+ *
+ * The final form is through a series of accessor functions that all have the
+ * form cpuid_get*. This is used by a number of different subsystems in the
+ * kernel to determine more detailed information about what we're running on,
+ * topology information, etc. Some of these subsystems include processor groups
+ * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
+ * microcode, and performance monitoring. These functions all ASSERT that the
+ * CPU they're being called on has reached a certain cpuid pass. If the passes
+ * are rearranged, then this needs to be adjusted.
*/
#include <sys/types.h>
@@ -68,58 +911,6 @@
#include <sys/ontrap.h>
#endif
-/*
- * Pass 0 of cpuid feature analysis happens in locore. It contains special code
- * to recognize Cyrix processors that are not cpuid-compliant, and to deal with
- * them accordingly. For most modern processors, feature detection occurs here
- * in pass 1.
- *
- * Pass 1 of cpuid feature analysis happens just at the beginning of mlsetup()
- * for the boot CPU and does the basic analysis that the early kernel needs.
- * x86_featureset is set based on the return value of cpuid_pass1() of the boot
- * CPU.
- *
- * Pass 1 includes:
- *
- * o Determining vendor/model/family/stepping and setting x86_type and
- * x86_vendor accordingly.
- * o Processing the feature flags returned by the cpuid instruction while
- * applying any workarounds or tricks for the specific processor.
- * o Mapping the feature flags into illumos feature bits (X86_*).
- * o Processing extended feature flags if supported by the processor,
- * again while applying specific processor knowledge.
- * o Determining the CMT characteristics of the system.
- *
- * Pass 1 is done on non-boot CPUs during their initialization and the results
- * are used only as a meager attempt at ensuring that all processors within the
- * system support the same features.
- *
- * Pass 2 of cpuid feature analysis happens just at the beginning
- * of startup(). It just copies in and corrects the remainder
- * of the cpuid data we depend on: standard cpuid functions that we didn't
- * need for pass1 feature analysis, and extended cpuid functions beyond the
- * simple feature processing done in pass1.
- *
- * Pass 3 of cpuid analysis is invoked after basic kernel services; in
- * particular kernel memory allocation has been made available. It creates a
- * readable brand string based on the data collected in the first two passes.
- *
- * Pass 4 of cpuid analysis is invoked after post_startup() when all
- * the support infrastructure for various hardware features has been
- * initialized. It determines which processor features will be reported
- * to userland via the aux vector.
- *
- * All passes are executed on all CPUs, but only the boot CPU determines what
- * features the kernel will use.
- *
- * Much of the worst junk in this file is for the support of processors
- * that didn't really implement the cpuid instruction properly.
- *
- * NOTE: The accessor functions (cpuid_get*) are aware of, and ASSERT upon,
- * the pass numbers. Accordingly, changes to the pass code may require changes
- * to the accessor code.
- */
-
uint_t x86_vendor = X86_VENDOR_IntelClone;
uint_t x86_type = X86_TYPE_OTHER;
uint_t x86_clflush_size = 0;
@@ -351,21 +1142,9 @@ struct xsave_info {
#define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */
/*
- * Some terminology needs to be explained:
- * - Socket: Something that can be plugged into a motherboard.
- * - Package: Same as socket
- * - Chip: Same as socket. Note that AMD's documentation uses term "chip"
- * differently: there, chip is the same as processor node (below)
- * - Processor node: Some AMD processors have more than one
- * "subprocessor" embedded in a package. These subprocessors (nodes)
- * are fully-functional processors themselves with cores, caches,
- * memory controllers, PCI configuration spaces. They are connected
- * inside the package with Hypertransport links. On single-node
- * processors, processor node is equivalent to chip/socket/package.
- * - Compute Unit: Some AMD processors pair cores in "compute units" that
- * share the FPU and the I$ and L2 caches.
+ * See the big theory statement for a more detailed explanation of what some of
+ * these members mean.
*/
-
struct cpuid_info {
uint_t cpi_pass; /* last pass completed */
/*
@@ -387,8 +1166,9 @@ struct cpuid_info {
uint_t cpi_ncache; /* fn 2: number of elements */
uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */
id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */
- uint_t cpi_std_4_size; /* fn 4: number of fn 4 elements */
- struct cpuid_regs **cpi_std_4; /* fn 4: %ecx == 0 .. fn4_size */
+ uint_t cpi_cache_leaf_size; /* Number of cache elements */
+ /* Intel fn: 4, AMD fn: 8000001d */
+ struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */
struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */
/*
* extended function information
@@ -540,6 +1320,14 @@ static struct cpuid_info cpuid_info0;
#define CPUID_LEAFD_2_YMM_SIZE 256
/*
+ * Common extended leaf names to cut down on typos.
+ */
+#define CPUID_LEAF_EXT_0 0x80000000
+#define CPUID_LEAF_EXT_8 0x80000008
+#define CPUID_LEAF_EXT_1d 0x8000001d
+#define CPUID_LEAF_EXT_1e 0x8000001e
+
+/*
* Functions we consune from cpuid_subr.c; don't publish these in a header
* file to try and keep people using the expected cpuid_* interfaces.
*/
@@ -607,7 +1395,7 @@ platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
break;
- case 0x80000008:
+ case CPUID_LEAF_EXT_8:
/*
* Zero out the (ncores-per-chip - 1) field
*/
@@ -664,13 +1452,14 @@ cpuid_free_space(cpu_t *cpu)
ASSERT(cpi != &cpuid_info0);
/*
- * Free up any function 4 related dynamic storage
+ * Free up any cache leaf related dynamic storage. The first entry was
+ * cached from the standard cpuid storage, so we should not free it.
*/
- for (i = 1; i < cpi->cpi_std_4_size; i++)
- kmem_free(cpi->cpi_std_4[i], sizeof (struct cpuid_regs));
- if (cpi->cpi_std_4_size > 0)
- kmem_free(cpi->cpi_std_4,
- cpi->cpi_std_4_size * sizeof (struct cpuid_regs *));
+ for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
+ kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
+ if (cpi->cpi_cache_leaf_size > 0)
+ kmem_free(cpi->cpi_cache_leaves,
+ cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
kmem_free(cpi, sizeof (*cpi));
cpu->cpu_m.mcpu_cpi = NULL;
@@ -804,6 +1593,198 @@ is_controldom(void)
#endif /* __xpv */
+/*
+ * Make sure that we have gathered all of the CPUID leaves that we might need to
+ * determine topology. We assume that the standard leaf 1 has already been done
+ * and that xmaxeax has already been calculated.
+ */
+static void
+cpuid_gather_amd_topology_leaves(cpu_t *cpu)
+{
+ struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
+
+ if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
+ struct cpuid_regs *cp;
+
+ cp = &cpi->cpi_extd[8];
+ cp->cp_eax = CPUID_LEAF_EXT_8;
+ (void) __cpuid_insn(cp);
+ platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
+ }
+
+ if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
+ struct cpuid_regs *cp;
+
+ cp = &cpi->cpi_extd[0x1e];
+ cp->cp_eax = CPUID_LEAF_EXT_1e;
+ (void) __cpuid_insn(cp);
+ }
+}
+
+/*
+ * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
+ * it to everything else. If not, and we're on an AMD system where 8000001e is
+ * valid, then we use that. Othewrise, we fall back to the default value for the
+ * APIC ID in leaf 1.
+ */
+static uint32_t
+cpuid_gather_apicid(struct cpuid_info *cpi)
+{
+ /*
+ * Leaf B changes based on the arguments to it. Beacuse we don't cache
+ * it, we need to gather it again.
+ */
+ if (cpi->cpi_maxeax >= 0xB) {
+ struct cpuid_regs regs;
+ struct cpuid_regs *cp;
+
+ cp = &regs;
+ cp->cp_eax = 0xB;
+ cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
+ (void) __cpuid_insn(cp);
+
+ if (cp->cp_ebx != 0) {
+ return (cp->cp_edx);
+ }
+ }
+
+ if (cpi->cpi_vendor == X86_VENDOR_AMD &&
+ is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
+ return (cpi->cpi_extd[0x1e].cp_eax);
+ }
+
+ return (CPI_APIC_ID(cpi));
+}
+
+/*
+ * For AMD processors, attempt to calculate the number of chips and cores that
+ * exist. The way that we do this varies based on the generation, because the
+ * generations themselves have changed dramatically.
+ *
+ * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
+ * However, with the advent of family 17h (Zen) it actually tells us the number
+ * of threads, so we need to look at leaf 0x8000001e if available to determine
+ * its value. Otherwise, for all prior families, the number of enabled cores is
+ * the same as threads.
+ *
+ * If we do not have leaf 0x80000008, then we assume that this processor does
+ * not have anything. AMD's older CPUID specification says there's no reason to
+ * fall back to leaf 1.
+ *
+ * In some virtualization cases we will not have leaf 8000001e or it will be
+ * zero. When that happens we assume the number of threads is one.
+ */
+static void
+cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
+{
+ uint_t nthreads, nthread_per_core;
+
+ nthreads = nthread_per_core = 1;
+
+ if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
+ nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
+ } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
+ nthreads = CPI_CPU_COUNT(cpi);
+ }
+
+ /*
+ * For us to have threads, and know about it, we have to be at least at
+ * family 17h and have the cpuid bit that says we have extended
+ * topology.
+ */
+ if (cpi->cpi_family >= 0x17 &&
+ is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
+ nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
+ }
+
+ *ncpus = nthreads;
+ *ncores = nthreads / nthread_per_core;
+}
+
+static void
+cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
+{
+ if (cpi->cpi_maxeax >= 4) {
+ *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
+ *ncpus = BITX(cpi->cpi_std[4].cp_eax, 25, 14) + 1;
+ } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
+ *ncores = 1;
+ *ncpus = CPI_CPU_COUNT(cpi);
+ } else {
+ *ncpus = *ncores = 1;
+ }
+}
+
+static boolean_t
+cpuid_leafB_getids(cpu_t *cpu)
+{
+ struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
+ struct cpuid_regs regs;
+ struct cpuid_regs *cp;
+
+ if (cpi->cpi_maxeax < 0xB)
+ return (B_FALSE);
+
+ cp = &regs;
+ cp->cp_eax = 0xB;
+ cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
+
+ (void) __cpuid_insn(cp);
+
+ /*
+ * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
+ * indicates that the extended topology enumeration leaf is
+ * available.
+ */
+ if (cp->cp_ebx != 0) {
+ uint32_t x2apic_id = 0;
+ uint_t coreid_shift = 0;
+ uint_t ncpu_per_core = 1;
+ uint_t chipid_shift = 0;
+ uint_t ncpu_per_chip = 1;
+ uint_t i;
+ uint_t level;
+
+ for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
+ cp->cp_eax = 0xB;
+ cp->cp_ecx = i;
+
+ (void) __cpuid_insn(cp);
+ level = CPI_CPU_LEVEL_TYPE(cp);
+
+ if (level == 1) {
+ x2apic_id = cp->cp_edx;
+ coreid_shift = BITX(cp->cp_eax, 4, 0);
+ ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
+ } else if (level == 2) {
+ x2apic_id = cp->cp_edx;
+ chipid_shift = BITX(cp->cp_eax, 4, 0);
+ ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
+ }
+ }
+
+ /*
+ * cpi_apicid is taken care of in cpuid_gather_apicid.
+ */
+ cpi->cpi_ncpu_per_chip = ncpu_per_chip;
+ cpi->cpi_ncore_per_chip = ncpu_per_chip /
+ ncpu_per_core;
+ cpi->cpi_chipid = x2apic_id >> chipid_shift;
+ cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
+ cpi->cpi_coreid = x2apic_id >> coreid_shift;
+ cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
+ cpi->cpi_procnodeid = cpi->cpi_chipid;
+ cpi->cpi_compunitid = cpi->cpi_coreid;
+
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
+}
+
static void
cpuid_intel_getids(cpu_t *cpu, void *feature)
{
@@ -812,6 +1793,20 @@ cpuid_intel_getids(cpu_t *cpu, void *feature)
uint_t coreid_shift = 0;
struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
+ /*
+ * There are no compute units or processor nodes currently on Intel.
+ * Always set these to one.
+ */
+ cpi->cpi_procnodes_per_pkg = 1;
+ cpi->cpi_cores_per_compunit = 1;
+
+ /*
+ * If cpuid Leaf B is present, use that to try and get this information.
+ * It will be the most accurate for Intel CPUs.
+ */
+ if (cpuid_leafB_getids(cpu))
+ return;
+
for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
chipid_shift++;
@@ -860,13 +1855,69 @@ cpuid_intel_getids(cpu_t *cpu, void *feature)
*/
cpi->cpi_coreid = cpi->cpi_chipid;
cpi->cpi_pkgcoreid = 0;
+ } else {
+ /*
+ * Single-core single-thread processors.
+ */
+ cpi->cpi_coreid = cpu->cpu_id;
+ cpi->cpi_pkgcoreid = 0;
}
cpi->cpi_procnodeid = cpi->cpi_chipid;
cpi->cpi_compunitid = cpi->cpi_coreid;
}
+/*
+ * Historically, AMD has had CMP chips with only a single thread per core.
+ * However, starting in family 17h (Zen), this has changed and they now have
+ * multiple threads. Our internal core id needs to be a unique value.
+ *
+ * To determine the core id of an AMD system, if we're from a family before 17h,
+ * then we just use the cpu id, as that gives us a good value that will be
+ * unique for each core. If instead, we're on family 17h or later, then we need
+ * to do something more complicated. CPUID leaf 0x8000001e can tell us
+ * how many threads are in the system. Based on that, we'll shift the APIC ID.
+ * We can't use the normal core id in that leaf as it's only unique within the
+ * socket, which is perfect for cpi_pkgcoreid, but not us.
+ */
+static id_t
+cpuid_amd_get_coreid(cpu_t *cpu)
+{
+ struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
+
+ if (cpi->cpi_family >= 0x17 &&
+ is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
+ uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
+ if (nthreads > 1) {
+ VERIFY3U(nthreads, ==, 2);
+ return (cpi->cpi_apicid >> 1);
+ }
+ }
+
+ return (cpu->cpu_id);
+}
+
+/*
+ * IDs on AMD is a more challenging task. This is notable because of the
+ * following two facts:
+ *
+ * 1. Before family 0x17 (Zen), there was no support for SMT and there was
+ * also no way to get an actual unique core id from the system. As such, we
+ * synthesize this case by using cpu->cpu_id. This scheme does not,
+ * however, guarantee that sibling cores of a chip will have sequential
+ * coreids starting at a multiple of the number of cores per chip - that is
+ * usually the case, but if the ACPI MADT table is presented in a different
+ * order then we need to perform a few more gymnastics for the pkgcoreid.
+ *
+ * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
+ * called compute units. These compute units share the L1I cache, L2 cache,
+ * and the FPU. To deal with this, a new topology leaf was added in
+ * 0x8000001e. However, parts of this leaf have different meanings
+ * once we get to family 0x17.
+ */
+
static void
-cpuid_amd_getids(cpu_t *cpu)
+cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
{
int i, first_half, coreidsz;
uint32_t nb_caps_reg;
@@ -875,41 +1926,31 @@ cpuid_amd_getids(cpu_t *cpu)
struct cpuid_regs *cp;
/*
- * AMD CMP chips currently have a single thread per core.
- *
- * Since no two cpus share a core we must assign a distinct coreid
- * per cpu, and we do this by using the cpu_id. This scheme does not,
- * however, guarantee that sibling cores of a chip will have sequential
- * coreids starting at a multiple of the number of cores per chip -
- * that is usually the case, but if the ACPI MADT table is presented
- * in a different order then we need to perform a few more gymnastics
- * for the pkgcoreid.
- *
- * All processors in the system have the same number of enabled
- * cores. Cores within a processor are always numbered sequentially
- * from 0 regardless of how many or which are disabled, and there
- * is no way for operating system to discover the real core id when some
- * are disabled.
- *
- * In family 0x15, the cores come in pairs called compute units. They
- * share I$ and L2 caches and the FPU. Enumeration of this feature is
- * simplified by the new topology extensions CPUID leaf, indicated by
- * the X86 feature X86FSET_TOPOEXT.
+ * Calculate the core id (this comes from hardware in family 0x17 if it
+ * hasn't been stripped by virtualization). We always set the compute
+ * unit id to the same value. Also, initialize the default number of
+ * cores per compute unit and nodes per package. This will be
+ * overwritten when we know information about a particular family.
*/
+ cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
+ cpi->cpi_compunitid = cpi->cpi_coreid;
+ cpi->cpi_cores_per_compunit = 1;
+ cpi->cpi_procnodes_per_pkg = 1;
- cpi->cpi_coreid = cpu->cpu_id;
- cpi->cpi_compunitid = cpu->cpu_id;
-
- if (cpi->cpi_xmaxeax >= 0x80000008) {
-
+ /*
+ * To construct the logical ID, we need to determine how many APIC IDs
+ * are dedicated to the cores and threads. This is provided for us in
+ * 0x80000008. However, if it's not present (say due to virtualization),
+ * then we assume it's one. This should be present on all 64-bit AMD
+ * processors. It was added in family 0xf (Hammer).
+ */
+ if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
/*
- * In AMD parlance chip is really a node while Solaris
- * sees chip as equivalent to socket/package.
+ * In AMD parlance chip is really a node while illumos
+ * uses chip as equivalent to socket/package.
*/
- cpi->cpi_ncore_per_chip =
- BITX((cpi)->cpi_extd[8].cp_ecx, 7, 0) + 1;
if (coreidsz == 0) {
/* Use legacy method */
for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
@@ -919,27 +1960,52 @@ cpuid_amd_getids(cpu_t *cpu)
}
} else {
/* Assume single-core part */
- cpi->cpi_ncore_per_chip = 1;
coreidsz = 1;
}
+ cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
- cpi->cpi_clogid = cpi->cpi_pkgcoreid =
- cpi->cpi_apicid & ((1<<coreidsz) - 1);
- cpi->cpi_ncpu_per_chip = cpi->cpi_ncore_per_chip;
+ /*
+ * The package core ID varies depending on the family. For family 17h,
+ * we can get this directly from leaf CPUID_LEAF_EXT_1e. Otherwise, we
+ * can use the clogid as is. When family 17h is virtualized, the clogid
+ * should be sufficient as if we don't have valid data in the leaf, then
+ * we won't think we have SMT, in which case the cpi_clogid should be
+ * sufficient.
+ */
+ if (cpi->cpi_family >= 0x17 &&
+ is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
+ cpi->cpi_extd[0x1e].cp_ebx != 0) {
+ cpi->cpi_pkgcoreid = BITX(cpi->cpi_extd[0x1e].cp_ebx, 7, 0);
+ } else {
+ cpi->cpi_pkgcoreid = cpi->cpi_clogid;
+ }
- /* Get node ID, compute unit ID */
+ /*
+ * Obtain the node ID and compute unit IDs. If we're on family 0x15
+ * (bulldozer) or newer, then we can derive all of this from leaf
+ * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
+ */
if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
- cpi->cpi_xmaxeax >= 0x8000001e) {
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
cp = &cpi->cpi_extd[0x1e];
- cp->cp_eax = 0x8000001e;
- (void) __cpuid_insn(cp);
cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
- cpi->cpi_cores_per_compunit = BITX(cp->cp_ebx, 15, 8) + 1;
- cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0)
- + (cpi->cpi_ncore_per_chip / cpi->cpi_cores_per_compunit)
- * (cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg);
+
+ /*
+ * For Bulldozer-era CPUs, recalculate the compute unit
+ * information.
+ */
+ if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
+ cpi->cpi_cores_per_compunit =
+ BITX(cp->cp_ebx, 15, 8) + 1;
+ cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
+ (cpi->cpi_ncore_per_chip /
+ cpi->cpi_cores_per_compunit) *
+ (cpi->cpi_procnodeid /
+ cpi->cpi_procnodes_per_pkg);
+ }
} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
} else if (cpi->cpi_family == 0x10) {
@@ -1014,7 +2080,7 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
if (cpi->cpi_vendor == X86_VENDOR_AMD &&
- cpi->cpi_xmaxeax >= 0x80000008) {
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
add_x86_feature(featureset, X86FSET_IBPB);
if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
@@ -1156,6 +2222,117 @@ setup_xfem(void)
xsave_bv_all = flags;
}
+static void
+cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
+{
+ struct cpuid_info *cpi;
+
+ cpi = cpu->cpu_m.mcpu_cpi;
+
+ if (cpi->cpi_vendor == X86_VENDOR_AMD) {
+ cpuid_gather_amd_topology_leaves(cpu);
+ }
+
+ cpi->cpi_apicid = cpuid_gather_apicid(cpi);
+
+ /*
+ * Before we can calculate the IDs that we should assign to this
+ * processor, we need to understand how many cores and threads it has.
+ */
+ switch (cpi->cpi_vendor) {
+ case X86_VENDOR_Intel:
+ cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
+ &cpi->cpi_ncore_per_chip);
+ break;
+ case X86_VENDOR_AMD:
+ cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
+ &cpi->cpi_ncore_per_chip);
+ break;
+ default:
+ /*
+ * If we have some other x86 compatible chip, it's not clear how
+ * they would behave. The most common case is virtualization
+ * today, though there are also 64-bit VIA chips. Assume that
+ * all we can get is the basic Leaf 1 HTT information.
+ */
+ if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
+ cpi->cpi_ncore_per_chip = 1;
+ cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
+ }
+ break;
+ }
+
+ /*
+ * Based on the calculated number of threads and cores, potentially
+ * assign the HTT and CMT features.
+ */
+ if (cpi->cpi_ncore_per_chip > 1) {
+ add_x86_feature(featureset, X86FSET_CMP);
+ }
+
+ if (cpi->cpi_ncpu_per_chip > 1 &&
+ cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
+ add_x86_feature(featureset, X86FSET_HTT);
+ }
+
+ /*
+ * Now that has been set up, we need to go through and calculate all of
+ * the rest of the parameters that exist. If we think the CPU doesn't
+ * have either SMT (HTT) or CMP, then we basically go through and fake
+ * up information in some way. The most likely case for this is
+ * virtualization where we have a lot of partial topology information.
+ */
+ if (!is_x86_feature(featureset, X86FSET_HTT) &&
+ !is_x86_feature(featureset, X86FSET_CMP)) {
+ /*
+ * This is a single core, single-threaded processor.
+ */
+ cpi->cpi_procnodes_per_pkg = 1;
+ cpi->cpi_cores_per_compunit = 1;
+ cpi->cpi_compunitid = 0;
+ cpi->cpi_chipid = -1;
+ cpi->cpi_clogid = 0;
+ cpi->cpi_coreid = cpu->cpu_id;
+ cpi->cpi_pkgcoreid = 0;
+ if (cpi->cpi_vendor == X86_VENDOR_AMD) {
+ cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
+ } else {
+ cpi->cpi_procnodeid = cpi->cpi_chipid;
+ }
+ } else {
+ switch (cpi->cpi_vendor) {
+ case X86_VENDOR_Intel:
+ cpuid_intel_getids(cpu, featureset);
+ break;
+ case X86_VENDOR_AMD:
+ cpuid_amd_getids(cpu, featureset);
+ break;
+ default:
+ /*
+ * In this case, it's hard to say what we should do.
+ * We're going to model them to the OS as single core
+ * threads. We don't have a good identifier for them, so
+ * we're just going to use the cpu id all on a single
+ * chip.
+ *
+ * This case has historically been different from the
+ * case above where we don't have HTT or CMP. While they
+ * could be combined, we've opted to keep it separate to
+ * minimize the risk of topology changes in weird cases.
+ */
+ cpi->cpi_procnodes_per_pkg = 1;
+ cpi->cpi_cores_per_compunit = 1;
+ cpi->cpi_chipid = 0;
+ cpi->cpi_coreid = cpu->cpu_id;
+ cpi->cpi_clogid = cpu->cpu_id;
+ cpi->cpi_pkgcoreid = cpu->cpu_id;
+ cpi->cpi_procnodeid = cpi->cpi_chipid;
+ cpi->cpi_compunitid = cpi->cpi_coreid;
+ break;
+ }
+ }
+}
+
void
cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
{
@@ -1743,23 +2920,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
if (is_x86_feature(featureset, X86FSET_PAE))
cpi->cpi_pabits = 36;
- /*
- * Hyperthreading configuration is slightly tricky on Intel
- * and pure clones, and even trickier on AMD.
- *
- * (AMD chose to set the HTT bit on their CMP processors,
- * even though they're not actually hyperthreaded. Thus it
- * takes a bit more work to figure out what's really going
- * on ... see the handling of the CMP_LGCY bit below)
- */
- if (cp->cp_edx & CPUID_INTC_EDX_HTT) {
- cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
- if (cpi->cpi_ncpu_per_chip > 1)
- add_x86_feature(featureset, X86FSET_HTT);
- } else {
- cpi->cpi_ncpu_per_chip = 1;
- }
-
if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
struct cpuid_regs r, *ecp;
@@ -1816,11 +2976,11 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
if (xcpuid) {
cp = &cpi->cpi_extd[0];
- cp->cp_eax = 0x80000000;
+ cp->cp_eax = CPUID_LEAF_EXT_0;
cpi->cpi_xmaxeax = __cpuid_insn(cp);
}
- if (cpi->cpi_xmaxeax & 0x80000000) {
+ if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
@@ -1878,18 +3038,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
}
/*
- * If both the HTT and CMP_LGCY bits are set,
- * then we're not actually HyperThreaded. Read
- * "AMD CPUID Specification" for more details.
- */
- if (cpi->cpi_vendor == X86_VENDOR_AMD &&
- is_x86_feature(featureset, X86FSET_HTT) &&
- (cp->cp_ecx & CPUID_AMD_ECX_CMP_LGCY)) {
- remove_x86_feature(featureset, X86FSET_HTT);
- add_x86_feature(featureset, X86FSET_CMP);
- }
-
- /*
* It's really tricky to support syscall/sysret in
* the i386 kernel; we rely on sysenter/sysexit
* instead. In the amd64 kernel, things are -way-
@@ -1954,12 +3102,13 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
}
/*FALLTHROUGH*/
case X86_VENDOR_AMD:
- if (cpi->cpi_xmaxeax < 0x80000008)
+ if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
break;
cp = &cpi->cpi_extd[8];
- cp->cp_eax = 0x80000008;
+ cp->cp_eax = CPUID_LEAF_EXT_8;
(void) __cpuid_insn(cp);
- platform_cpuid_mangle(cpi->cpi_vendor, 0x80000008, cp);
+ platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
+ cp);
/*
* AMD uses ebx for some extended functions.
@@ -1995,41 +3144,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
}
/*
- * Derive the number of cores per chip
- */
- switch (cpi->cpi_vendor) {
- case X86_VENDOR_Intel:
- if (cpi->cpi_maxeax < 4) {
- cpi->cpi_ncore_per_chip = 1;
- break;
- } else {
- cpi->cpi_ncore_per_chip =
- BITX((cpi)->cpi_std[4].cp_eax, 31, 26) + 1;
- }
- break;
- case X86_VENDOR_AMD:
- if (cpi->cpi_xmaxeax < 0x80000008) {
- cpi->cpi_ncore_per_chip = 1;
- break;
- } else {
- /*
- * On family 0xf cpuid fn 2 ECX[7:0] "NC" is
- * 1 less than the number of physical cores on
- * the chip. In family 0x10 this value can
- * be affected by "downcoring" - it reflects
- * 1 less than the number of cores actually
- * enabled on this node.
- */
- cpi->cpi_ncore_per_chip =
- BITX((cpi)->cpi_extd[8].cp_ecx, 7, 0) + 1;
- }
- break;
- default:
- cpi->cpi_ncore_per_chip = 1;
- break;
- }
-
- /*
* Get CPUID data about TSC Invariance in Deep C-State.
*/
switch (cpi->cpi_vendor) {
@@ -2045,57 +3159,9 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
default:
break;
}
- } else {
- cpi->cpi_ncore_per_chip = 1;
}
- /*
- * If more than one core, then this processor is CMP.
- */
- if (cpi->cpi_ncore_per_chip > 1) {
- add_x86_feature(featureset, X86FSET_CMP);
- }
-
- /*
- * If the number of cores is the same as the number
- * of CPUs, then we cannot have HyperThreading.
- */
- if (cpi->cpi_ncpu_per_chip == cpi->cpi_ncore_per_chip) {
- remove_x86_feature(featureset, X86FSET_HTT);
- }
-
- cpi->cpi_apicid = CPI_APIC_ID(cpi);
- cpi->cpi_procnodes_per_pkg = 1;
- cpi->cpi_cores_per_compunit = 1;
- if (is_x86_feature(featureset, X86FSET_HTT) == B_FALSE &&
- is_x86_feature(featureset, X86FSET_CMP) == B_FALSE) {
- /*
- * Single-core single-threaded processors.
- */
- cpi->cpi_chipid = -1;
- cpi->cpi_clogid = 0;
- cpi->cpi_coreid = cpu->cpu_id;
- cpi->cpi_pkgcoreid = 0;
- if (cpi->cpi_vendor == X86_VENDOR_AMD)
- cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
- else
- cpi->cpi_procnodeid = cpi->cpi_chipid;
- } else if (cpi->cpi_ncpu_per_chip > 1) {
- if (cpi->cpi_vendor == X86_VENDOR_Intel)
- cpuid_intel_getids(cpu, featureset);
- else if (cpi->cpi_vendor == X86_VENDOR_AMD)
- cpuid_amd_getids(cpu);
- else {
- /*
- * All other processors are currently
- * assumed to have single cores.
- */
- cpi->cpi_coreid = cpi->cpi_chipid;
- cpi->cpi_pkgcoreid = 0;
- cpi->cpi_procnodeid = cpi->cpi_chipid;
- cpi->cpi_compunitid = cpi->cpi_chipid;
- }
- }
+ cpuid_pass1_topology(cpu, featureset);
/*
* Synthesize chip "revision" and socket type
@@ -2108,7 +3174,7 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
cpi->cpi_model, cpi->cpi_step);
if (cpi->cpi_vendor == X86_VENDOR_AMD) {
- if (cpi->cpi_xmaxeax >= 0x80000008 &&
+ if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
/* Special handling for AMD FP not necessary. */
cpi->cpi_fp_amd_save = 0;
@@ -2281,61 +3347,6 @@ cpuid_pass2(cpu_t *cpu)
}
}
- if (cpi->cpi_maxeax >= 0xB && cpi->cpi_vendor == X86_VENDOR_Intel) {
- struct cpuid_regs regs;
-
- cp = &regs;
- cp->cp_eax = 0xB;
- cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
-
- (void) __cpuid_insn(cp);
-
- /*
- * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
- * indicates that the extended topology enumeration leaf is
- * available.
- */
- if (cp->cp_ebx) {
- uint32_t x2apic_id;
- uint_t coreid_shift = 0;
- uint_t ncpu_per_core = 1;
- uint_t chipid_shift = 0;
- uint_t ncpu_per_chip = 1;
- uint_t i;
- uint_t level;
-
- for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
- cp->cp_eax = 0xB;
- cp->cp_ecx = i;
-
- (void) __cpuid_insn(cp);
- level = CPI_CPU_LEVEL_TYPE(cp);
-
- if (level == 1) {
- x2apic_id = cp->cp_edx;
- coreid_shift = BITX(cp->cp_eax, 4, 0);
- ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
- } else if (level == 2) {
- x2apic_id = cp->cp_edx;
- chipid_shift = BITX(cp->cp_eax, 4, 0);
- ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
- }
- }
-
- cpi->cpi_apicid = x2apic_id;
- cpi->cpi_ncpu_per_chip = ncpu_per_chip;
- cpi->cpi_ncore_per_chip = ncpu_per_chip /
- ncpu_per_core;
- cpi->cpi_chipid = x2apic_id >> chipid_shift;
- cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
- cpi->cpi_coreid = x2apic_id >> coreid_shift;
- cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
- }
-
- /* Make cp NULL so that we don't stumble on others */
- cp = NULL;
- }
-
/*
* XSAVE enumeration
*/
@@ -2548,10 +3559,10 @@ cpuid_pass2(cpu_t *cpu)
}
- if ((cpi->cpi_xmaxeax & 0x80000000) == 0)
+ if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
goto pass2_done;
- if ((nmax = cpi->cpi_xmaxeax - 0x80000000 + 1) > NMAX_CPI_EXTD)
+ if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
nmax = NMAX_CPI_EXTD;
/*
* Copy the extended properties, fixing them as we go.
@@ -2559,9 +3570,10 @@ cpuid_pass2(cpu_t *cpu)
*/
iptr = (void *)cpi->cpi_brandstr;
for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
- cp->cp_eax = 0x80000000 + n;
+ cp->cp_eax = CPUID_LEAF_EXT_0 + n;
(void) __cpuid_insn(cp);
- platform_cpuid_mangle(cpi->cpi_vendor, 0x80000000 + n, cp);
+ platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
+ cp);
switch (n) {
case 2:
case 3:
@@ -3013,26 +4025,42 @@ cpuid_pass3(cpu_t *cpu)
ASSERT(cpi->cpi_pass == 2);
/*
- * Function 4: Deterministic cache parameters
+ * Deterministic cache parameters
*
- * Take this opportunity to detect the number of threads
- * sharing the last level cache, and construct a corresponding
- * cache id. The respective cpuid_info members are initialized
- * to the default case of "no last level cache sharing".
+ * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
+ * values that are present are currently defined to be the same. This
+ * means we can use the same logic to parse it as long as we use the
+ * appropriate leaf to get the data. If you're updating this, make sure
+ * you're careful about which vendor supports which aspect.
+ *
+ * Take this opportunity to detect the number of threads sharing the
+ * last level cache, and construct a corresponding cache id. The
+ * respective cpuid_info members are initialized to the default case of
+ * "no last level cache sharing".
*/
cpi->cpi_ncpu_shr_last_cache = 1;
cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
- if (cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) {
+ if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
+ (cpi->cpi_vendor == X86_VENDOR_AMD &&
+ cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
+ is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
+ uint32_t leaf;
+
+ if (cpi->cpi_vendor == X86_VENDOR_Intel) {
+ leaf = 4;
+ } else {
+ leaf = CPUID_LEAF_EXT_1d;
+ }
/*
- * Find the # of elements (size) returned by fn 4, and along
+ * Find the # of elements (size) returned by the leaf and along
* the way detect last level cache sharing details.
*/
bzero(&regs, sizeof (regs));
cp = &regs;
for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
- cp->cp_eax = 4;
+ cp->cp_eax = leaf;
cp->cp_ecx = i;
(void) __cpuid_insn(cp);
@@ -3046,29 +4074,33 @@ cpuid_pass3(cpu_t *cpu)
CPI_NTHR_SHR_CACHE(cp) + 1;
}
}
- cpi->cpi_std_4_size = size = i;
+ cpi->cpi_cache_leaf_size = size = i;
/*
- * Allocate the cpi_std_4 array. The first element
- * references the regs for fn 4, %ecx == 0, which
- * cpuid_pass2() stashed in cpi->cpi_std[4].
+ * Allocate the cpi_cache_leaves array. The first element
+ * references the regs for the corresponding leaf with %ecx set
+ * to 0. This was gathered in cpuid_pass2().
*/
if (size > 0) {
- cpi->cpi_std_4 =
+ cpi->cpi_cache_leaves =
kmem_alloc(size * sizeof (cp), KM_SLEEP);
- cpi->cpi_std_4[0] = &cpi->cpi_std[4];
+ if (cpi->cpi_vendor == X86_VENDOR_Intel) {
+ cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
+ } else {
+ cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
+ }
/*
* Allocate storage to hold the additional regs
- * for function 4, %ecx == 1 .. cpi_std_4_size.
+ * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
*
- * The regs for fn 4, %ecx == 0 has already
+ * The regs for the leaf, %ecx == 0 has already
* been allocated as indicated above.
*/
for (i = 1; i < size; i++) {
- cp = cpi->cpi_std_4[i] =
+ cp = cpi->cpi_cache_leaves[i] =
kmem_zalloc(sizeof (regs), KM_SLEEP);
- cp->cp_eax = 4;
+ cp->cp_eax = leaf;
cp->cp_ecx = i;
(void) __cpuid_insn(cp);
@@ -3090,7 +4122,7 @@ cpuid_pass3(cpu_t *cpu)
/*
* Now fixup the brand string
*/
- if ((cpi->cpi_xmaxeax & 0x80000000) == 0) {
+ if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
fabricate_brandstr(cpi);
} else {
@@ -3497,20 +4529,22 @@ cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
/*
* CPUID data is cached in two separate places: cpi_std for standard
- * CPUID functions, and cpi_extd for extended CPUID functions.
+ * CPUID leaves , and cpi_extd for extended CPUID leaves.
*/
- if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD)
+ if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
xcp = &cpi->cpi_std[cp->cp_eax];
- else if (cp->cp_eax >= 0x80000000 && cp->cp_eax <= cpi->cpi_xmaxeax &&
- cp->cp_eax < 0x80000000 + NMAX_CPI_EXTD)
- xcp = &cpi->cpi_extd[cp->cp_eax - 0x80000000];
- else
+ } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
+ cp->cp_eax <= cpi->cpi_xmaxeax &&
+ cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
+ xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
+ } else {
/*
* The caller is asking for data from an input parameter which
* the kernel has not cached. In this case we go fetch from
* the hardware and return the data directly to the user.
*/
return (__cpuid_insn(cp));
+ }
cp->cp_eax = xcp->cp_eax;
cp->cp_ebx = xcp->cp_ebx;
@@ -4410,17 +5444,18 @@ intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
uint32_t level, i;
int ret = 0;
- for (i = 0; i < cpi->cpi_std_4_size; i++) {
- level = CPI_CACHE_LVL(cpi->cpi_std_4[i]);
+ for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
+ level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
if (level == 2 || level == 3) {
- ct->ct_assoc = CPI_CACHE_WAYS(cpi->cpi_std_4[i]) + 1;
+ ct->ct_assoc =
+ CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
ct->ct_line_size =
- CPI_CACHE_COH_LN_SZ(cpi->cpi_std_4[i]) + 1;
+ CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
ct->ct_size = ct->ct_assoc *
- (CPI_CACHE_PARTS(cpi->cpi_std_4[i]) + 1) *
+ (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
ct->ct_line_size *
- (cpi->cpi_std_4[i]->cp_ecx + 1);
+ (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
if (level == 2) {
ct->ct_label = l2_cache_str;
@@ -5429,69 +6464,21 @@ patch_memops(uint_t vendor)
#endif /* __amd64 && !__xpv */
/*
- * This function finds the number of bits to represent the number of cores per
- * chip and the number of strands per core for the Intel platforms.
- * It re-uses the x2APIC cpuid code of the cpuid_pass2().
+ * We're being asked to tell the system how many bits are required to represent
+ * the various thread and strand IDs.
*/
void
-cpuid_get_ext_topo(uint_t vendor, uint_t *core_nbits, uint_t *strand_nbits)
+cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
{
- struct cpuid_regs regs;
- struct cpuid_regs *cp = &regs;
-
- if (vendor != X86_VENDOR_Intel) {
- return;
- }
-
- /* if the cpuid level is 0xB, extended topo is available. */
- cp->cp_eax = 0;
- if (__cpuid_insn(cp) >= 0xB) {
-
- cp->cp_eax = 0xB;
- cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
- (void) __cpuid_insn(cp);
-
- /*
- * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
- * indicates that the extended topology enumeration leaf is
- * available.
- */
- if (cp->cp_ebx) {
- uint_t coreid_shift = 0;
- uint_t chipid_shift = 0;
- uint_t i;
- uint_t level;
-
- for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
- cp->cp_eax = 0xB;
- cp->cp_ecx = i;
+ struct cpuid_info *cpi;
+ uint_t nthreads;
- (void) __cpuid_insn(cp);
- level = CPI_CPU_LEVEL_TYPE(cp);
-
- if (level == 1) {
- /*
- * Thread level processor topology
- * Number of bits shift right APIC ID
- * to get the coreid.
- */
- coreid_shift = BITX(cp->cp_eax, 4, 0);
- } else if (level == 2) {
- /*
- * Core level processor topology
- * Number of bits shift right APIC ID
- * to get the chipid.
- */
- chipid_shift = BITX(cp->cp_eax, 4, 0);
- }
- }
+ VERIFY(cpuid_checkpass(CPU, 1));
+ cpi = cpu->cpu_m.mcpu_cpi;
- if (coreid_shift > 0 && chipid_shift > coreid_shift) {
- *strand_nbits = coreid_shift;
- *core_nbits = chipid_shift - coreid_shift;
- }
- }
- }
+ nthreads = cpi->cpi_ncpu_per_chip / cpi->cpi_ncore_per_chip;
+ *core_nbits = ddi_fls(cpi->cpi_ncore_per_chip);
+ *strand_nbits = ddi_fls(nthreads);
}
void
@@ -5527,19 +6514,19 @@ cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
(cpi->cpi_family == 5 && cpi->cpi_model < 1))
return;
- if (cpi->cpi_xmaxeax < 0x80000008) {
+ if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
bzero(&cp, sizeof (cp));
- cp.cp_eax = 0x80000000;
+ cp.cp_eax = CPUID_LEAF_EXT_0;
cpi->cpi_xmaxeax = __cpuid_insn(&cp);
- if (cpi->cpi_xmaxeax < 0x80000008) {
+ if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
return;
}
}
bzero(&cp, sizeof (cp));
- cp.cp_eax = 0x80000008;
+ cp.cp_eax = CPUID_LEAF_EXT_8;
(void) __cpuid_insn(&cp);
- platform_cpuid_mangle(cpi->cpi_vendor, 0x80000008, &cp);
+ platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
cpi->cpi_extd[8] = cp;
} else {
/*