diff options
Diffstat (limited to 'usr/src/uts/i86pc/os/cpuid.c')
-rw-r--r-- | usr/src/uts/i86pc/os/cpuid.c | 1771 |
1 files changed, 1379 insertions, 392 deletions
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 1291b6180d..ddc09a4951 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -34,9 +34,852 @@ /* * Copyright (c) 2019, Joyent, Inc. */ + /* - * Various routines to handle identification - * and classification of x86 processors. + * CPU Identification logic + * + * The purpose of this file and its companion, cpuid_subr.c, is to help deal + * with the identification of CPUs, their features, and their topologies. More + * specifically, this file helps drive the following: + * + * 1. Enumeration of features of the processor which are used by the kernel to + * determine what features to enable or disable. These may be instruction set + * enhancements or features that we use. + * + * 2. Enumeration of instruction set architecture (ISA) additions that userland + * will be told about through the auxiliary vector. + * + * 3. Understanding the physical topology of the CPU such as the number of + * caches, how many cores it has, whether or not it supports symmetric + * multi-processing (SMT), etc. + * + * ------------------------ + * CPUID History and Basics + * ------------------------ + * + * The cpuid instruction was added by Intel roughly around the time that the + * original Pentium was introduced. The purpose of cpuid was to tell in a + * programmatic fashion information about the CPU that previously was guessed + * at. For example, an important part of cpuid is that we can know what + * extensions to the ISA exist. If you use an invalid opcode you would get a + * #UD, so this method allows a program (whether a user program or the kernel) + * to determine what exists without crashing or getting a SIGILL. Of course, + * this was also during the era of the clones and the AMD Am5x86. The vendor + * name shows up first in cpuid for a reason. + * + * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts + * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has + * its own meaning. The different leaves are broken down into different regions: + * + * [ 0, 7fffffff ] This region is called the 'basic' + * region. This region is generally defined + * by Intel, though some of the original + * portions have different meanings based + * on the manufacturer. These days, Intel + * adds most new features to this region. + * AMD adds non-Intel compatible + * information in the third, extended + * region. Intel uses this for everything + * including ISA extensions, CPU + * features, cache information, topology, + * and more. + * + * There is a hole carved out of this + * region which is reserved for + * hypervisors. + * + * [ 40000000, 4fffffff ] This region, which is found in the + * middle of the previous region, is + * explicitly promised to never be used by + * CPUs. Instead, it is used by hypervisors + * to communicate information about + * themselves to the operating system. The + * values and details are unique for each + * hypervisor. + * + * [ 80000000, ffffffff ] This region is called the 'extended' + * region. Some of the low leaves mirror + * parts of the basic leaves. This region + * has generally been used by AMD for + * various extensions. For example, AMD- + * specific information about caches, + * features, and topology are found in this + * region. + * + * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx, + * and %edx, and then issue the cpuid instruction. At the first leaf in each of + * the ranges, one of the primary things returned is the maximum valid leaf in + * that range. This allows for discovery of what range of CPUID is valid. + * + * The CPUs have potentially surprising behavior when using an invalid leaf or + * unimplemented leaf. If the requested leaf is within the valid basic or + * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be + * set to zero. However, if you specify a leaf that is outside of a valid range, + * then instead it will be filled with the last valid _basic_ leaf. For example, + * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or + * an invalid extended leaf will return the information for leaf 3. + * + * Some leaves are broken down into sub-leaves. This means that the value + * depends on both the leaf asked for in %eax and a secondary register. For + * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get + * additional information. Or when getting topology information in leaf 0xb, the + * initial value in %ecx changes which level of the topology that you are + * getting information about. + * + * cpuid values are always kept to 32 bits regardless of whether or not the + * program is in 64-bit mode. When executing in 64-bit mode, the upper + * 32 bits of the register are always set to zero so that way the values are the + * same regardless of execution mode. + * + * ---------------------- + * Identifying Processors + * ---------------------- + * + * We can identify a processor in two steps. The first step looks at cpuid leaf + * 0. Leaf 0 contains the processor's vendor information. This is done by + * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is + * 'AuthenticAMD' and on Intel it is 'GenuineIntel'. + * + * From there, a processor is identified by a combination of three different + * values: + * + * 1. Family + * 2. Model + * 3. Stepping + * + * Each vendor uses the family and model to uniquely identify a processor. The + * way that family and model are changed depends on the vendor. For example, + * Intel has been using family 0x6 for almost all of their processor since the + * Pentium Pro/Pentium II era, often called the P6. The model is used to + * identify the exact processor. Different models are often used for the client + * (consumer) and server parts. Even though each processor often has major + * architectural differences, they still are considered the same family by + * Intel. + * + * On the other hand, each major AMD architecture generally has its own family. + * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it + * the model number is used to help identify specific processors. + * + * The stepping is used to refer to a revision of a specific microprocessor. The + * term comes from equipment used to produce masks that are used to create + * integrated circuits. + * + * The information is present in leaf 1, %eax. In technical documentation you + * will see the terms extended model and extended family. The original family, + * model, and stepping fields were each 4 bits wide. If the values in either + * are 0xf, then one is to consult the extended model and extended family, which + * take previously reserved bits and allow for a larger number of models and add + * 0xf to them. + * + * When we process this information, we store the full family, model, and + * stepping in the struct cpuid_info members cpi_family, cpi_model, and + * cpi_step, respectively. Whenever you are performing comparisons with the + * family, model, and stepping, you should use these members and not the raw + * values from cpuid. If you must use the raw values from cpuid directly, you + * must make sure that you add the extended model and family to the base model + * and family. + * + * In general, we do not use information about the family, model, and stepping + * to determine whether or not a feature is present; that is generally driven by + * specific leaves. However, when something we care about on the processor is + * not considered 'architectural' meaning that it is specific to a set of + * processors and not promised in the architecture model to be consistent from + * generation to generation, then we will fall back on this information. The + * most common cases where this comes up is when we have to workaround errata in + * the processor, are dealing with processor-specific features such as CPU + * performance counters, or we want to provide additional information for things + * such as fault management. + * + * While processors also do have a brand string, which is the name that people + * are familiar with when buying the processor, they are not meant for + * programmatic consumption. That is what the family, model, and stepping are + * for. + * + * ------------ + * CPUID Passes + * ------------ + * + * As part of performing feature detection, we break this into several different + * passes. The passes are as follows: + * + * Pass 0 This is a primordial pass done in locore.s to deal with + * Cyrix CPUs that don't support cpuid. The reality is that + * we likely don't run on them any more, but there is still + * logic for handling them. + * + * Pass 1 This is the primary pass and is responsible for doing a + * large number of different things: + * + * 1. Determine which vendor manufactured the CPU and + * determining the family, model, and stepping information. + * + * 2. Gathering a large number of feature flags to + * determine which features the CPU support and which + * indicate things that we need to do other work in the OS + * to enable. Features detected this way are added to the + * x86_featureset which can be queried to + * determine what we should do. This includes processing + * all of the basic and extended CPU features that we care + * about. + * + * 3. Determining the CPU's topology. This includes + * information about how many cores and threads are present + * in the package. It also is responsible for figuring out + * which logical CPUs are potentially part of the same core + * and what other resources they might share. For more + * information see the 'Topology' section. + * + * 4. Determining the set of CPU security-specific features + * that we need to worry about and determine the + * appropriate set of workarounds. + * + * Pass 1 on the boot CPU occurs before KMDB is started. + * + * Pass 2 The second pass is done after startup(). Here, we check + * other miscellaneous features. Most of this is gathering + * additional basic and extended features that we'll use in + * later passes or for debugging support. + * + * Pass 3 The third pass occurs after the kernel memory allocator + * has been fully initialized. This gathers information + * where we might need dynamic memory available for our + * uses. This includes several varying width leaves that + * have cache information and the processor's brand string. + * + * Pass 4 The fourth and final normal pass is performed after the + * kernel has brought most everything online. This is + * invoked from post_startup(). In this pass, we go through + * the set of features that we have enabled and turn that + * into the hardware auxiliary vector features that + * userland receives. This is used by userland, primarily + * by the run-time link-editor (RTLD), though userland + * software could also refer to it directly. + * + * Microcode After a microcode update, we do a selective rescan of + * the cpuid leaves to determine what features have + * changed. Microcode updates can provide more details + * about security related features to deal with issues like + * Spectre and L1TF. On occasion, vendors have violated + * their contract and removed bits. However, we don't try + * to detect that because that puts us in a situation that + * we really can't deal with. As such, the only thing we + * rescan are security related features today. See + * cpuid_pass_ucode(). + * + * All of the passes (except pass 0) are run on all CPUs. However, for the most + * part we only care about what the boot CPU says about this information and use + * the other CPUs as a rough guide to sanity check that we have the same feature + * set. + * + * We do not support running multiple logical CPUs with disjoint, let alone + * different, feature sets. + * + * ------------------ + * Processor Topology + * ------------------ + * + * One of the important things that we need to do is to understand the topology + * of the underlying processor. When we say topology in this case, we're trying + * to understand the relationship between the logical CPUs that the operating + * system sees and the underlying physical layout. Different logical CPUs may + * share different resources which can have important consequences for the + * performance of the system. For example, they may share caches, execution + * units, and more. + * + * The topology of the processor changes from generation to generation and + * vendor to vendor. Along with that, different vendors use different + * terminology, and the operating system itself uses occasionally overlapping + * terminology. It's important to understand what this topology looks like so + * one can understand the different things that we try to calculate and + * determine. + * + * To get started, let's talk about a little bit of terminology that we've used + * so far, is used throughout this file, and is fairly generic across multiple + * vendors: + * + * CPU + * A central processing unit (CPU) refers to a logical and/or virtual + * entity that the operating system can execute instructions on. The + * underlying resources for this CPU may be shared between multiple + * entities; however, to the operating system it is a discrete unit. + * + * PROCESSOR and PACKAGE + * + * Generally, when we use the term 'processor' on its own, we are referring + * to the physical entity that one buys and plugs into a board. However, + * because processor has been overloaded and one might see it used to mean + * multiple different levels, we will instead use the term 'package' for + * the rest of this file. The term package comes from the electrical + * engineering side and refers to the physical entity that encloses the + * electronics inside. Strictly speaking the package can contain more than + * just the CPU, for example, on many processors it may also have what's + * called an 'integrated graphical processing unit (GPU)'. Because the + * package can encapsulate multiple units, it is the largest physical unit + * that we refer to. + * + * SOCKET + * + * A socket refers to unit on a system board (generally the motherboard) + * that can receive a package. A single package, or processor, is plugged + * into a single socket. A system may have multiple sockets. Often times, + * the term socket is used interchangeably with package and refers to the + * electrical component that has plugged in, and not the receptacle itself. + * + * CORE + * + * A core refers to the physical instantiation of a CPU, generally, with a + * full set of hardware resources available to it. A package may contain + * multiple cores inside of it or it may just have a single one. A + * processor with more than one core is often referred to as 'multi-core'. + * In illumos, we will use the feature X86FSET_CMP to refer to a system + * that has 'multi-core' processors. + * + * A core may expose a single logical CPU to the operating system, or it + * may expose multiple CPUs, which we call threads, defined below. + * + * Some resources may still be shared by cores in the same package. For + * example, many processors will share the level 3 cache between cores. + * Some AMD generations share hardware resources between cores. For more + * information on that see the section 'AMD Topology'. + * + * THREAD and STRAND + * + * In this file, generally a thread refers to a hardware resources and not + * the operating system's logical abstraction. A thread is always exposed + * as an independent logical CPU to the operating system. A thread belongs + * to a specific core. A core may have more than one thread. When that is + * the case, the threads that are part of the same core are often referred + * to as 'siblings'. + * + * When multiple threads exist, this is generally referred to as + * simultaneous multi-threading (SMT). When Intel introduced this in their + * processors they called it hyper-threading (HT). When multiple threads + * are active in a core, they split the resources of the core. For example, + * two threads may share the same set of hardware execution units. + * + * The operating system often uses the term 'strand' to refer to a thread. + * This helps disambiguate it from the software concept. + * + * CHIP + * + * Unfortunately, the term 'chip' is dramatically overloaded. At its most + * base meaning, it is used to refer to a single integrated circuit, which + * may or may not be the only thing in the package. In illumos, when you + * see the term 'chip' it is almost always referring to the same thing as + * the 'package'. However, many vendors may use chip to refer to one of + * many integrated circuits that have been placed in the package. As an + * example, see the subsequent definition. + * + * To try and keep things consistent, we will only use chip when referring + * to the entire integrated circuit package, with the exception of the + * definition of multi-chip module (because it is in the name) and use the + * term 'die' when we want the more general, potential sub-component + * definition. + * + * DIE + * + * A die refers to an integrated circuit. Inside of the package there may + * be a single die or multiple dies. This is sometimes called a 'chip' in + * vendor's parlance, but in this file, we use the term die to refer to a + * subcomponent. + * + * MULTI-CHIP MODULE + * + * A multi-chip module (MCM) refers to putting multiple distinct chips that + * are connected together in the same package. When a multi-chip design is + * used, generally each chip is manufactured independently and then joined + * together in the package. For example, on AMD's Zen microarchitecture + * (family 0x17), the package contains several dies (the second meaning of + * chip from above) that are connected together. + * + * CACHE + * + * A cache is a part of the processor that maintains copies of recently + * accessed memory. Caches are split into levels and then into types. + * Commonly there are one to three levels, called level one, two, and + * three. The lower the level, the smaller it is, the closer it is to the + * execution units of the CPU, and the faster it is to access. The layout + * and design of the cache come in many different flavors, consult other + * resources for a discussion of those. + * + * Caches are generally split into two types, the instruction and data + * cache. The caches contain what their names suggest, the instruction + * cache has executable program text, while the data cache has all other + * memory that the processor accesses. As of this writing, data is kept + * coherent between all of the caches on x86, so if one modifies program + * text before it is executed, that will be in the data cache, and the + * instruction cache will be synchronized with that change when the + * processor actually executes those instructions. This coherency also + * covers the fact that data could show up in multiple caches. + * + * Generally, the lowest level caches are specific to a core. However, the + * last layer cache is shared between some number of cores. The number of + * CPUs sharing this last level cache is important. This has implications + * for the choices that the scheduler makes, as accessing memory that might + * be in a remote cache after thread migration can be quite expensive. + * + * Sometimes, the word cache is abbreviated with a '$', because in US + * English the word cache is pronounced the same as cash. So L1D$ refers to + * the L1 data cache, and L2$ would be the L2 cache. This will not be used + * in the rest of this theory statement for clarity. + * + * MEMORY CONTROLLER + * + * The memory controller is a component that provides access to DRAM. Each + * memory controller can access a set number of DRAM channels. Each channel + * can have a number of DIMMs (sticks of memory) associated with it. A + * given package may have more than one memory controller. The association + * of the memory controller to a group of cores is important as it is + * cheaper to access memory on the controller that you are associated with. + * + * NUMA + * + * NUMA or non-uniform memory access, describes a way that systems are + * built. On x86, any processor core can address all of the memory in the + * system. However, When using multiple sockets or possibly within a + * multi-chip module, some of that memory is physically closer and some of + * it is further. Memory that is further away is more expensive to access. + * Consider the following image of multiple sockets with memory: + * + * +--------+ +--------+ + * | DIMM A | +----------+ +----------+ | DIMM D | + * +--------+-+ | | | | +-+------+-+ + * | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E | + * +--------+-+ | | | | +-+------+-+ + * | DIMM C | +----------+ +----------+ | DIMM F | + * +--------+ +--------+ + * + * In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is + * closer to DIMMs D-F. This means that it is cheaper for socket 0 to + * access DIMMs A-C and more expensive to access D-F as it has to go + * through Socket 1 to get there. The inverse is true for Socket 1. DIMMs + * D-F are cheaper than A-C. While the socket form is the most common, when + * using multi-chip modules, this can also sometimes occur. For another + * example of this that's more involved, see the AMD topology section. + * + * + * Intel Topology + * -------------- + * + * Most Intel processors since Nehalem, (as of this writing the current gen + * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of + * the package is a single monolithic die. MCMs currently aren't used. Most + * parts have three levels of caches, with the L3 cache being shared between + * all of the cores on the package. The L1/L2 cache is generally specific to + * an individual core. The following image shows at a simplified level what + * this looks like. The memory controller is commonly part of something called + * the 'Uncore', that used to be separate physical chips that were not a part of + * the package, but are now part of the same chip. + * + * +-----------------------------------------------------------------------+ + * | Package | + * | +-------------------+ +-------------------+ +-------------------+ | + * | | Core | | Core | | Core | | + * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | + * | | | Thread | | L | | | | Thread | | L | | | | Thread | | L | | | + * | | +--------+ | 1 | | | +--------+ | 1 | | | +--------+ | 1 | | | + * | | +--------+ | | | | +--------+ | | | | +--------+ | | | | + * | | | Thread | | | | | | Thread | | | | | | Thread | | | | | + * | | +--------+ +---+ | | +--------+ +---+ | | +--------+ +---+ | | + * | | +--------------+ | | +--------------+ | | +--------------+ | | + * | | | L2 Cache | | | | L2 Cache | | | | L2 Cache | | | + * | | +--------------+ | | +--------------+ | | +--------------+ | | + * | +-------------------+ +-------------------+ +-------------------+ | + * | +-------------------------------------------------------------------+ | + * | | Shared L3 Cache | | + * | +-------------------------------------------------------------------+ | + * | +-------------------------------------------------------------------+ | + * | | Memory Controller | | + * | +-------------------------------------------------------------------+ | + * +-----------------------------------------------------------------------+ + * + * A side effect of this current architecture is that what we care about from a + * scheduling and topology perspective, is simplified. In general we care about + * understanding which logical CPUs are part of the same core and socket. + * + * To determine the relationship between threads and cores, Intel initially used + * the identifier in the advanced programmable interrupt controller (APIC). They + * also added cpuid leaf 4 to give additional information about the number of + * threads and CPUs in the processor. With the addition of x2apic (which + * increased the number of addressable logical CPUs from 8-bits to 32-bits), an + * additional cpuid topology leaf 0xB was added. + * + * AMD Topology + * ------------ + * + * When discussing AMD topology, we want to break this into three distinct + * generations of topology. There's the basic topology that has been used in + * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced + * with family 0x15 (Bulldozer), and there's the topology that was introduced + * with family 0x17 (Zen). AMD also has some additional terminology that's worth + * talking about. + * + * Until the introduction of family 0x17 (Zen), AMD did not implement something + * that they considered SMT. Whether or not the AMD processors have SMT + * influences many things including scheduling and reliability, availability, + * and serviceability (RAS) features. + * + * NODE + * + * AMD uses the term node to refer to a die that contains a number of cores + * and I/O resources. Depending on the processor family and model, more + * than one node can be present in the package. When there is more than one + * node this indicates a multi-chip module. Usually each node has its own + * access to memory and I/O devices. This is important and generally + * different from the corresponding Intel Nehalem-Skylake+ processors. As a + * result, we track this relationship in the operating system. + * + * In processors with an L3 cache, the L3 cache is generally shared across + * the entire node, though the way this is carved up varies from generation + * to generation. + * + * BULLDOZER + * + * Starting with the Bulldozer family (0x15) and continuing until the + * introduction of the Zen microarchitecture, AMD introduced the idea of a + * compute unit. In a compute unit, two traditional cores share a number of + * hardware resources. Critically, they share the FPU, L1 instruction + * cache, and the L2 cache. Several compute units were then combined inside + * of a single node. Because the integer execution units, L1 data cache, + * and some other resources were not shared between the cores, AMD never + * considered this to be SMT. + * + * ZEN + * + * The Zen family (0x17) uses a multi-chip module (MCM) design, the module + * is called Zeppelin. These modules are similar to the idea of nodes used + * previously. Each of these nodes has two DRAM channels which all of the + * cores in the node can access uniformly. These nodes are linked together + * in the package, creating a NUMA environment. + * + * The Zeppelin die itself contains two different 'core complexes'. Each + * core complex consists of four cores which each have two threads, for a + * total of 8 logical CPUs per complex. Unlike other generations, + * where all the logical CPUs in a given node share the L3 cache, here each + * core complex has its own shared L3 cache. + * + * A further thing that we need to consider is that in some configurations, + * particularly with the Threadripper line of processors, not every die + * actually has its memory controllers wired up to actual memory channels. + * This means that some cores have memory attached to them and others + * don't. + * + * To put Zen in perspective, consider the following images: + * + * +--------------------------------------------------------+ + * | Core Complex | + * | +-------------------+ +-------------------+ +---+ | + * | | Core +----+ | | Core +----+ | | | | + * | | +--------+ | L2 | | | +--------+ | L2 | | | | | + * | | | Thread | +----+ | | | Thread | +----+ | | | | + * | | +--------+-+ +--+ | | +--------+-+ +--+ | | L | | + * | | | Thread | |L1| | | | Thread | |L1| | | 3 | | + * | | +--------+ +--+ | | +--------+ +--+ | | | | + * | +-------------------+ +-------------------+ | C | | + * | +-------------------+ +-------------------+ | a | | + * | | Core +----+ | | Core +----+ | | c | | + * | | +--------+ | L2 | | | +--------+ | L2 | | | h | | + * | | | Thread | +----+ | | | Thread | +----+ | | e | | + * | | +--------+-+ +--+ | | +--------+-+ +--+ | | | | + * | | | Thread | |L1| | | | Thread | |L1| | | | | + * | | +--------+ +--+ | | +--------+ +--+ | | | | + * | +-------------------+ +-------------------+ +---+ | + * | | + * +--------------------------------------------------------+ + * + * This first image represents a single Zen core complex that consists of four + * cores. + * + * + * +--------------------------------------------------------+ + * | Zeppelin Die | + * | +--------------------------------------------------+ | + * | | I/O Units (PCIe, SATA, USB, etc.) | | + * | +--------------------------------------------------+ | + * | HH | + * | +-----------+ HH +-----------+ | + * | | | HH | | | + * | | Core |==========| Core | | + * | | Complex |==========| Complex | | + * | | | HH | | | + * | +-----------+ HH +-----------+ | + * | HH | + * | +--------------------------------------------------+ | + * | | Memory Controller | | + * | +--------------------------------------------------+ | + * | | + * +--------------------------------------------------------+ + * + * This image represents a single Zeppelin Die. Note how both cores are + * connected to the same memory controller and I/O units. While each core + * complex has its own L3 cache as seen in the first image, they both have + * uniform access to memory. + * + * + * PP PP + * PP PP + * +----------PP---------------------PP---------+ + * | PP PP | + * | +-----------+ +-----------+ | + * | | | | | | + * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM + * MMMMMMMMM| Die |==========| Die |MMMMMMMMM + * | | | | | | + * | +-----------+ooo ...+-----------+ | + * | HH ooo ... HH | + * | HH oo.. HH | + * | HH ..oo HH | + * | HH ... ooo HH | + * | +-----------+... ooo+-----------+ | + * | | | | | | + * MMMMMMMMM| Zeppelin |==========| Zeppelin |MMMMMMMMM + * MMMMMMMMM| Die |==========| Die |MMMMMMMMM + * | | | | | | + * | +-----------+ +-----------+ | + * | PP PP | + * +----------PP---------------------PP---------+ + * PP PP + * PP PP + * + * This image represents a single Zen package. In this example, it has four + * Zeppelin dies, though some configurations only have a single one. In this + * example, each die is directly connected to the next. Also, each die is + * represented as being connected to memory by the 'M' character and connected + * to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin + * die is made up of two core complexes, we have multiple different NUMA + * domains that we care about for these systems. + * + * CPUID LEAVES + * + * There are a few different CPUID leaves that we can use to try and understand + * the actual state of the world. As part of the introduction of family 0xf, AMD + * added CPUID leaf 0x80000008. This leaf tells us the number of logical + * processors that are in the system. Because families before Zen didn't have + * SMT, this was always the number of cores that were in the system. However, it + * should always be thought of as the number of logical threads to be consistent + * between generations. In addition we also get the size of the APIC ID that is + * used to represent the number of logical processors. This is important for + * deriving topology information. + * + * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a + * bit between Bulldozer and later families, but it is quite useful in + * determining the topology information. Because this information has changed + * across family generations, it's worth calling out what these mean + * explicitly. The registers have the following meanings: + * + * %eax The APIC ID. The entire register is defined to have a 32-bit + * APIC ID, even though on systems without x2apic support, it will + * be limited to 8 bits. + * + * %ebx On Bulldozer-era systems this contains information about the + * number of cores that are in a compute unit (cores that share + * resources). It also contains a per-package compute unit ID that + * identifies which compute unit the logical CPU is a part of. + * + * On Zen-era systems this instead contains the number of threads + * per core and the ID of the core that the logical CPU is a part + * of. Note, this ID is unique only to the package, it is not + * globally unique across the entire system. + * + * %ecx This contains the number of nodes that exist in the package. It + * also contains an ID that identifies which node the logical CPU + * is a part of. + * + * Finally, we also use cpuid leaf 0x8000001D to determine information about the + * cache layout to determine which logical CPUs are sharing which caches. + * + * illumos Topology + * ---------------- + * + * Based on the above we synthesize the information into several different + * variables that we store in the 'struct cpuid_info'. We'll go into the details + * of what each member is supposed to represent and their uniqueness. In + * general, there are two levels of uniqueness that we care about. We care about + * an ID that is globally unique. That means that it will be unique across all + * entities in the system. For example, the default logical CPU ID is globally + * unique. On the other hand, there is some information that we only care about + * being unique within the context of a single package / socket. Here are the + * variables that we keep track of and their meaning. + * + * Several of the values that are asking for an identifier, with the exception + * of cpi_apicid, are allowed to be synthetic. + * + * + * cpi_apicid + * + * This is the value of the CPU's APIC id. This should be the full 32-bit + * ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit + * APIC ID. This value is globally unique between all logical CPUs across + * all packages. This is usually required by the APIC. + * + * cpi_chipid + * + * This value indicates the ID of the package that the logical CPU is a + * part of. This value is allowed to be synthetic. It is usually derived by + * taking the CPU's APIC ID and determining how many bits are used to + * represent CPU cores in the package. All logical CPUs that are part of + * the same package must have the same value. + * + * cpi_coreid + * + * This represents the ID of a CPU core. Two logical CPUs should only have + * the same cpi_coreid value if they are part of the same core. These + * values may be synthetic. On systems that support SMT, this value is + * usually derived from the APIC ID, otherwise it is often synthetic and + * just set to the value of the cpu_id in the cpu_t. + * + * cpi_pkgcoreid + * + * This is similar to the cpi_coreid in that logical CPUs that are part of + * the same core should have the same ID. The main difference is that these + * values are only required to be unique to a given socket. + * + * cpi_clogid + * + * This represents the logical ID of a logical CPU. This value should be + * unique within a given socket for each logical CPU. This is allowed to be + * synthetic, though it is usually based off of the CPU's apic ID. The + * broader system expects that logical CPUs that have are part of the same + * core have contiguous numbers. For example, if there were two threads per + * core, then the core IDs divided by two should be the same and the first + * modulus two should be zero and the second one. For example, IDs 4 and 5 + * indicate two logical CPUs that are part of the same core. But IDs 5 and + * 6 represent two logical CPUs that are part of different cores. + * + * While it is common for the cpi_coreid and the cpi_clogid to be derived + * from the same source, strictly speaking, they don't have to be and the + * two values should be considered logically independent. One should not + * try to compare a logical CPU's cpi_coreid and cpi_clogid to determine + * some kind of relationship. While this is tempting, we've seen cases on + * AMD family 0xf where the system's cpu id is not related to its APIC ID. + * + * cpi_ncpu_per_chip + * + * This value indicates the total number of logical CPUs that exist in the + * physical package. Critically, this is not the number of logical CPUs + * that exist for just the single core. + * + * This value should be the same for all logical CPUs in the same package. + * + * cpi_ncore_per_chip + * + * This value indicates the total number of physical CPU cores that exist + * in the package. The system compares this value with cpi_ncpu_per_chip to + * determine if simultaneous multi-threading (SMT) is enabled. When + * cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and + * the X86FSET_HTT feature is not set. If this value is greater than one, + * than we consider the processor to have the feature X86FSET_CMP, to + * indicate that there is support for more than one core. + * + * This value should be the same for all logical CPUs in the same package. + * + * cpi_procnodes_per_pkg + * + * This value indicates the number of 'nodes' that exist in the package. + * When processors are actually a multi-chip module, this represents the + * number of such modules that exist in the package. Currently, on Intel + * based systems this member is always set to 1. + * + * This value should be the same for all logical CPUs in the same package. + * + * cpi_procnodeid + * + * This value indicates the ID of the node that the logical CPU is a part + * of. All logical CPUs that are in the same node must have the same value + * here. This value must be unique across all of the packages in the + * system. On Intel based systems, this is currently set to the value in + * cpi_chipid because there is only one node. + * + * cpi_cores_per_compunit + * + * This value indicates the number of cores that are part of a compute + * unit. See the AMD topology section for this. This member only has real + * meaning currently for AMD Bulldozer family processors. For all other + * processors, this should currently be set to 1. + * + * cpi_compunitid + * + * This indicates the compute unit that the logical CPU belongs to. For + * processors without AMD Bulldozer-style compute units this should be set + * to the value of cpi_coreid. + * + * cpi_ncpu_shr_last_cache + * + * This indicates the number of logical CPUs that are sharing the same last + * level cache. This value should be the same for all CPUs that are sharing + * that cache. The last cache refers to the cache that is closest to memory + * and furthest away from the CPU. + * + * cpi_last_lvl_cacheid + * + * This indicates the ID of the last cache that the logical CPU uses. This + * cache is often shared between multiple logical CPUs and is the cache + * that is closest to memory and furthest away from the CPU. This value + * should be the same for a group of logical CPUs only if they actually + * share the same last level cache. IDs should not overlap between + * packages. + * + * ----------- + * Hypervisors + * ----------- + * + * If trying to manage the differences between vendors wasn't bad enough, it can + * get worse thanks to our friend hardware virtualization. Hypervisors are given + * the ability to interpose on all cpuid instructions and change them to suit + * their purposes. In general, this is necessary as the hypervisor wants to be + * able to present a more uniform set of features or not necessarily give the + * guest operating system kernel knowledge of all features so it can be + * more easily migrated between systems. + * + * When it comes to trying to determine topology information, this can be a + * double edged sword. When a hypervisor doesn't actually implement a cpuid + * leaf, it'll often return all zeros. Because of that, you'll often see various + * checks scattered about fields being non-zero before we assume we can use + * them. + * + * When it comes to topology information, the hypervisor is often incentivized + * to lie to you about topology. This is because it doesn't always actually + * guarantee that topology at all. The topology path we take in the system + * depends on how the CPU advertises itself. If it advertises itself as an Intel + * or AMD CPU, then we basically do our normal path. However, when they don't + * use an actual vendor, then that usually turns into multiple one-core CPUs + * that we enumerate that are often on different sockets. The actual behavior + * depends greatly on what the hypervisor actually exposes to us. + * + * -------------------- + * Exposing Information + * -------------------- + * + * We expose CPUID information in three different forms in the system. + * + * The first is through the x86_featureset variable. This is used in conjunction + * with the is_x86_feature() function. This is queried by x86-specific functions + * to determine which features are or aren't present in the system and to make + * decisions based upon them. For example, users of this include everything from + * parts of the system dedicated to reliability, availability, and + * serviceability (RAS), to making decisions about how to handle security + * mitigations, to various x86-specific drivers. General purpose or + * architecture independent drivers should never be calling this function. + * + * The second means is through the auxiliary vector. The auxiliary vector is a + * series of tagged data that the kernel passes down to a user program when it + * begins executing. This information is used to indicate to programs what + * instruction set extensions are present. For example, information about the + * CPU supporting the machine check architecture (MCA) wouldn't be passed down + * since user programs cannot make use of it. However, things like the AVX + * instruction sets are. Programs use this information to make run-time + * decisions about what features they should use. As an example, the run-time + * link-editor (rtld) can relocate different functions depending on the hardware + * support available. + * + * The final form is through a series of accessor functions that all have the + * form cpuid_get*. This is used by a number of different subsystems in the + * kernel to determine more detailed information about what we're running on, + * topology information, etc. Some of these subsystems include processor groups + * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI, + * microcode, and performance monitoring. These functions all ASSERT that the + * CPU they're being called on has reached a certain cpuid pass. If the passes + * are rearranged, then this needs to be adjusted. */ #include <sys/types.h> @@ -68,58 +911,6 @@ #include <sys/ontrap.h> #endif -/* - * Pass 0 of cpuid feature analysis happens in locore. It contains special code - * to recognize Cyrix processors that are not cpuid-compliant, and to deal with - * them accordingly. For most modern processors, feature detection occurs here - * in pass 1. - * - * Pass 1 of cpuid feature analysis happens just at the beginning of mlsetup() - * for the boot CPU and does the basic analysis that the early kernel needs. - * x86_featureset is set based on the return value of cpuid_pass1() of the boot - * CPU. - * - * Pass 1 includes: - * - * o Determining vendor/model/family/stepping and setting x86_type and - * x86_vendor accordingly. - * o Processing the feature flags returned by the cpuid instruction while - * applying any workarounds or tricks for the specific processor. - * o Mapping the feature flags into illumos feature bits (X86_*). - * o Processing extended feature flags if supported by the processor, - * again while applying specific processor knowledge. - * o Determining the CMT characteristics of the system. - * - * Pass 1 is done on non-boot CPUs during their initialization and the results - * are used only as a meager attempt at ensuring that all processors within the - * system support the same features. - * - * Pass 2 of cpuid feature analysis happens just at the beginning - * of startup(). It just copies in and corrects the remainder - * of the cpuid data we depend on: standard cpuid functions that we didn't - * need for pass1 feature analysis, and extended cpuid functions beyond the - * simple feature processing done in pass1. - * - * Pass 3 of cpuid analysis is invoked after basic kernel services; in - * particular kernel memory allocation has been made available. It creates a - * readable brand string based on the data collected in the first two passes. - * - * Pass 4 of cpuid analysis is invoked after post_startup() when all - * the support infrastructure for various hardware features has been - * initialized. It determines which processor features will be reported - * to userland via the aux vector. - * - * All passes are executed on all CPUs, but only the boot CPU determines what - * features the kernel will use. - * - * Much of the worst junk in this file is for the support of processors - * that didn't really implement the cpuid instruction properly. - * - * NOTE: The accessor functions (cpuid_get*) are aware of, and ASSERT upon, - * the pass numbers. Accordingly, changes to the pass code may require changes - * to the accessor code. - */ - uint_t x86_vendor = X86_VENDOR_IntelClone; uint_t x86_type = X86_TYPE_OTHER; uint_t x86_clflush_size = 0; @@ -351,21 +1142,9 @@ struct xsave_info { #define NMAX_CPI_EXTD 0x1f /* eax = 0x80000000 .. 0x8000001e */ /* - * Some terminology needs to be explained: - * - Socket: Something that can be plugged into a motherboard. - * - Package: Same as socket - * - Chip: Same as socket. Note that AMD's documentation uses term "chip" - * differently: there, chip is the same as processor node (below) - * - Processor node: Some AMD processors have more than one - * "subprocessor" embedded in a package. These subprocessors (nodes) - * are fully-functional processors themselves with cores, caches, - * memory controllers, PCI configuration spaces. They are connected - * inside the package with Hypertransport links. On single-node - * processors, processor node is equivalent to chip/socket/package. - * - Compute Unit: Some AMD processors pair cores in "compute units" that - * share the FPU and the I$ and L2 caches. + * See the big theory statement for a more detailed explanation of what some of + * these members mean. */ - struct cpuid_info { uint_t cpi_pass; /* last pass completed */ /* @@ -387,8 +1166,9 @@ struct cpuid_info { uint_t cpi_ncache; /* fn 2: number of elements */ uint_t cpi_ncpu_shr_last_cache; /* fn 4: %eax: ncpus sharing cache */ id_t cpi_last_lvl_cacheid; /* fn 4: %eax: derived cache id */ - uint_t cpi_std_4_size; /* fn 4: number of fn 4 elements */ - struct cpuid_regs **cpi_std_4; /* fn 4: %ecx == 0 .. fn4_size */ + uint_t cpi_cache_leaf_size; /* Number of cache elements */ + /* Intel fn: 4, AMD fn: 8000001d */ + struct cpuid_regs **cpi_cache_leaves; /* Acual leaves from above */ struct cpuid_regs cpi_std[NMAX_CPI_STD]; /* 0 .. 7 */ /* * extended function information @@ -540,6 +1320,14 @@ static struct cpuid_info cpuid_info0; #define CPUID_LEAFD_2_YMM_SIZE 256 /* + * Common extended leaf names to cut down on typos. + */ +#define CPUID_LEAF_EXT_0 0x80000000 +#define CPUID_LEAF_EXT_8 0x80000008 +#define CPUID_LEAF_EXT_1d 0x8000001d +#define CPUID_LEAF_EXT_1e 0x8000001e + +/* * Functions we consune from cpuid_subr.c; don't publish these in a header * file to try and keep people using the expected cpuid_* interfaces. */ @@ -607,7 +1395,7 @@ platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp) cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D; break; - case 0x80000008: + case CPUID_LEAF_EXT_8: /* * Zero out the (ncores-per-chip - 1) field */ @@ -664,13 +1452,14 @@ cpuid_free_space(cpu_t *cpu) ASSERT(cpi != &cpuid_info0); /* - * Free up any function 4 related dynamic storage + * Free up any cache leaf related dynamic storage. The first entry was + * cached from the standard cpuid storage, so we should not free it. */ - for (i = 1; i < cpi->cpi_std_4_size; i++) - kmem_free(cpi->cpi_std_4[i], sizeof (struct cpuid_regs)); - if (cpi->cpi_std_4_size > 0) - kmem_free(cpi->cpi_std_4, - cpi->cpi_std_4_size * sizeof (struct cpuid_regs *)); + for (i = 1; i < cpi->cpi_cache_leaf_size; i++) + kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs)); + if (cpi->cpi_cache_leaf_size > 0) + kmem_free(cpi->cpi_cache_leaves, + cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *)); kmem_free(cpi, sizeof (*cpi)); cpu->cpu_m.mcpu_cpi = NULL; @@ -804,6 +1593,198 @@ is_controldom(void) #endif /* __xpv */ +/* + * Make sure that we have gathered all of the CPUID leaves that we might need to + * determine topology. We assume that the standard leaf 1 has already been done + * and that xmaxeax has already been calculated. + */ +static void +cpuid_gather_amd_topology_leaves(cpu_t *cpu) +{ + struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; + + if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { + struct cpuid_regs *cp; + + cp = &cpi->cpi_extd[8]; + cp->cp_eax = CPUID_LEAF_EXT_8; + (void) __cpuid_insn(cp); + platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp); + } + + if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { + struct cpuid_regs *cp; + + cp = &cpi->cpi_extd[0x1e]; + cp->cp_eax = CPUID_LEAF_EXT_1e; + (void) __cpuid_insn(cp); + } +} + +/* + * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer + * it to everything else. If not, and we're on an AMD system where 8000001e is + * valid, then we use that. Othewrise, we fall back to the default value for the + * APIC ID in leaf 1. + */ +static uint32_t +cpuid_gather_apicid(struct cpuid_info *cpi) +{ + /* + * Leaf B changes based on the arguments to it. Beacuse we don't cache + * it, we need to gather it again. + */ + if (cpi->cpi_maxeax >= 0xB) { + struct cpuid_regs regs; + struct cpuid_regs *cp; + + cp = ®s; + cp->cp_eax = 0xB; + cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; + (void) __cpuid_insn(cp); + + if (cp->cp_ebx != 0) { + return (cp->cp_edx); + } + } + + if (cpi->cpi_vendor == X86_VENDOR_AMD && + is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { + return (cpi->cpi_extd[0x1e].cp_eax); + } + + return (CPI_APIC_ID(cpi)); +} + +/* + * For AMD processors, attempt to calculate the number of chips and cores that + * exist. The way that we do this varies based on the generation, because the + * generations themselves have changed dramatically. + * + * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores. + * However, with the advent of family 17h (Zen) it actually tells us the number + * of threads, so we need to look at leaf 0x8000001e if available to determine + * its value. Otherwise, for all prior families, the number of enabled cores is + * the same as threads. + * + * If we do not have leaf 0x80000008, then we assume that this processor does + * not have anything. AMD's older CPUID specification says there's no reason to + * fall back to leaf 1. + * + * In some virtualization cases we will not have leaf 8000001e or it will be + * zero. When that happens we assume the number of threads is one. + */ +static void +cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) +{ + uint_t nthreads, nthread_per_core; + + nthreads = nthread_per_core = 1; + + if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { + nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1; + } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { + nthreads = CPI_CPU_COUNT(cpi); + } + + /* + * For us to have threads, and know about it, we have to be at least at + * family 17h and have the cpuid bit that says we have extended + * topology. + */ + if (cpi->cpi_family >= 0x17 && + is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { + nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; + } + + *ncpus = nthreads; + *ncores = nthreads / nthread_per_core; +} + +static void +cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores) +{ + if (cpi->cpi_maxeax >= 4) { + *ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1; + *ncpus = BITX(cpi->cpi_std[4].cp_eax, 25, 14) + 1; + } else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { + *ncores = 1; + *ncpus = CPI_CPU_COUNT(cpi); + } else { + *ncpus = *ncores = 1; + } +} + +static boolean_t +cpuid_leafB_getids(cpu_t *cpu) +{ + struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; + struct cpuid_regs regs; + struct cpuid_regs *cp; + + if (cpi->cpi_maxeax < 0xB) + return (B_FALSE); + + cp = ®s; + cp->cp_eax = 0xB; + cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; + + (void) __cpuid_insn(cp); + + /* + * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which + * indicates that the extended topology enumeration leaf is + * available. + */ + if (cp->cp_ebx != 0) { + uint32_t x2apic_id = 0; + uint_t coreid_shift = 0; + uint_t ncpu_per_core = 1; + uint_t chipid_shift = 0; + uint_t ncpu_per_chip = 1; + uint_t i; + uint_t level; + + for (i = 0; i < CPI_FNB_ECX_MAX; i++) { + cp->cp_eax = 0xB; + cp->cp_ecx = i; + + (void) __cpuid_insn(cp); + level = CPI_CPU_LEVEL_TYPE(cp); + + if (level == 1) { + x2apic_id = cp->cp_edx; + coreid_shift = BITX(cp->cp_eax, 4, 0); + ncpu_per_core = BITX(cp->cp_ebx, 15, 0); + } else if (level == 2) { + x2apic_id = cp->cp_edx; + chipid_shift = BITX(cp->cp_eax, 4, 0); + ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); + } + } + + /* + * cpi_apicid is taken care of in cpuid_gather_apicid. + */ + cpi->cpi_ncpu_per_chip = ncpu_per_chip; + cpi->cpi_ncore_per_chip = ncpu_per_chip / + ncpu_per_core; + cpi->cpi_chipid = x2apic_id >> chipid_shift; + cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); + cpi->cpi_coreid = x2apic_id >> coreid_shift; + cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; + cpi->cpi_procnodeid = cpi->cpi_chipid; + cpi->cpi_compunitid = cpi->cpi_coreid; + + return (B_TRUE); + } else { + return (B_FALSE); + } +} + static void cpuid_intel_getids(cpu_t *cpu, void *feature) { @@ -812,6 +1793,20 @@ cpuid_intel_getids(cpu_t *cpu, void *feature) uint_t coreid_shift = 0; struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; + /* + * There are no compute units or processor nodes currently on Intel. + * Always set these to one. + */ + cpi->cpi_procnodes_per_pkg = 1; + cpi->cpi_cores_per_compunit = 1; + + /* + * If cpuid Leaf B is present, use that to try and get this information. + * It will be the most accurate for Intel CPUs. + */ + if (cpuid_leafB_getids(cpu)) + return; + for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1) chipid_shift++; @@ -860,13 +1855,69 @@ cpuid_intel_getids(cpu_t *cpu, void *feature) */ cpi->cpi_coreid = cpi->cpi_chipid; cpi->cpi_pkgcoreid = 0; + } else { + /* + * Single-core single-thread processors. + */ + cpi->cpi_coreid = cpu->cpu_id; + cpi->cpi_pkgcoreid = 0; } cpi->cpi_procnodeid = cpi->cpi_chipid; cpi->cpi_compunitid = cpi->cpi_coreid; } +/* + * Historically, AMD has had CMP chips with only a single thread per core. + * However, starting in family 17h (Zen), this has changed and they now have + * multiple threads. Our internal core id needs to be a unique value. + * + * To determine the core id of an AMD system, if we're from a family before 17h, + * then we just use the cpu id, as that gives us a good value that will be + * unique for each core. If instead, we're on family 17h or later, then we need + * to do something more complicated. CPUID leaf 0x8000001e can tell us + * how many threads are in the system. Based on that, we'll shift the APIC ID. + * We can't use the normal core id in that leaf as it's only unique within the + * socket, which is perfect for cpi_pkgcoreid, but not us. + */ +static id_t +cpuid_amd_get_coreid(cpu_t *cpu) +{ + struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; + + if (cpi->cpi_family >= 0x17 && + is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { + uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1; + if (nthreads > 1) { + VERIFY3U(nthreads, ==, 2); + return (cpi->cpi_apicid >> 1); + } + } + + return (cpu->cpu_id); +} + +/* + * IDs on AMD is a more challenging task. This is notable because of the + * following two facts: + * + * 1. Before family 0x17 (Zen), there was no support for SMT and there was + * also no way to get an actual unique core id from the system. As such, we + * synthesize this case by using cpu->cpu_id. This scheme does not, + * however, guarantee that sibling cores of a chip will have sequential + * coreids starting at a multiple of the number of cores per chip - that is + * usually the case, but if the ACPI MADT table is presented in a different + * order then we need to perform a few more gymnastics for the pkgcoreid. + * + * 2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups + * called compute units. These compute units share the L1I cache, L2 cache, + * and the FPU. To deal with this, a new topology leaf was added in + * 0x8000001e. However, parts of this leaf have different meanings + * once we get to family 0x17. + */ + static void -cpuid_amd_getids(cpu_t *cpu) +cpuid_amd_getids(cpu_t *cpu, uchar_t *features) { int i, first_half, coreidsz; uint32_t nb_caps_reg; @@ -875,41 +1926,31 @@ cpuid_amd_getids(cpu_t *cpu) struct cpuid_regs *cp; /* - * AMD CMP chips currently have a single thread per core. - * - * Since no two cpus share a core we must assign a distinct coreid - * per cpu, and we do this by using the cpu_id. This scheme does not, - * however, guarantee that sibling cores of a chip will have sequential - * coreids starting at a multiple of the number of cores per chip - - * that is usually the case, but if the ACPI MADT table is presented - * in a different order then we need to perform a few more gymnastics - * for the pkgcoreid. - * - * All processors in the system have the same number of enabled - * cores. Cores within a processor are always numbered sequentially - * from 0 regardless of how many or which are disabled, and there - * is no way for operating system to discover the real core id when some - * are disabled. - * - * In family 0x15, the cores come in pairs called compute units. They - * share I$ and L2 caches and the FPU. Enumeration of this feature is - * simplified by the new topology extensions CPUID leaf, indicated by - * the X86 feature X86FSET_TOPOEXT. + * Calculate the core id (this comes from hardware in family 0x17 if it + * hasn't been stripped by virtualization). We always set the compute + * unit id to the same value. Also, initialize the default number of + * cores per compute unit and nodes per package. This will be + * overwritten when we know information about a particular family. */ + cpi->cpi_coreid = cpuid_amd_get_coreid(cpu); + cpi->cpi_compunitid = cpi->cpi_coreid; + cpi->cpi_cores_per_compunit = 1; + cpi->cpi_procnodes_per_pkg = 1; - cpi->cpi_coreid = cpu->cpu_id; - cpi->cpi_compunitid = cpu->cpu_id; - - if (cpi->cpi_xmaxeax >= 0x80000008) { - + /* + * To construct the logical ID, we need to determine how many APIC IDs + * are dedicated to the cores and threads. This is provided for us in + * 0x80000008. However, if it's not present (say due to virtualization), + * then we assume it's one. This should be present on all 64-bit AMD + * processors. It was added in family 0xf (Hammer). + */ + if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12); /* - * In AMD parlance chip is really a node while Solaris - * sees chip as equivalent to socket/package. + * In AMD parlance chip is really a node while illumos + * uses chip as equivalent to socket/package. */ - cpi->cpi_ncore_per_chip = - BITX((cpi)->cpi_extd[8].cp_ecx, 7, 0) + 1; if (coreidsz == 0) { /* Use legacy method */ for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1) @@ -919,27 +1960,52 @@ cpuid_amd_getids(cpu_t *cpu) } } else { /* Assume single-core part */ - cpi->cpi_ncore_per_chip = 1; coreidsz = 1; } + cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1); - cpi->cpi_clogid = cpi->cpi_pkgcoreid = - cpi->cpi_apicid & ((1<<coreidsz) - 1); - cpi->cpi_ncpu_per_chip = cpi->cpi_ncore_per_chip; + /* + * The package core ID varies depending on the family. For family 17h, + * we can get this directly from leaf CPUID_LEAF_EXT_1e. Otherwise, we + * can use the clogid as is. When family 17h is virtualized, the clogid + * should be sufficient as if we don't have valid data in the leaf, then + * we won't think we have SMT, in which case the cpi_clogid should be + * sufficient. + */ + if (cpi->cpi_family >= 0x17 && + is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e && + cpi->cpi_extd[0x1e].cp_ebx != 0) { + cpi->cpi_pkgcoreid = BITX(cpi->cpi_extd[0x1e].cp_ebx, 7, 0); + } else { + cpi->cpi_pkgcoreid = cpi->cpi_clogid; + } - /* Get node ID, compute unit ID */ + /* + * Obtain the node ID and compute unit IDs. If we're on family 0x15 + * (bulldozer) or newer, then we can derive all of this from leaf + * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family. + */ if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) && - cpi->cpi_xmaxeax >= 0x8000001e) { + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) { cp = &cpi->cpi_extd[0x1e]; - cp->cp_eax = 0x8000001e; - (void) __cpuid_insn(cp); cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1; cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0); - cpi->cpi_cores_per_compunit = BITX(cp->cp_ebx, 15, 8) + 1; - cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) - + (cpi->cpi_ncore_per_chip / cpi->cpi_cores_per_compunit) - * (cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg); + + /* + * For Bulldozer-era CPUs, recalculate the compute unit + * information. + */ + if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) { + cpi->cpi_cores_per_compunit = + BITX(cp->cp_ebx, 15, 8) + 1; + cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) + + (cpi->cpi_ncore_per_chip / + cpi->cpi_cores_per_compunit) * + (cpi->cpi_procnodeid / + cpi->cpi_procnodes_per_pkg); + } } else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) { cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7; } else if (cpi->cpi_family == 0x10) { @@ -1014,7 +2080,7 @@ cpuid_scan_security(cpu_t *cpu, uchar_t *featureset) struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi; if (cpi->cpi_vendor == X86_VENDOR_AMD && - cpi->cpi_xmaxeax >= 0x80000008) { + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) { if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB) add_x86_feature(featureset, X86FSET_IBPB); if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS) @@ -1156,6 +2222,117 @@ setup_xfem(void) xsave_bv_all = flags; } +static void +cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset) +{ + struct cpuid_info *cpi; + + cpi = cpu->cpu_m.mcpu_cpi; + + if (cpi->cpi_vendor == X86_VENDOR_AMD) { + cpuid_gather_amd_topology_leaves(cpu); + } + + cpi->cpi_apicid = cpuid_gather_apicid(cpi); + + /* + * Before we can calculate the IDs that we should assign to this + * processor, we need to understand how many cores and threads it has. + */ + switch (cpi->cpi_vendor) { + case X86_VENDOR_Intel: + cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip, + &cpi->cpi_ncore_per_chip); + break; + case X86_VENDOR_AMD: + cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip, + &cpi->cpi_ncore_per_chip); + break; + default: + /* + * If we have some other x86 compatible chip, it's not clear how + * they would behave. The most common case is virtualization + * today, though there are also 64-bit VIA chips. Assume that + * all we can get is the basic Leaf 1 HTT information. + */ + if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) { + cpi->cpi_ncore_per_chip = 1; + cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); + } + break; + } + + /* + * Based on the calculated number of threads and cores, potentially + * assign the HTT and CMT features. + */ + if (cpi->cpi_ncore_per_chip > 1) { + add_x86_feature(featureset, X86FSET_CMP); + } + + if (cpi->cpi_ncpu_per_chip > 1 && + cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) { + add_x86_feature(featureset, X86FSET_HTT); + } + + /* + * Now that has been set up, we need to go through and calculate all of + * the rest of the parameters that exist. If we think the CPU doesn't + * have either SMT (HTT) or CMP, then we basically go through and fake + * up information in some way. The most likely case for this is + * virtualization where we have a lot of partial topology information. + */ + if (!is_x86_feature(featureset, X86FSET_HTT) && + !is_x86_feature(featureset, X86FSET_CMP)) { + /* + * This is a single core, single-threaded processor. + */ + cpi->cpi_procnodes_per_pkg = 1; + cpi->cpi_cores_per_compunit = 1; + cpi->cpi_compunitid = 0; + cpi->cpi_chipid = -1; + cpi->cpi_clogid = 0; + cpi->cpi_coreid = cpu->cpu_id; + cpi->cpi_pkgcoreid = 0; + if (cpi->cpi_vendor == X86_VENDOR_AMD) { + cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); + } else { + cpi->cpi_procnodeid = cpi->cpi_chipid; + } + } else { + switch (cpi->cpi_vendor) { + case X86_VENDOR_Intel: + cpuid_intel_getids(cpu, featureset); + break; + case X86_VENDOR_AMD: + cpuid_amd_getids(cpu, featureset); + break; + default: + /* + * In this case, it's hard to say what we should do. + * We're going to model them to the OS as single core + * threads. We don't have a good identifier for them, so + * we're just going to use the cpu id all on a single + * chip. + * + * This case has historically been different from the + * case above where we don't have HTT or CMP. While they + * could be combined, we've opted to keep it separate to + * minimize the risk of topology changes in weird cases. + */ + cpi->cpi_procnodes_per_pkg = 1; + cpi->cpi_cores_per_compunit = 1; + cpi->cpi_chipid = 0; + cpi->cpi_coreid = cpu->cpu_id; + cpi->cpi_clogid = cpu->cpu_id; + cpi->cpi_pkgcoreid = cpu->cpu_id; + cpi->cpi_procnodeid = cpi->cpi_chipid; + cpi->cpi_compunitid = cpi->cpi_coreid; + break; + } + } +} + void cpuid_pass1(cpu_t *cpu, uchar_t *featureset) { @@ -1743,23 +2920,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) if (is_x86_feature(featureset, X86FSET_PAE)) cpi->cpi_pabits = 36; - /* - * Hyperthreading configuration is slightly tricky on Intel - * and pure clones, and even trickier on AMD. - * - * (AMD chose to set the HTT bit on their CMP processors, - * even though they're not actually hyperthreaded. Thus it - * takes a bit more work to figure out what's really going - * on ... see the handling of the CMP_LGCY bit below) - */ - if (cp->cp_edx & CPUID_INTC_EDX_HTT) { - cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi); - if (cpi->cpi_ncpu_per_chip > 1) - add_x86_feature(featureset, X86FSET_HTT); - } else { - cpi->cpi_ncpu_per_chip = 1; - } - if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) { struct cpuid_regs r, *ecp; @@ -1816,11 +2976,11 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) if (xcpuid) { cp = &cpi->cpi_extd[0]; - cp->cp_eax = 0x80000000; + cp->cp_eax = CPUID_LEAF_EXT_0; cpi->cpi_xmaxeax = __cpuid_insn(cp); } - if (cpi->cpi_xmaxeax & 0x80000000) { + if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) { if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX) cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX; @@ -1878,18 +3038,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) } /* - * If both the HTT and CMP_LGCY bits are set, - * then we're not actually HyperThreaded. Read - * "AMD CPUID Specification" for more details. - */ - if (cpi->cpi_vendor == X86_VENDOR_AMD && - is_x86_feature(featureset, X86FSET_HTT) && - (cp->cp_ecx & CPUID_AMD_ECX_CMP_LGCY)) { - remove_x86_feature(featureset, X86FSET_HTT); - add_x86_feature(featureset, X86FSET_CMP); - } - - /* * It's really tricky to support syscall/sysret in * the i386 kernel; we rely on sysenter/sysexit * instead. In the amd64 kernel, things are -way- @@ -1954,12 +3102,13 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) } /*FALLTHROUGH*/ case X86_VENDOR_AMD: - if (cpi->cpi_xmaxeax < 0x80000008) + if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) break; cp = &cpi->cpi_extd[8]; - cp->cp_eax = 0x80000008; + cp->cp_eax = CPUID_LEAF_EXT_8; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000008, cp); + platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, + cp); /* * AMD uses ebx for some extended functions. @@ -1995,41 +3144,6 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) } /* - * Derive the number of cores per chip - */ - switch (cpi->cpi_vendor) { - case X86_VENDOR_Intel: - if (cpi->cpi_maxeax < 4) { - cpi->cpi_ncore_per_chip = 1; - break; - } else { - cpi->cpi_ncore_per_chip = - BITX((cpi)->cpi_std[4].cp_eax, 31, 26) + 1; - } - break; - case X86_VENDOR_AMD: - if (cpi->cpi_xmaxeax < 0x80000008) { - cpi->cpi_ncore_per_chip = 1; - break; - } else { - /* - * On family 0xf cpuid fn 2 ECX[7:0] "NC" is - * 1 less than the number of physical cores on - * the chip. In family 0x10 this value can - * be affected by "downcoring" - it reflects - * 1 less than the number of cores actually - * enabled on this node. - */ - cpi->cpi_ncore_per_chip = - BITX((cpi)->cpi_extd[8].cp_ecx, 7, 0) + 1; - } - break; - default: - cpi->cpi_ncore_per_chip = 1; - break; - } - - /* * Get CPUID data about TSC Invariance in Deep C-State. */ switch (cpi->cpi_vendor) { @@ -2045,57 +3159,9 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) default: break; } - } else { - cpi->cpi_ncore_per_chip = 1; } - /* - * If more than one core, then this processor is CMP. - */ - if (cpi->cpi_ncore_per_chip > 1) { - add_x86_feature(featureset, X86FSET_CMP); - } - - /* - * If the number of cores is the same as the number - * of CPUs, then we cannot have HyperThreading. - */ - if (cpi->cpi_ncpu_per_chip == cpi->cpi_ncore_per_chip) { - remove_x86_feature(featureset, X86FSET_HTT); - } - - cpi->cpi_apicid = CPI_APIC_ID(cpi); - cpi->cpi_procnodes_per_pkg = 1; - cpi->cpi_cores_per_compunit = 1; - if (is_x86_feature(featureset, X86FSET_HTT) == B_FALSE && - is_x86_feature(featureset, X86FSET_CMP) == B_FALSE) { - /* - * Single-core single-threaded processors. - */ - cpi->cpi_chipid = -1; - cpi->cpi_clogid = 0; - cpi->cpi_coreid = cpu->cpu_id; - cpi->cpi_pkgcoreid = 0; - if (cpi->cpi_vendor == X86_VENDOR_AMD) - cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0); - else - cpi->cpi_procnodeid = cpi->cpi_chipid; - } else if (cpi->cpi_ncpu_per_chip > 1) { - if (cpi->cpi_vendor == X86_VENDOR_Intel) - cpuid_intel_getids(cpu, featureset); - else if (cpi->cpi_vendor == X86_VENDOR_AMD) - cpuid_amd_getids(cpu); - else { - /* - * All other processors are currently - * assumed to have single cores. - */ - cpi->cpi_coreid = cpi->cpi_chipid; - cpi->cpi_pkgcoreid = 0; - cpi->cpi_procnodeid = cpi->cpi_chipid; - cpi->cpi_compunitid = cpi->cpi_chipid; - } - } + cpuid_pass1_topology(cpu, featureset); /* * Synthesize chip "revision" and socket type @@ -2108,7 +3174,7 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) cpi->cpi_model, cpi->cpi_step); if (cpi->cpi_vendor == X86_VENDOR_AMD) { - if (cpi->cpi_xmaxeax >= 0x80000008 && + if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 && cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) { /* Special handling for AMD FP not necessary. */ cpi->cpi_fp_amd_save = 0; @@ -2281,61 +3347,6 @@ cpuid_pass2(cpu_t *cpu) } } - if (cpi->cpi_maxeax >= 0xB && cpi->cpi_vendor == X86_VENDOR_Intel) { - struct cpuid_regs regs; - - cp = ®s; - cp->cp_eax = 0xB; - cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; - - (void) __cpuid_insn(cp); - - /* - * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which - * indicates that the extended topology enumeration leaf is - * available. - */ - if (cp->cp_ebx) { - uint32_t x2apic_id; - uint_t coreid_shift = 0; - uint_t ncpu_per_core = 1; - uint_t chipid_shift = 0; - uint_t ncpu_per_chip = 1; - uint_t i; - uint_t level; - - for (i = 0; i < CPI_FNB_ECX_MAX; i++) { - cp->cp_eax = 0xB; - cp->cp_ecx = i; - - (void) __cpuid_insn(cp); - level = CPI_CPU_LEVEL_TYPE(cp); - - if (level == 1) { - x2apic_id = cp->cp_edx; - coreid_shift = BITX(cp->cp_eax, 4, 0); - ncpu_per_core = BITX(cp->cp_ebx, 15, 0); - } else if (level == 2) { - x2apic_id = cp->cp_edx; - chipid_shift = BITX(cp->cp_eax, 4, 0); - ncpu_per_chip = BITX(cp->cp_ebx, 15, 0); - } - } - - cpi->cpi_apicid = x2apic_id; - cpi->cpi_ncpu_per_chip = ncpu_per_chip; - cpi->cpi_ncore_per_chip = ncpu_per_chip / - ncpu_per_core; - cpi->cpi_chipid = x2apic_id >> chipid_shift; - cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1); - cpi->cpi_coreid = x2apic_id >> coreid_shift; - cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift; - } - - /* Make cp NULL so that we don't stumble on others */ - cp = NULL; - } - /* * XSAVE enumeration */ @@ -2548,10 +3559,10 @@ cpuid_pass2(cpu_t *cpu) } - if ((cpi->cpi_xmaxeax & 0x80000000) == 0) + if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) goto pass2_done; - if ((nmax = cpi->cpi_xmaxeax - 0x80000000 + 1) > NMAX_CPI_EXTD) + if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD) nmax = NMAX_CPI_EXTD; /* * Copy the extended properties, fixing them as we go. @@ -2559,9 +3570,10 @@ cpuid_pass2(cpu_t *cpu) */ iptr = (void *)cpi->cpi_brandstr; for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) { - cp->cp_eax = 0x80000000 + n; + cp->cp_eax = CPUID_LEAF_EXT_0 + n; (void) __cpuid_insn(cp); - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000000 + n, cp); + platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n, + cp); switch (n) { case 2: case 3: @@ -3013,26 +4025,42 @@ cpuid_pass3(cpu_t *cpu) ASSERT(cpi->cpi_pass == 2); /* - * Function 4: Deterministic cache parameters + * Deterministic cache parameters * - * Take this opportunity to detect the number of threads - * sharing the last level cache, and construct a corresponding - * cache id. The respective cpuid_info members are initialized - * to the default case of "no last level cache sharing". + * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The + * values that are present are currently defined to be the same. This + * means we can use the same logic to parse it as long as we use the + * appropriate leaf to get the data. If you're updating this, make sure + * you're careful about which vendor supports which aspect. + * + * Take this opportunity to detect the number of threads sharing the + * last level cache, and construct a corresponding cache id. The + * respective cpuid_info members are initialized to the default case of + * "no last level cache sharing". */ cpi->cpi_ncpu_shr_last_cache = 1; cpi->cpi_last_lvl_cacheid = cpu->cpu_id; - if (cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) { + if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) || + (cpi->cpi_vendor == X86_VENDOR_AMD && + cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d && + is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) { + uint32_t leaf; + + if (cpi->cpi_vendor == X86_VENDOR_Intel) { + leaf = 4; + } else { + leaf = CPUID_LEAF_EXT_1d; + } /* - * Find the # of elements (size) returned by fn 4, and along + * Find the # of elements (size) returned by the leaf and along * the way detect last level cache sharing details. */ bzero(®s, sizeof (regs)); cp = ®s; for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) { - cp->cp_eax = 4; + cp->cp_eax = leaf; cp->cp_ecx = i; (void) __cpuid_insn(cp); @@ -3046,29 +4074,33 @@ cpuid_pass3(cpu_t *cpu) CPI_NTHR_SHR_CACHE(cp) + 1; } } - cpi->cpi_std_4_size = size = i; + cpi->cpi_cache_leaf_size = size = i; /* - * Allocate the cpi_std_4 array. The first element - * references the regs for fn 4, %ecx == 0, which - * cpuid_pass2() stashed in cpi->cpi_std[4]. + * Allocate the cpi_cache_leaves array. The first element + * references the regs for the corresponding leaf with %ecx set + * to 0. This was gathered in cpuid_pass2(). */ if (size > 0) { - cpi->cpi_std_4 = + cpi->cpi_cache_leaves = kmem_alloc(size * sizeof (cp), KM_SLEEP); - cpi->cpi_std_4[0] = &cpi->cpi_std[4]; + if (cpi->cpi_vendor == X86_VENDOR_Intel) { + cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4]; + } else { + cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d]; + } /* * Allocate storage to hold the additional regs - * for function 4, %ecx == 1 .. cpi_std_4_size. + * for the leaf, %ecx == 1 .. cpi_cache_leaf_size. * - * The regs for fn 4, %ecx == 0 has already + * The regs for the leaf, %ecx == 0 has already * been allocated as indicated above. */ for (i = 1; i < size; i++) { - cp = cpi->cpi_std_4[i] = + cp = cpi->cpi_cache_leaves[i] = kmem_zalloc(sizeof (regs), KM_SLEEP); - cp->cp_eax = 4; + cp->cp_eax = leaf; cp->cp_ecx = i; (void) __cpuid_insn(cp); @@ -3090,7 +4122,7 @@ cpuid_pass3(cpu_t *cpu) /* * Now fixup the brand string */ - if ((cpi->cpi_xmaxeax & 0x80000000) == 0) { + if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) { fabricate_brandstr(cpi); } else { @@ -3497,20 +4529,22 @@ cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp) /* * CPUID data is cached in two separate places: cpi_std for standard - * CPUID functions, and cpi_extd for extended CPUID functions. + * CPUID leaves , and cpi_extd for extended CPUID leaves. */ - if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) + if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) { xcp = &cpi->cpi_std[cp->cp_eax]; - else if (cp->cp_eax >= 0x80000000 && cp->cp_eax <= cpi->cpi_xmaxeax && - cp->cp_eax < 0x80000000 + NMAX_CPI_EXTD) - xcp = &cpi->cpi_extd[cp->cp_eax - 0x80000000]; - else + } else if (cp->cp_eax >= CPUID_LEAF_EXT_0 && + cp->cp_eax <= cpi->cpi_xmaxeax && + cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) { + xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0]; + } else { /* * The caller is asking for data from an input parameter which * the kernel has not cached. In this case we go fetch from * the hardware and return the data directly to the user. */ return (__cpuid_insn(cp)); + } cp->cp_eax = xcp->cp_eax; cp->cp_ebx = xcp->cp_ebx; @@ -4410,17 +5444,18 @@ intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi) uint32_t level, i; int ret = 0; - for (i = 0; i < cpi->cpi_std_4_size; i++) { - level = CPI_CACHE_LVL(cpi->cpi_std_4[i]); + for (i = 0; i < cpi->cpi_cache_leaf_size; i++) { + level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]); if (level == 2 || level == 3) { - ct->ct_assoc = CPI_CACHE_WAYS(cpi->cpi_std_4[i]) + 1; + ct->ct_assoc = + CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1; ct->ct_line_size = - CPI_CACHE_COH_LN_SZ(cpi->cpi_std_4[i]) + 1; + CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1; ct->ct_size = ct->ct_assoc * - (CPI_CACHE_PARTS(cpi->cpi_std_4[i]) + 1) * + (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) * ct->ct_line_size * - (cpi->cpi_std_4[i]->cp_ecx + 1); + (cpi->cpi_cache_leaves[i]->cp_ecx + 1); if (level == 2) { ct->ct_label = l2_cache_str; @@ -5429,69 +6464,21 @@ patch_memops(uint_t vendor) #endif /* __amd64 && !__xpv */ /* - * This function finds the number of bits to represent the number of cores per - * chip and the number of strands per core for the Intel platforms. - * It re-uses the x2APIC cpuid code of the cpuid_pass2(). + * We're being asked to tell the system how many bits are required to represent + * the various thread and strand IDs. */ void -cpuid_get_ext_topo(uint_t vendor, uint_t *core_nbits, uint_t *strand_nbits) +cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits) { - struct cpuid_regs regs; - struct cpuid_regs *cp = ®s; - - if (vendor != X86_VENDOR_Intel) { - return; - } - - /* if the cpuid level is 0xB, extended topo is available. */ - cp->cp_eax = 0; - if (__cpuid_insn(cp) >= 0xB) { - - cp->cp_eax = 0xB; - cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0; - (void) __cpuid_insn(cp); - - /* - * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which - * indicates that the extended topology enumeration leaf is - * available. - */ - if (cp->cp_ebx) { - uint_t coreid_shift = 0; - uint_t chipid_shift = 0; - uint_t i; - uint_t level; - - for (i = 0; i < CPI_FNB_ECX_MAX; i++) { - cp->cp_eax = 0xB; - cp->cp_ecx = i; + struct cpuid_info *cpi; + uint_t nthreads; - (void) __cpuid_insn(cp); - level = CPI_CPU_LEVEL_TYPE(cp); - - if (level == 1) { - /* - * Thread level processor topology - * Number of bits shift right APIC ID - * to get the coreid. - */ - coreid_shift = BITX(cp->cp_eax, 4, 0); - } else if (level == 2) { - /* - * Core level processor topology - * Number of bits shift right APIC ID - * to get the chipid. - */ - chipid_shift = BITX(cp->cp_eax, 4, 0); - } - } + VERIFY(cpuid_checkpass(CPU, 1)); + cpi = cpu->cpu_m.mcpu_cpi; - if (coreid_shift > 0 && chipid_shift > coreid_shift) { - *strand_nbits = coreid_shift; - *core_nbits = chipid_shift - coreid_shift; - } - } - } + nthreads = cpi->cpi_ncpu_per_chip / cpi->cpi_ncore_per_chip; + *core_nbits = ddi_fls(cpi->cpi_ncore_per_chip); + *strand_nbits = ddi_fls(nthreads); } void @@ -5527,19 +6514,19 @@ cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset) (cpi->cpi_family == 5 && cpi->cpi_model < 1)) return; - if (cpi->cpi_xmaxeax < 0x80000008) { + if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { bzero(&cp, sizeof (cp)); - cp.cp_eax = 0x80000000; + cp.cp_eax = CPUID_LEAF_EXT_0; cpi->cpi_xmaxeax = __cpuid_insn(&cp); - if (cpi->cpi_xmaxeax < 0x80000008) { + if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) { return; } } bzero(&cp, sizeof (cp)); - cp.cp_eax = 0x80000008; + cp.cp_eax = CPUID_LEAF_EXT_8; (void) __cpuid_insn(&cp); - platform_cpuid_mangle(cpi->cpi_vendor, 0x80000008, &cp); + platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp); cpi->cpi_extd[8] = cp; } else { /* |