diff options
Diffstat (limited to 'usr/src/uts/i86pc')
24 files changed, 833 insertions, 152 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 9829939b16..ef7a36d09c 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -23,6 +23,7 @@ # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # Copyright (c) 2010, Intel Corporation. +# Copyright 2016 Joyent, Inc. # # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc @@ -40,6 +41,8 @@ CORE_OBJS += \ cmi.o \ cmi_hw.o \ cms.o \ + comm_page.o \ + comm_page_util.o \ confunix.o \ cpu_idle.o \ cpuid.o \ diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c index 7fc43e27bd..6abb7c6349 100644 --- a/usr/src/uts/i86pc/dboot/dboot_startkern.c +++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c @@ -63,6 +63,15 @@ extern int have_cpuid(void); #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) /* + * Region of memory that may be corrupted by external actors. This can go away + * once the firmware bug RICHMOND-16 is fixed and all systems with the bug are + * upgraded. + */ +#define CORRUPT_REGION_START 0xc700000 +#define CORRUPT_REGION_SIZE 0x100000 +#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE) + +/* * This file contains code that runs to transition us from either a multiboot * compliant loader (32 bit non-paging) or a XPV domain loader to * regular kernel execution. Its task is to setup the kernel memory image @@ -1155,6 +1164,38 @@ init_mem_alloc(void) case 1: if (end > max_mem) max_mem = end; + + /* + * Well, this is sad. One some systems, there + * is a region of memory that can be corrupted + * until some number of seconds after we have + * booted. And the BIOS doesn't tell us that + * this memory is unsafe to use. And we don't + * know how long it's dangerous. So we'll + * chop out this range from any memory list + * that would otherwise be usable. Note that + * any system of this type will give us the + * new-style (0x40) memlist, so we need not + * fix up the other path below. + */ + if (start < CORRUPT_REGION_START && + end > CORRUPT_REGION_START) { + memlists[memlists_used].addr = start; + memlists[memlists_used].size = + CORRUPT_REGION_START - start; + ++memlists_used; + if (end > CORRUPT_REGION_END) + start = CORRUPT_REGION_END; + else + continue; + } + if (start >= CORRUPT_REGION_START && + start < CORRUPT_REGION_END) { + if (end <= CORRUPT_REGION_END) + continue; + start = CORRUPT_REGION_END; + } + memlists[memlists_used].addr = start; memlists[memlists_used].size = end - start; ++memlists_used; diff --git a/usr/src/uts/i86pc/io/ppm/acpisleep.c b/usr/src/uts/i86pc/io/ppm/acpisleep.c index 78328170e6..6b94e50909 100644 --- a/usr/src/uts/i86pc/io/ppm/acpisleep.c +++ b/usr/src/uts/i86pc/io/ppm/acpisleep.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -56,6 +57,19 @@ int acpi_rtc_wake = 0x0; /* wake in N seconds */ +/* + * Execute optional ACPI methods for suspend/resume. + * The value can be ACPI_EXECUTE_GTS and/or ACPI_EXECUTE_BFS. + * Global so it can be set in /etc/system. + * From usr/src/uts/intel/io/acpica/changes.txt: + * It has been seen on some systems where the execution of these + * methods causes errors and also prevents the machine from entering S5. + * It is therefore suggested that host operating systems do not execute + * these methods by default. In the future, perhaps these methods can be + * optionally executed based on the age of the system... + */ +int acpi_sleep_flags = ACPI_NO_OPTIONAL_METHODS; + #if 0 /* debug */ static uint8_t branchbuf[64 * 1024]; /* for the HDT branch trace stuff */ #endif /* debug */ @@ -142,8 +156,9 @@ acpi_enter_sleepstate(s3a_t *s3ap) * Tell the hardware to sleep. */ PT(PT_SXE); - PMD(PMD_SX, ("Calling AcpiEnterSleepState(%d) ...\n", Sx)) - if (AcpiEnterSleepState(Sx) != AE_OK) { + PMD(PMD_SX, ("Calling AcpiEnterSleepState(%d, %d) ...\n", Sx, + acpi_sleep_flags)) + if (AcpiEnterSleepState(Sx, acpi_sleep_flags) != AE_OK) { PT(PT_SXE_FAIL); PMD(PMD_SX, ("... failed!\n")) } @@ -163,6 +178,11 @@ acpi_exit_sleepstate(s3a_t *s3ap) PMD(PMD_SX, ("!We woke up!\n")) PT(PT_LSS); + if (AcpiLeaveSleepStatePrep(Sx, acpi_sleep_flags) != AE_OK) { + PT(PT_LSS_FAIL); + PMD(PMD_SX, ("Problem with LeaveSleepState!\n")) + } + if (AcpiLeaveSleepState(Sx) != AE_OK) { PT(PT_LSS_FAIL); PMD(PMD_SX, ("Problem with LeaveSleepState!\n")) diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c index 7a3dd8a733..9c17d85228 100644 --- a/usr/src/uts/i86pc/io/psm/psm_common.c +++ b/usr/src/uts/i86pc/io/psm/psm_common.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -979,7 +980,7 @@ acpi_poweroff(void) return (1); } ACPI_DISABLE_IRQS(); - status = AcpiEnterSleepState(5); + status = AcpiEnterSleepState(5, ACPI_NO_OPTIONAL_METHODS); ACPI_ENABLE_IRQS(); /* we should be off; if we get here it's an error */ diff --git a/usr/src/uts/i86pc/ml/comm_page.s b/usr/src/uts/i86pc/ml/comm_page.s new file mode 100644 index 0000000000..7ff803ea93 --- /dev/null +++ b/usr/src/uts/i86pc/ml/comm_page.s @@ -0,0 +1,88 @@ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/asm_misc.h> +#include <sys/param.h> +#include <sys/comm_page.h> +#include <sys/tsc.h> + +#if defined(__lint) + +hrtime_t tsc_last; +hrtime_t tsc_resume_cap; +hrtime_t tsc_hrtime_base; +uint32_t tsc_max_delta; +volatile uint32_t hres_lock; +uint32_t tsc_type; +uint32_t nsec_scale; +int64_t hrestime_adj; +hrtime_t hres_last_tick; +uint32_t tsc_ncpu; +volatile timestruc_t hrestime; +hrtime_t tsc_sync_tick_delta[NCPU]; + +comm_page_t comm_page; + +#else /* defined(__lint) */ + +#include "assym.h" + +/* + * x86 Comm Page + * + * This is the definition for the comm page on x86. The purpose of this struct + * is to consolidate certain pieces of kernel state into one contiguous section + * of memory in order for it to be exposed (read-only) to userspace. The + * struct contents are defined by hand so that member variables will maintain + * their original symbols for use throughout the rest of the kernel. This + * layout must exactly match the C definition of comm_page_t. + * See: "uts/i86pc/sys/comm_page.h" + */ + + .data + DGDEF3(comm_page, COMM_PAGE_S_SIZE, 4096) + DGDEF2(tsc_last, 8) + .fill 1, 8, 0 + DGDEF2(tsc_hrtime_base, 8) + .fill 1, 8, 0 + DGDEF2(tsc_resume_cap, 8) + .fill 1, 8, 0 + DGDEF2(tsc_type, 4); + .fill 1, 4, _CONST(TSC_RDTSC_CPUID) + DGDEF2(tsc_max_delta, 4); + .fill 1, 4, 0 + DGDEF2(hres_lock, 4); + .fill 1, 4, 0 + DGDEF2(nsec_scale, 4); + .fill 1, 4, 0 + DGDEF2(hrestime_adj, 8) + .fill 1, 8, 0 + DGDEF2(hres_last_tick, 8) + .fill 1, 8, 0 + DGDEF2(tsc_ncpu, 4) + .fill 1, 4, 0 + /* _cp_pad */ + .fill 1, 4, 0 + DGDEF2(hrestime, _MUL(2, 8)) + .fill 2, 8, 0 + DGDEF2(tsc_sync_tick_delta, _MUL(NCPU, 8)) + .fill _CONST(NCPU), 8, 0 + + /* pad out the rest of the page from the struct end */ + .fill _CONST(COMM_PAGE_SIZE - COMM_PAGE_S_SIZE), 1, 0 + +#endif /* defined(__lint) */ diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 721d32fa3a..a1f83d3cf8 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,6 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. +\ Copyright 2016 Joyent, Inc. \ \ CDDL HEADER START \ @@ -61,6 +62,7 @@ #include <sys/brand.h> #include <sys/fastboot.h> #include <sys/cpr_wakecode.h> +#include <sys/comm_page.h> proc PROCSIZE p_link @@ -150,6 +152,8 @@ _klwp lwp_thread lwp_procp lwp_brand + lwp_brand_syscall + lwp_brand_syscall_fast lwp_eosys lwp_regs lwp_arg @@ -467,3 +471,5 @@ wc_cpu WC_CPU_SIZE wc_wakecode wc_cpu + +comm_page_s COMM_PAGE_S_SIZE diff --git a/usr/src/uts/i86pc/ml/syscall_asm.s b/usr/src/uts/i86pc/ml/syscall_asm.s index 61ef4ac6c3..68181be28a 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm.s +++ b/usr/src/uts/i86pc/ml/syscall_asm.s @@ -631,6 +631,36 @@ _sysenter_done: sysexit SET_SIZE(sys_sysenter) SET_SIZE(brand_sys_sysenter) +#endif /* __lint */ + +#if defined(__lint) +/* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the sysenter path, there is no default + * action to take if no callback is registered for this process. + */ +void +sys_int80() +{} + +#else /* __lint */ + + ENTRY_NP(brand_sys_int80) + BRAND_CALLBACK(BRAND_CB_INT80) + + ALTENTRY(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %eip to point before the INT, push the expected error + * code and fake a GP fault. + * + */ + subl $2, (%esp) /* int insn 2-bytes */ + pushl $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) /* * Declare a uintptr_t which covers the entire pc range of syscall diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index f26468c8cc..bc901e3e42 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -503,8 +503,25 @@ noprod_sys_syscall: movq T_LWP(%r15), %r14 ASSERT_NO_RUPDATE_PENDING(%r14) + ENABLE_INTR_FLAGS + /* + * If our LWP has a branded syscall_fast handler, execute it. A return + * code of zero indicates that the handler completely processed the syscall + * and we can return directly to userspace. + */ + movq LWP_BRAND_SYSCALL_FAST(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand_fast + call *%rdi + testl %eax, %eax + jnz _syscall_no_brand_fast + incq LWP_RU_SYSC(%r14) + incq %gs:CPU_STATS_SYS_SYSCALL + jmp _sys_rtt + +_syscall_no_brand_fast: MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ @@ -516,6 +533,28 @@ noprod_sys_syscall: incq %gs:CPU_STATS_SYS_SYSCALL + /* + * If our LWP has an alternate system call handler, run that instead of + * the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand + + pushq %rax + subq $8, %rsp /* align stack for call to C */ + call *%rdi + addq $8, %rsp + + /* + * If the alternate handler returns 0, we skip straight to the return to + * usermode. Otherwise, we resume regular system call processing. + */ + testl %eax, %eax + popq %rax + jz _syscall_after_brand + +_syscall_no_brand: movw %ax, T_SYSNUM(%r15) movzbl T_PRE_SYS(%r15), %ebx ORL_SYSCALLTRACE(%ebx) @@ -550,6 +589,8 @@ _syscall_invoke: shrq $32, %r13 /* upper 32-bits into %edx */ movl %r12d, %r12d /* lower 32-bits into %eax */ 5: + +_syscall_after_brand: /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -787,6 +828,22 @@ _syscall32_save: ENABLE_INTR_FLAGS + /* + * If our LWP has a branded syscall_fast handler, execute it. A return + * code of zero indicates that the handler completely processed the syscall + * and we can return directly to userspace. + */ + movq LWP_BRAND_SYSCALL_FAST(%r14), %rdi + testq %rdi, %rdi + jz _syscall32_no_brand_fast + call *%rdi + testl %eax, %eax + jnz _syscall32_no_brand_fast + incq LWP_RU_SYSC(%r14) + incq %gs:CPU_STATS_SYS_SYSCALL + jmp _sys_rtt + +_syscall32_no_brand_fast: MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ @@ -795,11 +852,37 @@ _syscall32_save: incq %gs:CPU_STATS_SYS_SYSCALL /* + * If our lwp has an alternate system call handler, run that instead + * of the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rax + testq %rax, %rax + jz _syscall32_no_brand + + movb $LWP_SYS, LWP_STATE(%r14) + call *%rax + + /* + * If the alternate handler returns 0, we skip straight to the return + * to usermode. Otherwise, we resume regular system call processing. + */ + testl %eax, %eax + jz _syscall32_after_brand + +_syscall32_no_brand: + /* * Make some space for MAXSYSARGS (currently 8) 32-bit args placed * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or * more succinctly: * * SA(MAXSYSARGS * sizeof (long)) == 64 + * + * Note, this space is used both to copy in the arguments from user + * land, but also to as part of the old UNIX style syscall_ap() method. + * syscall_entry expects that we do not change the values of this space + * that we give it. However, this means that when we end up in the more + * recent model of passing the arguments based on the calling + * conventions, we'll need to save an additional 16 bytes of stack. */ #define SYS_DROP 64 /* drop for args */ subq $SYS_DROP, %rsp @@ -827,12 +910,16 @@ _syscall32_save: */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ call *SY_CALLC(%rbx) @@ -850,6 +937,8 @@ _syscall32_save: shrq $32, %r13 /* upper 32-bits into %edx */ movl %eax, %r12d /* lower 32-bits into %eax */ +_syscall32_after_brand: + /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -1079,15 +1168,20 @@ sys_sysenter() /* * Fetch the arguments copied onto the kernel stack and put * them in the right registers to invoke a C-style syscall handler. - * %rax contains the handler address. + * %rax contains the handler address. For the last two arguments, we + * push them onto the stack -- we can't clobber the old arguments. */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ call *SY_CALLC(%rbx) @@ -1159,6 +1253,74 @@ sys_sysenter() SET_SIZE(brand_sys_sysenter) #endif /* __lint */ + +#if defined(__lint) +/* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the other entry points, there is no + * default action to take if no callback is registered for this process. + */ +void +sys_int80() +{} + +#else /* __lint */ + + ENTRY_NP(brand_sys_int80) + SWAPGS /* kernel gsbase */ + XPV_TRAP_POP + call smap_enable + + /* + * We first attempt to call the "b_int80" handler from the "struct + * brand_mach_ops" for this brand. If no handler function is installed + * for this brand, the BRAND_CALLBACK() macro returns here and we + * check the lwp for a "lwp_brand_syscall" handler. + */ + BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK()) + + /* + * Check to see if this lwp provides "lwp_brand_syscall". If so, we + * will route this int80 through the regular system call handling path. + */ + movq %r15, %gs:CPU_RTMP_R15 + movq %gs:CPU_THREAD, %r15 + movq T_LWP(%r15), %r15 + movq LWP_BRAND_SYSCALL(%r15), %r15 + testq %r15, %r15 + movq %gs:CPU_RTMP_R15, %r15 + jnz nopop_syscall_int + + /* + * The brand provided neither a "b_int80", nor a "lwp_brand_syscall" + * function, and has thus opted out of handling this trap. + */ + SWAPGS /* user gsbase */ + jmp nopop_int80 + + ENTRY_NP(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %rip to point before the INT, push the expected error + * code and fake a GP fault. Note on 64-bit hypervisor we need + * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack + * because gptrap will pop them again with its own XPV_TRAP_POP. + */ + XPV_TRAP_POP + call smap_enable +nopop_int80: + subq $2, (%rsp) /* int insn 2-bytes */ + pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) +#if defined(__xpv) + push %r11 + push %rcx +#endif + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) +#endif /* __lint */ + /* * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by diff --git a/usr/src/uts/i86pc/os/comm_page_util.c b/usr/src/uts/i86pc/os/comm_page_util.c new file mode 100644 index 0000000000..1c8c9f8afd --- /dev/null +++ b/usr/src/uts/i86pc/os/comm_page_util.c @@ -0,0 +1,62 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <vm/as.h> +#include <vm/seg_umap.h> + +#if defined(__x86) && !defined(__xpv) +#include <sys/comm_page.h> +#endif /* defined(__x86) && !defined(__xpv) */ + +/* + * Map in the comm page. + * + * The contents of the comm page are only defined on non-xpv x86 at this time. + * Furthermore, the data is only valid in userspace (32-bit or 64-bit) when + * mapped from a 64-bit kernel. + * See: "uts/i86pc/sys/comm_page.h" + */ +caddr_t +comm_page_mapin() +{ +#if defined(__amd64) && !defined(__xpv) + proc_t *p = curproc; + caddr_t addr = (caddr_t)COMM_PAGE_ALIGN; + size_t len = COMM_PAGE_SIZE; + uint_t prot = PROT_USER | PROT_READ; + segumap_crargs_t suarg; + + map_addr(&addr, len, (offset_t)0, 1, MAP_ALIGN); + if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as, + p->p_as->a_userlimit) != RANGE_OKAY) { + return (NULL); + } + + suarg.kaddr = (caddr_t)&comm_page; + suarg.prot = suarg.maxprot = prot; + if (as_map(p->p_as, addr, len, segumap_create, &suarg) != 0) { + return (NULL); + } + return (addr); +#else /* defined(__amd64) && !defined(__xpv) */ + return (NULL); +#endif /* defined(__amd64) && !defined(__xpv) */ +} diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 91fb583a01..f173a1dc57 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -753,6 +753,20 @@ i_cpr_is_supported(int sleeptype) if (sleeptype != CPR_TORAM) return (0); + /* + * Unfortunately, the x86 resume code was never implemented for GAS. + * The only obvious problem is that a trick necessary to appease Sun + * Studio does the wrong thing for GAS. Doubley unfortunate is that + * the condition used to detect GAS is incorrect, so we do in fact + * compile the Studio path, it just immediately fails in resume. + * + * Given that, if we were built using GCC, never allow CPR to be + * attempted. + */ +#ifdef __GNUC__ + return (0); +#endif + /* * The next statement tests if a specific platform has turned off * cpr support. diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 44e475f328..027ed29c3d 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -32,7 +32,7 @@ * Portions Copyright 2009 Advanced Micro Devices, Inc. */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* * Various routines to handle identification @@ -57,6 +57,8 @@ #include <sys/auxv_386.h> #include <sys/memnode.h> #include <sys/pci_cfgspace.h> +#include <sys/comm_page.h> +#include <sys/tsc.h> #ifdef __xpv #include <sys/hypervisor.h> @@ -171,7 +173,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = { "bmi2", "fma", "smep", - "smap" + "smap", + "adx", + "rdseed" }; boolean_t @@ -1264,6 +1268,11 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) disable_smap == 0) add_x86_feature(featureset, X86FSET_SMAP); #endif + if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED) + add_x86_feature(featureset, X86FSET_RDSEED); + + if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX) + add_x86_feature(featureset, X86FSET_ADX); } /* @@ -2739,6 +2748,10 @@ cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) *ebx &= ~CPUID_INTC_EBX_7_0_BMI2; if (!is_x86_feature(x86_featureset, X86FSET_AVX2)) *ebx &= ~CPUID_INTC_EBX_7_0_AVX2; + if (!is_x86_feature(x86_featureset, X86FSET_RDSEED)) + *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED; + if (!is_x86_feature(x86_featureset, X86FSET_ADX)) + *ebx &= ~CPUID_INTC_EBX_7_0_ADX; /* * [no explicit support required beyond x87 fp context] @@ -2808,8 +2821,20 @@ cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out) if (*ecx & CPUID_INTC_ECX_RDRAND) hwcap_flags_2 |= AV_386_2_RDRAND; + if (*ebx & CPUID_INTC_EBX_7_0_ADX) + hwcap_flags_2 |= AV_386_2_ADX; + if (*ebx & CPUID_INTC_EBX_7_0_RDSEED) + hwcap_flags_2 |= AV_386_2_RDSEED; + + } + + /* Detect systems with a potential CPUID limit */ + if (cpi->cpi_vendor == X86_VENDOR_Intel && cpi->cpi_maxeax < 4) { + cmn_err(CE_NOTE, "CPUID limit detected, " + "see the CPUID(7D) man page for details\n"); } + if (cpi->cpi_xmaxeax < 0x80000001) goto pass4_done; @@ -4591,27 +4616,30 @@ patch_tsc_read(int flag) size_t cnt; switch (flag) { - case X86_NO_TSC: + case TSC_NONE: cnt = &_no_rdtsc_end - &_no_rdtsc_start; (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); break; - case X86_HAVE_TSCP: - cnt = &_tscp_end - &_tscp_start; - (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); - break; - case X86_TSC_MFENCE: + case TSC_RDTSC_MFENCE: cnt = &_tsc_mfence_end - &_tsc_mfence_start; (void) memcpy((void *)tsc_read, (void *)&_tsc_mfence_start, cnt); break; - case X86_TSC_LFENCE: + case TSC_RDTSC_LFENCE: cnt = &_tsc_lfence_end - &_tsc_lfence_start; (void) memcpy((void *)tsc_read, (void *)&_tsc_lfence_start, cnt); break; + case TSC_TSCP: + cnt = &_tscp_end - &_tscp_start; + (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); + break; default: + /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ + cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); break; } + tsc_type = flag; } int diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c index d9ed882705..fab1324787 100644 --- a/usr/src/uts/i86pc/os/ibft.c +++ b/usr/src/uts/i86pc/os/ibft.c @@ -39,6 +39,7 @@ #include <sys/kmem.h> #include <sys/psm.h> #include <sys/bootconf.h> +#include <sys/reboot.h> typedef enum ibft_structure_type { Reserved = 0, @@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp); static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft, iscsi_ibft_tgt_t *tgtp); +extern int boothowto; /* * Return value: @@ -759,7 +761,9 @@ ld_ib_prop() * 1) pass "-B ibft-noprobe=1" on kernel command line * 2) add line "set ibft_noprobe=1" in /etc/system */ - cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + if (boothowto & RB_VERBOSE) { + cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + } return; } diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index b46bbf849d..981398970e 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -2799,7 +2799,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, /* * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs * and memory are local to each other in the same NUMA node and return number - * of nodes + * of nodes. + * + * The SRAT table pointer is populated during bootup by + * build_firmware_properties() in fakebop.c. Several motherboard and BIOS + * manufacturers are guilty of not having a SRAT table. */ static int lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, @@ -2816,9 +2820,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, /* * Nothing to do when no SRAT or disabled */ - if (tp == NULL || !lgrp_plat_srat_enable) + if (!lgrp_plat_srat_enable) return (-1); + if (tp == NULL) { + cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. " + "lgrp support will be limited to one group.\n"); + return (-1); + } + /* * Try to get domain information from MSCT table. * ACPI4.0: OSPM will use information provided by the MSCT only diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 045adbcb7b..438f83b6e9 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -23,6 +23,7 @@ * * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2010, Intel Corporation. @@ -61,6 +62,7 @@ #include <sys/promif.h> #include <sys/pci_cfgspace.h> #include <sys/bootvfs.h> +#include <sys/tsc.h> #ifdef __xpv #include <sys/hypervisor.h> #else @@ -227,15 +229,15 @@ mlsetup(struct regs *rp) */ if ((get_hwenv() & HW_XEN_HVM) == 0 && is_x86_feature(x86_featureset, X86FSET_TSCP)) - patch_tsc_read(X86_HAVE_TSCP); + patch_tsc_read(TSC_TSCP); else if (cpuid_getvendor(CPU) == X86_VENDOR_AMD && cpuid_getfamily(CPU) <= 0xf && is_x86_feature(x86_featureset, X86FSET_SSE2)) - patch_tsc_read(X86_TSC_MFENCE); + patch_tsc_read(TSC_RDTSC_MFENCE); else if (cpuid_getvendor(CPU) == X86_VENDOR_Intel && cpuid_getfamily(CPU) <= 6 && is_x86_feature(x86_featureset, X86FSET_SSE2)) - patch_tsc_read(X86_TSC_LFENCE); + patch_tsc_read(TSC_RDTSC_LFENCE); #endif /* !__xpv */ @@ -246,7 +248,7 @@ mlsetup(struct regs *rp) * return 0. */ if (!is_x86_feature(x86_featureset, X86FSET_TSC)) - patch_tsc_read(X86_NO_TSC); + patch_tsc_read(TSC_NONE); #endif /* __i386 && !__xpv */ #if defined(__amd64) && !defined(__xpv) diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 3c7b453949..829c631096 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -27,7 +27,7 @@ * All rights reserved. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -166,6 +166,8 @@ init_cpu_info(struct cpu *cp) void init_cpu_syscall(struct cpu *cp) { + uint64_t flags; + kpreempt_disable(); #if defined(__amd64) @@ -247,6 +249,24 @@ init_cpu_syscall(struct cpu *cp) kpreempt_enable(); } +#if !defined(__xpv) +/* + * Configure per-cpu ID GDT + */ +static void +init_cpu_id_gdt(struct cpu *cp) +{ + /* Write cpu_id into limit field of GDT for usermode retrieval */ +#if defined(__amd64) + set_usegd(&cp->cpu_gdt[GDT_CPUID], SDP_SHORT, NULL, cp->cpu_id, + SDT_MEMRODA, SEL_UPL, SDP_BYTES, SDP_OP32); +#elif defined(__i386) + set_usegd(&cp->cpu_gdt[GDT_CPUID], NULL, cp->cpu_id, SDT_MEMRODA, + SEL_UPL, SDP_BYTES, SDP_OP32); +#endif +} +#endif /* !defined(__xpv) */ + /* * Multiprocessor initialization. * @@ -430,6 +450,10 @@ mp_cpu_configure_common(int cpun, boolean_t boot) init_cpu_info(cp); +#if !defined(__xpv) + init_cpu_id_gdt(cp); +#endif + /* * alloc space for ucode_info */ @@ -1486,6 +1510,10 @@ start_other_cpus(int cprboot) */ init_cpu_info(CPU); +#if !defined(__xpv) + init_cpu_id_gdt(CPU); +#endif + cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_idstr); cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_brandstr); diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c index c40159018c..7344e1a492 100644 --- a/usr/src/uts/i86pc/os/timestamp.c +++ b/usr/src/uts/i86pc/os/timestamp.c @@ -25,6 +25,7 @@ * * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -47,6 +48,7 @@ #include <sys/panic.h> #include <sys/cpu.h> #include <sys/sdt.h> +#include <sys/comm_page.h> /* * Using the Pentium's TSC register for gethrtime() @@ -99,7 +101,6 @@ #define NSEC_SHIFT 5 -static uint_t nsec_scale; static uint_t nsec_unscale; /* @@ -140,18 +141,12 @@ static volatile int tsc_sync_go; int tsc_master_slave_sync_needed = 1; -static int tsc_max_delta; -static hrtime_t tsc_sync_tick_delta[NCPU]; typedef struct tsc_sync { volatile hrtime_t master_tsc, slave_tsc; } tsc_sync_t; static tsc_sync_t *tscp; -static hrtime_t largest_tsc_delta = 0; -static ulong_t shortest_write_time = ~0UL; -static hrtime_t tsc_last = 0; static hrtime_t tsc_last_jumped = 0; -static hrtime_t tsc_hrtime_base = 0; static int tsc_jumped = 0; static uint32_t tsc_wayback = 0; /* @@ -159,7 +154,6 @@ static uint32_t tsc_wayback = 0; * tsc_tick() function runs which means that when gethrtime() is called it * should never be more than 1 second since tsc_last was updated. */ -static hrtime_t tsc_resume_cap; static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */ static hrtime_t shadow_tsc_hrtime_base; @@ -451,25 +445,27 @@ tsc_gethrtimeunscaled_delta(void) } /* - * Called by the master in the TSC sync operation (usually the boot CPU). - * If the slave is discovered to have a skew, gethrtimef will be changed to - * point to tsc_gethrtime_delta(). Calculating skews is precise only when - * the master and slave TSCs are read simultaneously; however, there is no - * algorithm that can read both CPUs in perfect simultaneity. The proposed - * algorithm is an approximate method based on the behaviour of cache - * management. The slave CPU continuously reads TSC and then reads a global - * variable which the master CPU updates. The moment the master's update reaches - * the slave's visibility (being forced by an mfence operation) we use the TSC - * reading taken on the slave. A corresponding TSC read will be taken on the - * master as soon as possible after finishing the mfence operation. But the - * delay between causing the slave to notice the invalid cache line and the - * competion of mfence is not repeatable. This error is heuristically assumed - * to be 1/4th of the total write time as being measured by the two TSC reads - * on the master sandwiching the mfence. Furthermore, due to the nature of - * bus arbitration, contention on memory bus, etc., the time taken for the write - * to reflect globally can vary a lot. So instead of taking a single reading, - * a set of readings are taken and the one with least write time is chosen - * to calculate the final skew. + * TSC Sync Master + * + * Typically called on the boot CPU, this attempts to quantify TSC skew between + * different CPUs. If an appreciable difference is found, gethrtimef will be + * changed to point to tsc_gethrtime_delta(). + * + * Calculating skews is precise only when the master and slave TSCs are read + * simultaneously; however, there is no algorithm that can read both CPUs in + * perfect simultaneity. The proposed algorithm is an approximate method based + * on the behaviour of cache management. The slave CPU continuously polls the + * TSC while reading a global variable updated by the master CPU. The latest + * TSC reading is saved when the master's update (forced via mfence) reaches + * visibility on the slave. The master will also take a TSC reading + * immediately following the mfence. + * + * While the delay between cache line invalidation on the slave and mfence + * completion on the master is not repeatable, the error is heuristically + * assumed to be 1/4th of the write time recorded by the master. Multiple + * samples are taken to control for the variance caused by external factors + * such as bus contention. Each sample set is independent per-CPU to control + * for differing memory latency on NUMA systems. * * TSC sync is disabled in the context of virtualization because the CPUs * assigned to the guest are virtual CPUs which means the real CPUs on which @@ -482,7 +478,7 @@ void tsc_sync_master(processorid_t slave) { ulong_t flags, source, min_write_time = ~0UL; - hrtime_t write_time, x, mtsc_after, tdelta; + hrtime_t write_time, mtsc_after, last_delta = 0; tsc_sync_t *tsc = tscp; int cnt; int hwtype; @@ -505,57 +501,53 @@ tsc_sync_master(processorid_t slave) SMT_PAUSE(); write_time = mtsc_after - tsc->master_tsc; if (write_time <= min_write_time) { - min_write_time = write_time; + hrtime_t tdelta; + + tdelta = tsc->slave_tsc - mtsc_after; + if (tdelta < 0) + tdelta = -tdelta; /* - * Apply heuristic adjustment only if the calculated - * delta is > 1/4th of the write time. + * If the margin exists, subtract 1/4th of the measured + * write time from the master's TSC value. This is an + * estimate of how late the mfence completion came + * after the slave noticed the cache line change. */ - x = tsc->slave_tsc - mtsc_after; - if (x < 0) - x = -x; - if (x > (min_write_time/4)) - /* - * Subtract 1/4th of the measured write time - * from the master's TSC value, as an estimate - * of how late the mfence completion came - * after the slave noticed the cache line - * change. - */ + if (tdelta > (write_time/4)) { tdelta = tsc->slave_tsc - - (mtsc_after - (min_write_time/4)); - else + (mtsc_after - (write_time/4)); + } else { tdelta = tsc->slave_tsc - mtsc_after; - tsc_sync_tick_delta[slave] = - tsc_sync_tick_delta[source] - tdelta; + } + last_delta = tsc_sync_tick_delta[source] - tdelta; + tsc_sync_tick_delta[slave] = last_delta; + min_write_time = write_time; } tsc->master_tsc = tsc->slave_tsc = write_time = 0; membar_enter(); tsc_sync_go = TSC_SYNC_STOP; } - if (tdelta < 0) - tdelta = -tdelta; - if (tdelta > largest_tsc_delta) - largest_tsc_delta = tdelta; - if (min_write_time < shortest_write_time) - shortest_write_time = min_write_time; + /* - * Enable delta variants of tsc functions if the largest of all chosen - * deltas is > smallest of the write time. + * Only enable the delta variants of the TSC functions if the measured + * skew is greater than the fastest write time. */ - if (largest_tsc_delta > shortest_write_time) { + last_delta = (last_delta < 0) ? -last_delta : last_delta; + if (last_delta > min_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; + tsc_ncpu = NCPU; } restore_int_flag(flags); } /* + * TSC Sync Slave + * * Called by a CPU which has just been onlined. It is expected that the CPU * performing the online operation will call tsc_sync_master(). * - * TSC sync is disabled in the context of virtualization. See comments - * above tsc_sync_master. + * Like tsc_sync_master, this logic is skipped on virtualized platforms. */ void tsc_sync_slave(void) @@ -579,11 +571,9 @@ tsc_sync_slave(void) tsc_sync_go = TSC_SYNC_GO; do { /* - * Do not put an SMT_PAUSE here. For instance, - * if the master and slave are really the same - * hyper-threaded CPU, then you want the master - * to yield to the slave as quickly as possible here, - * but not the other way. + * Do not put an SMT_PAUSE here. If the master and + * slave are the same hyper-threaded CPU, we want the + * master to yield as quickly as possible to the slave. */ s1 = tsc_read(); } while (tsc->master_tsc == 0); @@ -688,6 +678,12 @@ tsc_hrtimeinit(uint64_t cpu_freq_hz) hrtime_tick = tsc_tick; gethrtime_hires = 1; /* + * Being part of the comm page, tsc_ncpu communicates the published + * length of the tsc_sync_tick_delta array. This is kept zeroed to + * ignore the absent delta data while the TSCs are synced. + */ + tsc_ncpu = 0; + /* * Allocate memory for the structure used in the tsc sync logic. * This structure should be aligned on a multiple of cache line size. */ @@ -708,12 +704,10 @@ get_tsc_ready() } /* - * Adjust all the deltas by adding the passed value to the array. - * Then use the "delt" versions of the the gethrtime functions. - * Note that 'tdelta' _could_ be a negative number, which should - * reduce the values in the array (used, for example, if the Solaris - * instance was moved by a virtual manager to a machine with a higher - * value of tsc). + * Adjust all the deltas by adding the passed value to the array and activate + * the "delta" versions of the gethrtime functions. It is possible that the + * adjustment could be negative. Such may occur if the SunOS instance was + * moved by a virtual manager to a machine with a higher value of TSC. */ void tsc_adjust_delta(hrtime_t tdelta) @@ -726,19 +720,16 @@ tsc_adjust_delta(hrtime_t tdelta) gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; + tsc_ncpu = NCPU; } /* * Functions to manage TSC and high-res time on suspend and resume. */ -/* - * declarations needed for time adjustment - */ -extern void rtcsync(void); +/* tod_ops from "uts/i86pc/io/todpc_subr.c" */ extern tod_ops_t *tod_ops; -/* There must be a better way than exposing nsec_scale! */ -extern uint_t nsec_scale; + static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */ static timestruc_t tsc_saved_ts; static int tsc_needs_resume = 0; /* We only want to do this once. */ @@ -748,23 +739,20 @@ int tsc_suspend_count = 0; int tsc_resume_in_cyclic = 0; /* - * Let timestamp.c know that we are suspending. It needs to take - * snapshots of the current time, and do any pre-suspend work. + * Take snapshots of the current time and do any other pre-suspend work. */ void tsc_suspend(void) { -/* - * What we need to do here, is to get the time we suspended, so that we - * know how much we should add to the resume. - * This routine is called by each CPU, so we need to handle reentry. - */ + /* + * We need to collect the time at which we suspended here so we know + * now much should be added during the resume. This is called by each + * CPU, so reentry must be properly handled. + */ if (tsc_gethrtime_enable) { /* - * We put the tsc_read() inside the lock as it - * as no locking constraints, and it puts the - * aquired value closer to the time stamp (in - * case we delay getting the lock). + * Perform the tsc_read after acquiring the lock to make it as + * accurate as possible in the face of contention. */ mutex_enter(&tod_lock); tsc_saved_tsc = tsc_read(); @@ -786,8 +774,7 @@ tsc_suspend(void) } /* - * Restore all timestamp state based on the snapshots taken at - * suspend time. + * Restore all timestamp state based on the snapshots taken at suspend time. */ void tsc_resume(void) diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index 9390690e95..c88fec6fbe 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -32,7 +32,7 @@ /* */ /* - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -101,6 +101,7 @@ #include <sys/hypervisor.h> #endif #include <sys/contract/process_impl.h> +#include <sys/brand.h> #define USER 0x10000 /* user-mode flag added to trap type */ @@ -862,6 +863,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) fault_type = F_INVAL; } + /* + * Allow the brand to interpose on invalid memory accesses + * prior to running the native pagefault handler. If this + * brand hook returns zero, it was able to handle the fault + * completely. Otherwise, drive on and call pagefault(). + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL && + BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) { + goto out; + } + res = pagefault(addr, fault_type, rw, 0); /* @@ -1468,12 +1480,23 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) ct->t_sig_check = 0; - mutex_enter(&p->p_lock); + /* + * As in other code paths that check against TP_CHANGEBIND, + * we perform the check first without p_lock held -- only + * acquiring p_lock in the unlikely event that it is indeed + * set. This is safe because we are doing this after the + * astoff(); if we are racing another thread setting + * TP_CHANGEBIND on us, we will pick it up on a subsequent + * lap through. + */ if (curthread->t_proc_flag & TP_CHANGEBIND) { - timer_lwpbind(); - curthread->t_proc_flag &= ~TP_CHANGEBIND; + mutex_enter(&p->p_lock); + if (curthread->t_proc_flag & TP_CHANGEBIND) { + timer_lwpbind(); + curthread->t_proc_flag &= ~TP_CHANGEBIND; + } + mutex_exit(&p->p_lock); } - mutex_exit(&p->p_lock); /* * for kaio requests that are on the per-process poll queue, diff --git a/usr/src/uts/i86pc/sys/acpidev.h b/usr/src/uts/i86pc/sys/acpidev.h index 6d11277aaf..a3bd54d4e3 100644 --- a/usr/src/uts/i86pc/sys/acpidev.h +++ b/usr/src/uts/i86pc/sys/acpidev.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2009-2010, Intel Corporation. * All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ACPIDEV_H @@ -128,7 +129,7 @@ typedef enum acpidev_class_id { #ifdef _KERNEL /* Common ACPI object names. */ -#define ACPIDEV_OBJECT_NAME_SB ACPI_NS_SYSTEM_BUS +#define ACPIDEV_OBJECT_NAME_SB METHOD_NAME__SB_ #define ACPIDEV_OBJECT_NAME_PR "_PR_" /* Common ACPI method names. */ diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 8a87760456..11ae48340a 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -382,7 +382,7 @@ struct apic_io_intr { /* special or reserve vectors */ #define APIC_CHECK_RESERVE_VECTORS(v) \ (((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \ - ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET)) + ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80)) /* cmos shutdown code for BIOS */ #define BIOS_SHUTDOWN 0x0a diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h new file mode 100644 index 0000000000..dbf00bc7a7 --- /dev/null +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -0,0 +1,102 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _COMM_PAGE_H +#define _COMM_PAGE_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define COMM_PAGE_SIZE PAGESIZE +#define COMM_PAGE_ALIGN 0x4000 + +#ifndef _ASM + +/* + * x86 comm page + * + * This struct defines the data format for the "comm page": kernel data made + * directly available to userspace for read-only operations. This enables + * facilities such as clock_gettime to operate entirely in userspace without + * the need for a trap or fasttrap. + * + * A note about 32-bit/64-bit compatibility: + * The current format of the comm page is designed to be consistent for both + * 32-bit and 64-bit programs running in a 64-bit kernel. On 32-bit kernels, + * the comm page is not exposed to userspace due to the difference in + * timespec_t sizing. + * + * This struct is instantiated "by hand" in assembly to preserve the global + * symbols it contains. That layout must be kept in sync with the structure + * defined here. + * See: "uts/i86pc/ml/comm_page.s" + */ +typedef struct comm_page_s { + hrtime_t cp_tsc_last; + hrtime_t cp_tsc_hrtime_base; + hrtime_t cp_tsc_resume_cap; + uint32_t cp_tsc_type; + uint32_t cp_tsc_max_delta; + + volatile uint32_t cp_hres_lock; /* must be 8-byte aligned */ + uint32_t cp_nsec_scale; + int64_t cp_hrestime_adj; + hrtime_t cp_hres_last_tick; + uint32_t cp_tsc_ncpu; + uint32_t _cp_pad; + volatile int64_t cp_hrestime[2]; +#if defined(_MACHDEP) + hrtime_t cp_tsc_sync_tick_delta[NCPU]; +#else + /* length resides in cp_ncpu */ + hrtime_t cp_tsc_sync_tick_delta[]; +#endif /* defined(_MACHDEP) */ +} comm_page_t; + +#if defined(_KERNEL) +extern comm_page_t comm_page; + +extern caddr_t comm_page_mapin(); + +#if defined(_MACHDEP) +extern hrtime_t tsc_last; +extern hrtime_t tsc_hrtime_base; +extern hrtime_t tsc_resume_cap; +extern uint32_t tsc_type; +extern uint32_t tsc_max_delta; +extern volatile uint32_t hres_lock; +extern uint32_t nsec_scale; +extern int64_t hrestime_adj; +extern hrtime_t hres_last_tick; +extern uint32_t tsc_ncpu; +extern volatile timestruc_t hrestime; +extern hrtime_t tsc_sync_tick_delta[NCPU]; +#endif /* defined(_MACHDEP) */ +#endif /* defined(_KERNEL) */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _COMM_PAGE_H */ diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index 99ae0d4d3b..fc34522307 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -54,6 +55,10 @@ extern "C" { */ #if defined(__amd64) +/* + * If NCPU grows beyond 256, sizing for the x86 comm page will require + * adjustment. + */ #define NCPU 256 #define NCPU_LOG2 8 #elif defined(__i386) diff --git a/usr/src/uts/i86pc/sys/tsc.h b/usr/src/uts/i86pc/sys/tsc.h new file mode 100644 index 0000000000..d4090381c4 --- /dev/null +++ b/usr/src/uts/i86pc/sys/tsc.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _TSC_H +#define _TSC_H + +/* + * flags to patch tsc_read routine. + */ +#define TSC_NONE 0x0 +#define TSC_RDTSC_CPUID 0x1 +#define TSC_RDTSC_MFENCE 0x2 +#define TSC_RDTSC_LFENCE 0x3 +#define TSC_TSCP 0x4 + +#endif /* _TSC_H */ diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index ea2a83b2bd..a8b4e6edfc 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -27,6 +27,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014, 2015 by Delphix. All rights reserved. */ @@ -3323,7 +3324,7 @@ hat_page_getattr(struct page *pp, uint_t flag) /* - * common code used by hat_pageunload() and hment_steal() + * common code used by hat_page_inval() and hment_steal() */ hment_t * hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) @@ -3379,15 +3380,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) extern int vpm_enable; /* - * Unload all translations to a page. If the page is a subpage of a large + * Unload translations to a page. If the page is a subpage of a large * page, the large page mappings are also removed. - * - * The forceflags are unused. + * If curhat is not NULL, then we only unload the translation + * for the given process, otherwise all translations are unloaded. */ - -/*ARGSUSED*/ -static int -hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) +void +hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat) { page_t *cur_pp = pp; hment_t *hm; @@ -3395,15 +3394,10 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) htable_t *ht; uint_t entry; level_t level; + ulong_t cnt; XPV_DISALLOW_MIGRATE(); - /* - * prevent recursion due to kmem_free() - */ - ++curthread->t_hatdepth; - ASSERT(curthread->t_hatdepth < 16); - #if defined(__amd64) /* * clear the vpm ref. @@ -3416,6 +3410,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) * The loop with next_size handles pages with multiple pagesize mappings */ next_size: + if (curhat != NULL) + cnt = hat_page_getshare(cur_pp); for (;;) { /* @@ -3427,14 +3423,13 @@ next_size: if (hm == NULL) { x86_hm_exit(cur_pp); +curproc_done: /* * If not part of a larger page, we're done. */ if (cur_pp->p_szc <= pg_szcd) { - ASSERT(curthread->t_hatdepth > 0); - --curthread->t_hatdepth; XPV_ALLOW_MIGRATE(); - return (0); + return; } /* @@ -3453,8 +3448,20 @@ next_size: * If this mapping size matches, remove it. */ level = ht->ht_level; - if (level == pg_szcd) - break; + if (level == pg_szcd) { + if (curhat == NULL || ht->ht_hat == curhat) + break; + /* + * Unloading only the given process but it's + * not the hat for the current process. Leave + * entry in place. Also do a safety check to + * ensure we don't get in an infinite loop + */ + if (cnt-- == 0) { + x86_hm_exit(cur_pp); + goto curproc_done; + } + } } /* @@ -3464,14 +3471,44 @@ next_size: hm = hati_page_unmap(cur_pp, ht, entry); if (hm != NULL) hment_free(hm); + + /* Perform check above for being part of a larger page. */ + if (curhat != NULL) + goto curproc_done; } } +/* + * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then + * we only unload the translation for the current process, otherwise all + * translations are unloaded. + */ +static int +hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag) +{ + struct hat *curhat = NULL; + + /* + * prevent recursion due to kmem_free() + */ + ++curthread->t_hatdepth; + ASSERT(curthread->t_hatdepth < 16); + + if (unloadflag == HAT_CURPROC_PGUNLOAD) + curhat = curthread->t_procp->p_as->a_hat; + + hat_page_inval(pp, pg_szcd, curhat); + + ASSERT(curthread->t_hatdepth > 0); + --curthread->t_hatdepth; + return (0); +} + int -hat_pageunload(struct page *pp, uint_t forceflag) +hat_pageunload(struct page *pp, uint_t unloadflag) { ASSERT(PAGE_EXCL(pp)); - return (hati_pageunload(pp, 0, forceflag)); + return (hati_pageunload(pp, 0, unloadflag)); } /* diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index 2212202a01..1c2bd3e0ec 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -24,6 +24,7 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -623,10 +624,8 @@ void map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) { struct proc *p = curproc; - caddr_t userlimit = (flags & _MAP_LOW32) ? - (caddr_t)_userlimit32 : p->p_as->a_userlimit; - - map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); + map_addr_proc(addrp, len, off, vacalign, + map_userlimit(p, p->p_as, flags), curproc, flags); } /*ARGSUSED*/ |
