summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86pc')
-rw-r--r--usr/src/uts/i86pc/Makefile.files3
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_startkern.c41
-rw-r--r--usr/src/uts/i86pc/io/ppm/acpisleep.c24
-rw-r--r--usr/src/uts/i86pc/io/psm/psm_common.c3
-rw-r--r--usr/src/uts/i86pc/ml/comm_page.s88
-rw-r--r--usr/src/uts/i86pc/ml/offsets.in6
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm.s30
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm_amd64.s188
-rw-r--r--usr/src/uts/i86pc/os/comm_page_util.c62
-rw-r--r--usr/src/uts/i86pc/os/cpr_impl.c14
-rw-r--r--usr/src/uts/i86pc/os/cpuid.c46
-rw-r--r--usr/src/uts/i86pc/os/ibft.c6
-rw-r--r--usr/src/uts/i86pc/os/lgrpplat.c14
-rw-r--r--usr/src/uts/i86pc/os/mlsetup.c10
-rw-r--r--usr/src/uts/i86pc/os/mp_startup.c30
-rw-r--r--usr/src/uts/i86pc/os/timestamp.c161
-rw-r--r--usr/src/uts/i86pc/os/trap.c33
-rw-r--r--usr/src/uts/i86pc/sys/acpidev.h3
-rw-r--r--usr/src/uts/i86pc/sys/apic.h2
-rw-r--r--usr/src/uts/i86pc/sys/comm_page.h102
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h5
-rw-r--r--usr/src/uts/i86pc/sys/tsc.h28
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c79
-rw-r--r--usr/src/uts/i86pc/vm/vm_machdep.c7
24 files changed, 833 insertions, 152 deletions
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 9829939b16..ef7a36d09c 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -23,6 +23,7 @@
# Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
#
# Copyright (c) 2010, Intel Corporation.
+# Copyright 2016 Joyent, Inc.
#
# This Makefile defines file modules in the directory uts/i86pc
# and its children. These are the source files which are i86pc
@@ -40,6 +41,8 @@ CORE_OBJS += \
cmi.o \
cmi_hw.o \
cms.o \
+ comm_page.o \
+ comm_page_util.o \
confunix.o \
cpu_idle.o \
cpuid.o \
diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c
index 7fc43e27bd..6abb7c6349 100644
--- a/usr/src/uts/i86pc/dboot/dboot_startkern.c
+++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c
@@ -63,6 +63,15 @@ extern int have_cpuid(void);
#define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
/*
+ * Region of memory that may be corrupted by external actors. This can go away
+ * once the firmware bug RICHMOND-16 is fixed and all systems with the bug are
+ * upgraded.
+ */
+#define CORRUPT_REGION_START 0xc700000
+#define CORRUPT_REGION_SIZE 0x100000
+#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE)
+
+/*
* This file contains code that runs to transition us from either a multiboot
* compliant loader (32 bit non-paging) or a XPV domain loader to
* regular kernel execution. Its task is to setup the kernel memory image
@@ -1155,6 +1164,38 @@ init_mem_alloc(void)
case 1:
if (end > max_mem)
max_mem = end;
+
+ /*
+ * Well, this is sad. One some systems, there
+ * is a region of memory that can be corrupted
+ * until some number of seconds after we have
+ * booted. And the BIOS doesn't tell us that
+ * this memory is unsafe to use. And we don't
+ * know how long it's dangerous. So we'll
+ * chop out this range from any memory list
+ * that would otherwise be usable. Note that
+ * any system of this type will give us the
+ * new-style (0x40) memlist, so we need not
+ * fix up the other path below.
+ */
+ if (start < CORRUPT_REGION_START &&
+ end > CORRUPT_REGION_START) {
+ memlists[memlists_used].addr = start;
+ memlists[memlists_used].size =
+ CORRUPT_REGION_START - start;
+ ++memlists_used;
+ if (end > CORRUPT_REGION_END)
+ start = CORRUPT_REGION_END;
+ else
+ continue;
+ }
+ if (start >= CORRUPT_REGION_START &&
+ start < CORRUPT_REGION_END) {
+ if (end <= CORRUPT_REGION_END)
+ continue;
+ start = CORRUPT_REGION_END;
+ }
+
memlists[memlists_used].addr = start;
memlists[memlists_used].size = end - start;
++memlists_used;
diff --git a/usr/src/uts/i86pc/io/ppm/acpisleep.c b/usr/src/uts/i86pc/io/ppm/acpisleep.c
index 78328170e6..6b94e50909 100644
--- a/usr/src/uts/i86pc/io/ppm/acpisleep.c
+++ b/usr/src/uts/i86pc/io/ppm/acpisleep.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -56,6 +57,19 @@
int acpi_rtc_wake = 0x0; /* wake in N seconds */
+/*
+ * Execute optional ACPI methods for suspend/resume.
+ * The value can be ACPI_EXECUTE_GTS and/or ACPI_EXECUTE_BFS.
+ * Global so it can be set in /etc/system.
+ * From usr/src/uts/intel/io/acpica/changes.txt:
+ * It has been seen on some systems where the execution of these
+ * methods causes errors and also prevents the machine from entering S5.
+ * It is therefore suggested that host operating systems do not execute
+ * these methods by default. In the future, perhaps these methods can be
+ * optionally executed based on the age of the system...
+ */
+int acpi_sleep_flags = ACPI_NO_OPTIONAL_METHODS;
+
#if 0 /* debug */
static uint8_t branchbuf[64 * 1024]; /* for the HDT branch trace stuff */
#endif /* debug */
@@ -142,8 +156,9 @@ acpi_enter_sleepstate(s3a_t *s3ap)
* Tell the hardware to sleep.
*/
PT(PT_SXE);
- PMD(PMD_SX, ("Calling AcpiEnterSleepState(%d) ...\n", Sx))
- if (AcpiEnterSleepState(Sx) != AE_OK) {
+ PMD(PMD_SX, ("Calling AcpiEnterSleepState(%d, %d) ...\n", Sx,
+ acpi_sleep_flags))
+ if (AcpiEnterSleepState(Sx, acpi_sleep_flags) != AE_OK) {
PT(PT_SXE_FAIL);
PMD(PMD_SX, ("... failed!\n"))
}
@@ -163,6 +178,11 @@ acpi_exit_sleepstate(s3a_t *s3ap)
PMD(PMD_SX, ("!We woke up!\n"))
PT(PT_LSS);
+ if (AcpiLeaveSleepStatePrep(Sx, acpi_sleep_flags) != AE_OK) {
+ PT(PT_LSS_FAIL);
+ PMD(PMD_SX, ("Problem with LeaveSleepState!\n"))
+ }
+
if (AcpiLeaveSleepState(Sx) != AE_OK) {
PT(PT_LSS_FAIL);
PMD(PMD_SX, ("Problem with LeaveSleepState!\n"))
diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c
index 7a3dd8a733..9c17d85228 100644
--- a/usr/src/uts/i86pc/io/psm/psm_common.c
+++ b/usr/src/uts/i86pc/io/psm/psm_common.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -979,7 +980,7 @@ acpi_poweroff(void)
return (1);
}
ACPI_DISABLE_IRQS();
- status = AcpiEnterSleepState(5);
+ status = AcpiEnterSleepState(5, ACPI_NO_OPTIONAL_METHODS);
ACPI_ENABLE_IRQS();
/* we should be off; if we get here it's an error */
diff --git a/usr/src/uts/i86pc/ml/comm_page.s b/usr/src/uts/i86pc/ml/comm_page.s
new file mode 100644
index 0000000000..7ff803ea93
--- /dev/null
+++ b/usr/src/uts/i86pc/ml/comm_page.s
@@ -0,0 +1,88 @@
+
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/param.h>
+#include <sys/comm_page.h>
+#include <sys/tsc.h>
+
+#if defined(__lint)
+
+hrtime_t tsc_last;
+hrtime_t tsc_resume_cap;
+hrtime_t tsc_hrtime_base;
+uint32_t tsc_max_delta;
+volatile uint32_t hres_lock;
+uint32_t tsc_type;
+uint32_t nsec_scale;
+int64_t hrestime_adj;
+hrtime_t hres_last_tick;
+uint32_t tsc_ncpu;
+volatile timestruc_t hrestime;
+hrtime_t tsc_sync_tick_delta[NCPU];
+
+comm_page_t comm_page;
+
+#else /* defined(__lint) */
+
+#include "assym.h"
+
+/*
+ * x86 Comm Page
+ *
+ * This is the definition for the comm page on x86. The purpose of this struct
+ * is to consolidate certain pieces of kernel state into one contiguous section
+ * of memory in order for it to be exposed (read-only) to userspace. The
+ * struct contents are defined by hand so that member variables will maintain
+ * their original symbols for use throughout the rest of the kernel. This
+ * layout must exactly match the C definition of comm_page_t.
+ * See: "uts/i86pc/sys/comm_page.h"
+ */
+
+ .data
+ DGDEF3(comm_page, COMM_PAGE_S_SIZE, 4096)
+ DGDEF2(tsc_last, 8)
+ .fill 1, 8, 0
+ DGDEF2(tsc_hrtime_base, 8)
+ .fill 1, 8, 0
+ DGDEF2(tsc_resume_cap, 8)
+ .fill 1, 8, 0
+ DGDEF2(tsc_type, 4);
+ .fill 1, 4, _CONST(TSC_RDTSC_CPUID)
+ DGDEF2(tsc_max_delta, 4);
+ .fill 1, 4, 0
+ DGDEF2(hres_lock, 4);
+ .fill 1, 4, 0
+ DGDEF2(nsec_scale, 4);
+ .fill 1, 4, 0
+ DGDEF2(hrestime_adj, 8)
+ .fill 1, 8, 0
+ DGDEF2(hres_last_tick, 8)
+ .fill 1, 8, 0
+ DGDEF2(tsc_ncpu, 4)
+ .fill 1, 4, 0
+ /* _cp_pad */
+ .fill 1, 4, 0
+ DGDEF2(hrestime, _MUL(2, 8))
+ .fill 2, 8, 0
+ DGDEF2(tsc_sync_tick_delta, _MUL(NCPU, 8))
+ .fill _CONST(NCPU), 8, 0
+
+ /* pad out the rest of the page from the struct end */
+ .fill _CONST(COMM_PAGE_SIZE - COMM_PAGE_S_SIZE), 1, 0
+
+#endif /* defined(__lint) */
diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in
index 721d32fa3a..a1f83d3cf8 100644
--- a/usr/src/uts/i86pc/ml/offsets.in
+++ b/usr/src/uts/i86pc/ml/offsets.in
@@ -1,6 +1,7 @@
\
\ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
\ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
+\ Copyright 2016 Joyent, Inc.
\
\ CDDL HEADER START
\
@@ -61,6 +62,7 @@
#include <sys/brand.h>
#include <sys/fastboot.h>
#include <sys/cpr_wakecode.h>
+#include <sys/comm_page.h>
proc PROCSIZE
p_link
@@ -150,6 +152,8 @@ _klwp
lwp_thread
lwp_procp
lwp_brand
+ lwp_brand_syscall
+ lwp_brand_syscall_fast
lwp_eosys
lwp_regs
lwp_arg
@@ -467,3 +471,5 @@ wc_cpu WC_CPU_SIZE
wc_wakecode
wc_cpu
+
+comm_page_s COMM_PAGE_S_SIZE
diff --git a/usr/src/uts/i86pc/ml/syscall_asm.s b/usr/src/uts/i86pc/ml/syscall_asm.s
index 61ef4ac6c3..68181be28a 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm.s
@@ -631,6 +631,36 @@ _sysenter_done:
sysexit
SET_SIZE(sys_sysenter)
SET_SIZE(brand_sys_sysenter)
+#endif /* __lint */
+
+#if defined(__lint)
+/*
+ * System call via an int80. This entry point is only used by the Linux
+ * application environment. Unlike the sysenter path, there is no default
+ * action to take if no callback is registered for this process.
+ */
+void
+sys_int80()
+{}
+
+#else /* __lint */
+
+ ENTRY_NP(brand_sys_int80)
+ BRAND_CALLBACK(BRAND_CB_INT80)
+
+ ALTENTRY(sys_int80)
+ /*
+ * We hit an int80, but this process isn't of a brand with an int80
+ * handler. Bad process! Make it look as if the INT failed.
+ * Modify %eip to point before the INT, push the expected error
+ * code and fake a GP fault.
+ *
+ */
+ subl $2, (%esp) /* int insn 2-bytes */
+ pushl $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
+ jmp gptrap / GP fault
+ SET_SIZE(sys_int80)
+ SET_SIZE(brand_sys_int80)
/*
* Declare a uintptr_t which covers the entire pc range of syscall
diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
index f26468c8cc..bc901e3e42 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
@@ -503,8 +503,25 @@ noprod_sys_syscall:
movq T_LWP(%r15), %r14
ASSERT_NO_RUPDATE_PENDING(%r14)
+
ENABLE_INTR_FLAGS
+ /*
+ * If our LWP has a branded syscall_fast handler, execute it. A return
+ * code of zero indicates that the handler completely processed the syscall
+ * and we can return directly to userspace.
+ */
+ movq LWP_BRAND_SYSCALL_FAST(%r14), %rdi
+ testq %rdi, %rdi
+ jz _syscall_no_brand_fast
+ call *%rdi
+ testl %eax, %eax
+ jnz _syscall_no_brand_fast
+ incq LWP_RU_SYSC(%r14)
+ incq %gs:CPU_STATS_SYS_SYSCALL
+ jmp _sys_rtt
+
+_syscall_no_brand_fast:
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
@@ -516,6 +533,28 @@ noprod_sys_syscall:
incq %gs:CPU_STATS_SYS_SYSCALL
+ /*
+ * If our LWP has an alternate system call handler, run that instead of
+ * the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rdi
+ testq %rdi, %rdi
+ jz _syscall_no_brand
+
+ pushq %rax
+ subq $8, %rsp /* align stack for call to C */
+ call *%rdi
+ addq $8, %rsp
+
+ /*
+ * If the alternate handler returns 0, we skip straight to the return to
+ * usermode. Otherwise, we resume regular system call processing.
+ */
+ testl %eax, %eax
+ popq %rax
+ jz _syscall_after_brand
+
+_syscall_no_brand:
movw %ax, T_SYSNUM(%r15)
movzbl T_PRE_SYS(%r15), %ebx
ORL_SYSCALLTRACE(%ebx)
@@ -550,6 +589,8 @@ _syscall_invoke:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %r12d, %r12d /* lower 32-bits into %eax */
5:
+
+_syscall_after_brand:
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -787,6 +828,22 @@ _syscall32_save:
ENABLE_INTR_FLAGS
+ /*
+ * If our LWP has a branded syscall_fast handler, execute it. A return
+ * code of zero indicates that the handler completely processed the syscall
+ * and we can return directly to userspace.
+ */
+ movq LWP_BRAND_SYSCALL_FAST(%r14), %rdi
+ testq %rdi, %rdi
+ jz _syscall32_no_brand_fast
+ call *%rdi
+ testl %eax, %eax
+ jnz _syscall32_no_brand_fast
+ incq LWP_RU_SYSC(%r14)
+ incq %gs:CPU_STATS_SYS_SYSCALL
+ jmp _sys_rtt
+
+_syscall32_no_brand_fast:
MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
@@ -795,11 +852,37 @@ _syscall32_save:
incq %gs:CPU_STATS_SYS_SYSCALL
/*
+ * If our lwp has an alternate system call handler, run that instead
+ * of the regular system call path.
+ */
+ movq LWP_BRAND_SYSCALL(%r14), %rax
+ testq %rax, %rax
+ jz _syscall32_no_brand
+
+ movb $LWP_SYS, LWP_STATE(%r14)
+ call *%rax
+
+ /*
+ * If the alternate handler returns 0, we skip straight to the return
+ * to usermode. Otherwise, we resume regular system call processing.
+ */
+ testl %eax, %eax
+ jz _syscall32_after_brand
+
+_syscall32_no_brand:
+ /*
* Make some space for MAXSYSARGS (currently 8) 32-bit args placed
* into 64-bit (long) arg slots, maintaining 16 byte alignment. Or
* more succinctly:
*
* SA(MAXSYSARGS * sizeof (long)) == 64
+ *
+ * Note, this space is used both to copy in the arguments from user
+ * land, but also to as part of the old UNIX style syscall_ap() method.
+ * syscall_entry expects that we do not change the values of this space
+ * that we give it. However, this means that when we end up in the more
+ * recent model of passing the arguments based on the calling
+ * conventions, we'll need to save an additional 16 bytes of stack.
*/
#define SYS_DROP 64 /* drop for args */
subq $SYS_DROP, %rsp
@@ -827,12 +910,16 @@ _syscall32_save:
*/
movq %rax, %rbx
- movl 0(%rsp), %edi
- movl 8(%rsp), %esi
- movl 0x10(%rsp), %edx
- movl 0x18(%rsp), %ecx
- movl 0x20(%rsp), %r8d
- movl 0x28(%rsp), %r9d
+ movl 0x0(%rsp), %edi /* arg0 */
+ movl 0x8(%rsp), %esi /* arg1 */
+ movl 0x10(%rsp), %edx /* arg2 */
+ movl 0x38(%rsp), %eax /* arg7 load */
+ movl 0x18(%rsp), %ecx /* arg3 */
+ pushq %rax /* arg7 saved to stack */
+ movl 0x28(%rsp), %r8d /* arg4 */
+ movl 0x38(%rsp), %eax /* arg6 load */
+ movl 0x30(%rsp), %r9d /* arg5 */
+ pushq %rax /* arg6 saved to stack */
call *SY_CALLC(%rbx)
@@ -850,6 +937,8 @@ _syscall32_save:
shrq $32, %r13 /* upper 32-bits into %edx */
movl %eax, %r12d /* lower 32-bits into %eax */
+_syscall32_after_brand:
+
/*
* Optimistically assume that there's no post-syscall
* work to do. (This is to avoid having to call syscall_mstate()
@@ -1079,15 +1168,20 @@ sys_sysenter()
/*
* Fetch the arguments copied onto the kernel stack and put
* them in the right registers to invoke a C-style syscall handler.
- * %rax contains the handler address.
+ * %rax contains the handler address. For the last two arguments, we
+ * push them onto the stack -- we can't clobber the old arguments.
*/
movq %rax, %rbx
- movl 0(%rsp), %edi
- movl 8(%rsp), %esi
- movl 0x10(%rsp), %edx
- movl 0x18(%rsp), %ecx
- movl 0x20(%rsp), %r8d
- movl 0x28(%rsp), %r9d
+ movl 0x0(%rsp), %edi /* arg0 */
+ movl 0x8(%rsp), %esi /* arg1 */
+ movl 0x10(%rsp), %edx /* arg2 */
+ movl 0x38(%rsp), %eax /* arg7 load */
+ movl 0x18(%rsp), %ecx /* arg3 */
+ pushq %rax /* arg7 saved to stack */
+ movl 0x28(%rsp), %r8d /* arg4 */
+ movl 0x38(%rsp), %eax /* arg6 load */
+ movl 0x30(%rsp), %r9d /* arg5 */
+ pushq %rax /* arg6 saved to stack */
call *SY_CALLC(%rbx)
@@ -1159,6 +1253,74 @@ sys_sysenter()
SET_SIZE(brand_sys_sysenter)
#endif /* __lint */
+
+#if defined(__lint)
+/*
+ * System call via an int80. This entry point is only used by the Linux
+ * application environment. Unlike the other entry points, there is no
+ * default action to take if no callback is registered for this process.
+ */
+void
+sys_int80()
+{}
+
+#else /* __lint */
+
+ ENTRY_NP(brand_sys_int80)
+ SWAPGS /* kernel gsbase */
+ XPV_TRAP_POP
+ call smap_enable
+
+ /*
+ * We first attempt to call the "b_int80" handler from the "struct
+ * brand_mach_ops" for this brand. If no handler function is installed
+ * for this brand, the BRAND_CALLBACK() macro returns here and we
+ * check the lwp for a "lwp_brand_syscall" handler.
+ */
+ BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
+
+ /*
+ * Check to see if this lwp provides "lwp_brand_syscall". If so, we
+ * will route this int80 through the regular system call handling path.
+ */
+ movq %r15, %gs:CPU_RTMP_R15
+ movq %gs:CPU_THREAD, %r15
+ movq T_LWP(%r15), %r15
+ movq LWP_BRAND_SYSCALL(%r15), %r15
+ testq %r15, %r15
+ movq %gs:CPU_RTMP_R15, %r15
+ jnz nopop_syscall_int
+
+ /*
+ * The brand provided neither a "b_int80", nor a "lwp_brand_syscall"
+ * function, and has thus opted out of handling this trap.
+ */
+ SWAPGS /* user gsbase */
+ jmp nopop_int80
+
+ ENTRY_NP(sys_int80)
+ /*
+ * We hit an int80, but this process isn't of a brand with an int80
+ * handler. Bad process! Make it look as if the INT failed.
+ * Modify %rip to point before the INT, push the expected error
+ * code and fake a GP fault. Note on 64-bit hypervisor we need
+ * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
+ * because gptrap will pop them again with its own XPV_TRAP_POP.
+ */
+ XPV_TRAP_POP
+ call smap_enable
+nopop_int80:
+ subq $2, (%rsp) /* int insn 2-bytes */
+ pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
+#if defined(__xpv)
+ push %r11
+ push %rcx
+#endif
+ jmp gptrap / GP fault
+ SET_SIZE(sys_int80)
+ SET_SIZE(brand_sys_int80)
+#endif /* __lint */
+
/*
* This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
diff --git a/usr/src/uts/i86pc/os/comm_page_util.c b/usr/src/uts/i86pc/os/comm_page_util.c
new file mode 100644
index 0000000000..1c8c9f8afd
--- /dev/null
+++ b/usr/src/uts/i86pc/os/comm_page_util.c
@@ -0,0 +1,62 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <vm/as.h>
+#include <vm/seg_umap.h>
+
+#if defined(__x86) && !defined(__xpv)
+#include <sys/comm_page.h>
+#endif /* defined(__x86) && !defined(__xpv) */
+
+/*
+ * Map in the comm page.
+ *
+ * The contents of the comm page are only defined on non-xpv x86 at this time.
+ * Furthermore, the data is only valid in userspace (32-bit or 64-bit) when
+ * mapped from a 64-bit kernel.
+ * See: "uts/i86pc/sys/comm_page.h"
+ */
+caddr_t
+comm_page_mapin()
+{
+#if defined(__amd64) && !defined(__xpv)
+ proc_t *p = curproc;
+ caddr_t addr = (caddr_t)COMM_PAGE_ALIGN;
+ size_t len = COMM_PAGE_SIZE;
+ uint_t prot = PROT_USER | PROT_READ;
+ segumap_crargs_t suarg;
+
+ map_addr(&addr, len, (offset_t)0, 1, MAP_ALIGN);
+ if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as,
+ p->p_as->a_userlimit) != RANGE_OKAY) {
+ return (NULL);
+ }
+
+ suarg.kaddr = (caddr_t)&comm_page;
+ suarg.prot = suarg.maxprot = prot;
+ if (as_map(p->p_as, addr, len, segumap_create, &suarg) != 0) {
+ return (NULL);
+ }
+ return (addr);
+#else /* defined(__amd64) && !defined(__xpv) */
+ return (NULL);
+#endif /* defined(__amd64) && !defined(__xpv) */
+}
diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c
index 91fb583a01..f173a1dc57 100644
--- a/usr/src/uts/i86pc/os/cpr_impl.c
+++ b/usr/src/uts/i86pc/os/cpr_impl.c
@@ -753,6 +753,20 @@ i_cpr_is_supported(int sleeptype)
if (sleeptype != CPR_TORAM)
return (0);
+ /*
+ * Unfortunately, the x86 resume code was never implemented for GAS.
+ * The only obvious problem is that a trick necessary to appease Sun
+ * Studio does the wrong thing for GAS. Doubley unfortunate is that
+ * the condition used to detect GAS is incorrect, so we do in fact
+ * compile the Studio path, it just immediately fails in resume.
+ *
+ * Given that, if we were built using GCC, never allow CPR to be
+ * attempted.
+ */
+#ifdef __GNUC__
+ return (0);
+#endif
+
/*
* The next statement tests if a specific platform has turned off
* cpr support.
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index 44e475f328..027ed29c3d 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -32,7 +32,7 @@
* Portions Copyright 2009 Advanced Micro Devices, Inc.
*/
/*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
* Various routines to handle identification
@@ -57,6 +57,8 @@
#include <sys/auxv_386.h>
#include <sys/memnode.h>
#include <sys/pci_cfgspace.h>
+#include <sys/comm_page.h>
+#include <sys/tsc.h>
#ifdef __xpv
#include <sys/hypervisor.h>
@@ -171,7 +173,9 @@ static char *x86_feature_names[NUM_X86_FEATURES] = {
"bmi2",
"fma",
"smep",
- "smap"
+ "smap",
+ "adx",
+ "rdseed"
};
boolean_t
@@ -1264,6 +1268,11 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
disable_smap == 0)
add_x86_feature(featureset, X86FSET_SMAP);
#endif
+ if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
+ add_x86_feature(featureset, X86FSET_RDSEED);
+
+ if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
+ add_x86_feature(featureset, X86FSET_ADX);
}
/*
@@ -2739,6 +2748,10 @@ cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
+ if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
+ *ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
+ if (!is_x86_feature(x86_featureset, X86FSET_ADX))
+ *ebx &= ~CPUID_INTC_EBX_7_0_ADX;
/*
* [no explicit support required beyond x87 fp context]
@@ -2808,8 +2821,20 @@ cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
if (*ecx & CPUID_INTC_ECX_RDRAND)
hwcap_flags_2 |= AV_386_2_RDRAND;
+ if (*ebx & CPUID_INTC_EBX_7_0_ADX)
+ hwcap_flags_2 |= AV_386_2_ADX;
+ if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
+ hwcap_flags_2 |= AV_386_2_RDSEED;
+
+ }
+
+ /* Detect systems with a potential CPUID limit */
+ if (cpi->cpi_vendor == X86_VENDOR_Intel && cpi->cpi_maxeax < 4) {
+ cmn_err(CE_NOTE, "CPUID limit detected, "
+ "see the CPUID(7D) man page for details\n");
}
+
if (cpi->cpi_xmaxeax < 0x80000001)
goto pass4_done;
@@ -4591,27 +4616,30 @@ patch_tsc_read(int flag)
size_t cnt;
switch (flag) {
- case X86_NO_TSC:
+ case TSC_NONE:
cnt = &_no_rdtsc_end - &_no_rdtsc_start;
(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
break;
- case X86_HAVE_TSCP:
- cnt = &_tscp_end - &_tscp_start;
- (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
- break;
- case X86_TSC_MFENCE:
+ case TSC_RDTSC_MFENCE:
cnt = &_tsc_mfence_end - &_tsc_mfence_start;
(void) memcpy((void *)tsc_read,
(void *)&_tsc_mfence_start, cnt);
break;
- case X86_TSC_LFENCE:
+ case TSC_RDTSC_LFENCE:
cnt = &_tsc_lfence_end - &_tsc_lfence_start;
(void) memcpy((void *)tsc_read,
(void *)&_tsc_lfence_start, cnt);
break;
+ case TSC_TSCP:
+ cnt = &_tscp_end - &_tscp_start;
+ (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
+ break;
default:
+ /* Bail for unexpected TSC types. (TSC_NONE covers 0) */
+ cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
break;
}
+ tsc_type = flag;
}
int
diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c
index d9ed882705..fab1324787 100644
--- a/usr/src/uts/i86pc/os/ibft.c
+++ b/usr/src/uts/i86pc/os/ibft.c
@@ -39,6 +39,7 @@
#include <sys/kmem.h>
#include <sys/psm.h>
#include <sys/bootconf.h>
+#include <sys/reboot.h>
typedef enum ibft_structure_type {
Reserved = 0,
@@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp);
static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft,
iscsi_ibft_tgt_t *tgtp);
+extern int boothowto;
/*
* Return value:
@@ -759,7 +761,9 @@ ld_ib_prop()
* 1) pass "-B ibft-noprobe=1" on kernel command line
* 2) add line "set ibft_noprobe=1" in /etc/system
*/
- cmn_err(CE_NOTE, IBFT_NOPROBE_MSG);
+ if (boothowto & RB_VERBOSE) {
+ cmn_err(CE_NOTE, IBFT_NOPROBE_MSG);
+ }
return;
}
diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c
index b46bbf849d..981398970e 100644
--- a/usr/src/uts/i86pc/os/lgrpplat.c
+++ b/usr/src/uts/i86pc/os/lgrpplat.c
@@ -2799,7 +2799,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
/*
* Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
* and memory are local to each other in the same NUMA node and return number
- * of nodes
+ * of nodes.
+ *
+ * The SRAT table pointer is populated during bootup by
+ * build_firmware_properties() in fakebop.c. Several motherboard and BIOS
+ * manufacturers are guilty of not having a SRAT table.
*/
static int
lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
@@ -2816,9 +2820,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
/*
* Nothing to do when no SRAT or disabled
*/
- if (tp == NULL || !lgrp_plat_srat_enable)
+ if (!lgrp_plat_srat_enable)
return (-1);
+ if (tp == NULL) {
+ cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. "
+ "lgrp support will be limited to one group.\n");
+ return (-1);
+ }
+
/*
* Try to get domain information from MSCT table.
* ACPI4.0: OSPM will use information provided by the MSCT only
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index 045adbcb7b..438f83b6e9 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -23,6 +23,7 @@
*
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -61,6 +62,7 @@
#include <sys/promif.h>
#include <sys/pci_cfgspace.h>
#include <sys/bootvfs.h>
+#include <sys/tsc.h>
#ifdef __xpv
#include <sys/hypervisor.h>
#else
@@ -227,15 +229,15 @@ mlsetup(struct regs *rp)
*/
if ((get_hwenv() & HW_XEN_HVM) == 0 &&
is_x86_feature(x86_featureset, X86FSET_TSCP))
- patch_tsc_read(X86_HAVE_TSCP);
+ patch_tsc_read(TSC_TSCP);
else if (cpuid_getvendor(CPU) == X86_VENDOR_AMD &&
cpuid_getfamily(CPU) <= 0xf &&
is_x86_feature(x86_featureset, X86FSET_SSE2))
- patch_tsc_read(X86_TSC_MFENCE);
+ patch_tsc_read(TSC_RDTSC_MFENCE);
else if (cpuid_getvendor(CPU) == X86_VENDOR_Intel &&
cpuid_getfamily(CPU) <= 6 &&
is_x86_feature(x86_featureset, X86FSET_SSE2))
- patch_tsc_read(X86_TSC_LFENCE);
+ patch_tsc_read(TSC_RDTSC_LFENCE);
#endif /* !__xpv */
@@ -246,7 +248,7 @@ mlsetup(struct regs *rp)
* return 0.
*/
if (!is_x86_feature(x86_featureset, X86FSET_TSC))
- patch_tsc_read(X86_NO_TSC);
+ patch_tsc_read(TSC_NONE);
#endif /* __i386 && !__xpv */
#if defined(__amd64) && !defined(__xpv)
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index 3c7b453949..829c631096 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -27,7 +27,7 @@
* All rights reserved.
*/
/*
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
@@ -166,6 +166,8 @@ init_cpu_info(struct cpu *cp)
void
init_cpu_syscall(struct cpu *cp)
{
+ uint64_t flags;
+
kpreempt_disable();
#if defined(__amd64)
@@ -247,6 +249,24 @@ init_cpu_syscall(struct cpu *cp)
kpreempt_enable();
}
+#if !defined(__xpv)
+/*
+ * Configure per-cpu ID GDT
+ */
+static void
+init_cpu_id_gdt(struct cpu *cp)
+{
+ /* Write cpu_id into limit field of GDT for usermode retrieval */
+#if defined(__amd64)
+ set_usegd(&cp->cpu_gdt[GDT_CPUID], SDP_SHORT, NULL, cp->cpu_id,
+ SDT_MEMRODA, SEL_UPL, SDP_BYTES, SDP_OP32);
+#elif defined(__i386)
+ set_usegd(&cp->cpu_gdt[GDT_CPUID], NULL, cp->cpu_id, SDT_MEMRODA,
+ SEL_UPL, SDP_BYTES, SDP_OP32);
+#endif
+}
+#endif /* !defined(__xpv) */
+
/*
* Multiprocessor initialization.
*
@@ -430,6 +450,10 @@ mp_cpu_configure_common(int cpun, boolean_t boot)
init_cpu_info(cp);
+#if !defined(__xpv)
+ init_cpu_id_gdt(cp);
+#endif
+
/*
* alloc space for ucode_info
*/
@@ -1486,6 +1510,10 @@ start_other_cpus(int cprboot)
*/
init_cpu_info(CPU);
+#if !defined(__xpv)
+ init_cpu_id_gdt(CPU);
+#endif
+
cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_idstr);
cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_brandstr);
diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c
index c40159018c..7344e1a492 100644
--- a/usr/src/uts/i86pc/os/timestamp.c
+++ b/usr/src/uts/i86pc/os/timestamp.c
@@ -25,6 +25,7 @@
*
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -47,6 +48,7 @@
#include <sys/panic.h>
#include <sys/cpu.h>
#include <sys/sdt.h>
+#include <sys/comm_page.h>
/*
* Using the Pentium's TSC register for gethrtime()
@@ -99,7 +101,6 @@
#define NSEC_SHIFT 5
-static uint_t nsec_scale;
static uint_t nsec_unscale;
/*
@@ -140,18 +141,12 @@ static volatile int tsc_sync_go;
int tsc_master_slave_sync_needed = 1;
-static int tsc_max_delta;
-static hrtime_t tsc_sync_tick_delta[NCPU];
typedef struct tsc_sync {
volatile hrtime_t master_tsc, slave_tsc;
} tsc_sync_t;
static tsc_sync_t *tscp;
-static hrtime_t largest_tsc_delta = 0;
-static ulong_t shortest_write_time = ~0UL;
-static hrtime_t tsc_last = 0;
static hrtime_t tsc_last_jumped = 0;
-static hrtime_t tsc_hrtime_base = 0;
static int tsc_jumped = 0;
static uint32_t tsc_wayback = 0;
/*
@@ -159,7 +154,6 @@ static uint32_t tsc_wayback = 0;
* tsc_tick() function runs which means that when gethrtime() is called it
* should never be more than 1 second since tsc_last was updated.
*/
-static hrtime_t tsc_resume_cap;
static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */
static hrtime_t shadow_tsc_hrtime_base;
@@ -451,25 +445,27 @@ tsc_gethrtimeunscaled_delta(void)
}
/*
- * Called by the master in the TSC sync operation (usually the boot CPU).
- * If the slave is discovered to have a skew, gethrtimef will be changed to
- * point to tsc_gethrtime_delta(). Calculating skews is precise only when
- * the master and slave TSCs are read simultaneously; however, there is no
- * algorithm that can read both CPUs in perfect simultaneity. The proposed
- * algorithm is an approximate method based on the behaviour of cache
- * management. The slave CPU continuously reads TSC and then reads a global
- * variable which the master CPU updates. The moment the master's update reaches
- * the slave's visibility (being forced by an mfence operation) we use the TSC
- * reading taken on the slave. A corresponding TSC read will be taken on the
- * master as soon as possible after finishing the mfence operation. But the
- * delay between causing the slave to notice the invalid cache line and the
- * competion of mfence is not repeatable. This error is heuristically assumed
- * to be 1/4th of the total write time as being measured by the two TSC reads
- * on the master sandwiching the mfence. Furthermore, due to the nature of
- * bus arbitration, contention on memory bus, etc., the time taken for the write
- * to reflect globally can vary a lot. So instead of taking a single reading,
- * a set of readings are taken and the one with least write time is chosen
- * to calculate the final skew.
+ * TSC Sync Master
+ *
+ * Typically called on the boot CPU, this attempts to quantify TSC skew between
+ * different CPUs. If an appreciable difference is found, gethrtimef will be
+ * changed to point to tsc_gethrtime_delta().
+ *
+ * Calculating skews is precise only when the master and slave TSCs are read
+ * simultaneously; however, there is no algorithm that can read both CPUs in
+ * perfect simultaneity. The proposed algorithm is an approximate method based
+ * on the behaviour of cache management. The slave CPU continuously polls the
+ * TSC while reading a global variable updated by the master CPU. The latest
+ * TSC reading is saved when the master's update (forced via mfence) reaches
+ * visibility on the slave. The master will also take a TSC reading
+ * immediately following the mfence.
+ *
+ * While the delay between cache line invalidation on the slave and mfence
+ * completion on the master is not repeatable, the error is heuristically
+ * assumed to be 1/4th of the write time recorded by the master. Multiple
+ * samples are taken to control for the variance caused by external factors
+ * such as bus contention. Each sample set is independent per-CPU to control
+ * for differing memory latency on NUMA systems.
*
* TSC sync is disabled in the context of virtualization because the CPUs
* assigned to the guest are virtual CPUs which means the real CPUs on which
@@ -482,7 +478,7 @@ void
tsc_sync_master(processorid_t slave)
{
ulong_t flags, source, min_write_time = ~0UL;
- hrtime_t write_time, x, mtsc_after, tdelta;
+ hrtime_t write_time, mtsc_after, last_delta = 0;
tsc_sync_t *tsc = tscp;
int cnt;
int hwtype;
@@ -505,57 +501,53 @@ tsc_sync_master(processorid_t slave)
SMT_PAUSE();
write_time = mtsc_after - tsc->master_tsc;
if (write_time <= min_write_time) {
- min_write_time = write_time;
+ hrtime_t tdelta;
+
+ tdelta = tsc->slave_tsc - mtsc_after;
+ if (tdelta < 0)
+ tdelta = -tdelta;
/*
- * Apply heuristic adjustment only if the calculated
- * delta is > 1/4th of the write time.
+ * If the margin exists, subtract 1/4th of the measured
+ * write time from the master's TSC value. This is an
+ * estimate of how late the mfence completion came
+ * after the slave noticed the cache line change.
*/
- x = tsc->slave_tsc - mtsc_after;
- if (x < 0)
- x = -x;
- if (x > (min_write_time/4))
- /*
- * Subtract 1/4th of the measured write time
- * from the master's TSC value, as an estimate
- * of how late the mfence completion came
- * after the slave noticed the cache line
- * change.
- */
+ if (tdelta > (write_time/4)) {
tdelta = tsc->slave_tsc -
- (mtsc_after - (min_write_time/4));
- else
+ (mtsc_after - (write_time/4));
+ } else {
tdelta = tsc->slave_tsc - mtsc_after;
- tsc_sync_tick_delta[slave] =
- tsc_sync_tick_delta[source] - tdelta;
+ }
+ last_delta = tsc_sync_tick_delta[source] - tdelta;
+ tsc_sync_tick_delta[slave] = last_delta;
+ min_write_time = write_time;
}
tsc->master_tsc = tsc->slave_tsc = write_time = 0;
membar_enter();
tsc_sync_go = TSC_SYNC_STOP;
}
- if (tdelta < 0)
- tdelta = -tdelta;
- if (tdelta > largest_tsc_delta)
- largest_tsc_delta = tdelta;
- if (min_write_time < shortest_write_time)
- shortest_write_time = min_write_time;
+
/*
- * Enable delta variants of tsc functions if the largest of all chosen
- * deltas is > smallest of the write time.
+ * Only enable the delta variants of the TSC functions if the measured
+ * skew is greater than the fastest write time.
*/
- if (largest_tsc_delta > shortest_write_time) {
+ last_delta = (last_delta < 0) ? -last_delta : last_delta;
+ if (last_delta > min_write_time) {
gethrtimef = tsc_gethrtime_delta;
gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
+ tsc_ncpu = NCPU;
}
restore_int_flag(flags);
}
/*
+ * TSC Sync Slave
+ *
* Called by a CPU which has just been onlined. It is expected that the CPU
* performing the online operation will call tsc_sync_master().
*
- * TSC sync is disabled in the context of virtualization. See comments
- * above tsc_sync_master.
+ * Like tsc_sync_master, this logic is skipped on virtualized platforms.
*/
void
tsc_sync_slave(void)
@@ -579,11 +571,9 @@ tsc_sync_slave(void)
tsc_sync_go = TSC_SYNC_GO;
do {
/*
- * Do not put an SMT_PAUSE here. For instance,
- * if the master and slave are really the same
- * hyper-threaded CPU, then you want the master
- * to yield to the slave as quickly as possible here,
- * but not the other way.
+ * Do not put an SMT_PAUSE here. If the master and
+ * slave are the same hyper-threaded CPU, we want the
+ * master to yield as quickly as possible to the slave.
*/
s1 = tsc_read();
} while (tsc->master_tsc == 0);
@@ -688,6 +678,12 @@ tsc_hrtimeinit(uint64_t cpu_freq_hz)
hrtime_tick = tsc_tick;
gethrtime_hires = 1;
/*
+ * Being part of the comm page, tsc_ncpu communicates the published
+ * length of the tsc_sync_tick_delta array. This is kept zeroed to
+ * ignore the absent delta data while the TSCs are synced.
+ */
+ tsc_ncpu = 0;
+ /*
* Allocate memory for the structure used in the tsc sync logic.
* This structure should be aligned on a multiple of cache line size.
*/
@@ -708,12 +704,10 @@ get_tsc_ready()
}
/*
- * Adjust all the deltas by adding the passed value to the array.
- * Then use the "delt" versions of the the gethrtime functions.
- * Note that 'tdelta' _could_ be a negative number, which should
- * reduce the values in the array (used, for example, if the Solaris
- * instance was moved by a virtual manager to a machine with a higher
- * value of tsc).
+ * Adjust all the deltas by adding the passed value to the array and activate
+ * the "delta" versions of the gethrtime functions. It is possible that the
+ * adjustment could be negative. Such may occur if the SunOS instance was
+ * moved by a virtual manager to a machine with a higher value of TSC.
*/
void
tsc_adjust_delta(hrtime_t tdelta)
@@ -726,19 +720,16 @@ tsc_adjust_delta(hrtime_t tdelta)
gethrtimef = tsc_gethrtime_delta;
gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
+ tsc_ncpu = NCPU;
}
/*
* Functions to manage TSC and high-res time on suspend and resume.
*/
-/*
- * declarations needed for time adjustment
- */
-extern void rtcsync(void);
+/* tod_ops from "uts/i86pc/io/todpc_subr.c" */
extern tod_ops_t *tod_ops;
-/* There must be a better way than exposing nsec_scale! */
-extern uint_t nsec_scale;
+
static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
static timestruc_t tsc_saved_ts;
static int tsc_needs_resume = 0; /* We only want to do this once. */
@@ -748,23 +739,20 @@ int tsc_suspend_count = 0;
int tsc_resume_in_cyclic = 0;
/*
- * Let timestamp.c know that we are suspending. It needs to take
- * snapshots of the current time, and do any pre-suspend work.
+ * Take snapshots of the current time and do any other pre-suspend work.
*/
void
tsc_suspend(void)
{
-/*
- * What we need to do here, is to get the time we suspended, so that we
- * know how much we should add to the resume.
- * This routine is called by each CPU, so we need to handle reentry.
- */
+ /*
+ * We need to collect the time at which we suspended here so we know
+ * now much should be added during the resume. This is called by each
+ * CPU, so reentry must be properly handled.
+ */
if (tsc_gethrtime_enable) {
/*
- * We put the tsc_read() inside the lock as it
- * as no locking constraints, and it puts the
- * aquired value closer to the time stamp (in
- * case we delay getting the lock).
+ * Perform the tsc_read after acquiring the lock to make it as
+ * accurate as possible in the face of contention.
*/
mutex_enter(&tod_lock);
tsc_saved_tsc = tsc_read();
@@ -786,8 +774,7 @@ tsc_suspend(void)
}
/*
- * Restore all timestamp state based on the snapshots taken at
- * suspend time.
+ * Restore all timestamp state based on the snapshots taken at suspend time.
*/
void
tsc_resume(void)
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index 9390690e95..c88fec6fbe 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -32,7 +32,7 @@
/* */
/*
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -101,6 +101,7 @@
#include <sys/hypervisor.h>
#endif
#include <sys/contract/process_impl.h>
+#include <sys/brand.h>
#define USER 0x10000 /* user-mode flag added to trap type */
@@ -862,6 +863,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
fault_type = F_INVAL;
}
+ /*
+ * Allow the brand to interpose on invalid memory accesses
+ * prior to running the native pagefault handler. If this
+ * brand hook returns zero, it was able to handle the fault
+ * completely. Otherwise, drive on and call pagefault().
+ */
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL &&
+ BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) {
+ goto out;
+ }
+
res = pagefault(addr, fault_type, rw, 0);
/*
@@ -1468,12 +1480,23 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
ct->t_sig_check = 0;
- mutex_enter(&p->p_lock);
+ /*
+ * As in other code paths that check against TP_CHANGEBIND,
+ * we perform the check first without p_lock held -- only
+ * acquiring p_lock in the unlikely event that it is indeed
+ * set. This is safe because we are doing this after the
+ * astoff(); if we are racing another thread setting
+ * TP_CHANGEBIND on us, we will pick it up on a subsequent
+ * lap through.
+ */
if (curthread->t_proc_flag & TP_CHANGEBIND) {
- timer_lwpbind();
- curthread->t_proc_flag &= ~TP_CHANGEBIND;
+ mutex_enter(&p->p_lock);
+ if (curthread->t_proc_flag & TP_CHANGEBIND) {
+ timer_lwpbind();
+ curthread->t_proc_flag &= ~TP_CHANGEBIND;
+ }
+ mutex_exit(&p->p_lock);
}
- mutex_exit(&p->p_lock);
/*
* for kaio requests that are on the per-process poll queue,
diff --git a/usr/src/uts/i86pc/sys/acpidev.h b/usr/src/uts/i86pc/sys/acpidev.h
index 6d11277aaf..a3bd54d4e3 100644
--- a/usr/src/uts/i86pc/sys/acpidev.h
+++ b/usr/src/uts/i86pc/sys/acpidev.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009-2010, Intel Corporation.
* All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_ACPIDEV_H
@@ -128,7 +129,7 @@ typedef enum acpidev_class_id {
#ifdef _KERNEL
/* Common ACPI object names. */
-#define ACPIDEV_OBJECT_NAME_SB ACPI_NS_SYSTEM_BUS
+#define ACPIDEV_OBJECT_NAME_SB METHOD_NAME__SB_
#define ACPIDEV_OBJECT_NAME_PR "_PR_"
/* Common ACPI method names. */
diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h
index 8a87760456..11ae48340a 100644
--- a/usr/src/uts/i86pc/sys/apic.h
+++ b/usr/src/uts/i86pc/sys/apic.h
@@ -382,7 +382,7 @@ struct apic_io_intr {
/* special or reserve vectors */
#define APIC_CHECK_RESERVE_VECTORS(v) \
(((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \
- ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET))
+ ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80))
/* cmos shutdown code for BIOS */
#define BIOS_SHUTDOWN 0x0a
diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h
new file mode 100644
index 0000000000..dbf00bc7a7
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/comm_page.h
@@ -0,0 +1,102 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _COMM_PAGE_H
+#define _COMM_PAGE_H
+
+#ifndef _ASM
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#endif /* _ASM */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define COMM_PAGE_SIZE PAGESIZE
+#define COMM_PAGE_ALIGN 0x4000
+
+#ifndef _ASM
+
+/*
+ * x86 comm page
+ *
+ * This struct defines the data format for the "comm page": kernel data made
+ * directly available to userspace for read-only operations. This enables
+ * facilities such as clock_gettime to operate entirely in userspace without
+ * the need for a trap or fasttrap.
+ *
+ * A note about 32-bit/64-bit compatibility:
+ * The current format of the comm page is designed to be consistent for both
+ * 32-bit and 64-bit programs running in a 64-bit kernel. On 32-bit kernels,
+ * the comm page is not exposed to userspace due to the difference in
+ * timespec_t sizing.
+ *
+ * This struct is instantiated "by hand" in assembly to preserve the global
+ * symbols it contains. That layout must be kept in sync with the structure
+ * defined here.
+ * See: "uts/i86pc/ml/comm_page.s"
+ */
+typedef struct comm_page_s {
+ hrtime_t cp_tsc_last;
+ hrtime_t cp_tsc_hrtime_base;
+ hrtime_t cp_tsc_resume_cap;
+ uint32_t cp_tsc_type;
+ uint32_t cp_tsc_max_delta;
+
+ volatile uint32_t cp_hres_lock; /* must be 8-byte aligned */
+ uint32_t cp_nsec_scale;
+ int64_t cp_hrestime_adj;
+ hrtime_t cp_hres_last_tick;
+ uint32_t cp_tsc_ncpu;
+ uint32_t _cp_pad;
+ volatile int64_t cp_hrestime[2];
+#if defined(_MACHDEP)
+ hrtime_t cp_tsc_sync_tick_delta[NCPU];
+#else
+ /* length resides in cp_ncpu */
+ hrtime_t cp_tsc_sync_tick_delta[];
+#endif /* defined(_MACHDEP) */
+} comm_page_t;
+
+#if defined(_KERNEL)
+extern comm_page_t comm_page;
+
+extern caddr_t comm_page_mapin();
+
+#if defined(_MACHDEP)
+extern hrtime_t tsc_last;
+extern hrtime_t tsc_hrtime_base;
+extern hrtime_t tsc_resume_cap;
+extern uint32_t tsc_type;
+extern uint32_t tsc_max_delta;
+extern volatile uint32_t hres_lock;
+extern uint32_t nsec_scale;
+extern int64_t hrestime_adj;
+extern hrtime_t hres_last_tick;
+extern uint32_t tsc_ncpu;
+extern volatile timestruc_t hrestime;
+extern hrtime_t tsc_sync_tick_delta[NCPU];
+#endif /* defined(_MACHDEP) */
+#endif /* defined(_KERNEL) */
+
+#endif /* _ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _COMM_PAGE_H */
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index 99ae0d4d3b..fc34522307 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1988 AT&T */
@@ -54,6 +55,10 @@ extern "C" {
*/
#if defined(__amd64)
+/*
+ * If NCPU grows beyond 256, sizing for the x86 comm page will require
+ * adjustment.
+ */
#define NCPU 256
#define NCPU_LOG2 8
#elif defined(__i386)
diff --git a/usr/src/uts/i86pc/sys/tsc.h b/usr/src/uts/i86pc/sys/tsc.h
new file mode 100644
index 0000000000..d4090381c4
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/tsc.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _TSC_H
+#define _TSC_H
+
+/*
+ * flags to patch tsc_read routine.
+ */
+#define TSC_NONE 0x0
+#define TSC_RDTSC_CPUID 0x1
+#define TSC_RDTSC_MFENCE 0x2
+#define TSC_RDTSC_LFENCE 0x3
+#define TSC_TSCP 0x4
+
+#endif /* _TSC_H */
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index ea2a83b2bd..a8b4e6edfc 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -27,6 +27,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014, 2015 by Delphix. All rights reserved.
*/
@@ -3323,7 +3324,7 @@ hat_page_getattr(struct page *pp, uint_t flag)
/*
- * common code used by hat_pageunload() and hment_steal()
+ * common code used by hat_page_inval() and hment_steal()
*/
hment_t *
hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
@@ -3379,15 +3380,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
extern int vpm_enable;
/*
- * Unload all translations to a page. If the page is a subpage of a large
+ * Unload translations to a page. If the page is a subpage of a large
* page, the large page mappings are also removed.
- *
- * The forceflags are unused.
+ * If curhat is not NULL, then we only unload the translation
+ * for the given process, otherwise all translations are unloaded.
*/
-
-/*ARGSUSED*/
-static int
-hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
+void
+hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat)
{
page_t *cur_pp = pp;
hment_t *hm;
@@ -3395,15 +3394,10 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
htable_t *ht;
uint_t entry;
level_t level;
+ ulong_t cnt;
XPV_DISALLOW_MIGRATE();
- /*
- * prevent recursion due to kmem_free()
- */
- ++curthread->t_hatdepth;
- ASSERT(curthread->t_hatdepth < 16);
-
#if defined(__amd64)
/*
* clear the vpm ref.
@@ -3416,6 +3410,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
* The loop with next_size handles pages with multiple pagesize mappings
*/
next_size:
+ if (curhat != NULL)
+ cnt = hat_page_getshare(cur_pp);
for (;;) {
/*
@@ -3427,14 +3423,13 @@ next_size:
if (hm == NULL) {
x86_hm_exit(cur_pp);
+curproc_done:
/*
* If not part of a larger page, we're done.
*/
if (cur_pp->p_szc <= pg_szcd) {
- ASSERT(curthread->t_hatdepth > 0);
- --curthread->t_hatdepth;
XPV_ALLOW_MIGRATE();
- return (0);
+ return;
}
/*
@@ -3453,8 +3448,20 @@ next_size:
* If this mapping size matches, remove it.
*/
level = ht->ht_level;
- if (level == pg_szcd)
- break;
+ if (level == pg_szcd) {
+ if (curhat == NULL || ht->ht_hat == curhat)
+ break;
+ /*
+ * Unloading only the given process but it's
+ * not the hat for the current process. Leave
+ * entry in place. Also do a safety check to
+ * ensure we don't get in an infinite loop
+ */
+ if (cnt-- == 0) {
+ x86_hm_exit(cur_pp);
+ goto curproc_done;
+ }
+ }
}
/*
@@ -3464,14 +3471,44 @@ next_size:
hm = hati_page_unmap(cur_pp, ht, entry);
if (hm != NULL)
hment_free(hm);
+
+ /* Perform check above for being part of a larger page. */
+ if (curhat != NULL)
+ goto curproc_done;
}
}
+/*
+ * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then
+ * we only unload the translation for the current process, otherwise all
+ * translations are unloaded.
+ */
+static int
+hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag)
+{
+ struct hat *curhat = NULL;
+
+ /*
+ * prevent recursion due to kmem_free()
+ */
+ ++curthread->t_hatdepth;
+ ASSERT(curthread->t_hatdepth < 16);
+
+ if (unloadflag == HAT_CURPROC_PGUNLOAD)
+ curhat = curthread->t_procp->p_as->a_hat;
+
+ hat_page_inval(pp, pg_szcd, curhat);
+
+ ASSERT(curthread->t_hatdepth > 0);
+ --curthread->t_hatdepth;
+ return (0);
+}
+
int
-hat_pageunload(struct page *pp, uint_t forceflag)
+hat_pageunload(struct page *pp, uint_t unloadflag)
{
ASSERT(PAGE_EXCL(pp));
- return (hati_pageunload(pp, 0, forceflag));
+ return (hati_pageunload(pp, 0, unloadflag));
}
/*
diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c
index 2212202a01..1c2bd3e0ec 100644
--- a/usr/src/uts/i86pc/vm/vm_machdep.c
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c
@@ -24,6 +24,7 @@
/*
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -623,10 +624,8 @@ void
map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
{
struct proc *p = curproc;
- caddr_t userlimit = (flags & _MAP_LOW32) ?
- (caddr_t)_userlimit32 : p->p_as->a_userlimit;
-
- map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
+ map_addr_proc(addrp, len, off, vacalign,
+ map_userlimit(p, p->p_as, flags), curproc, flags);
}
/*ARGSUSED*/