diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2016-05-16 20:49:35 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2016-05-16 20:49:35 +0000 |
commit | d19cb1300ec66e8552d605a713e7b6dd6ba255f5 (patch) | |
tree | a3bbb3f8d5bcee582c50fbf4f5ef425fd9a8ed2d /usr/src | |
parent | a257e301376666442c2b655cf573c9d3e34b1ed5 (diff) | |
download | illumos-joyent-d19cb1300ec66e8552d605a713e7b6dd6ba255f5.tar.gz |
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Diffstat (limited to 'usr/src')
45 files changed, 1616 insertions, 92 deletions
diff --git a/usr/src/cmd/ptools/pargs/pargs.c b/usr/src/cmd/ptools/pargs/pargs.c index d8072b56a1..fd2be024f3 100644 --- a/usr/src/cmd/ptools/pargs/pargs.c +++ b/usr/src/cmd/ptools/pargs/pargs.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -828,7 +828,8 @@ static struct aux_id aux_arr[] = { { AT_SUN_BRAND_AUX1, "AT_SUN_BRAND_AUX1", at_null }, { AT_SUN_BRAND_AUX2, "AT_SUN_BRAND_AUX2", at_null }, { AT_SUN_BRAND_AUX3, "AT_SUN_BRAND_AUX3", at_null }, - { AT_SUN_BRAND_AUX4, "AT_SUN_BRAND_AUX4", at_null } + { AT_SUN_BRAND_AUX4, "AT_SUN_BRAND_AUX4", at_null }, + { AT_SUN_COMMPAGE, "AT_SUN_COMMPAGE", at_null } }; #define N_AT_ENTS (sizeof (aux_arr) / sizeof (struct aux_id)) diff --git a/usr/src/cmd/sgs/libconv/common/corenote.c b/usr/src/cmd/sgs/libconv/common/corenote.c index 863c3bc917..1e579415f6 100644 --- a/usr/src/cmd/sgs/libconv/common/corenote.c +++ b/usr/src/cmd/sgs/libconv/common/corenote.c @@ -25,7 +25,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -105,20 +105,20 @@ conv_cnote_auxv_type(Word type, Conv_fmt_flags_t fmt_flags, static const conv_ds_msg_t ds_types_2000_2011 = { CONV_DS_MSG_INIT(2000, types_2000_2011) }; - static const Msg types_2014_2024[] = { + static const Msg types_2014_2025[] = { MSG_AUXV_AT_SUN_EXECNAME, MSG_AUXV_AT_SUN_MMU, MSG_AUXV_AT_SUN_LDDATA, MSG_AUXV_AT_SUN_AUXFLAGS, MSG_AUXV_AT_SUN_EMULATOR, MSG_AUXV_AT_SUN_BRANDNAME, MSG_AUXV_AT_SUN_BRAND_AUX1, MSG_AUXV_AT_SUN_BRAND_AUX2, MSG_AUXV_AT_SUN_BRAND_AUX3, MSG_AUXV_AT_SUN_HWCAP2, - MSG_AUXV_AT_SUN_BRAND_NROOT + MSG_AUXV_AT_SUN_BRAND_NROOT, MSG_AUXV_AT_SUN_COMMPAGE }; - static const conv_ds_msg_t ds_types_2014_2024 = { - CONV_DS_MSG_INIT(2014, types_2014_2024) }; + static const conv_ds_msg_t ds_types_2014_2025 = { + CONV_DS_MSG_INIT(2014, types_2014_2025) }; static const conv_ds_t *ds[] = { CONV_DS_ADDR(ds_types_0_25), CONV_DS_ADDR(ds_types_2000_2011), - CONV_DS_ADDR(ds_types_2014_2024), NULL }; + CONV_DS_ADDR(ds_types_2014_2025), NULL }; return (conv_map_ds(ELFOSABI_NONE, EM_NONE, type, ds, fmt_flags, inv_buf)); diff --git a/usr/src/cmd/sgs/libconv/common/corenote.msg b/usr/src/cmd/sgs/libconv/common/corenote.msg index 6372b6f714..2dcd2a697f 100644 --- a/usr/src/cmd/sgs/libconv/common/corenote.msg +++ b/usr/src/cmd/sgs/libconv/common/corenote.msg @@ -24,7 +24,7 @@ # Use is subject to license terms. # # Copyright 2012 DEY Storage Systems, Inc. All rights reserved. -# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# Copyright 2016 Joyent, Inc. # @ MSG_NT_PRSTATUS "[ NT_PRSTATUS ]" @@ -104,6 +104,7 @@ @ MSG_AUXV_AT_SUN_BRAND_AUX3 "SUN_BRAND_AUX3" @ MSG_AUXV_AT_SUN_HWCAP2 "SUN_HWCAP2" @ MSG_AUXV_AT_SUN_BRAND_NROOT "SUN_BRAND_NROOT" +@ MSG_AUXV_AT_SUN_COMMPAGE "SUN_COMMPAGE" @ MSG_CC_CONTENT_STACK "STACK" diff --git a/usr/src/cmd/sgs/rtld/common/external.c b/usr/src/cmd/sgs/rtld/common/external.c index 4a16ffcf9b..cd7365c524 100644 --- a/usr/src/cmd/sgs/rtld/common/external.c +++ b/usr/src/cmd/sgs/rtld/common/external.c @@ -22,7 +22,7 @@ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Garrett D'Amore <garrett@damore.org> - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ /* @@ -721,6 +721,21 @@ isalnum(int c) return ((isalpha(c) || isdigit(c)) ? 1 : 0); } +#if defined(__i386) || defined(__amd64) +/* + * Instead of utilizing the comm page for clock_gettime, rtld uses the raw + * syscall instead. Doing so decreases the surface of symbols needed from libc + * for a modest performance cost. + */ +extern int __clock_gettime_sys(clockid_t, struct timespec *); + +int +__clock_gettime(clockid_t clock_id, struct timespec *tp) +{ + return (__clock_gettime_sys(clock_id, tp)); +} +#endif /* defined(__i386) || defined(__amd64) */ + /* * In a similar vein to the is* functions above, we also have to define our own * version of strerror, as it is implemented in terms of the locale aware diff --git a/usr/src/lib/commpage/Makefile.shared.com b/usr/src/lib/commpage/Makefile.shared.com new file mode 100644 index 0000000000..29cd826706 --- /dev/null +++ b/usr/src/lib/commpage/Makefile.shared.com @@ -0,0 +1,27 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# This Makefile is shared between both libc and other consumers +# + +COMMPAGE_OBJS = \ + cp_subr.o \ + cp_main.o + +pics/cp_main.o := CPPFLAGS += -I$(SRC)/uts/i86pc +pics/cp_subr.o := ASFLAGS += -I$(SRC)/uts/i86pc + +COMMPAGE_CPPFLAGS = -I$(SRC)/lib/commpage/common diff --git a/usr/src/lib/commpage/Makefile.shared.targ b/usr/src/lib/commpage/Makefile.shared.targ new file mode 100644 index 0000000000..667634cafa --- /dev/null +++ b/usr/src/lib/commpage/Makefile.shared.targ @@ -0,0 +1,26 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +# +# This Makefile is shared between both libc and other consumers +# + +pics/%.o: $(SRC)/lib/commpage/common/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +pics/%.o: $(SRC)/lib/commpage/$(TARGET_ARCH)/%.s + $(COMPILE.s) -o $@ $< + $(POST_PROCESS_O) diff --git a/usr/src/lib/commpage/amd64/cp_subr.s b/usr/src/lib/commpage/amd64/cp_subr.s new file mode 100644 index 0000000000..ceb946acf0 --- /dev/null +++ b/usr/src/lib/commpage/amd64/cp_subr.s @@ -0,0 +1,140 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> +#include <sys/time_impl.h> +#include <sys/tsc.h> + +#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) + + .file "cp_subr.s" + +/* + * hrtime_t + * __cp_tsc_read(uint_t cp_tsc_type) + */ + ENTRY_NP(__cp_tsc_read) + cmpl $TSC_TSCP, %edi + je 1f + cmpl $TSC_RDTSC_MFENCE, %edi + je 2f + cmpl $TSC_RDTSC_LFENCE, %edi + je 3f + cmpl $TSC_RDTSC_CPUID, %edi + je 4f + ud2a /* abort with SIGILL */ +1: + rdtscp + jmp 5f +2: + mfence + rdtsc + jmp 5f +3: + lfence + rdtsc + jmp 5f +4: + movq %rbx, %r11 + xorl %eax, %eax + cpuid + rdtsc + movq %r11, %rbx +5: + shlq $0x20, %rdx + orq %rdx, %rax + ret + SET_SIZE(__cp_tsc_read) + + +/* + * hrtime_t + * __cp_tsc_readcpu(uint_t cp_tsc_type, uint_t *cpu_id) + */ + ENTRY_NP(__cp_tsc_readcpu) + /* + * Both time and cpu_id can be queried quickly (using few registers) on + * systems which support RDTSCP. On each cpu, the cpu_id is stored in + * the TSC_AUX MSR by the kernel. + */ + cmpl $TSC_TSCP, %edi + jne 1f + rdtscp + movl %ecx, (%rsi) + shlq $0x20, %rdx + orq %rdx, %rax + ret +1: + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + movq %rax, %r11 + cmpl $TSC_RDTSC_MFENCE, %edi + je 2f + cmpl $TSC_RDTSC_LFENCE, %edi + je 3f + cmpl $TSC_RDTSC_CPUID, %edi + je 4f + ud2a /* abort with SIGILL */ +2: + mfence + rdtsc + jmp 5f +3: + lfence + rdtsc + jmp 5f +4: + movq %rbx, %r10 + xorl %eax, %eax + cpuid + rdtsc + movq %r10, %rbx +5: + shlq %rdx + orq %rax, %rdx + /* + * With a TSC reading in-hand, confirm that the thread has not migrated + * since the cpu_id was first checked. + */ + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + cmpq %rax, %r11 + jne 1b + movl %eax, (%rsi) + movq %rdx, %rax + ret + SET_SIZE(__cp_tsc_readcpu) + + +/* + * uint_t + * __cp_do_getcpu(uint_t cp_tsc_type) + */ + ENTRY_NP(__cp_do_getcpu) + /* + * If RDTSCP is available, it is a quick way to grab the cpu_id which + * is stored in the TSC_AUX MSR by the kernel. + */ + cmpl $TSC_TSCP, %edi + jne 1f + rdtscp + movl %ecx, %eax + ret +1: + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + ret + SET_SIZE(__cp_do_getcpu) diff --git a/usr/src/lib/commpage/common/cp_defs.h b/usr/src/lib/commpage/common/cp_defs.h new file mode 100644 index 0000000000..e2a6cbca45 --- /dev/null +++ b/usr/src/lib/commpage/common/cp_defs.h @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _CP_DEFS_H_ +#define _CP_DEFS_H_ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct comm_page; +typedef struct comm_page comm_page_t; + +extern uint_t __cp_can_gettime(comm_page_t *); +extern hrtime_t __cp_gethrtime(comm_page_t *); +extern void __cp_clock_gettime_realtime(comm_page_t *, timespec_t *); +extern uint_t __cp_getcpu(comm_page_t *cp); + +#ifdef __cplusplus +} +#endif +#endif /* _CP_DEFS_H_ */ diff --git a/usr/src/lib/commpage/common/cp_main.c b/usr/src/lib/commpage/common/cp_main.c new file mode 100644 index 0000000000..8386767695 --- /dev/null +++ b/usr/src/lib/commpage/common/cp_main.c @@ -0,0 +1,185 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + + +#include <sys/comm_page.h> +#include <sys/tsc.h> + +/* + * ASM-defined functions. + */ +extern hrtime_t __cp_tsc_read(uint_t); +extern hrtime_t __cp_tsc_readcpu(uint_t, uint_t *); +extern uint_t __cp_do_getcpu(uint_t); + +/* + * These are cloned from TSC and time related code in the kernel. The should + * be kept in sync in the case that the source values are changed. + */ +#define NSEC_SHIFT 5 +#define ADJ_SHIFT 4 +#define NANOSEC 1000000000LL + +#define TSC_CONVERT_AND_ADD(tsc, hrt, scale) do { \ + uint32_t *_l = (uint32_t *)&(tsc); \ + uint64_t sc = (uint32_t)(scale); \ + (hrt) += (uint64_t)(_l[1] * sc) << NSEC_SHIFT; \ + (hrt) += (uint64_t)(_l[0] * sc) >> (32 - NSEC_SHIFT); \ +} while (0) + + +/* + * Userspace version of tsc_gethrtime. + * See: uts/i86pc/os/timestamp.c + */ +hrtime_t +__cp_gethrtime(comm_page_t *cp) +{ + uint32_t old_hres_lock; + hrtime_t tsc, hrt, tsc_last; + + /* + * Several precautions must be taken when collecting the data necessary + * to perform an accurate gethrtime calculation. + * + * While much of the TSC state stored in the comm page is unchanging + * after boot, portions of it are periodically updated during OS ticks. + * Changes to hres_lock during the course of the copy indicates a + * potentially inconsistent snapshot, necessitating a loop. + * + * Even more complicated is the handling for TSCs which require sync + * offsets between different CPUs. Since userspace lacks the luxury of + * disabling interrupts, a validation loop checking for CPU migrations + * is used. Pathological scheduling could, in theory, "outwit" + * this check. Such a possibility is considered an acceptable risk. + * + */ + if (cp->cp_tsc_ncpu == 0) { + /* + * No per-CPU offset data, use the simple hres_lock loop. + */ + do { + old_hres_lock = cp->cp_hres_lock; + tsc_last = cp->cp_tsc_last; + hrt = cp->cp_tsc_hrtime_base; + tsc = __cp_tsc_read(cp->cp_tsc_type); + } while ((old_hres_lock & ~1) != cp->cp_hres_lock); + } else { + /* + * Per-CPU offset data is needed for an accurate TSC reading. + */ + do { + uint_t cpu_id; + + old_hres_lock = cp->cp_hres_lock; + tsc_last = cp->cp_tsc_last; + hrt = cp->cp_tsc_hrtime_base; + /* + * When collecting the TSC and cpu_id, cp_tsc_readcpu + * will accurately detect CPU migrations in all but + * the most pathological scheduling conditions. + */ + tsc = __cp_tsc_readcpu(cp->cp_tsc_type, &cpu_id); + tsc += cp->cp_tsc_sync_tick_delta[cpu_id]; + } while ((old_hres_lock & ~1) != cp->cp_hres_lock); + } + + if (tsc >= tsc_last) { + tsc -= tsc_last; + } else if (tsc >= tsc_last - (2 * cp->cp_tsc_max_delta)) { + tsc = 0; + } else if (tsc > cp->cp_tsc_resume_cap) { + tsc = cp->cp_tsc_resume_cap; + } + TSC_CONVERT_AND_ADD(tsc, hrt, cp->cp_nsec_scale); + + return (hrt); +} + +/* + * Userspace version of pc_gethrestime. + * See: uts/i86pc/os/machdep.c + */ +void +__cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tp) +{ + int lock_prev, nslt, adj; + timespec_t now; + int64_t hres_adj; + +loop: + lock_prev = cp->cp_hres_lock; + now.tv_sec = cp->cp_hrestime[0]; + now.tv_nsec = cp->cp_hrestime[1]; + nslt = (int)(__cp_gethrtime(cp) - cp->cp_hres_last_tick); + hres_adj = cp->cp_hrestime_adj; + if (nslt < 0) { + /* + * Tick came between sampling hrtime and hres_last_tick; + */ + goto loop; + } + now.tv_nsec += nslt; + if (hres_adj != 0) { + if (hres_adj > 0) { + adj = (nslt >> ADJ_SHIFT); + if (adj > hres_adj) + adj = (int)hres_adj; + } else { + adj = -(nslt >> ADJ_SHIFT); + if (adj < hres_adj) + adj = (int)hres_adj; + } + now.tv_nsec += adj; + } + while ((unsigned long)now.tv_nsec >= NANOSEC) { + /* + * Rope in tv_nsec from any excessive adjustments. + */ + now.tv_nsec -= NANOSEC; + now.tv_sec++; + } + if ((cp->cp_hres_lock & ~1) != lock_prev) + goto loop; + + *tp = now; +} + +/* + * Interrogate if querying the clock via the comm page is possible. + */ +int +__cp_can_gettime(comm_page_t *cp) +{ + switch (cp->cp_tsc_type) { + case TSC_TSCP: + case TSC_RDTSC_MFENCE: + case TSC_RDTSC_LFENCE: + case TSC_RDTSC_CPUID: + return (0); + default: + break; + } + return (1); +} + +/* + * Query which CPU this LWP is running on. + */ +uint_t +__cp_getcpu(comm_page_t *cp) +{ + return (__cp_do_getcpu(cp->cp_tsc_type)); +} diff --git a/usr/src/lib/commpage/i386/cp_subr.s b/usr/src/lib/commpage/i386/cp_subr.s new file mode 100644 index 0000000000..3dda195c12 --- /dev/null +++ b/usr/src/lib/commpage/i386/cp_subr.s @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> +#include <sys/time_impl.h> +#include <sys/tsc.h> + +#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) + + .file "cp_subr.s" + +/* + * hrtime_t + * __cp_tsc_read(uint_t cp_tsc_type) + */ + ENTRY_NP(__cp_tsc_read) + movl 4(%esp), %eax + cmpl $TSC_TSCP, %eax + je 1f + cmpl $TSC_RDTSC_MFENCE, %eax + je 2f + cmpl $TSC_RDTSC_LFENCE, %eax + je 3f + cmpl $TSC_RDTSC_CPUID, %eax + je 4f + ud2a /* abort with SIGILL */ +1: + rdtscp + ret +2: + mfence + rdtsc + ret +3: + lfence + rdtsc + ret +4: + pushl %ebx + xorl %eax, %eax + cpuid + rdtsc + popl %ebx + ret + SET_SIZE(__cp_tsc_read) + +/* + * hrtime_t + * __cp_tsc_readcpu(uint_t cp_tsc_type, uint_t *cpu_id) + */ + ENTRY_NP(__cp_tsc_readcpu) + /* + * Both time and cpu_id can be queried quickly (using few registers) on + * systems which support RDTSCP. On each cpu, the cpu_id is stored in + * the TSC_AUX MSR by the kernel. + */ + movl 4(%esp), %eax + cmpl $TSC_TSCP, %eax + jne 1f + rdtscp + pushl %eax + movl 0xc(%esp), %eax + movl %ecx, (%eax) + popl %eax + ret +1: + /* + * Since the other methods of querying the TSC and cpu_id are + * vulnurable to CPU migrations, build a proper stack frame so a more + * complicated and thorough check and be performed. + */ + pushl %ebp + movl %esp, %ebp + pushl %edi + pushl %esi + movl %eax, %edi +2: + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + movl %eax, %esi + cmpl $TSC_RDTSC_MFENCE, %edi + je 3f + cmpl $TSC_RDTSC_LFENCE, %edi + je 4f + cmpl $TSC_RDTSC_CPUID, %edi + je 5f + ud2a /* abort with SIGILL */ +3: + mfence + rdtsc + jmp 6f +4: + lfence + rdtsc + jmp 6f +5: + pushl %ebx + xorl %eax, %eax + cpuid + rdtsc + popl %ebx +6: + /* + * With a TSC reading in-hand, confirm that the thread has not migrated + * since the cpu_id was first checked. + */ + pushl %eax + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + cmpl %eax, %esi + jne 2b + movl 0xc(%ebp), %edi + mov %eax, (%edi) + popl %eax + popl %esi + popl %edi + leave + ret + SET_SIZE(__cp_tsc_readcpu) + +/* + * uint_t + * __cp_do_getcpu(uint_t cp_tsc_type) + */ + ENTRY_NP(__cp_do_getcpu) + /* + * If RDTSCP is available, it is a quick way to grab the cpu_id which + * is stored in the TSC_AUX MSR by the kernel. + */ + movl 4(%esp), %eax + cmpl $TSC_TSCP, %eax + jne 1f + rdtscp + movl %ecx, %eax + ret +1: + mov $GETCPU_GDT_OFFSET, %eax + lsl %ax, %eax + ret + SET_SIZE(__cp_do_getcpu) diff --git a/usr/src/lib/libc/Makefile.targ b/usr/src/lib/libc/Makefile.targ index 9f9f6d2dfb..bb9ccf467d 100644 --- a/usr/src/lib/libc/Makefile.targ +++ b/usr/src/lib/libc/Makefile.targ @@ -21,6 +21,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2016 Joyent, Inc. # # @@ -93,6 +94,10 @@ pics/%.o: $(LIBCBASE)/../$(MACH)/sys/%.s $(BUILD.s) $(POST_PROCESS_O) +pics/%.o: $(LIBCBASE)/../$(MACH)/sys/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + pics/%.o: $(LIBCBASE)/../$(TARGET_ARCH)/sys/%.c $(COMPILE.c) -o $@ $< $(POST_PROCESS_O) diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile index 12c0d96168..be55c90da9 100644 --- a/usr/src/lib/libc/amd64/Makefile +++ b/usr/src/lib/libc/amd64/Makefile @@ -20,7 +20,7 @@ # # # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2015, Joyent, Inc. All rights reserved. +# Copyright 2016 Joyent, Inc. # # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> @@ -36,6 +36,10 @@ VERS= .1 CPP= /usr/lib/cpp TARGET_ARCH= amd64 +# include comm page definitions +include $(SRC)/lib/commpage/Makefile.shared.targ +include $(SRC)/lib/commpage/Makefile.shared.com + # objects are grouped by source directory # local objects @@ -106,6 +110,7 @@ COMOBJS= \ strtoull.o GENOBJS= \ + $(COMMPAGE_OBJS) \ _getsp.o \ abs.o \ alloca.o \ @@ -280,6 +285,7 @@ COMSYSOBJS= \ SYSOBJS= \ __clock_gettime.o \ + __clock_gettime_sys.o \ __getcontext.o \ __uadmin.o \ _lwp_mutex_unlock.o \ @@ -1181,6 +1187,8 @@ $(PORTI18N_COND:%=pics/%) := \ pics/arc4random.o := CPPFLAGS += -I$(SRC)/common/crypto/chacha +pics/__clock_gettime.o := CPPFLAGS += $(COMMPAGE_CPPFLAGS) + .KEEP_STATE: all: $(LIBS) $(LIB_PIC) diff --git a/usr/src/lib/libc/amd64/sys/__clock_gettime.s b/usr/src/lib/libc/amd64/sys/__clock_gettime_sys.s index 0a98de28a9..bef188c18e 100644 --- a/usr/src/lib/libc/amd64/sys/__clock_gettime.s +++ b/usr/src/lib/libc/amd64/sys/__clock_gettime_sys.s @@ -21,25 +21,28 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ - .file "__clock_gettime.s" + .file "__clock_gettime_sys.s" +#include <sys/asm_linkage.h> #include <sys/time_impl.h> #include "SYS.h" + /* * int - * __clock_gettime(clockid_t clock_id, timespec_t *tp) + * __clock_gettime_sys(clockid_t clock_id, timespec_t *tp) */ - ENTRY(__clock_gettime) + ENTRY(__clock_gettime_sys) cmpl $__CLOCK_REALTIME0, %edi /* if (clock_id) */ - je 2f /* equal to __CLOCK_REALTIME0 */ + je 2f /* equals __CLOCK_REALTIME0 */ cmpl $CLOCK_REALTIME, %edi /* or if (clock_id) */ - jne 1f /* equal to CLOCK_REALTIME */ + jne 1f /* equals CLOCK_REALTIME */ 2: - pushq %rsi /* preserve the timespec_t pointer */ + pushq %rsi /* preserve timespec_t ptr */ SYSFASTTRAP(GETHRESTIME) popq %rsi movq %rax, (%rsi) @@ -49,4 +52,4 @@ SYSTRAP_RVAL1(clock_gettime) SYSCERROR RETC - SET_SIZE(__clock_gettime) + SET_SIZE(__clock_gettime_sys) diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com index c03b4e7468..bf3c62519e 100644 --- a/usr/src/lib/libc/i386/Makefile.com +++ b/usr/src/lib/libc/i386/Makefile.com @@ -20,7 +20,7 @@ # # # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2015, Joyent, Inc. All rights reserved. +# Copyright 2016 Joyent, Inc. # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> # @@ -34,6 +34,10 @@ VERS= .1 CPP= /usr/lib/cpp TARGET_ARCH= i386 +# include comm page definitions +include $(SRC)/lib/commpage/Makefile.shared.targ +include $(SRC)/lib/commpage/Makefile.shared.com + VALUES= values-Xa.o # objects are grouped by source directory @@ -110,6 +114,7 @@ DTRACEOBJS= \ dtrace_data.o GENOBJS= \ + $(COMMPAGE_OBJS) \ _div64.o \ _divdi3.o \ _getsp.o \ @@ -304,6 +309,7 @@ COMSYSOBJS= \ SYSOBJS= \ __clock_gettime.o \ + __clock_gettime_sys.o \ __getcontext.o \ __uadmin.o \ _lwp_mutex_unlock.o \ @@ -1249,6 +1255,8 @@ $(PORTI18N_COND:%=pics/%) := \ pics/arc4random.o := CPPFLAGS += -I$(SRC)/common/crypto/chacha +pics/__clock_gettime.o := CPPFLAGS += $(COMMPAGE_CPPFLAGS) + .KEEP_STATE: all: $(LIBS) $(LIB_PIC) diff --git a/usr/src/lib/libc/i386/sys/__clock_gettime.c b/usr/src/lib/libc/i386/sys/__clock_gettime.c new file mode 100644 index 0000000000..cdc40f7747 --- /dev/null +++ b/usr/src/lib/libc/i386/sys/__clock_gettime.c @@ -0,0 +1,45 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + + + +#include "thr_uberdata.h" +#include <cp_defs.h> + +extern int __clock_gettime_sys(clockid_t, timespec_t *); + +int +__clock_gettime(clockid_t clock_id, timespec_t *tp) +{ + comm_page_t *cp = (comm_page_t *)__uberdata.ub_comm_page; + + if (cp != NULL && __cp_can_gettime(cp) == 0) { + switch (clock_id) { + case __CLOCK_REALTIME0: + case CLOCK_REALTIME: + __cp_clock_gettime_realtime(cp, tp); + return (0); + + case CLOCK_MONOTONIC: + hrt2ts(__cp_gethrtime(cp), tp); + return (0); + + default: + /* Fallback */ + break; + } + } + return (__clock_gettime_sys(clock_id, tp)); +} diff --git a/usr/src/lib/libc/i386/sys/__clock_gettime.s b/usr/src/lib/libc/i386/sys/__clock_gettime_sys.s index cf9ee306ff..57c831843a 100644 --- a/usr/src/lib/libc/i386/sys/__clock_gettime.s +++ b/usr/src/lib/libc/i386/sys/__clock_gettime_sys.s @@ -21,24 +21,26 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ - .file "__clock_gettime.s" + .file "__clock_gettime_sys.s" +#include <sys/asm_linkage.h> #include <sys/time_impl.h> #include "SYS.h" /* * int - * __clock_gettime(clockid_t clock_id, timespec_t *tp) + * __sys_clock_gettime(clockid_t clock_id, timespec_t *tp) */ - ENTRY(__clock_gettime) + ENTRY(__clock_gettime_sys) movl 4(%esp), %eax - cmpl $__CLOCK_REALTIME0, %eax / if (clock_id) - je 2f / equal to __CLOCK_REALTIME0 - cmpl $CLOCK_REALTIME, %eax / or if (clock_id) - jne 1f / equal to CLOCK_REALTIME + cmpl $__CLOCK_REALTIME0, %eax /* if (clock_id) */ + je 2f /* equal to __CLOCK_REALTIME0 */ + cmpl $CLOCK_REALTIME, %eax /* or if (clock_id) */ + jne 1f /* equal to CLOCK_REALTIME */ 2: SYSFASTTRAP(GETHRESTIME) movl 8(%esp), %ecx @@ -49,4 +51,4 @@ SYSTRAP_RVAL1(clock_gettime) SYSCERROR RETC - SET_SIZE(__clock_gettime) + SET_SIZE(__clock_gettime_sys) diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h index 4c5dcb19ff..27bb892620 100644 --- a/usr/src/lib/libc/inc/thr_uberdata.h +++ b/usr/src/lib/libc/inc/thr_uberdata.h @@ -23,7 +23,7 @@ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2015, Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #ifndef _THR_UBERDATA_H @@ -946,6 +946,7 @@ typedef struct uberdata { robust_t *robustlist; /* list of registered robust locks */ char *progname; /* the basename of the program, from argv[0] */ char *ub_broot; /* the root of the native code in the brand */ + void *ub_comm_page; /* arch-specific comm page of kernel data */ struct uberdata **tdb_bootstrap; tdb_t tdb; /* thread debug interfaces (for libc_db) */ } uberdata_t; @@ -1159,6 +1160,7 @@ typedef struct uberdata32 { caddr32_t robustlocks; caddr32_t robustlist; caddr32_t progname; + caddr32_t ub_comm_page; caddr32_t tdb_bootstrap; tdb32_t tdb; } uberdata32_t; diff --git a/usr/src/lib/libc/port/threads/thr.c b/usr/src/lib/libc/port/threads/thr.c index 485d7f8edf..81a07ea21b 100644 --- a/usr/src/lib/libc/port/threads/thr.c +++ b/usr/src/lib/libc/port/threads/thr.c @@ -23,7 +23,7 @@ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include "lint.h" @@ -138,6 +138,7 @@ uberdata_t __uberdata = { NULL, /* robustlist */ NULL, /* progname */ NULL, /* ub_broot */ + NULL, /* ub_comm_page */ NULL, /* __tdb_bootstrap */ { /* tdb */ NULL, /* tdb_sync_addr_hash */ @@ -1222,18 +1223,23 @@ extern void __proc64id(void); #endif static void -init_brandroot(uberdata_t *udp) +init_auxv_data(uberdata_t *udp) { Dl_argsinfo_t args; udp->ub_broot = NULL; + udp->ub_comm_page = NULL; if (dlinfo(RTLD_SELF, RTLD_DI_ARGSINFO, &args) < 0) return; while (args.dla_auxv->a_type != AT_NULL) { - if (args.dla_auxv->a_type == AT_SUN_BRAND_NROOT) { + switch (args.dla_auxv->a_type) { + case AT_SUN_BRAND_NROOT: udp->ub_broot = args.dla_auxv->a_un.a_ptr; - return; + break; + case AT_SUN_COMMPAGE: + udp->ub_comm_page = args.dla_auxv->a_un.a_ptr; + break; } args.dla_auxv++; } @@ -1275,11 +1281,12 @@ libc_init(void) (void) _atexit(__cleanup); /* - * Every libc, regardless of link map, needs to go through and check its - * aux vectors so as to indicate whether or not this has been given a - * brand root with which we use to qualify various other data. + * Every libc, regardless of link map, needs to go through and check + * its aux vectors. Doing so will indicate whether or not this has + * been given a brand root (used to qualify various other data) or a + * comm page (to optimize certain system actions). */ - init_brandroot(udp); + init_auxv_data(udp); /* * We keep our uberdata on one of (a) the first alternate link map diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 445c4f269e..fb317edf68 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -304,6 +304,7 @@ GENUNIX_OBJS += \ seg_map.o \ seg_vn.o \ seg_spt.o \ + seg_umap.o \ semaphore.o \ sendfile.o \ session.o \ diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index f1b4473260..14f309a0c3 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -66,6 +66,11 @@ #include <sys/sdt.h> #include <sys/siginfo.h> +#if defined(__x86) && !defined(__xpv) +#include <sys/comm_page.h> +#endif /* defined(__x86) && !defined(__xpv) */ + + extern int at_flags; #define ORIGIN_STR "ORIGIN" @@ -580,6 +585,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, args->auxsize += sizeof (aux_entry_t); } + + /* + * On supported kernels (64-bit, non-xpv) make room in the auxv for the + * AT_SUN_COMMPAGE entry. + */ +#if defined(__amd64) && !defined(__xpv) + args->auxsize += sizeof (aux_entry_t); +#endif /* defined(__amd64) && !defined(__xpv) */ + /* * If we have user credentials, we'll supply the following entries: * AT_SUN_UID @@ -859,6 +873,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (hasauxv) { int auxf = AF_SUN_HWCAPVERIFY; + /* * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were * filled in via exec_args() @@ -946,6 +961,16 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_BRAND_AUX4, 0) } + /* + * Add the comm page auxv entry, mapping it in if needed. + */ +#if defined(__amd64) && !defined(__xpv) + if (args->commpage != NULL || + (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) { + ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage) + } +#endif /* defined(__amd64) && !defined(__xpv) */ + ADDAUX(aux, AT_NULL, 0) postfixsize = (char *)aux - (char *)bigwad->elfargs; @@ -985,6 +1010,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, } bzero(up->u_auxv, sizeof (up->u_auxv)); + up->u_commpagep = args->commpage; if (postfixsize) { int num_auxv; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index a917211988..e86fe138e3 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -4331,6 +4331,7 @@ zsched(void *arg) PTOU(pp)->u_argc = 0; PTOU(pp)->u_argv = NULL; PTOU(pp)->u_envp = NULL; + PTOU(pp)->u_commpagep = NULL; closeall(P_FINFO(pp)); /* diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index f4f3416946..48b94e2951 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -29,7 +29,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_AUXV_H @@ -202,6 +202,11 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_NROOT 2024 /* + * Aux vector for comm page + */ +#define AT_SUN_COMMPAGE 2026 + +/* * Note that 2023 is reserved for the AT_SUN_HWCAP2 word defined above. */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index 3044f865e6..b2db3f2987 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -113,6 +113,7 @@ typedef struct uarg { boolean_t scrubenv; uintptr_t maxstack; boolean_t stk_prot_override; + uintptr_t commpage; } uarg_t; /* diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index 02e06256eb..66250a3f2b 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2012 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ @@ -211,6 +211,7 @@ typedef struct user { int u_argc; /* value of argc passed to main() */ uintptr_t u_argv; /* value of argv passed to main() */ uintptr_t u_envp; /* value of envp passed to main() */ + uintptr_t u_commpagep; /* address of mapped comm page */ /* * These fields are protected by p_lock: diff --git a/usr/src/uts/common/vm/seg_kmem.c b/usr/src/uts/common/vm/seg_kmem.c index 90e1b73b70..439c859d96 100644 --- a/usr/src/uts/common/vm/seg_kmem.c +++ b/usr/src/uts/common/vm/seg_kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -773,7 +774,7 @@ segkmem_capable(struct seg *seg, segcapability_t capability) return (0); } -static struct seg_ops segkmem_ops = { +struct seg_ops segkmem_ops = { SEGKMEM_BADOP(int), /* dup */ SEGKMEM_BADOP(int), /* unmap */ SEGKMEM_BADOP(void), /* free */ diff --git a/usr/src/uts/common/vm/seg_kmem.h b/usr/src/uts/common/vm/seg_kmem.h index 2a4ed3b2aa..3ad4202e91 100644 --- a/usr/src/uts/common/vm/seg_kmem.h +++ b/usr/src/uts/common/vm/seg_kmem.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _VM_SEG_KMEM_H @@ -136,6 +137,8 @@ extern size_t segkmem_kmemlp_max; #define IS_KMEM_VA_LARGEPAGE(vaddr) \ (((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) +extern struct seg_ops segkmem_ops; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/vm/seg_umap.c b/usr/src/uts/common/vm/seg_umap.c new file mode 100644 index 0000000000..ccad71c5d6 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.c @@ -0,0 +1,466 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * VM - Kernel-to-user mapping segment + * + * The umap segment driver was primarily designed to facilitate the comm page: + * a portion of kernel memory shared with userspace so that certain (namely + * clock-related) actions could operate without making an expensive trip into + * the kernel. + * + * Since the initial requirements for the comm page are slim, advanced features + * of the segment driver such as per-page protection have been left + * unimplemented at this time. + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/lgrp.h> +#include <sys/mman.h> + +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_umap.h> + + +static boolean_t segumap_verify_safe(caddr_t, size_t); +static int segumap_dup(struct seg *, struct seg *); +static int segumap_unmap(struct seg *, caddr_t, size_t); +static void segumap_free(struct seg *); +static faultcode_t segumap_fault(struct hat *, struct seg *, caddr_t, size_t, + enum fault_type, enum seg_rw); +static faultcode_t segumap_faulta(struct seg *, caddr_t); +static int segumap_setprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_checkprot(struct seg *, caddr_t, size_t, uint_t); +static int segumap_sync(struct seg *, caddr_t, size_t, int, uint_t); +static size_t segumap_incore(struct seg *, caddr_t, size_t, char *); +static int segumap_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *, + size_t); +static int segumap_getprot(struct seg *, caddr_t, size_t, uint_t *); +static u_offset_t segumap_getoffset(struct seg *, caddr_t); +static int segumap_gettype(struct seg *, caddr_t); +static int segumap_getvp(struct seg *, caddr_t, struct vnode **); +static int segumap_advise(struct seg *, caddr_t, size_t, uint_t); +static void segumap_dump(struct seg *); +static int segumap_pagelock(struct seg *, caddr_t, size_t, struct page ***, + enum lock_type, enum seg_rw); +static int segumap_setpagesize(struct seg *, caddr_t, size_t, uint_t); +static int segumap_getmemid(struct seg *, caddr_t, memid_t *); +static int segumap_capable(struct seg *, segcapability_t); + +static struct seg_ops segumap_ops = { + segumap_dup, + segumap_unmap, + segumap_free, + segumap_fault, + segumap_faulta, + segumap_setprot, + segumap_checkprot, + NULL, /* kluster: disabled */ + NULL, /* swapout: disabled */ + segumap_sync, + segumap_incore, + segumap_lockop, + segumap_getprot, + segumap_getoffset, + segumap_gettype, + segumap_getvp, + segumap_advise, + segumap_dump, + segumap_pagelock, + segumap_setpagesize, + segumap_getmemid, + NULL, /* getpolicy: disabled */ + segumap_capable, + seg_inherit_notsup +}; + + +/* + * Create a kernel/user-mapped segment. + */ +int +segumap_create(struct seg *seg, void *argsp) +{ + segumap_crargs_t *a = (struct segumap_crargs *)argsp; + segumap_data_t *data; + + ASSERT((uintptr_t)a->kaddr > _userlimit); + + /* + * Check several aspects of the mapping request to ensure validity: + * - kernel pages must reside entirely in kernel space + * - target protection must be user-accessible + * - kernel address must be page-aligned + * - kernel address must reside inside a "safe" segment + */ + if ((uintptr_t)a->kaddr <= _userlimit || + ((uintptr_t)a->kaddr + seg->s_size) < (uintptr_t)a->kaddr || + (a->prot & PROT_USER) == 0 || + ((uintptr_t)a->kaddr & PAGEOFFSET) != 0 || + !segumap_verify_safe(a->kaddr, seg->s_size)) { + return (EINVAL); + } + + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + rw_init(&data->sud_lock, NULL, RW_DEFAULT, NULL); + data->sud_kaddr = a->kaddr; + data->sud_prot = a->prot; + data->sud_loaded = B_FALSE; + + seg->s_ops = &segumap_ops; + seg->s_data = data; + return (0); +} + +static boolean_t +segumap_verify_safe(caddr_t kaddr, size_t len) +{ + struct seg *seg; + + /* + * Presently, only pages which are backed by segkmem are allowed to be + * shared with userspace. This prevents nasty paging behavior with + * other drivers such as seg_kp. Furthermore, the backing kernel + * segment must completely contain the region to be mapped. + * + * Failing these checks is fatal for now since such mappings are done + * in a very limited context from the kernel. + */ + AS_LOCK_ENTER(&kas, RW_READER); + seg = as_segat(&kas, kaddr); + VERIFY(seg != NULL); + VERIFY(seg->s_base + seg->s_size >= kaddr + len); + VERIFY(seg->s_ops == &segkmem_ops); + AS_LOCK_EXIT(&kas); + + return (B_TRUE); +} + +static int +segumap_dup(struct seg *seg, struct seg *newseg) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + segumap_data_t *newsud; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + newsud = kmem_zalloc(sizeof (segumap_data_t), KM_SLEEP); + rw_init(&newsud->sud_lock, NULL, RW_DEFAULT, NULL); + newsud->sud_kaddr = sud->sud_kaddr; + newsud->sud_prot = sud->sud_prot; + newsud->sud_loaded = B_FALSE; + + newseg->s_ops = seg->s_ops; + newseg->s_data = newsud; + return (0); +} + +static int +segumap_unmap(struct seg *seg, caddr_t addr, size_t len) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); + + /* Only allow unmap of entire segment */ + if (addr != seg->s_base || len != seg->s_size) { + return (EINVAL); + } + if (sud->sud_softlockcnt != 0) { + return (EAGAIN); + } + + hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP); + /* + * While setting this field before immediately freeing the segment is + * not necessary, it is done for the sake of completeness. Doing so + * outside sud_lock is safe with the AS write-locked. + */ + sud->sud_loaded = B_FALSE; + + seg_free(seg); + return (0); +} + +static void +segumap_free(struct seg *seg) +{ + segumap_data_t *data = (segumap_data_t *)seg->s_data; + + ASSERT(data != NULL); + + rw_destroy(&data->sud_lock); + VERIFY(data->sud_loaded == B_FALSE); + VERIFY(data->sud_softlockcnt == 0); + kmem_free(data, sizeof (*data)); + seg->s_data = NULL; +} + +/* ARGSUSED */ +static faultcode_t +segumap_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, + enum fault_type type, enum seg_rw tw) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + if (type == F_PROT) { + /* + * Since protection on the segment is fixed, there is nothing + * to do but report an error for protection faults. + */ + return (FC_PROT); + } else if (type == F_SOFTUNLOCK) { + size_t plen = btop(len); + + rw_enter(&sud->sud_lock, RW_WRITER); + VERIFY(sud->sud_softlockcnt >= plen); + sud->sud_softlockcnt -= plen; + rw_exit(&sud->sud_lock); + return (0); + } + + ASSERT(type == F_INVAL || type == F_SOFTLOCK); + rw_enter(&sud->sud_lock, RW_WRITER); + + if (type == F_INVAL && sud->sud_loaded) { + rw_exit(&sud->sud_lock); + return (FC_NOMAP); + } + + /* + * Load the (entire) segment into the HAT if it has not been done so. + */ + if (!sud->sud_loaded) { + for (uintptr_t i = 0; i < seg->s_size; i += PAGESIZE) { + pfn_t pfn; + + pfn = hat_getpfnum(kas.a_hat, sud->sud_kaddr + i); + VERIFY(pfn != PFN_INVALID); + hat_devload(seg->s_as->a_hat, seg->s_base + i, + PAGESIZE, pfn, sud->sud_prot, HAT_LOAD); + } + sud->sud_loaded = B_TRUE; + } else { + /* + * If there the segment has already been loaded, there is no + * reason to take an F_INVALID fault. + */ + VERIFY(type != F_INVAL); + } + + if (type == F_SOFTLOCK) { + size_t nval = sud->sud_softlockcnt + btop(len); + + if (sud->sud_softlockcnt >= nval) { + rw_exit(&sud->sud_lock); + return (FC_MAKE_ERR(EOVERFLOW)); + } + sud->sud_softlockcnt = nval; + } + rw_exit(&sud->sud_lock); + return (0); +} + +/* ARGSUSED */ +static faultcode_t +segumap_faulta(struct seg *seg, caddr_t addr) +{ + /* Do nothing since asynch pagefault should not load translation. */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + /* + * The seg_umap driver does not yet allow protection to be changed. + */ + return (EACCES); +} + +/* ARGSUSED */ +static int +segumap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + int error = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + if ((sud->sud_prot & prot) != prot) { + error = EACCES; + } + rw_exit(&sud->sud_lock); + return (error); +} + +/* ARGSUSED */ +static int +segumap_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) +{ + /* Always succeed since there are no backing store to sync */ + return (0); +} + +/* ARGSUSED */ +static size_t +segumap_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) +{ + size_t sz = 0; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + len = (len + PAGEOFFSET) & PAGEMASK; + while (len > 0) { + *vec = 1; + sz += PAGESIZE; + vec++; + len -= PAGESIZE; + } + return (sz); +} + +/* ARGSUSED */ +static int +segumap_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op, + ulong_t *lockmap, size_t pos) +{ + /* Report success since kernel pages are always in memory. */ + return (0); +} + +static int +segumap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + size_t pgno; + uint_t prot; + + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + rw_enter(&sud->sud_lock, RW_READER); + prot = sud->sud_prot; + rw_exit(&sud->sud_lock); + + /* + * Reporting protection is simple since it is not tracked per-page. + */ + pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; + while (pgno > 0) { + protv[--pgno] = prot; + } + return (0); +} + +/* ARGSUSED */ +static u_offset_t +segumap_getoffset(struct seg *seg, caddr_t addr) +{ + /* + * To avoid leaking information about the layout of the kernel address + * space, always report '0' as the offset. + */ + return (0); +} + +/* ARGSUSED */ +static int +segumap_gettype(struct seg *seg, caddr_t addr) +{ + /* + * Since already-existing kernel pages are being mapped into userspace, + * always report the segment type as shared. + */ + return (MAP_SHARED); +} + +/* ARGSUSED */ +static int +segumap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) +{ + ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); + + *vpp = NULL; + return (0); +} + +/* ARGSUSED */ +static int +segumap_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) +{ + if (behav == MADV_PURGE) { + /* Purge does not make sense for this mapping */ + return (EINVAL); + } + /* Indicate success for everything else. */ + return (0); +} + +/* ARGSUSED */ +static void +segumap_dump(struct seg *seg) +{ + /* + * Since this is a mapping to share kernel data with userspace, nothing + * additional should be dumped. + */ +} + +/* ARGSUSED */ +static int +segumap_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, + enum lock_type type, enum seg_rw rw) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +segumap_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) +{ + return (ENOTSUP); +} + +static int +segumap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) +{ + segumap_data_t *sud = (segumap_data_t *)seg->s_data; + + memidp->val[0] = (uintptr_t)sud->sud_kaddr; + memidp->val[1] = (uintptr_t)(addr - seg->s_base); + return (0); +} + +/* ARGSUSED */ +static int +segumap_capable(struct seg *seg, segcapability_t capability) +{ + /* no special capablities */ + return (0); +} diff --git a/usr/src/uts/common/vm/seg_umap.h b/usr/src/uts/common/vm/seg_umap.h new file mode 100644 index 0000000000..bcf7447509 --- /dev/null +++ b/usr/src/uts/common/vm/seg_umap.h @@ -0,0 +1,43 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _VM_SEG_UMAP_H +#define _VM_SEG_UMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct segumap_crargs { + caddr_t kaddr; + uchar_t prot; /* protection */ + uchar_t maxprot; /* maximum protection */ +} segumap_crargs_t; + +typedef struct segumap_data { + krwlock_t sud_lock; + caddr_t sud_kaddr; + uchar_t sud_prot; + size_t sud_softlockcnt; + boolean_t sud_loaded; +} segumap_data_t; + +extern int segumap_create(struct seg *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_UMAP_H */ diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 9829939b16..ef7a36d09c 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -23,6 +23,7 @@ # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # Copyright (c) 2010, Intel Corporation. +# Copyright 2016 Joyent, Inc. # # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc @@ -40,6 +41,8 @@ CORE_OBJS += \ cmi.o \ cmi_hw.o \ cms.o \ + comm_page.o \ + comm_page_util.o \ confunix.o \ cpu_idle.o \ cpuid.o \ diff --git a/usr/src/uts/i86pc/ml/comm_page.s b/usr/src/uts/i86pc/ml/comm_page.s new file mode 100644 index 0000000000..7ff803ea93 --- /dev/null +++ b/usr/src/uts/i86pc/ml/comm_page.s @@ -0,0 +1,88 @@ + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/asm_misc.h> +#include <sys/param.h> +#include <sys/comm_page.h> +#include <sys/tsc.h> + +#if defined(__lint) + +hrtime_t tsc_last; +hrtime_t tsc_resume_cap; +hrtime_t tsc_hrtime_base; +uint32_t tsc_max_delta; +volatile uint32_t hres_lock; +uint32_t tsc_type; +uint32_t nsec_scale; +int64_t hrestime_adj; +hrtime_t hres_last_tick; +uint32_t tsc_ncpu; +volatile timestruc_t hrestime; +hrtime_t tsc_sync_tick_delta[NCPU]; + +comm_page_t comm_page; + +#else /* defined(__lint) */ + +#include "assym.h" + +/* + * x86 Comm Page + * + * This is the definition for the comm page on x86. The purpose of this struct + * is to consolidate certain pieces of kernel state into one contiguous section + * of memory in order for it to be exposed (read-only) to userspace. The + * struct contents are defined by hand so that member variables will maintain + * their original symbols for use throughout the rest of the kernel. This + * layout must exactly match the C definition of comm_page_t. + * See: "uts/i86pc/sys/comm_page.h" + */ + + .data + DGDEF3(comm_page, COMM_PAGE_S_SIZE, 4096) + DGDEF2(tsc_last, 8) + .fill 1, 8, 0 + DGDEF2(tsc_hrtime_base, 8) + .fill 1, 8, 0 + DGDEF2(tsc_resume_cap, 8) + .fill 1, 8, 0 + DGDEF2(tsc_type, 4); + .fill 1, 4, _CONST(TSC_RDTSC_CPUID) + DGDEF2(tsc_max_delta, 4); + .fill 1, 4, 0 + DGDEF2(hres_lock, 4); + .fill 1, 4, 0 + DGDEF2(nsec_scale, 4); + .fill 1, 4, 0 + DGDEF2(hrestime_adj, 8) + .fill 1, 8, 0 + DGDEF2(hres_last_tick, 8) + .fill 1, 8, 0 + DGDEF2(tsc_ncpu, 4) + .fill 1, 4, 0 + /* _cp_pad */ + .fill 1, 4, 0 + DGDEF2(hrestime, _MUL(2, 8)) + .fill 2, 8, 0 + DGDEF2(tsc_sync_tick_delta, _MUL(NCPU, 8)) + .fill _CONST(NCPU), 8, 0 + + /* pad out the rest of the page from the struct end */ + .fill _CONST(COMM_PAGE_SIZE - COMM_PAGE_S_SIZE), 1, 0 + +#endif /* defined(__lint) */ diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 4253c644c1..a1f83d3cf8 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,7 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. -\ Copyright 2015 Joyent, Inc. +\ Copyright 2016 Joyent, Inc. \ \ CDDL HEADER START \ @@ -62,6 +62,7 @@ #include <sys/brand.h> #include <sys/fastboot.h> #include <sys/cpr_wakecode.h> +#include <sys/comm_page.h> proc PROCSIZE p_link @@ -470,3 +471,5 @@ wc_cpu WC_CPU_SIZE wc_wakecode wc_cpu + +comm_page_s COMM_PAGE_S_SIZE diff --git a/usr/src/uts/i86pc/os/comm_page_util.c b/usr/src/uts/i86pc/os/comm_page_util.c new file mode 100644 index 0000000000..3c635fe79b --- /dev/null +++ b/usr/src/uts/i86pc/os/comm_page_util.c @@ -0,0 +1,62 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <vm/as.h> +#include <vm/seg_umap.h> + +#if defined(__x86) && !defined(__xpv) +#include <sys/comm_page.h> +#endif /* defined(__x86) && !defined(__xpv) */ + +/* + * Map in the comm page. + * + * The contents of the comm page are only defined on non-xpv x86 at this time. + * Furthermore, the data is only valid in userspace (32-bit or 64-bit) when + * mapped from a 64-bit kernel. + * See: "uts/i86pc/sys/comm_page.h" + */ +caddr_t +comm_page_mapin() +{ +#if defined(__amd64) && !defined(__xpv) + proc_t *p = curproc; + caddr_t addr = NULL; + size_t len = COMM_PAGE_SIZE; + uint_t prot = PROT_USER | PROT_READ; + segumap_crargs_t suarg; + + map_addr(&addr, len, (offset_t)0, 1, 0); + if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as, + p->p_as->a_userlimit) != RANGE_OKAY) { + return (NULL); + } + + suarg.kaddr = (caddr_t)&comm_page; + suarg.prot = suarg.maxprot = prot; + if (as_map(p->p_as, addr, len, segumap_create, &suarg) != 0) { + return (NULL); + } + return (addr); +#else /* defined(__amd64) && !defined(__xpv) */ + return (NULL); +#endif /* defined(__amd64) && !defined(__xpv) */ +} diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 9b55ba9553..027ed29c3d 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -32,7 +32,7 @@ * Portions Copyright 2009 Advanced Micro Devices, Inc. */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* * Various routines to handle identification @@ -57,6 +57,8 @@ #include <sys/auxv_386.h> #include <sys/memnode.h> #include <sys/pci_cfgspace.h> +#include <sys/comm_page.h> +#include <sys/tsc.h> #ifdef __xpv #include <sys/hypervisor.h> @@ -4614,27 +4616,30 @@ patch_tsc_read(int flag) size_t cnt; switch (flag) { - case X86_NO_TSC: + case TSC_NONE: cnt = &_no_rdtsc_end - &_no_rdtsc_start; (void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt); break; - case X86_HAVE_TSCP: - cnt = &_tscp_end - &_tscp_start; - (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); - break; - case X86_TSC_MFENCE: + case TSC_RDTSC_MFENCE: cnt = &_tsc_mfence_end - &_tsc_mfence_start; (void) memcpy((void *)tsc_read, (void *)&_tsc_mfence_start, cnt); break; - case X86_TSC_LFENCE: + case TSC_RDTSC_LFENCE: cnt = &_tsc_lfence_end - &_tsc_lfence_start; (void) memcpy((void *)tsc_read, (void *)&_tsc_lfence_start, cnt); break; + case TSC_TSCP: + cnt = &_tscp_end - &_tscp_start; + (void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt); + break; default: + /* Bail for unexpected TSC types. (TSC_NONE covers 0) */ + cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag); break; } + tsc_type = flag; } int diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 045adbcb7b..438f83b6e9 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -23,6 +23,7 @@ * * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2010, Intel Corporation. @@ -61,6 +62,7 @@ #include <sys/promif.h> #include <sys/pci_cfgspace.h> #include <sys/bootvfs.h> +#include <sys/tsc.h> #ifdef __xpv #include <sys/hypervisor.h> #else @@ -227,15 +229,15 @@ mlsetup(struct regs *rp) */ if ((get_hwenv() & HW_XEN_HVM) == 0 && is_x86_feature(x86_featureset, X86FSET_TSCP)) - patch_tsc_read(X86_HAVE_TSCP); + patch_tsc_read(TSC_TSCP); else if (cpuid_getvendor(CPU) == X86_VENDOR_AMD && cpuid_getfamily(CPU) <= 0xf && is_x86_feature(x86_featureset, X86FSET_SSE2)) - patch_tsc_read(X86_TSC_MFENCE); + patch_tsc_read(TSC_RDTSC_MFENCE); else if (cpuid_getvendor(CPU) == X86_VENDOR_Intel && cpuid_getfamily(CPU) <= 6 && is_x86_feature(x86_featureset, X86FSET_SSE2)) - patch_tsc_read(X86_TSC_LFENCE); + patch_tsc_read(TSC_RDTSC_LFENCE); #endif /* !__xpv */ @@ -246,7 +248,7 @@ mlsetup(struct regs *rp) * return 0. */ if (!is_x86_feature(x86_featureset, X86FSET_TSC)) - patch_tsc_read(X86_NO_TSC); + patch_tsc_read(TSC_NONE); #endif /* __i386 && !__xpv */ #if defined(__amd64) && !defined(__xpv) diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 6ded04c1b5..829c631096 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -27,7 +27,7 @@ * All rights reserved. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -249,6 +249,24 @@ init_cpu_syscall(struct cpu *cp) kpreempt_enable(); } +#if !defined(__xpv) +/* + * Configure per-cpu ID GDT + */ +static void +init_cpu_id_gdt(struct cpu *cp) +{ + /* Write cpu_id into limit field of GDT for usermode retrieval */ +#if defined(__amd64) + set_usegd(&cp->cpu_gdt[GDT_CPUID], SDP_SHORT, NULL, cp->cpu_id, + SDT_MEMRODA, SEL_UPL, SDP_BYTES, SDP_OP32); +#elif defined(__i386) + set_usegd(&cp->cpu_gdt[GDT_CPUID], NULL, cp->cpu_id, SDT_MEMRODA, + SEL_UPL, SDP_BYTES, SDP_OP32); +#endif +} +#endif /* !defined(__xpv) */ + /* * Multiprocessor initialization. * @@ -432,6 +450,10 @@ mp_cpu_configure_common(int cpun, boolean_t boot) init_cpu_info(cp); +#if !defined(__xpv) + init_cpu_id_gdt(cp); +#endif + /* * alloc space for ucode_info */ @@ -1488,6 +1510,10 @@ start_other_cpus(int cprboot) */ init_cpu_info(CPU); +#if !defined(__xpv) + init_cpu_id_gdt(CPU); +#endif + cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_idstr); cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_brandstr); diff --git a/usr/src/uts/i86pc/os/timestamp.c b/usr/src/uts/i86pc/os/timestamp.c index 3b478853ee..7344e1a492 100644 --- a/usr/src/uts/i86pc/os/timestamp.c +++ b/usr/src/uts/i86pc/os/timestamp.c @@ -25,7 +25,7 @@ * * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -48,6 +48,7 @@ #include <sys/panic.h> #include <sys/cpu.h> #include <sys/sdt.h> +#include <sys/comm_page.h> /* * Using the Pentium's TSC register for gethrtime() @@ -100,7 +101,6 @@ #define NSEC_SHIFT 5 -static uint_t nsec_scale; static uint_t nsec_unscale; /* @@ -141,16 +141,12 @@ static volatile int tsc_sync_go; int tsc_master_slave_sync_needed = 1; -static int tsc_max_delta; -static hrtime_t tsc_sync_tick_delta[NCPU]; typedef struct tsc_sync { volatile hrtime_t master_tsc, slave_tsc; } tsc_sync_t; static tsc_sync_t *tscp; -static hrtime_t tsc_last = 0; static hrtime_t tsc_last_jumped = 0; -static hrtime_t tsc_hrtime_base = 0; static int tsc_jumped = 0; static uint32_t tsc_wayback = 0; /* @@ -158,7 +154,6 @@ static uint32_t tsc_wayback = 0; * tsc_tick() function runs which means that when gethrtime() is called it * should never be more than 1 second since tsc_last was updated. */ -static hrtime_t tsc_resume_cap; static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */ static hrtime_t shadow_tsc_hrtime_base; @@ -541,6 +536,7 @@ tsc_sync_master(processorid_t slave) if (last_delta > min_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; + tsc_ncpu = NCPU; } restore_int_flag(flags); } @@ -682,6 +678,12 @@ tsc_hrtimeinit(uint64_t cpu_freq_hz) hrtime_tick = tsc_tick; gethrtime_hires = 1; /* + * Being part of the comm page, tsc_ncpu communicates the published + * length of the tsc_sync_tick_delta array. This is kept zeroed to + * ignore the absent delta data while the TSCs are synced. + */ + tsc_ncpu = 0; + /* * Allocate memory for the structure used in the tsc sync logic. * This structure should be aligned on a multiple of cache line size. */ @@ -718,6 +720,7 @@ tsc_adjust_delta(hrtime_t tdelta) gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; + tsc_ncpu = NCPU; } /* diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h new file mode 100644 index 0000000000..9d94a27763 --- /dev/null +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -0,0 +1,101 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _COMM_PAGE_H +#define _COMM_PAGE_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define COMM_PAGE_SIZE PAGESIZE + +#ifndef _ASM + +/* + * x86 comm page + * + * This struct defines the data format for the "comm page": kernel data made + * directly available to userspace for read-only operations. This enables + * facilities such as clock_gettime to operate entirely in userspace without + * the need for a trap or fasttrap. + * + * A note about 32-bit/64-bit compatibility: + * The current format of the comm page is designed to be consistent for both + * 32-bit and 64-bit programs running in a 64-bit kernel. On 32-bit kernels, + * the comm page is not exposed to userspace due to the difference in + * timespec_t sizing. + * + * This struct is instantiated "by hand" in assembly to preserve the global + * symbols it contains. That layout must be kept in sync with the structure + * defined here. + * See: "uts/i86pc/ml/comm_page.s" + */ +typedef struct comm_page_s { + hrtime_t cp_tsc_last; + hrtime_t cp_tsc_hrtime_base; + hrtime_t cp_tsc_resume_cap; + uint32_t cp_tsc_type; + uint32_t cp_tsc_max_delta; + + volatile uint32_t cp_hres_lock; /* must be 8-byte aligned */ + uint32_t cp_nsec_scale; + int64_t cp_hrestime_adj; + hrtime_t cp_hres_last_tick; + uint32_t cp_tsc_ncpu; + uint32_t _cp_pad; + volatile int64_t cp_hrestime[2]; +#if defined(_MACHDEP) + hrtime_t cp_tsc_sync_tick_delta[NCPU]; +#else + /* length resides in cp_ncpu */ + hrtime_t cp_tsc_sync_tick_delta[]; +#endif /* defined(_MACHDEP) */ +} comm_page_t; + +#if defined(_KERNEL) +extern comm_page_t comm_page; + +extern caddr_t comm_page_mapin(); + +#if defined(_MACHDEP) +extern hrtime_t tsc_last; +extern hrtime_t tsc_hrtime_base; +extern hrtime_t tsc_resume_cap; +extern uint32_t tsc_type; +extern uint32_t tsc_max_delta; +extern volatile uint32_t hres_lock; +extern uint32_t nsec_scale; +extern int64_t hrestime_adj; +extern hrtime_t hres_last_tick; +extern uint32_t tsc_ncpu; +extern volatile timestruc_t hrestime; +extern hrtime_t tsc_sync_tick_delta[NCPU]; +#endif /* defined(_MACHDEP) */ +#endif /* defined(_KERNEL) */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _COMM_PAGE_H */ diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index 99ae0d4d3b..fc34522307 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -54,6 +55,10 @@ extern "C" { */ #if defined(__amd64) +/* + * If NCPU grows beyond 256, sizing for the x86 comm page will require + * adjustment. + */ #define NCPU 256 #define NCPU_LOG2 8 #elif defined(__i386) diff --git a/usr/src/uts/i86pc/sys/tsc.h b/usr/src/uts/i86pc/sys/tsc.h new file mode 100644 index 0000000000..d4090381c4 --- /dev/null +++ b/usr/src/uts/i86pc/sys/tsc.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _TSC_H +#define _TSC_H + +/* + * flags to patch tsc_read routine. + */ +#define TSC_NONE 0x0 +#define TSC_RDTSC_CPUID 0x1 +#define TSC_RDTSC_MFENCE 0x2 +#define TSC_RDTSC_LFENCE 0x3 +#define TSC_TSCP 0x4 + +#endif /* _TSC_H */ diff --git a/usr/src/uts/i86xpv/os/xpv_timestamp.c b/usr/src/uts/i86xpv/os/xpv_timestamp.c index d7d13f984b..8b7d226305 100644 --- a/usr/src/uts/i86xpv/os/xpv_timestamp.c +++ b/usr/src/uts/i86xpv/os/xpv_timestamp.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -71,6 +72,11 @@ static volatile hrtime_t hrtime_last; static hrtime_t hrtime_suspend_time; static hrtime_t hrtime_addend; +volatile uint32_t hres_lock; +hrtime_t hres_last_tick; +int64_t hrestime_adj; +volatile timestruc_t hrestime; + /* * These functions are used in DTrace probe context, and must be removed from * fbt consideration. Currently fbt ignores all weak symbols, so this will diff --git a/usr/src/uts/intel/elfexec/Makefile b/usr/src/uts/intel/elfexec/Makefile index ce0433391c..975377a538 100644 --- a/usr/src/uts/intel/elfexec/Makefile +++ b/usr/src/uts/intel/elfexec/Makefile @@ -23,6 +23,7 @@ # # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2016 Joyent, Inc. # # @@ -60,6 +61,8 @@ ALL_TARGET = $(BINARY) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +INC_PATH += -I$(UTSBASE)/i86pc + # # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. diff --git a/usr/src/uts/intel/ia32/ml/i86_subr.s b/usr/src/uts/intel/ia32/ml/i86_subr.s index e9d0b8128f..d4ba6589bc 100644 --- a/usr/src/uts/intel/ia32/ml/i86_subr.s +++ b/usr/src/uts/intel/ia32/ml/i86_subr.s @@ -23,6 +23,7 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -3590,29 +3591,13 @@ hres_tick(void) {} int64_t timedelta; -hrtime_t hres_last_tick; -volatile timestruc_t hrestime; -int64_t hrestime_adj; -volatile int hres_lock; hrtime_t hrtime_base; #else /* __lint */ - DGDEF3(hrestime, _MUL(2, CLONGSIZE), 8) - .NWORD 0, 0 - - DGDEF3(hrestime_adj, 8, 8) - .long 0, 0 - - DGDEF3(hres_last_tick, 8, 8) - .long 0, 0 - DGDEF3(timedelta, 8, 8) .long 0, 0 - DGDEF3(hres_lock, 4, 8) - .long 0 - /* * initialized to a non zero value to make pc_gethrtime() * work correctly even before clock is initialized diff --git a/usr/src/uts/intel/sys/machlock.h b/usr/src/uts/intel/sys/machlock.h index d740bc6212..e362ca9aba 100644 --- a/usr/src/uts/intel/sys/machlock.h +++ b/usr/src/uts/intel/sys/machlock.h @@ -21,13 +21,12 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_MACHLOCK_H #define _SYS_MACHLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifndef _ASM #include <sys/types.h> #include <sys/time.h> @@ -76,7 +75,7 @@ typedef lock_t disp_lock_t; /* dispatcher lock type */ /* * Externs for CLOCK_LOCK and clock resolution */ -extern volatile int hres_lock; +extern volatile uint32_t hres_lock; extern hrtime_t hrtime_base; extern int clock_res; diff --git a/usr/src/uts/intel/sys/segments.h b/usr/src/uts/intel/sys/segments.h index 8a6e398eec..5368f80735 100644 --- a/usr/src/uts/intel/sys/segments.h +++ b/usr/src/uts/intel/sys/segments.h @@ -2,7 +2,7 @@ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SEGMENTS_H @@ -522,6 +522,7 @@ void init_boot_gdt(user_desc_t *); #define GDT_B16DATA 4 /* bios call 16 bit data descriptor */ #define GDT_B64CODE 5 /* dboot 64 bit code descriptor */ #define GDT_BGSTMP 7 /* kmdb descriptor only used early in boot */ +#define GDT_CPUID 16 /* store numeric id of current CPU */ #if defined(__amd64) @@ -530,8 +531,8 @@ void init_boot_gdt(user_desc_t *); #define GDT_U32CODE 8 /* 32-bit process on 64-bit kernel %cs */ #define GDT_UDATA 9 /* user data seg %ds (32 and 64 bit) */ #define GDT_UCODE 10 /* native user code seg %cs */ -#define GDT_LDT 12 /* LDT for current process */ -#define GDT_KTSS 14 /* kernel tss */ +#define GDT_LDT 12 /* (12-13) LDT for current process */ +#define GDT_KTSS 14 /* (14-15) kernel tss */ #define GDT_FS GDT_NULL /* kernel %fs segment selector */ #define GDT_GS GDT_NULL /* kernel %gs segment selector */ #define GDT_LWPFS 55 /* lwp private %fs segment selector (32-bit) */ diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 749746036c..1ea68e0184 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -28,7 +28,7 @@ * All rights reserved. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. * Copyright 2012 Jens Elkner <jel+illumos@cs.uni-magdeburg.de> * Copyright 2012 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> * Copyright 2014 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> @@ -387,14 +387,6 @@ extern "C" { #define X86FSET_RDSEED 48 /* - * flags to patch tsc_read routine. - */ -#define X86_NO_TSC 0x0 -#define X86_HAVE_TSCP 0x1 -#define X86_TSC_MFENCE 0x2 -#define X86_TSC_LFENCE 0x4 - -/* * Intel Deep C-State invariant TSC in leaf 0x80000007. */ #define CPUID_TSC_CSTATE_INVARIANCE (0x100) |