diff options
Diffstat (limited to 'usr/src/uts/common/os')
68 files changed, 3670 insertions, 1438 deletions
diff --git a/usr/src/uts/common/os/autoconf.c b/usr/src/uts/common/os/autoconf.c index 71af31ba2b..44ec3353fc 100644 --- a/usr/src/uts/common/os/autoconf.c +++ b/usr/src/uts/common/os/autoconf.c @@ -53,6 +53,7 @@ #include <sys/fm/util.h> #include <sys/ddifm_impl.h> #include <sys/ddi_ufm_impl.h> +#include <sys/ksensor_impl.h> extern dev_info_t *top_devinfo; extern dev_info_t *scsi_vhci_dip; @@ -96,6 +97,7 @@ setup_ddi(void) ndi_fm_init(); irm_init(); ufm_init(); + ksensor_init(); (void) i_ddi_load_drvconf(DDI_MAJOR_T_NONE); diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c index abaaef1b4a..daf3b638a6 100644 --- a/usr/src/uts/common/os/bio.c +++ b/usr/src/uts/common/os/bio.c @@ -1488,7 +1488,6 @@ bio_getfreeblk(long bsize) */ bio_mem_get(bsize); /* Account for our memory request */ -again: bp = bio_bhdr_alloc(); /* Get a buf hdr */ sema_p(&bp->b_sem); /* Should never fail */ diff --git a/usr/src/uts/common/os/bitmap.c b/usr/src/uts/common/os/bitmap.c index 46fae44adb..06dd326f4a 100644 --- a/usr/src/uts/common/os/bitmap.c +++ b/usr/src/uts/common/os/bitmap.c @@ -19,17 +19,16 @@ * * CDDL HEADER END */ -/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2022 Oxide Computer Company */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Operations on bitmaps of arbitrary size * A bitmap is a vector of 1 or more ulongs. @@ -39,7 +38,7 @@ #include <sys/types.h> #include <sys/bitmap.h> -#include <sys/debug.h> /* ASSERT */ +#include <sys/debug.h> /* * Return index of first available bit in denoted bitmap, or -1 for @@ -49,7 +48,7 @@ * Caller is responsible for range checks. */ index_t -bt_availbit(ulong_t *bitmap, size_t nbits) +bt_availbit(const ulong_t *bitmap, size_t nbits) { index_t maxword; /* index of last in map */ index_t wx; /* word index in map */ @@ -92,7 +91,7 @@ bt_availbit(ulong_t *bitmap, size_t nbits) * the word specified by wx. */ int -bt_gethighbit(ulong_t *mapp, int wx) +bt_gethighbit(const ulong_t *mapp, int wx) { ulong_t word; @@ -115,7 +114,7 @@ bt_gethighbit(ulong_t *mapp, int wx) * and one past the last bit (pos2) in the pattern. */ int -bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos) +bt_range(const ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos) { size_t pos; @@ -169,7 +168,7 @@ odd_parity(ulong_t i) * a -1 is returned. */ int -bt_getlowbit(ulong_t *map, size_t start, size_t stop) +bt_getlowbit(const ulong_t *map, size_t start, size_t stop) { ulong_t word; int counter = start >> BT_ULSHIFT; @@ -236,7 +235,7 @@ bt_getlowbit(ulong_t *map, size_t start, size_t stop) * Copy the bitmap. */ void -bt_copy(ulong_t *from, ulong_t *to, ulong_t size) +bt_copy(const ulong_t *from, ulong_t *to, ulong_t size) { ulong_t i; for (i = 0; i < size; i++) diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c index 4f9b9f5985..7647302cfe 100644 --- a/usr/src/uts/common/os/cap_util.c +++ b/usr/src/uts/common/os/cap_util.c @@ -693,7 +693,7 @@ cu_cpc_program(cpu_t *cp, int *err) * * Context is marked with KCPC_CTX_INVALID_STOPPED when context is * unprogrammed and may be marked with KCPC_CTX_INVALID when - * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to + * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to * invalidate all CPC contexts before they take over all the counters. * * This isn't necessary since these flags are only used for thread bound @@ -1258,7 +1258,7 @@ cu_cpu_fini(cpu_t *cp) ctx = cpu_ctx->ctx_ptr_array[i]; if (ctx == NULL) continue; - kcpc_free(ctx, 0); + kcpc_free_cpu(ctx); } /* diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 75c3b000db..93f12d7b96 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -318,7 +318,9 @@ time_t boot_time = 0; /* Boot time in seconds since 1970 */ cyclic_id_t clock_cyclic; /* clock()'s cyclic_id */ cyclic_id_t deadman_cyclic; /* deadman()'s cyclic_id */ -extern void clock_tick_schedule(int); +extern void clock_tick_schedule(int); +extern void set_freemem(void); +extern void pageout_deadman(void); static int lgrp_ticks; /* counter to schedule lgrp load calcs */ @@ -400,7 +402,6 @@ clock(void) uint_t w_io; cpu_t *cp; cpupart_t *cpupart; - extern void set_freemem(); void (*funcp)(); int32_t ltemp; int64_t lltemp; @@ -477,6 +478,7 @@ clock(void) if (one_sec) { loadavg_update(); deadman_counter++; + pageout_deadman(); } /* diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index 1280c8a1b6..27bc319ee6 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -93,7 +93,7 @@ clock_highres_fire(void *arg) static int clock_highres_timer_settime(itimer_t *it, int flags, - const struct itimerspec *when) + const struct itimerspec *when) { cyclic_id_t cyc, *cycp = it->it_arg; proc_t *p = curproc; diff --git a/usr/src/uts/common/os/clock_process.c b/usr/src/uts/common/os/clock_process.c new file mode 100644 index 0000000000..a3c1641c9c --- /dev/null +++ b/usr/src/uts/common/os/clock_process.c @@ -0,0 +1,130 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * This clock backend implements basic support for the CLOCK_PROCESS_CPUTIME_ID + * clock. This clock is weakly defined by POSIX as "The identifier of the + * CPU-time clock associated with the process making a clock() or timer*() + * function call". We interpret that as including LMS_USER, LMS_SYSTEM, and + * LMS_TRAP microstates. This is similar to what we do in proc(5) for the + * lwpstatus_t and the prstatus_t. + * + * At this time, we only provide the ability to read the current time (e.g. + * through a call to clock_gettime(3C)). There is never a case where being able + * to set the time makes sense today and even if so, the privileges required for + * that are circumspect. Today, we do not support the ability to create interval + * timers based on this backend (e.g. timer_create(3C) and timer_settime(3C)). + * However, there is no reason that couldn't be added. + * + * To implement this, we leverage the existing microstate aggregation time that + * is done in /proc. + */ + +#include <sys/timer.h> +#include <sys/cyclic.h> +#include <sys/msacct.h> + +static clock_backend_t clock_process; + +static int +clock_process_settime(timespec_t *ts) +{ + return (EINVAL); +} + +static int +clock_process_gettime(timespec_t *ts) +{ + hrtime_t hrt; + proc_t *p = curproc; + + /* + * mstate_aggr_state() automatically includes LMS_TRAP when we ask for + * LMS_SYSTEM below. + */ + mutex_enter(&p->p_lock); + hrt = mstate_aggr_state(p, LMS_USER); + hrt += mstate_aggr_state(p, LMS_SYSTEM); + mutex_exit(&p->p_lock); + + hrt2ts(hrt, ts); + + return (0); +} + +/* + * See the discussion in clock_thread_getres() for the why of using + * cyclic_getres() here. + */ +static int +clock_process_getres(timespec_t *ts) +{ + hrt2ts(cyclic_getres(), (timestruc_t *)ts); + + return (0); +} + +static int +clock_process_timer_create(itimer_t *it, void (*fire)(itimer_t *)) +{ + return (EINVAL); +} + +static int +clock_process_timer_settime(itimer_t *it, int flags, + const struct itimerspec *when) +{ + return (EINVAL); +} + +static int +clock_process_timer_gettime(itimer_t *it, struct itimerspec *when) +{ + return (EINVAL); +} + +static int +clock_process_timer_delete(itimer_t *it) +{ + return (EINVAL); +} + +static void +clock_process_timer_lwpbind(itimer_t *it) +{ +} + +void +clock_process_init(void) +{ + /* + * While this clock backend doesn't support notifications right now, we + * still fill out the default for what it would be. + */ + clock_process.clk_default.sigev_signo = SIGALRM; + clock_process.clk_default.sigev_notify = SIGEV_SIGNAL; + clock_process.clk_default.sigev_value.sival_ptr = NULL; + + clock_process.clk_clock_settime = clock_process_settime; + clock_process.clk_clock_gettime = clock_process_gettime; + clock_process.clk_clock_getres = clock_process_getres; + clock_process.clk_timer_create = clock_process_timer_create; + clock_process.clk_timer_settime = clock_process_timer_settime; + clock_process.clk_timer_gettime = clock_process_timer_gettime; + clock_process.clk_timer_delete = clock_process_timer_delete; + clock_process.clk_timer_lwpbind = clock_process_timer_lwpbind; + + clock_add_backend(CLOCK_PROCESS_CPUTIME_ID, &clock_process); +} diff --git a/usr/src/uts/common/os/clock_thread.c b/usr/src/uts/common/os/clock_thread.c new file mode 100644 index 0000000000..96dd36fa08 --- /dev/null +++ b/usr/src/uts/common/os/clock_thread.c @@ -0,0 +1,191 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * This clock backend implements basic support for the following two clocks: + * + * o CLOCK_VIRTUAL This provides the ability to read the amount of + * user CPU time that the calling thread has spent + * on CPU. This is the LMS_USER cpu microstate. + * + * o CLOCK_THREAD_CPUTIME_ID This clock is similar to the above; however, it + * also includes system time. This is the LMS_USER, + * LMS_SYSTEM, and LMS_TRAP microstates combined + * together. We include LMS_TRAP here because that + * is what you see in a thread's lwpstatus file. + * + * At this time, we only provide the ability to read the current time (e.g. + * through a call to clock_gettime(3C)). There is never a case where being able + * to set the time makes sense today and truthfully, lying about a process's + * runtime should be left to mdb -kw. Today, we do not support the ability to + * create interval timers based on this backend (e.g. timer_create(3C) and + * timer_settime(3C)). However, there is no reason that couldn't be added. + * + * A nice simplification here is that this clock is always about reading from + * the current thread. This means that one can always access it. Because the + * calling thread exists and is in this code, it means that we know it is here. + * Any other privilege information is left to the broader kernel. + * + * Because the only difference between these is the question of whether or not + * we include LMS_SYSTEM time in the value, we generally use the same actual + * clock backend functions except for the one that implements + * clk_clock_gettime(). + */ + +#include <sys/timer.h> +#include <sys/cyclic.h> +#include <sys/msacct.h> + +static clock_backend_t clock_thread_usr; +static clock_backend_t clock_thread_usrsys; + +static int +clock_thread_settime(timespec_t *ts) +{ + return (EINVAL); +} + +static int +clock_thread_usr_gettime(timespec_t *ts) +{ + hrtime_t hrt; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + + hrt = lwp->lwp_mstate.ms_acct[LMS_USER]; + scalehrtime(&hrt); + hrt2ts(hrt, ts); + + return (0); +} + +static int +clock_thread_usrsys_gettime(timespec_t *ts) +{ + hrtime_t hrt; + kthread_t *t = curthread; + + /* + * mstate_thread_onproc_time() takes care of doing the following: + * + * o Combining LMS_USER, LMS_SYSTEM, and LMS_TRAP. + * o Ensuring that the result is scaled + * o Ensuring that the time that's elapsed to the point of our asking + * is included. By definition the kernel is executing in LMS_SYSTEM + * so this ensures that we add that time which isn't currently in the + * microstate to this. + */ + thread_lock(t); + hrt = mstate_thread_onproc_time(t); + thread_unlock(t); + + hrt2ts(hrt, ts); + return (0); +} + +/* + * The question of the resolution here is a thorny one. Technically this would + * really be based upon the resolution of gethrtime_unscaled(), as we can + * actually tell that much due to our use of CPU microstate accounting. However, + * from a timer resolution perspective it's actually quite different and would + * in theory be based on the system tick rate. + * + * This basically leaves us with two options: + * + * 1) Use 'nsec_per_tick' to go down the Hz path. + * 2) Use the cyclic resolution, which basically is kind of the resolution of + * that timer. + * + * POSIX is unclear as to the effect of the resolution in the case of timer_*() + * functions and only really says it is used to impact the implementation of + * clock_settime() which of course isn't actually supported here. As a result, + * we opt to prefer the cyclic resolution, which is closer to the actual + * resolution of this subsystem. Strictly speaking, this might not be completely + * accurate, but should be on current platforms. + */ +static int +clock_thread_getres(timespec_t *ts) +{ + hrt2ts(cyclic_getres(), (timestruc_t *)ts); + + return (0); +} + +static int +clock_thread_timer_create(itimer_t *it, void (*fire)(itimer_t *)) +{ + return (EINVAL); +} + +static int +clock_thread_timer_settime(itimer_t *it, int flags, + const struct itimerspec *when) +{ + return (EINVAL); +} + +static int +clock_thread_timer_gettime(itimer_t *it, struct itimerspec *when) +{ + return (EINVAL); +} + +static int +clock_thread_timer_delete(itimer_t *it) +{ + return (EINVAL); +} + +static void +clock_thread_timer_lwpbind(itimer_t *it) +{ +} + +void +clock_thread_init(void) +{ + /* + * While this clock backends don't support notifications right now, we + * still fill out the default for what it would be. + */ + clock_thread_usr.clk_default.sigev_signo = SIGALRM; + clock_thread_usr.clk_default.sigev_notify = SIGEV_SIGNAL; + clock_thread_usr.clk_default.sigev_value.sival_ptr = NULL; + + clock_thread_usr.clk_clock_settime = clock_thread_settime; + clock_thread_usr.clk_clock_gettime = clock_thread_usr_gettime; + clock_thread_usr.clk_clock_getres = clock_thread_getres; + clock_thread_usr.clk_timer_create = clock_thread_timer_create; + clock_thread_usr.clk_timer_settime = clock_thread_timer_settime; + clock_thread_usr.clk_timer_gettime = clock_thread_timer_gettime; + clock_thread_usr.clk_timer_delete = clock_thread_timer_delete; + clock_thread_usr.clk_timer_lwpbind = clock_thread_timer_lwpbind; + + clock_thread_usrsys.clk_default.sigev_signo = SIGALRM; + clock_thread_usrsys.clk_default.sigev_notify = SIGEV_SIGNAL; + clock_thread_usrsys.clk_default.sigev_value.sival_ptr = NULL; + + clock_thread_usrsys.clk_clock_settime = clock_thread_settime; + clock_thread_usrsys.clk_clock_gettime = clock_thread_usrsys_gettime; + clock_thread_usrsys.clk_clock_getres = clock_thread_getres; + clock_thread_usrsys.clk_timer_create = clock_thread_timer_create; + clock_thread_usrsys.clk_timer_settime = clock_thread_timer_settime; + clock_thread_usrsys.clk_timer_gettime = clock_thread_timer_gettime; + clock_thread_usrsys.clk_timer_delete = clock_thread_timer_delete; + clock_thread_usrsys.clk_timer_lwpbind = clock_thread_timer_lwpbind; + + clock_add_backend(CLOCK_VIRTUAL, &clock_thread_usr); + clock_add_backend(CLOCK_THREAD_CPUTIME_ID, &clock_thread_usrsys); +} diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index e53c75b64e..6a86dbb8cb 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ /* @@ -60,7 +61,7 @@ #include <sys/archsystm.h> #include <sys/sdt.h> #include <sys/smt.h> -#if defined(__x86) || defined(__amd64) +#if defined(__x86) #include <sys/x86_archext.h> #endif #include <sys/callo.h> @@ -613,7 +614,7 @@ again: * requests will continue to be satisfied in the same way, * even if weak bindings have recommenced. */ - if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) { + if (t->t_nomigrate < 0 || (weakbindingbarrier && t->t_nomigrate == 0)) { --t->t_nomigrate; thread_unlock(curthread); return; /* with kpreempt_disable still active */ @@ -2909,7 +2910,7 @@ cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu) } void -cpuset_or(cpuset_t *dst, cpuset_t *src) +cpuset_or(cpuset_t *dst, const cpuset_t *src) { for (int i = 0; i < CPUSET_WORDS; i++) { dst->cpub[i] |= src->cpub[i]; @@ -2917,7 +2918,7 @@ cpuset_or(cpuset_t *dst, cpuset_t *src) } void -cpuset_xor(cpuset_t *dst, cpuset_t *src) +cpuset_xor(cpuset_t *dst, const cpuset_t *src) { for (int i = 0; i < CPUSET_WORDS; i++) { dst->cpub[i] ^= src->cpub[i]; @@ -2925,7 +2926,7 @@ cpuset_xor(cpuset_t *dst, cpuset_t *src) } void -cpuset_and(cpuset_t *dst, cpuset_t *src) +cpuset_and(cpuset_t *dst, const cpuset_t *src) { for (int i = 0; i < CPUSET_WORDS; i++) { dst->cpub[i] &= src->cpub[i]; diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 0bd6cfd44f..5e909667de 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -20,13 +20,14 @@ */ /* * Copyright (c) 2013, Ira Cooper. All rights reserved. + * Copyright 2020 Nexenta by DDN, Inc. All rights reserved. */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 @@ -288,7 +289,7 @@ crget(void) { cred_t *cr = kmem_cache_alloc(cred_cache, KM_SLEEP); - bcopy(kcred, cr, crsize); + bcopy(zone_kcred(), cr, crsize); cr->cr_ref = 1; zone_cred_hold(cr->cr_zone); if (cr->cr_label) @@ -377,7 +378,7 @@ crfree(cred_t *cr) /* * Copy a cred structure to a new one and free the old one. * The new cred will have two references. One for the calling process, - * and one for the thread. + * and one for the thread. */ cred_t * crcopy(cred_t *cr) @@ -404,7 +405,7 @@ crcopy(cred_t *cr) /* * Copy a cred structure to a new one and free the old one. * The new cred will have two references. One for the calling process, - * and one for the thread. + * and one for the thread. * This variation on crcopy uses a pre-allocated structure for the * "new" cred. */ diff --git a/usr/src/uts/common/os/dacf.c b/usr/src/uts/common/os/dacf.c index 8d4cd486d8..592b1cd570 100644 --- a/usr/src/uts/common/os/dacf.c +++ b/usr/src/uts/common/os/dacf.c @@ -642,7 +642,7 @@ dacf_arglist_delete(dacf_arg_t **list) * Match a device-spec to a rule. */ dacf_rule_t * -dacf_match(dacf_opid_t op, dacf_devspec_t ds, void *match_info) +dacf_match(dacf_opid_t op, dacf_devspec_t ds, const void *match_info) { dacf_rule_t *rule; diff --git a/usr/src/uts/common/os/dacf_clnt.c b/usr/src/uts/common/os/dacf_clnt.c index e40509d33b..fdb1696fb2 100644 --- a/usr/src/uts/common/os/dacf_clnt.c +++ b/usr/src/uts/common/os/dacf_clnt.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DACF (Device Autoconfiguration Framework) client code. * @@ -67,8 +65,8 @@ * for the operation to be invoked at post-attach and/or pre-detach time. */ void -dacfc_match_create_minor(char *name, char *node_type, dev_info_t *dip, - struct ddi_minor_data *dmdp, int flag) +dacfc_match_create_minor(const char *name, const char *node_type, + dev_info_t *dip, struct ddi_minor_data *dmdp, int flag) { dacf_rule_t *r; char *dev_path, *dev_pathp, *drv_mname = NULL; diff --git a/usr/src/uts/common/os/ddi.c b/usr/src/uts/common/os/ddi.c index a37d91e92a..c348ee474c 100644 --- a/usr/src/uts/common/os/ddi.c +++ b/usr/src/uts/common/os/ddi.c @@ -1136,8 +1136,8 @@ qunbufcall(queue_t *q, bufcall_id_t id) * Associate the stream with an instance of the bottom driver. This * function is called by APIs that establish or modify the hardware * association (ppa) of an open stream. Two examples of such - * post-open(9E) APIs are the dlpi(7p) DL_ATTACH_REQ message, and the - * ndd(1M) "instance=" ioctl(2). This interface may be called from a + * post-open(9E) APIs are the dlpi(4P) DL_ATTACH_REQ message, and the + * ndd(8) "instance=" ioctl(2). This interface may be called from a * stream driver's wput procedure and from within syncq perimeters, * so it can't block. * diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c index 38e575dbfd..8f0890fc2b 100644 --- a/usr/src/uts/common/os/ddi_hp_impl.c +++ b/usr/src/uts/common/os/ddi_hp_impl.c @@ -92,8 +92,8 @@ * - Through the nexus driver interface, ndi_hp_state_change_req. PCIe * nexus drivers that pass a hotplug interrupt through to pciehpc will kick * off state changes in this way. - * - Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and - * hotplug(1M) pass state change requests through hotplugd, which uses + * - Through coordinated removal, ddihp_modctl. Both cfgadm(8) and + * hotplug(8) pass state change requests through hotplugd, which uses * modctl to request state changes to the DDI hotplug framework. That * interface is ultimately implemented by ddihp_modctl. * @@ -131,7 +131,7 @@ * of some key components are below. * * +------------+ - * | cfgadm(1M) | + * | cfgadm(8) | * +------------+ * | * +-------------------+ @@ -139,7 +139,7 @@ * +-------------------+ * | * +-------------+ +------------+ - * | hotplug(1M) |----------| libhotplug | + * | hotplug(8) |----------| libhotplug | * +-------------+ +------------+ * | * +----------+ @@ -193,14 +193,14 @@ * * KEY HOTPLUG SOFTWARE COMPONENTS * - * CFGADM(1M) + * cfgadm(8) * * cfgadm is the canonical tool for hotplug operations. It can be used to * list connections on the system and change their state in a coordinated * fashion. For more information, see its manual page. * * - * HOTPLUG(1M) + * hotplug(8) * * hotplug is a command line tool for managing hotplug connections for * connectors. For more information, see its manual page. diff --git a/usr/src/uts/common/os/ddi_intr_impl.c b/usr/src/uts/common/os/ddi_intr_impl.c index 215be73722..22f4548607 100644 --- a/usr/src/uts/common/os/ddi_intr_impl.c +++ b/usr/src/uts/common/os/ddi_intr_impl.c @@ -35,7 +35,7 @@ #include <sys/sunndi.h> #include <sys/ndi_impldefs.h> /* include prototypes */ -#if defined(__i386) || defined(__amd64) +#if defined(__x86) /* * MSI-X allocation limit. */ @@ -294,7 +294,7 @@ i_ddi_intr_get_limit(dev_info_t *dip, int type, ddi_irm_pool_t *pool_p) limit = MIN(limit, nintrs); /* Impose a global MSI-X limit on x86 */ -#if defined(__i386) || defined(__amd64) +#if defined(__x86) if (type == DDI_INTR_TYPE_MSIX) limit = MIN(limit, ddi_msix_alloc_limit); #endif @@ -539,7 +539,7 @@ set_intr_affinity(ddi_intr_handle_t h, processorid_t tgt) return (ret); } -#if defined(__i386) || defined(__amd64) +#if defined(__x86) ddi_acc_handle_t i_ddi_get_pci_config_handle(dev_info_t *dip) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index a4b35dcb5b..2433c504fc 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -34,7 +34,7 @@ #include <sys/sunndi.h> #include <sys/ndi_impldefs.h> /* include prototypes */ -#if defined(__i386) || defined(__amd64) +#if defined(__x86) /* * MSI-X allocation limit. */ @@ -767,7 +767,7 @@ i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag) /* Determine new request size */ nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz); -#if defined(__i386) || defined(__amd64) +#if defined(__x86) /* Use the default static limit for non-IRM drivers */ if (req_p->ireq_type == DDI_INTR_TYPE_MSIX) nreq = MIN(nreq, ddi_msix_alloc_limit); diff --git a/usr/src/uts/common/os/ddi_ufm.c b/usr/src/uts/common/os/ddi_ufm.c index ffb04eddec..940ebf82bf 100644 --- a/usr/src/uts/common/os/ddi_ufm.c +++ b/usr/src/uts/common/os/ddi_ufm.c @@ -11,6 +11,7 @@ /* * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #include <sys/avl.h> @@ -20,13 +21,16 @@ #include <sys/kmem.h> #include <sys/sunddi.h> #include <sys/stddef.h> +#include <sys/sunndi.h> +#include <sys/file.h> +#include <sys/sysmacros.h> /* * The UFM subsystem tracks its internal state with respect to device * drivers that participate in the DDI UFM subsystem on a per-instance basis * via ddi_ufm_handle_t structures (see ddi_ufm_impl.h). This is known as the * UFM handle. The UFM handle contains a pointer to the driver's UFM ops, - * which the ufm(7D) pseudo driver uses to invoke the UFM entry points in + * which the ufm(4D) pseudo driver uses to invoke the UFM entry points in * response to DDI UFM ioctls. Additionally, the DDI UFM subsystem uses the * handle to maintain cached UFM image and slot data. * @@ -65,6 +69,12 @@ * These tests should be run whenever changes are made to the DDI UFM * subsystem or the ufm driver. */ + +/* + * Amount of data to read in one go (1 MiB). + */ +#define UFM_READ_STRIDE (1024 * 1024) + static avl_tree_t ufm_handles; static kmutex_t ufm_lock; @@ -171,7 +181,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh) */ ufmh->ufmh_images = kmem_zalloc((sizeof (ddi_ufm_image_t) * ufmh->ufmh_nimages), - KM_NOSLEEP | KM_NORMALPRI); + KM_NOSLEEP_LAZY); if (ufmh->ufmh_images == NULL) return (ENOMEM); @@ -191,7 +201,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh) img->ufmi_slots = kmem_zalloc((sizeof (ddi_ufm_slot_t) * img->ufmi_nslots), - KM_NOSLEEP | KM_NORMALPRI); + KM_NOSLEEP_LAZY); if (img->ufmi_slots == NULL) { ret = ENOMEM; goto cache_fail; @@ -234,6 +244,12 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh) if (slot->ufms_attrs & DDI_UFM_ATTR_EMPTY) continue; + if (slot->ufms_imgsize != 0) { + fnvlist_add_uint64(slots[s], + DDI_UFM_NV_SLOT_IMGSIZE, + slot->ufms_imgsize); + } + fnvlist_add_string(slots[s], DDI_UFM_NV_SLOT_VERSION, slot->ufms_version); if (slot->ufms_misc != NULL) { @@ -257,6 +273,56 @@ cache_fail: return (ret); } +int +ufm_read_img(ddi_ufm_handle_t *ufmh, uint_t img, uint_t slot, uint64_t len, + uint64_t off, uintptr_t uaddr, uint64_t *nreadp, int copyflags) +{ + int ret = 0; + ddi_ufm_cap_t caps; + void *buf; + uint64_t nread; + + ret = ufmh->ufmh_ops->ddi_ufm_op_getcaps(ufmh, ufmh->ufmh_arg, &caps); + if (ret != 0) { + return (ret); + } + + if ((caps & DDI_UFM_CAP_READIMG) == 0 || + ufmh->ufmh_ops->ddi_ufm_op_readimg == NULL) { + return (ENOTSUP); + } + + if (off + len < MAX(off, len)) { + return (EOVERFLOW); + } + + buf = kmem_zalloc(UFM_READ_STRIDE, KM_SLEEP); + nread = 0; + while (len > 0) { + uint64_t toread = MIN(len, UFM_READ_STRIDE); + uint64_t iter; + + ret = ufmh->ufmh_ops->ddi_ufm_op_readimg(ufmh, ufmh->ufmh_arg, + img, slot, toread, off + nread, buf, &iter); + if (ret != 0) { + break; + } + + if (ddi_copyout(buf, (void *)(uintptr_t)(uaddr + nread), iter, + copyflags & FKIOCTL) != 0) { + ret = EFAULT; + break; + } + + nread += iter; + len -= iter; + } + + *nreadp = nread; + kmem_free(buf, UFM_READ_STRIDE); + return (ret); +} + /* * This gets called early in boot by setup_ddi(). */ @@ -375,6 +441,12 @@ ddi_ufm_init(dev_info_t *dip, uint_t version, ddi_ufm_ops_t *ufmops, mutex_exit(&old_ufmh->ufmh_lock); } + /* + * Give a hint in the devinfo tree that this device supports UFM + * capabilities. + */ + (void) ndi_prop_create_boolean(DDI_DEV_T_NONE, dip, "ddi-ufm-capable"); + return (DDI_SUCCESS); } @@ -453,3 +525,10 @@ ddi_ufm_slot_set_misc(ddi_ufm_slot_t *usp, nvlist_t *misc) nvlist_free(usp->ufms_misc); usp->ufms_misc = misc; } + +void +ddi_ufm_slot_set_imgsize(ddi_ufm_slot_t *usp, uint64_t size) +{ + VERIFY3P(usp, !=, NULL); + usp->ufms_imgsize = size; +} diff --git a/usr/src/uts/common/os/ddifm.c b/usr/src/uts/common/os/ddifm.c index 533fa15aed..dc39ba49ab 100644 --- a/usr/src/uts/common/os/ddifm.c +++ b/usr/src/uts/common/os/ddifm.c @@ -56,7 +56,7 @@ * * Error reports resulting from hardware component specific and common IO * fault and driver defects must be accompanied by an Eversholt fault - * tree (.eft) by the Solaris fault manager (fmd(1M)) for + * tree (.eft) by the Solaris fault manager (fmd(8)) for * diagnosis. * * DDI_FM_ERRCB_CAPABLE @@ -466,7 +466,7 @@ out: if (ereport && (nva == NULL)) /* * Generate an error report for consumption by the Solaris Fault Manager, - * fmd(1M). Valid ereport classes are defined in /usr/include/sys/fm/io. + * fmd(8). Valid ereport classes are defined in /usr/include/sys/fm/io. * * The ENA should be set if this error is a result of an error status * returned from ddi_dma_err_check() or ddi_acc_err_check(). Otherwise, diff --git a/usr/src/uts/common/os/devcfg.c b/usr/src/uts/common/os/devcfg.c index cbcc4db3d8..d61525be9c 100644 --- a/usr/src/uts/common/os/devcfg.c +++ b/usr/src/uts/common/os/devcfg.c @@ -24,6 +24,7 @@ * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ #include <sys/note.h> @@ -62,6 +63,7 @@ #include <sys/varargs.h> #include <sys/modhash.h> #include <sys/instance.h> +#include <sys/sysevent/eventdefs.h> #if defined(__amd64) && !defined(__xpv) #include <sys/iommulib.h> @@ -253,7 +255,7 @@ i_ddi_node_cache_init() * The allocated node has a reference count of 0. */ dev_info_t * -i_ddi_alloc_node(dev_info_t *pdip, char *node_name, pnode_t nodeid, +i_ddi_alloc_node(dev_info_t *pdip, const char *node_name, pnode_t nodeid, int instance, ddi_prop_t *sys_prop, int flag) { struct dev_info *devi; @@ -395,6 +397,9 @@ sid: devi->devi_node_attributes |= DDI_PERSISTENT; devi->devi_ct_count = -1; /* counter not in use if -1 */ list_create(&(devi->devi_ct), sizeof (cont_device_t), offsetof(cont_device_t, cond_next)); + list_create(&devi->devi_unbind_cbs, sizeof (ddi_unbind_callback_t), + offsetof(ddi_unbind_callback_t, ddiub_next)); + mutex_init(&devi->devi_unbind_lock, NULL, MUTEX_DEFAULT, NULL); i_ddi_set_node_state((dev_info_t *)devi, DS_PROTO); da_log_enter((dev_info_t *)devi); @@ -491,6 +496,9 @@ i_ddi_free_node(dev_info_t *dip) if (devi->devi_ev_path) kmem_free(devi->devi_ev_path, MAXPATHLEN); + mutex_destroy(&devi->devi_unbind_lock); + list_destroy(&devi->devi_unbind_cbs); + kmem_cache_free(ddi_node_cache, devi); } @@ -828,6 +836,7 @@ bind_node(dev_info_t *dip) static int unbind_node(dev_info_t *dip) { + ddi_unbind_callback_t *cb; ASSERT(DEVI(dip)->devi_node_state == DS_BOUND); ASSERT(DEVI(dip)->devi_major != DDI_MAJOR_T_NONE); @@ -842,6 +851,11 @@ unbind_node(dev_info_t *dip) DEVI(dip)->devi_major = DDI_MAJOR_T_NONE; DEVI(dip)->devi_binding_name = DEVI(dip)->devi_node_name; + + while ((cb = list_remove_head(&DEVI(dip)->devi_unbind_cbs)) != NULL) { + cb->ddiub_cb(cb->ddiub_arg, dip); + } + return (DDI_SUCCESS); } @@ -1486,12 +1500,12 @@ postattach_node(dev_info_t *dip) /* * Plumbing during postattach may fail because of the * underlying device is not ready. This will fail ndi_devi_config() - * in dv_filldir() and a warning message is issued. The message - * from here will explain what happened + * in dv_filldir(). */ if (rval != DACF_SUCCESS) { - cmn_err(CE_WARN, "Postattach failed for %s%d\n", - ddi_driver_name(dip), ddi_get_instance(dip)); + NDI_CONFIG_DEBUG((CE_CONT, "postattach_node: %s%d (%p) " + "postattach failed\n", ddi_driver_name(dip), + ddi_get_instance(dip), (void *)dip)); return (DDI_FAILURE); } @@ -2044,7 +2058,7 @@ ndi_devi_tryenter(dev_info_t *dip, int *circular) * not allowed to sleep. */ int -ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid, +ndi_devi_alloc(dev_info_t *parent, const char *node_name, pnode_t nodeid, dev_info_t **ret_dip) { ASSERT(node_name != NULL); @@ -2064,7 +2078,7 @@ ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid, * This routine may sleep and should not be called at interrupt time */ void -ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, pnode_t nodeid, +ndi_devi_alloc_sleep(dev_info_t *parent, const char *node_name, pnode_t nodeid, dev_info_t **ret_dip) { ASSERT(node_name != NULL); @@ -2534,7 +2548,7 @@ i_ddi_get_exported_classes(dev_info_t *dip, char ***classes) * Helper functions, returns NULL if no memory. */ char * -i_ddi_strdup(char *str, uint_t flag) +i_ddi_strdup(const char *str, uint_t flag) { char *copy; @@ -3560,7 +3574,6 @@ walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg, * They include, but not limited to, _init(9e), _fini(9e), probe(9e), * attach(9e), and detach(9e). */ - void ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg) { @@ -3580,7 +3593,6 @@ ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg) * * N.B. The same restrictions from ddi_walk_devs() apply. */ - void e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg) { @@ -3609,6 +3621,91 @@ e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg) UNLOCK_DEV_OPS(&dnp->dn_lock); } +struct preroot_walk_block_devices_arg { + int (*prwb_func)(const char *, void *); + void *prwb_arg; +}; + +static int +preroot_walk_block_devices_walker(dev_info_t *dip, void *arg) +{ + struct preroot_walk_block_devices_arg *prwb = arg; + + if (i_ddi_devi_class(dip) == NULL || + strcmp(i_ddi_devi_class(dip), ESC_DISK) != 0) { + /* + * We do not think that this is a disk. + */ + return (DDI_WALK_CONTINUE); + } + + for (struct ddi_minor_data *md = DEVI(dip)->devi_minor; md != NULL; + md = md->next) { + if (md->ddm_spec_type != S_IFBLK) { + /* + * We don't want the raw version of any block device. + */ + continue; + } + + /* + * The node type taxonomy is hierarchical, with each level + * separated by colons. Nodes of interest are either of the + * BLOCK type, or are prefixed with that type. + */ + if (strcmp(md->ddm_node_type, DDI_NT_BLOCK) != 0 && + strncmp(md->ddm_node_type, DDI_NT_BLOCK ":", + strlen(DDI_NT_BLOCK ":")) != 0) { + /* + * This minor node does not represent a block device. + */ + continue; + } + + char buf[MAXPATHLEN]; + int r; + if ((r = prwb->prwb_func(ddi_pathname_minor(md, buf), + prwb->prwb_arg)) == PREROOT_WALK_BLOCK_DEVICES_CANCEL) { + /* + * The consumer does not need any more minor nodes. + */ + return (DDI_WALK_TERMINATE); + } + VERIFY3S(r, ==, PREROOT_WALK_BLOCK_DEVICES_NEXT); + } + + return (DDI_WALK_CONTINUE); +} + +/* + * Private routine for ZFS when it needs to attach and scan all of the block + * device minors in the system while looking for vdev labels. + * + * The callback function accepts a physical device path and the context + * argument (arg) passed to this function; it should return + * PREROOT_WALK_BLOCK_DEVICES_NEXT when more devices are required and + * PREROOT_WALK_BLOCK_DEVICES_CANCEL to stop the walk. + */ +void +preroot_walk_block_devices(int (*callback)(const char *, void *), void *arg) +{ + /* + * First, force everything which can attach to do so. The device class + * is not derived until at least one minor mode is created, so we + * cannot walk the device tree looking for a device class of ESC_DISK + * until everything is attached. + */ + (void) ndi_devi_config(ddi_root_node(), NDI_CONFIG | NDI_DEVI_PERSIST | + NDI_NO_EVENT | NDI_DRV_CONF_REPROBE); + + struct preroot_walk_block_devices_arg prwb; + prwb.prwb_func = callback; + prwb.prwb_arg = arg; + + ddi_walk_devs(ddi_root_node(), preroot_walk_block_devices_walker, + &prwb); +} + /* * argument to i_find_devi, a devinfo node search callback function. */ @@ -3823,8 +3920,8 @@ ddi_is_pci_dip(dev_info_t *dip) * to ioc's bus_config entry point. */ int -resolve_pathname(char *pathname, - dev_info_t **dipp, dev_t *devtp, int *spectypep) +resolve_pathname(char *pathname, dev_info_t **dipp, dev_t *devtp, + int *spectypep) { int error; dev_info_t *parent, *child; @@ -9055,7 +9152,7 @@ out: char * ddi_curr_redirect(char *curr) { - char *alias; + char *alias; int i; if (ddi_aliases_present == B_FALSE) @@ -9196,3 +9293,13 @@ ddi_mem_update(uint64_t addr, uint64_t size) ; #endif } + +void +e_ddi_register_unbind_callback(dev_info_t *dip, ddi_unbind_callback_t *cb) +{ + struct dev_info *devi = DEVI(dip); + + mutex_enter(&devi->devi_unbind_lock); + list_insert_tail(&devi->devi_unbind_cbs, cb); + mutex_exit(&devi->devi_unbind_lock); +} diff --git a/usr/src/uts/common/os/devid_cache.c b/usr/src/uts/common/os/devid_cache.c index 3e1a06a844..2a780eebe2 100644 --- a/usr/src/uts/common/os/devid_cache.c +++ b/usr/src/uts/common/os/devid_cache.c @@ -47,7 +47,7 @@ * involves walking the entire device tree attaching all possible disk * instances, to search for the device referenced by a devid. Obviously, * full device discovery is something to be avoided where possible. - * Note that simply invoking devfsadm(1M) is equivalent to running full + * Note that simply invoking devfsadm(8) is equivalent to running full * discovery at the devid cache level. * * Reasons why a disk may not be accessible: @@ -61,7 +61,7 @@ * When discovery may succeed: * Discovery will result in success when a device has been moved * to a different address. Note that it's recommended that - * devfsadm(1M) be invoked (no arguments required) whenever a system's + * devfsadm(8) be invoked (no arguments required) whenever a system's * h/w configuration has been updated. Alternatively, a * reconfiguration boot can be used to accomplish the same result. * @@ -69,7 +69,7 @@ * failure for a device which was powered off. Assuming the cache has an * entry for such a device, simply powering it on should permit the system * to access it. If problems persist after powering it on, invoke - * devfsadm(1M). + * devfsadm(8). * * Discovery prior to mounting root is only of interest when booting * from a filesystem which accesses devices by device id, which of diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c index 85470f7e28..4bf1f54ca4 100644 --- a/usr/src/uts/common/os/dkioc_free_util.c +++ b/usr/src/uts/common/os/dkioc_free_util.c @@ -10,7 +10,8 @@ */ /* - * Copyright 2017 Nexenta Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. + * Copyright 2020 Joyent, Inc. */ /* needed when building libzpool */ @@ -25,6 +26,13 @@ #include <sys/file.h> #include <sys/sdt.h> +static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *, + uint64_t len_blk); +static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *, + uint64_t, dfl_iter_fn_t, void *, int); +static int process_range(dkioc_free_list_t *, uint64_t, uint64_t, + dfl_iter_fn_t, void *, int); + /* * Copy-in convenience function for variable-length dkioc_free_list_t * structures. The pointer to be copied from is in `arg' (may be a pointer @@ -78,3 +86,435 @@ dfl_free(dkioc_free_list_t *dfl) { kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); } + +/* + * Convenience function to resize and segment the array of extents in + * a DKIOCFREE request as required by a driver. + * + * Some devices that implement DKIOCFREE (e.g. vioblk) have limits + * on either the number of extents that can be submitted in a single request, + * or the total number of blocks that can be submitted in a single request. + * In addition, devices may have alignment requirements on the starting + * address stricter than the device block size. + * + * Since there is currently no mechanism for callers of DKIOCFREE to discover + * such restrictions, instead of rejecting any requests that do not conform to + * some undiscoverable (to the caller) set of requirements, a driver can use + * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as + * required to conform to its requirements. + * + * The original request is passed as 'dfl' and the alignment requirements + * are passed in 'dfi'. Additionally the maximum offset of the device allowed + * in bytes) is passed as max_off -- this allows a driver with + * multiple instances of different sizes but similar requirements (e.g. + * a partitioned blkdev device) to not construct a separate dkioc_free_info_t + * struct for each device. + * + * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of + * arg passed to it as needed. If the extents in the dkioc_free_list_t passed + * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will + * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter() + * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t + * instances and populate them with the adjusted extents that do conform to the + * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t + * passed to it when this occurs. The net result is that 'func' can always + * assume it will be called with a dkioc_free_list_t with extents that + * comply with the requirements in 'dfi'. 'func' is also responsible for + * freeing the dkioc_free_list_t passed to it (likely via a completion + * callback). + * + * Combined with the behavior described above, dfl_iter() can be viewed as + * consuming the dkioc_free_list_t passed to it. Either it will pass it along + * to 'func' (and let 'func' handle freeing it), or it will free it and + * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still + * letting 'func' handle freeing the new instances). This way neither the + * dfl_iter() caller nor nor the driver need to worry about treating + * conforming and non-conforming requests differently. + * + * Unfortunately, the DKIOCFREE ioctl provides no method for communicating + * any notion of partial completion -- either it returns success (0) or + * an error. It's not clear if such a notion would even be possible while + * supporting multiple types of devices (NVMe, SCSI, etc.) with the same + * interface. As such, there's little benefit to providing more detailed error + * semantics beyond what DKIOCFREE can handle. + * + * Due to this, a somewhat simplistic approach is taken to error handling. The + * original list of extents is first checked to make sure they all appear + * valid -- that is they do not start or extend beyond the end of the device. + * Any request that contains such extents is always rejected in it's entirety. + * It is possible after applying any needed adjustments to the original list + * of extents that the result is not acceptable to the driver. For example, + * a device with a 512 byte block size that tries to free the range 513-1023 + * (bytes) would not be able to be processed. Such extents will be silently + * ignored. If the original request consists of nothing but such requests, + * dfl_iter() will never call 'func' and will merely return 0. + */ +int +dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off, + dfl_iter_fn_t func, void *arg, int kmflag) +{ + dkioc_free_list_ext_t *ext; + uint64_t n_bytes, n_segs, start_idx, i; + uint_t bsize = 1U << dfi->dfi_bshift; + int r = 0; + boolean_t need_copy = B_FALSE; + + /* + * Make sure the block size derived from dfi_bshift is at least 512 + * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is + * to prevent any problems with other parts of the system that might + * assume a minimum block size of 512, and the upper bound is just + * to prevent overflow when creating the block size from dfi_bshift + * (though it seems unlikely we'll have _block_ sizes near a GiB + * any time soon). + */ + if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) { + r = SET_ERROR(EINVAL); + goto done; + } + + /* Max bytes must be a multiple of the block size */ + if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) { + r = SET_ERROR(EINVAL); + goto done; + } + + /* Start offset alignment must also be a multiple of the block size */ + if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) { + r = SET_ERROR(EINVAL); + goto done; + } + + /* Max bytes in an extent must be a multiple of the block size */ + if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) { + r = SET_ERROR(EINVAL); + goto done; + } + + /* + * It makes no sense to allow a single extent to be larger than the + * total allowed for an entire request. + */ + if (dfi->dfi_max_ext_bytes > 0 && + dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) { + r = SET_ERROR(EINVAL); + goto done; + } + + /* + * The first pass, align everything as needed and make sure all the + * extents look valid. + */ + if ((r = adjust_exts(dfl, dfi, max_off)) != 0) { + goto done; + } + + /* + * Go through and split things up as needed. The general idea is to + * split along the original extent boundaries when needed. We only + * split an extent from the original request into multiple extents + * if the original extent is by itself too big for the device to + * process in a single request. + */ + start_idx = 0; + n_bytes = n_segs = 0; + ext = dfl->dfl_exts; + for (i = 0; i < dfl->dfl_num_exts; i++, ext++) { + uint64_t start = dfl->dfl_offset + ext->dfle_start; + uint64_t len = ext->dfle_length; + + if (len == 0) { + /* + * If we encounter a zero length extent, we're going + * to create a new copy of dfl no matter what -- + * the size of dfl is determined by dfl_num_exts so + * we cannot do things like shift the contents and + * reduce dfl_num_exts to get a contiguous array + * of non-zero length extents. + */ + need_copy = B_TRUE; + continue; + } + + if (dfi->dfi_max_ext_bytes > 0 && + len > dfi->dfi_max_ext_bytes) { + /* + * An extent that's too large. Dispatch what we've + * accumulated, and then split this extent into + * smaller ones the device can accept. + */ + if ((r = process_range(dfl, start_idx, i - start_idx, + func, arg, kmflag)) != 0) { + goto done; + } + + if ((r = split_extent(dfl, dfi, i, func, arg, + kmflag)) != 0) { + goto done; + } + start_idx = i + 1; + n_segs = 0; + n_bytes = 0; + continue; + } + + if (dfi->dfi_max_bytes > 0 && + n_bytes + len > dfi->dfi_max_bytes) { + /* + * This extent would put us over the limit for total + * bytes that can be trimmed in one request. + * Dispatch what we've accumulated. Then deal + * with this extent. + */ + if ((r = process_range(dfl, start_idx, i - start_idx, + func, arg, kmflag)) != 0) { + goto done; + } + + if (len < dfi->dfi_max_bytes) { + /* + * After dispatching what we've accumulated, + * this extent can fit in a new request + * Just add it to the accumulated list of + * extents and move on. + */ + start_idx = i; + n_segs = 1; + n_bytes = len; + continue; + } + + /* + * Even after starting a new request, this extent + * is too big. Split it until it fits. + */ + if ((r = split_extent(dfl, dfi, i, func, arg, + kmflag)) != 0) { + goto done; + } + + start_idx = i + 1; + n_segs = 0; + n_bytes = 0; + continue; + } + + if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) { + /* + * This extent will put us over the limit on the number + * of extents the device can accept. Dispatch what + * we've accumulated so far. + */ + if ((r = process_range(dfl, start_idx, i - start_idx, + func, arg, kmflag)) != 0) { + goto done; + } + + start_idx = i; + n_segs = 1; + n_bytes = len; + continue; + } + + n_segs++; + n_bytes += len; + } + + /* + * If a copy wasn't required, and we haven't processed a subset of + * the extents already, we can just use the original request. + */ + if (!need_copy && start_idx == 0) { + return (func(dfl, arg, kmflag)); + } + + r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag); + +done: + dfl_free(dfl); + return (r); +} + +/* + * Adjust the start and length of each extent in dfl so that it conforms to + * the requirements in dfi. It also verifies that no extent extends beyond + * the end of the device (given by len_blk). + * + * Returns 0 on success, or an error value. + */ +static int +adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, + uint64_t max_off) +{ + dkioc_free_list_ext_t *exts = dfl->dfl_exts; + /* + * These must be uint64_t to prevent the P2 macros from truncating + * the result. + */ + const uint64_t align = dfi->dfi_align; + const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift; + + for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) { + /* + * Since there are no known requirements on the value of + * dfl_offset, it's possible (though odd) to have a scenario + * where dfl_offset == 1, and dfle_start == 511 (resulting + * in an actual start offset of 512). As such, we always + * apply the offset and find the resulting starting offset + * and length (in bytes) first, then apply any rounding + * and alignment. + */ + uint64_t start = exts->dfle_start + dfl->dfl_offset; + uint64_t end = start + exts->dfle_length; + + /* + * Make sure after applying dfl->dfl_offset and any alignment + * adjustments that the results don't overflow. + */ + if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) { + return (SET_ERROR(EOVERFLOW)); + } + + if (end < start) { + return (SET_ERROR(EOVERFLOW)); + } + + /* + * Make sure we don't extend past the end of the device + */ + if (end > max_off) { + return (SET_ERROR(EINVAL)); + } + + start = P2ROUNDUP(start, align); + end = P2ALIGN(end, bsize); + + /* + * Remove the offset so that when it's later applied again, + * the correct start value is obtained. + */ + exts->dfle_start = start - dfl->dfl_offset; + + /* + * If the original length was less than the block size + * of the device, we can end up with end < start. If that + * happens we just set the length to zero. + */ + exts->dfle_length = (end < start) ? 0 : end - start; + } + + return (0); +} + +/* + * Take a subset of extents from dfl (starting at start_idx, with n entries) + * and create a new dkioc_free_list_t, passing that to func. + */ +static int +process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n, + dfl_iter_fn_t func, void *arg, int kmflag) +{ + dkioc_free_list_t *new_dfl = NULL; + dkioc_free_list_ext_t *new_exts = NULL; + dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx; + size_t actual_n = n; + int r = 0; + + if (n == 0) { + return (0); + } + + /* + * Ignore any zero length extents. No known devices attach any + * semantic meaning to such extents, and are likely just a result of + * narrowing the range of the extent to fit the device alignment + * requirements. It is possible the original caller submitted a + * zero length extent, but we ignore those as well. Since we can't + * communicate partial results back to the caller anyway, it's + * unclear whether reporting that one of potentially many exents was + * too small (without being able to identify which one) to the caller + * of the DKIOCFREE request would be useful. + */ + for (uint64_t i = 0; i < n; i++) { + if (exts[i].dfle_length == 0 && --actual_n == 0) { + return (0); + } + } + + new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag); + if (new_dfl == NULL) { + return (SET_ERROR(ENOMEM)); + } + + new_dfl->dfl_flags = dfl->dfl_flags; + new_dfl->dfl_num_exts = actual_n; + new_dfl->dfl_offset = dfl->dfl_offset; + new_exts = new_dfl->dfl_exts; + + for (uint64_t i = 0; i < n; i++) { + if (exts[i].dfle_length == 0) { + continue; + } + + *new_exts++ = exts[i]; + } + + return (func(new_dfl, arg, kmflag)); +} + +/* + * If dfi_max_ext_bytes is set, use as the max segment length, + * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX + */ +#define MAX_SEGLEN(dfi) \ + (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \ + ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX) + +/* + * Split the extent at idx into multiple lists (calling func for each one). + */ +static int +split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx, + dfl_iter_fn_t func, void *arg, int kmflag) +{ + ASSERT3U(idx, <, dfl->dfl_num_exts); + + const uint64_t maxlen = MAX_SEGLEN(dfi); + dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx; + uint64_t remain = ext->dfle_length; + int r; + + /* + * Break the extent into as many single requests as needed. While it + * would be possible in some circumstances to combine the final chunk + * of the extent (after splitting) with the remaining extents in the + * original request, it's not clear there's much benefit from the + * added complexity. Such behavior could be added in the future if + * it's determined to be worthwhile. + */ + while (remain > 0) { + uint64_t start = dfl->dfl_offset + ext->dfle_start; + uint64_t len = remain; + + /* + * If we know we have at least one more segment left after + * the current iteration of this loop, split it so that + * the next segment starts on an aligned boundary. + */ + if (len > maxlen) { + uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align); + len = end - start; + } + + ext->dfle_length = len; + + if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) { + return (r); + } + + ext->dfle_start += len; + remain -= len; + } + + return (0); +} diff --git a/usr/src/uts/common/os/driver_lyr.c b/usr/src/uts/common/os/driver_lyr.c index 9e5eb33dd6..d64342738b 100644 --- a/usr/src/uts/common/os/driver_lyr.c +++ b/usr/src/uts/common/os/driver_lyr.c @@ -1131,7 +1131,7 @@ ldi_usage_walker_helper(struct ldi_ident *lip, vnode_t *vp, else major = lip->li_major; - ASSERT((major >= 0) && (major < devcnt)); + ASSERT3U(major, <, devcnt); dnp = &devnamesp[major]; LOCK_DEV_OPS(&dnp->dn_lock); @@ -1258,7 +1258,7 @@ ldi_mlink_lh(vnode_t *vp, int cmd, intptr_t arg, cred_t *crp, int *rvalp) * in its internal state so that the devinfo snapshot code has some * observability into streams device linkage information. */ -void +int ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type) { vnode_t *vp = fpdown->f_vnode; @@ -1267,9 +1267,13 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type) major_t major; int ret; - /* if the lower stream is not a device then return */ + /* + * If the lower stream is not a device then return but claim to have + * succeeded, which matches our historical behaviour of just not + * setting up LDI in this case. + */ if (!vn_matchops(vp, spec_getvnodeops())) - return; + return (0); ASSERT(!servicing_interrupt()); @@ -1280,6 +1284,41 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type) sp = VTOS(vp); csp = VTOS(sp->s_commonvp); + /* get a layered ident for the upper stream */ + if (type == LINKNORMAL) { + /* + * if the link is not persistant then we can associate + * the upper stream with a dev_t. this is because the + * upper stream is associated with a vnode, which is + * associated with a dev_t and this binding can't change + * during the life of the stream. since the link isn't + * persistant once the stream is destroyed the link is + * destroyed. so the dev_t will be valid for the life + * of the link. + */ + ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li); + } else { + /* + * if the link is persistant we can only associate the + * link with a driver (and not a dev_t.) this is + * because subsequent opens of the upper device may result + * in a different stream (and dev_t) having access to + * the lower stream. + * + * for example, if the upper stream is closed after the + * persistant link operation is completed, a subsequent + * open of the upper device will create a new stream which + * may have a different dev_t and an unlink operation + * can be performed using this new upper stream. + */ + VERIFY3S(type, ==, LINKPERSIST); + major = getmajor(stp->sd_vnode->v_rdev); + ret = ldi_ident_from_major(major, &li); + } + + if (ret != 0) + return (ret); + /* check if this was a plink via a layered handle */ if (lhlink) { /* @@ -1303,8 +1342,10 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type) * while there may still be valid layered handles * pointing to it. */ + VERIFY3S(type, ==, LINKPERSIST); + mutex_enter(&csp->s_lock); - ASSERT(csp->s_count >= 1); + VERIFY(csp->s_count >= 1); csp->s_count++; mutex_exit(&csp->s_lock); @@ -1330,48 +1371,17 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type) * mark the snode/stream as multiplexed */ mutex_enter(&sp->s_lock); - ASSERT(!(sp->s_flag & SMUXED)); + VERIFY(!(sp->s_flag & SMUXED)); sp->s_flag |= SMUXED; mutex_exit(&sp->s_lock); - /* get a layered ident for the upper stream */ - if (type == LINKNORMAL) { - /* - * if the link is not persistant then we can associate - * the upper stream with a dev_t. this is because the - * upper stream is associated with a vnode, which is - * associated with a dev_t and this binding can't change - * during the life of the stream. since the link isn't - * persistant once the stream is destroyed the link is - * destroyed. so the dev_t will be valid for the life - * of the link. - */ - ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li); - } else { - /* - * if the link is persistant we can only associate the - * link with a driver (and not a dev_t.) this is - * because subsequent opens of the upper device may result - * in a different stream (and dev_t) having access to - * the lower stream. - * - * for example, if the upper stream is closed after the - * persistant link operation is compleated, a subsequent - * open of the upper device will create a new stream which - * may have a different dev_t and an unlink operation - * can be performed using this new upper stream. - */ - ASSERT(type == LINKPERSIST); - major = getmajor(stp->sd_vnode->v_rdev); - ret = ldi_ident_from_major(major, &li); - } - - ASSERT(ret == 0); (void) handle_alloc(vp, (struct ldi_ident *)li); ldi_ident_release(li); + + return (0); } -void +int ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type) { struct ldi_handle *lhp; @@ -1381,31 +1391,21 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type) major_t major; int ret; - /* if the lower stream is not a device then return */ + /* + * If the lower stream is not a device then return but claim to have + * succeeded, which matches our historical behaviour of just not + * setting up LDI in this case. + */ if (!vn_matchops(vp, spec_getvnodeops())) - return; + return (0); ASSERT(!servicing_interrupt()); - ASSERT((type == LINKNORMAL) || (type == LINKPERSIST)); LDI_STREAMS_LNK((CE_NOTE, "%s: unlinking streams " "stp=0x%p, fpdown=0x%p", "ldi_munlink_fp", (void *)stp, (void *)fpdown)); /* - * NOTE: here we rely on the streams subsystem not allowing - * a stream to be multiplexed more than once. if this - * changes, we break. - * - * mark the snode/stream as not multiplexed - */ - sp = VTOS(vp); - mutex_enter(&sp->s_lock); - ASSERT(sp->s_flag & SMUXED); - sp->s_flag &= ~SMUXED; - mutex_exit(&sp->s_lock); - - /* * clear the owner for this snode * see the comment in ldi_mlink_fp() for information about how * the ident is allocated @@ -1413,15 +1413,32 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type) if (type == LINKNORMAL) { ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li); } else { - ASSERT(type == LINKPERSIST); + VERIFY3S(type, ==, LINKPERSIST); major = getmajor(stp->sd_vnode->v_rdev); ret = ldi_ident_from_major(major, &li); } - ASSERT(ret == 0); + if (ret != 0) + return (ret); + + /* + * NOTE: here we rely on the streams subsystem not allowing + * a stream to be multiplexed more than once. if this + * changes, we break. + * + * mark the snode/stream as not multiplexed + */ + sp = VTOS(vp); + mutex_enter(&sp->s_lock); + VERIFY(sp->s_flag & SMUXED); + sp->s_flag &= ~SMUXED; + mutex_exit(&sp->s_lock); + lhp = handle_find(vp, (struct ldi_ident *)li); handle_release(lhp); ldi_ident_release(li); + + return (0); } /* diff --git a/usr/src/uts/common/os/errorq.c b/usr/src/uts/common/os/errorq.c index 8b41e7e8c1..cd71b9be08 100644 --- a/usr/src/uts/common/os/errorq.c +++ b/usr/src/uts/common/os/errorq.c @@ -946,7 +946,7 @@ errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep) /* * Write elements on the dump list of each nvlist errorq to the dump device. - * Upon reboot, fmd(1M) will extract and replay them for diagnosis. + * Upon reboot, fmd(8) will extract and replay them for diagnosis. */ void errorq_dump(void) diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c index c9214cec84..1051c037fa 100644 --- a/usr/src/uts/common/os/exacct.c +++ b/usr/src/uts/common/os/exacct.c @@ -1508,10 +1508,8 @@ exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res) } break; case AC_FLOW_UID: - if (fu->fu_userid >= 0) { - (void) ea_attach_item(record, &fu->fu_userid, - sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID); - } + (void) ea_attach_item(record, &fu->fu_userid, + sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID); break; case AC_FLOW_ANAME: (void) ea_attach_item(record, fu->fu_aname, diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 06e0117cd6..7ccf9b3221 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -22,6 +22,8 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -71,6 +73,7 @@ #include <sys/pool.h> #include <sys/sdt.h> #include <sys/corectl.h> +#include <sys/core.h> #include <sys/brand.h> #include <sys/libc_kernel.h> @@ -163,7 +166,7 @@ restart_init_notify(zone_t *zone) * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things * which are usually inherited across exec() but will break init's - * assumption that it is being exec()'d from a virgin process. Most + * assumption that it is being exec()'d from a virgin process. Most * importantly this includes closing all file descriptors (exec only * closes those marked close-on-exec) and resetting signals (exec only * resets handled signals, and we need to clear any signals which @@ -176,6 +179,7 @@ restart_init(int what, int why) kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); + proc_t *pp = p->p_zone->zone_zsched; user_t *up = PTOU(p); vnode_t *oldcd, *oldrd; @@ -187,11 +191,11 @@ restart_init(int what, int why) * zone) know that init has failed and will be restarted. */ zcmn_err(p->p_zone->zone_id, CE_WARN, - "init(1M) %s: restarting automatically", + "init(8) %s: restarting automatically", exit_reason(reason_buf, sizeof (reason_buf), what, why)); if (!INGLOBALZONE(p)) { - cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: " + cmn_err(CE_WARN, "init(8) for zone %s (pid %d) %s: " "restarting automatically", p->p_zone->zone_name, p->p_pid, reason_buf); } @@ -206,7 +210,7 @@ restart_init(int what, int why) /* * Grab p_lock and begin clearing miscellaneous global process - * state that needs to be reset before we exec the new init(1M). + * state that needs to be reset before we exec the new init(8). */ mutex_enter(&p->p_lock); @@ -270,6 +274,11 @@ restart_init(int what, int why) up->u_cwd = NULL; } + /* Reset security flags */ + mutex_enter(&pp->p_lock); + p->p_secflags = pp->p_secflags; + mutex_exit(&pp->p_lock); + mutex_exit(&p->p_lock); if (oldrd != NULL) @@ -277,6 +286,23 @@ restart_init(int what, int why) if (oldcd != NULL) VN_RELE(oldcd); + /* + * It's possible that a zone's init will have become privilege aware + * and modified privilege sets; reset them. + */ + cred_t *oldcr, *newcr; + + mutex_enter(&p->p_crlock); + oldcr = p->p_cred; + mutex_enter(&pp->p_crlock); + crhold(newcr = p->p_cred = pp->p_cred); + mutex_exit(&pp->p_crlock); + mutex_exit(&p->p_crlock); + crfree(oldcr); + /* Additional hold for the current thread - expected by crset() */ + crhold(newcr); + crset(p, newcr); + /* Free the controlling tty. (freectty() always assumes curproc.) */ ASSERT(p == curproc); (void) freectty(B_TRUE); @@ -284,7 +310,7 @@ restart_init(int what, int why) restart_init_notify(p->p_zone); /* - * Now exec() the new init(1M) on top of the current process. If we + * Now exec() the new init(8) on top of the current process. If we * succeed, the caller will treat this like a successful system call. * If we fail, we issue messages and the caller will proceed with exit. */ @@ -294,11 +320,11 @@ restart_init(int what, int why) return (0); zcmn_err(p->p_zone->zone_id, CE_WARN, - "failed to restart init(1M) (err=%d): system reboot required", err); + "failed to restart init(8) (err=%d): system reboot required", err); if (!INGLOBALZONE(p)) { - cmn_err(CE_WARN, "failed to restart init(1M) for zone %s " - "(pid %d, err=%d): zoneadm(1M) boot required", + cmn_err(CE_WARN, "failed to restart init(8) for zone %s " + "(pid %d, err=%d): zoneadm(8) boot required", p->p_zone->zone_name, p->p_pid, err); } @@ -317,7 +343,7 @@ exit(int why, int what) /* * If proc_exit() fails, then some other lwp in the process * got there first. We just have to call lwp_exit() to allow - * the other lwp to finish exiting the process. Otherwise we're + * the other lwp to finish exiting the process. Otherwise we're * restarting init, and should return. */ if (proc_exit(why, what) != 0) { @@ -330,7 +356,7 @@ exit(int why, int what) /* * Set the SEXITING flag on the process, after making sure /proc does - * not have it locked. This is done in more places than proc_exit(), + * not have it locked. This is done in more places than proc_exit(), * so it is a separate function. */ void @@ -445,9 +471,9 @@ zone_init_exit(zone_t *z, int why, int what) } } - /* - * The restart failed, the zone will shut down. + * The restart failed, or the criteria for a restart are not met; + * the zone will shut down. */ z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); @@ -484,7 +510,7 @@ proc_exit(int why, int what) /* * Stop and discard the process's lwps except for the current one, - * unless some other lwp beat us to it. If exitlwps() fails then + * unless some other lwp beat us to it. If exitlwps() fails then * return and the calling lwp will call (or continue in) lwp_exit(). */ proc_is_exiting(p); @@ -502,6 +528,13 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); + /* + * Don't let init exit unless zone_start_init() failed its exec, or + * we are shutting down the zone or the machine. + * + * Since we are single threaded, we don't need to lock the + * following accesses to zone_proc_initpid. + */ if (p->p_pid == z->zone_proc_initpid) { /* If zone's init restarts, we're done here. */ if (zone_init_exit(z, why, what)) @@ -600,6 +633,14 @@ proc_exit(int why, int what) } /* + * If we had generated any upanic(2) state, free that now. + */ + if (p->p_upanic != NULL) { + kmem_free(p->p_upanic, PRUPANIC_BUFLEN); + p->p_upanic = NULL; + } + + /* * Remove any fpollinfo_t's for this (last) thread from our file * descriptors so closeall() can ASSERT() that they're all gone. */ @@ -971,7 +1012,7 @@ proc_exit(int why, int what) * curthread's proc pointer is changed to point to the 'sched' * process for the corresponding zone, except in the case when * the exiting process is in fact a zsched instance, in which - * case the proc pointer is set to p0. We do so, so that the + * case the proc pointer is set to p0. We do so, so that the * process still points at the right zone when we call the VN_RELE() * below. * @@ -1055,7 +1096,7 @@ proc_exit(int why, int what) /* * task_rele() may ultimately cause the zone to go away (or * may cause the last user process in a zone to go away, which - * signals zsched to go away). So prior to this call, we must + * signals zsched to go away). So prior to this call, we must * no longer point at zsched. */ t->t_procp = &p0; diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index ec89cb0657..f6179cf301 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -22,6 +22,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2017, Joyent Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -958,7 +959,22 @@ closef(file_t *fp) vp = fp->f_vnode; - error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); + /* + * The __FLXPATH flag is a private interface for use by the lx + * brand in order to emulate open(O_NOFOLLOW|O_PATH) which, + * when a symbolic link is encountered, returns a file + * descriptor which references it. + * See uts/common/brand/lx/syscall/lx_open.c + * + * When this flag is set, VOP_OPEN() will not have been called when + * this file descriptor was opened, and VOP_CLOSE() should not be + * called here (for a symlink, most filesystems would return ENOSYS + * anyway) + */ + if (fp->f_flag2 & (__FLXPATH >> 16)) + error = 0; + else + error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); if (count > 1) { mutex_exit(&fp->f_tlock); @@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) mutex_enter(&fp->f_tlock); fp->f_count = 1; fp->f_flag = (ushort_t)flag; - fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16; + fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16; fp->f_vnode = vp; fp->f_offset = 0; fp->f_audit_data = 0; @@ -1585,7 +1601,9 @@ fsetattrat(int fd, char *path, int flags, struct vattr *vap) VN_HOLD(vp); } - if (vn_is_readonly(vp)) { + if (vp->v_type == VLNK && (vap->va_mask & AT_MODE) != 0) { + error = EOPNOTSUPP; + } else if (vn_is_readonly(vp)) { error = EROFS; } else { error = VOP_SETATTR(vp, vap, 0, CRED(), NULL); diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c index 78907db25c..2d7849e30d 100644 --- a/usr/src/uts/common/os/flock.c +++ b/usr/src/uts/common/os/flock.c @@ -28,7 +28,7 @@ /* All Rights Reserved */ /* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright 2015 Joyent, Inc. */ @@ -1122,8 +1122,8 @@ flk_process_request(lock_descriptor_t *request) } if (!request_blocked_by_active) { - lock_descriptor_t *lk[1]; - lock_descriptor_t *first_glock = NULL; + lock_descriptor_t *lk[1]; + lock_descriptor_t *first_glock = NULL; /* * Shall we grant this?! NO!! * What about those locks that were just granted and still @@ -2093,12 +2093,12 @@ flk_graph_uncolor(graph_t *gp) if (gp->mark == UINT_MAX) { gp->mark = 1; - for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); - lock = lock->l_next) + for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); + lock = lock->l_next) lock->l_color = 0; - for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp); - lock = lock->l_next) + for (lock = SLEEPING_HEAD(gp)->l_next; + lock != SLEEPING_HEAD(gp); lock = lock->l_next) lock->l_color = 0; } else { gp->mark++; @@ -4318,6 +4318,8 @@ nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset, lock->l_flock.l_pid != pid) && lock_blocks_io(op, offset, length, lock->l_type, lock->l_start, lock->l_end)) { + DTRACE_PROBE1(conflict_lock, + lock_descriptor_t *, lock); conflict = 1; break; } @@ -4467,34 +4469,34 @@ check_sleeping_locks(graph_t *gp) edge_t *ep; for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp); lock1 = lock1->l_next) { - ASSERT(!IS_BARRIER(lock1)); - for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp); - lock2 = lock2->l_next) { - if (lock1->l_vnode == lock2->l_vnode) { - if (BLOCKS(lock2, lock1)) { - ASSERT(!IS_GRANTED(lock1)); - ASSERT(!NOT_BLOCKED(lock1)); - path(lock1, lock2); + ASSERT(!IS_BARRIER(lock1)); + for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp); + lock2 = lock2->l_next) { + if (lock1->l_vnode == lock2->l_vnode) { + if (BLOCKS(lock2, lock1)) { + ASSERT(!IS_GRANTED(lock1)); + ASSERT(!NOT_BLOCKED(lock1)); + path(lock1, lock2); + } } } - } - for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp); - lock2 = lock2->l_next) { - ASSERT(!IS_BARRIER(lock1)); - if (lock1->l_vnode == lock2->l_vnode) { - if (BLOCKS(lock2, lock1)) { - ASSERT(!IS_GRANTED(lock1)); - ASSERT(!NOT_BLOCKED(lock1)); - path(lock1, lock2); + for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp); + lock2 = lock2->l_next) { + ASSERT(!IS_BARRIER(lock1)); + if (lock1->l_vnode == lock2->l_vnode) { + if (BLOCKS(lock2, lock1)) { + ASSERT(!IS_GRANTED(lock1)); + ASSERT(!NOT_BLOCKED(lock1)); + path(lock1, lock2); + } } } - } - ep = FIRST_ADJ(lock1); - while (ep != HEAD(lock1)) { - ASSERT(BLOCKS(ep->to_vertex, lock1)); - ep = NEXT_ADJ(ep); - } + ep = FIRST_ADJ(lock1); + while (ep != HEAD(lock1)) { + ASSERT(BLOCKS(ep->to_vertex, lock1)); + ep = NEXT_ADJ(ep); + } } } diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c index 66fe699366..bd3e5dceac 100644 --- a/usr/src/uts/common/os/fm.c +++ b/usr/src/uts/common/os/fm.c @@ -336,6 +336,7 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) c = fm_printf(d + 1, c, cols, "[...]"); break; case DATA_TYPE_UNKNOWN: + case DATA_TYPE_DONTCARE: c = fm_printf(d + 1, c, cols, "<unknown>"); break; } @@ -363,7 +364,7 @@ fm_nvprint(nvlist_t *nvl) /* * Wrapper for panic() that first produces an FMA-style message for admins. - * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this + * Normally such messages are generated by fmd(8)'s syslog-msgs agent: this * is the one exception to that rule and the only error that gets messaged. * This function is intended for use by subsystems that have detected a fatal * error and enqueued appropriate ereports and wish to then force a panic. @@ -375,9 +376,9 @@ fm_panic(const char *format, ...) va_list ap; (void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format); -#if defined(__i386) || defined(__amd64) +#if defined(__x86) fastreboot_disable_highpil(); -#endif /* __i386 || __amd64 */ +#endif /* __x86 */ va_start(ap, format); vpanic(format, ap); va_end(ap); diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index 07fd623a95..6e2d3c403c 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -30,7 +30,7 @@ */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ #include <sys/types.h> #include <sys/inttypes.h> @@ -770,20 +770,11 @@ smmap_common(caddr_t *addrp, size_t len, } /* - * XXX - Do we also adjust maxprot based on protections - * of the vnode? E.g. if no execute permission is given - * on the vnode for the current user, maxprot probably - * should disallow PROT_EXEC also? This is different - * from the write access as this would be a per vnode - * test as opposed to a per fd test for writability. - */ - - /* - * Verify that the specified protections are not greater than - * the maximum allowable protections. Also test to make sure - * that the file descriptor does allows for read access since - * "write only" mappings are hard to do since normally we do - * the read from the file before the page can be written. + * Verify that the specified protections are not greater than the + * maximum allowable protections. Also test to make sure that the + * file descriptor allows for read access since "write only" mappings + * are hard to do since normally we do the read from the file before + * the page can be written. */ if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0) return (EACCES); diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c index 0a237e86ec..51a93dfa24 100644 --- a/usr/src/uts/common/os/ip_cksum.c +++ b/usr/src/uts/common/os/ip_cksum.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2019 Joyent, Inc. + * Copyright 2021 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -40,8 +40,7 @@ #include <sys/multidata.h> #include <sys/multidata_impl.h> -extern unsigned int ip_ocsum(ushort_t *address, int halfword_count, - unsigned int sum); +extern unsigned int ip_ocsum(ushort_t *, int, unsigned int); /* * Checksum routine for Internet Protocol family headers. @@ -587,7 +586,8 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr, ip6_rthdr_t *rthdr; ip6_frag_t *fraghdr; - ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION) + return (B_FALSE); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c index 977d243400..27e30a5725 100644 --- a/usr/src/uts/common/os/kcpc.c +++ b/usr/src/uts/common/os/kcpc.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ #include <sys/param.h> @@ -74,9 +76,10 @@ static uint32_t kcpc_nullctx_count; /* # overflows in a thread with no ctx */ */ static int kcpc_nullctx_panic = 0; -static void kcpc_lwp_create(kthread_t *t, kthread_t *ct); -static void kcpc_restore(kcpc_ctx_t *ctx); -static void kcpc_save(kcpc_ctx_t *ctx); +static void kcpc_save(void *); +static void kcpc_restore(void *); +static void kcpc_lwp_create(void *, void *); +static void kcpc_free(void *, int); static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx); static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch); static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set); @@ -111,6 +114,14 @@ extern int kcpc_hw_load_pcbe(void); */ static int kcpc_pcbe_error = 0; +static const struct ctxop_template kcpc_ctxop_tpl = { + .ct_rev = CTXOP_TPL_REV, + .ct_save = kcpc_save, + .ct_restore = kcpc_restore, + .ct_lwp_create = kcpc_lwp_create, + .ct_free = kcpc_free, +}; + /* * Perform one-time initialization of kcpc framework. * This function performs the initialization only the first time it is called. @@ -317,8 +328,7 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode) /* * Add a device context to the subject thread. */ - installctx(t, ctx, kcpc_save, kcpc_restore, NULL, - kcpc_lwp_create, NULL, kcpc_free); + ctxop_install(t, &kcpc_ctxop_tpl, ctx); /* * Ask the backend to program the hardware. @@ -546,7 +556,7 @@ kcpc_unbind(kcpc_set_t *set) t = ctx->kc_thread; /* * The context is thread-bound and therefore has a device - * context. It will be freed via removectx() calling + * context. It will be freed via ctxop_remove() calling * freectx() calling kcpc_free(). */ if (t == curthread) { @@ -559,15 +569,7 @@ kcpc_unbind(kcpc_set_t *set) splx(save_spl); kpreempt_enable(); } -#ifdef DEBUG - if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL, - kcpc_lwp_create, NULL, kcpc_free) == 0) - panic("kcpc_unbind: context %p not preset on thread %p", - (void *)ctx, (void *)t); -#else - (void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL, - kcpc_lwp_create, NULL, kcpc_free); -#endif /* DEBUG */ + VERIFY3U(ctxop_remove(t, &kcpc_ctxop_tpl, ctx), !=, 0); t->t_cpc_set = NULL; t->t_cpc_ctx = NULL; } else { @@ -1214,8 +1216,9 @@ kcpc_overflow_ast() * Called when switching away from current thread. */ static void -kcpc_save(kcpc_ctx_t *ctx) +kcpc_save(void *arg) { + kcpc_ctx_t *ctx = arg; int err; int save_spl; @@ -1263,8 +1266,9 @@ kcpc_save(kcpc_ctx_t *ctx) } static void -kcpc_restore(kcpc_ctx_t *ctx) +kcpc_restore(void *arg) { + kcpc_ctx_t *ctx = arg; int save_spl; mutex_enter(&ctx->kc_lock); @@ -1323,9 +1327,11 @@ kcpc_restore(kcpc_ctx_t *ctx) * it is switched off. */ /*ARGSUSED*/ -void -kcpc_idle_save(struct cpu *cp) +static void +kcpc_idle_save(void *arg) { + struct cpu *cp = arg; + /* * The idle thread shouldn't be run anywhere else. */ @@ -1347,9 +1353,11 @@ kcpc_idle_save(struct cpu *cp) mutex_exit(&cp->cpu_cpc_ctxlock); } -void -kcpc_idle_restore(struct cpu *cp) +static void +kcpc_idle_restore(void *arg) { + struct cpu *cp = arg; + /* * The idle thread shouldn't be run anywhere else. */ @@ -1371,10 +1379,23 @@ kcpc_idle_restore(struct cpu *cp) mutex_exit(&cp->cpu_cpc_ctxlock); } +static const struct ctxop_template kcpc_idle_ctxop_tpl = { + .ct_rev = CTXOP_TPL_REV, + .ct_save = kcpc_idle_save, + .ct_restore = kcpc_idle_restore, +}; + +void +kcpc_idle_ctxop_install(kthread_t *t, struct cpu *cp) +{ + ctxop_install(t, &kcpc_idle_ctxop_tpl, cp); +} + /*ARGSUSED*/ static void -kcpc_lwp_create(kthread_t *t, kthread_t *ct) +kcpc_lwp_create(void *parent, void *child) { + kthread_t *t = parent, *ct = child; kcpc_ctx_t *ctx = t->t_cpc_ctx, *cctx; int i; @@ -1423,8 +1444,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) aston(ct); } - installctx(ct, cctx, kcpc_save, kcpc_restore, - NULL, kcpc_lwp_create, NULL, kcpc_free); + ctxop_install(ct, &kcpc_ctxop_tpl, cctx); } /* @@ -1461,8 +1481,9 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct) /*ARGSUSED*/ void -kcpc_free(kcpc_ctx_t *ctx, int isexec) +kcpc_free(void *arg, int isexec) { + kcpc_ctx_t *ctx = arg; int i; kcpc_set_t *set = ctx->kc_set; @@ -1543,6 +1564,12 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec) kcpc_free_set(set); } +void +kcpc_free_cpu(kcpc_ctx_t *ctx) +{ + kcpc_free(ctx, 0); +} + /* * Free the memory associated with a request set. */ diff --git a/usr/src/uts/common/os/klpd.c b/usr/src/uts/common/os/klpd.c index 8592b47021..0879f791b5 100644 --- a/usr/src/uts/common/os/klpd.c +++ b/usr/src/uts/common/os/klpd.c @@ -1150,7 +1150,7 @@ check_user_privs(const cred_t *cr, const priv_set_t *set) out: if (da.rbuf != (char *)&res) kmem_free(da.rbuf, da.rsize); -out1: + kmem_free(pap, pasize); klpd_rele(pfd); return (err); diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index d12928acc3..4d2c1e6c10 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ /* @@ -2250,7 +2251,7 @@ kmem_dumppr(char **pp, char *e, const char *format, ...) } /* - * Called when dumpadm(1M) configures dump parameters. + * Called when dumpadm(8) configures dump parameters. */ void kmem_dump_init(size_t size) @@ -4462,8 +4463,7 @@ kmem_init(void) if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS | KMF_LITE)) != 0) || ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE)) - cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. " - "See the Solaris Tunable Parameters Reference Manual.", + cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x.", kmem_flags); #ifdef DEBUG @@ -4481,8 +4481,7 @@ kmem_init(void) (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0) cmn_err(CE_WARN, "High-overhead kmem debugging features " "enabled (kmem_flags = 0x%x). Performance degradation " - "and large memory overhead possible. See the Solaris " - "Tunable Parameters Reference Manual.", kmem_flags); + "and large memory overhead possible.", kmem_flags); #endif /* not DEBUG */ kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP); @@ -4530,8 +4529,21 @@ void kmem_thread_init(void) { kmem_move_init(); + + /* + * This taskq is used for various kmem maintenance functions, including + * kmem_reap(). When maintenance is required on every cache, + * kmem_cache_applyall() dispatches one task per cache onto this queue. + * + * In the case of kmem_reap(), the system may be under increasingly + * dire memory pressure and may not be able to allocate a new task + * entry. The count of entries to prepopulate (below) should cover at + * least as many caches as we generally expect to exist on the system + * so that they may all be scheduled for reaping under those + * conditions. + */ kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri, - 300, INT_MAX, TASKQ_PREPOPULATE); + 600, INT_MAX, TASKQ_PREPOPULATE); } void @@ -5351,7 +5363,7 @@ kmem_cache_scan(kmem_cache_t *cp) } if (kmem_cache_is_fragmented(cp, &reap)) { - size_t slabs_found; + int slabs_found; /* * Consolidate reclaimable slabs from the end of the partial diff --git a/usr/src/uts/common/os/ksensor.c b/usr/src/uts/common/os/ksensor.c new file mode 100644 index 0000000000..7dd4a22c8a --- /dev/null +++ b/usr/src/uts/common/os/ksensor.c @@ -0,0 +1,871 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Oxide Computer Company + */ + +/* + * Kernel Sensor Framework + * + * The kernel sensor framework exists to provide a simple and straightforward + * means for various parts of the system to declare and instantiate sensor + * information. Between this and the ksensor character device + * (uts/common/io/ksensor/ksensor_drv.c) this exposes per-device sensors and + * character devices. + * + * -------------------------- + * Driver and User Interfaces + * -------------------------- + * + * Each sensor that is registered with the framework is exposed as a character + * device under /dev/sensors. The device class and node name are often ':' + * delineated and must begin with 'ddi_sensor'. Everything after 'ddi_sensor' + * will be created in a directory under /dev/sensors. So for example the Intel + * PCH driver uses a class "ddi_sensor:temperature:pch" and a node name of + * 'ts.%d'. This creates the node /dev/sensors/temperature/pch/ts.0. The + * devfsadm plugin automatically handles the creation of directories which makes + * the addition of additional sensor types easy to create. + * + * Strictly speaking, any device can manage their own sensors and minor nodes by + * using the appropriate class and implementing the corresponding ioctls. That + * was how the first kernel sensors were written; however, there are a lot of + * issues with that which led to this: + * + * 1. Every driver had to actually implement character devices. + * + * 2. Every driver had to duplicate a lot of the logic around open(9E), + * close(9E), and ioctl(9E). + * + * 3. Drivers that tied into frameworks like mac(9E) or SCSAv3 needed a lot more + * work to fit into this model. For example, because the minor state is + * shared between all the instances and the frameworks, they would have + * required shared, global state that they don't have today. + * + * Ultimately, having an operations vector and a callback argument makes work a + * lot simpler for the producers of sensor data and that simplicity makes it + * worthwhile to take on additional effort and work here. + * + * ---------- + * Components + * ---------- + * + * The ksensor framework is made of a couple of different pieces: + * + * 1. This glue that is a part of genunix. + * 2. The ksensor character device driver. + * 3. Sensor providers, which are generally drivers that register with the + * ksensor framework. + * + * The implementation of (1) is all in this file. The implementation of (2) is + * in uts/common/io/ksensor/ksensor_drv.c. The implementation of (3) is found in + * all of the different leaf devices. Examples of (3) include pchtemp(4D) and + * igb(4D). + * + * We separate numbers one and two into two different components for a few + * reasons. The most important thing is that drivers that provide sensors should + * not be dependent on some other part of the system having been loaded. This + * makes a compelling argument for it being a part of the core kernel. However, + * like other subsystems (e.g. kstats, smbios, etc.), it's useful to separate + * out the thing that provides the interface to users with the thing that is + * used to glue together providers in the kernel. There's the added benefit that + * it's practically simpler to spin up a pseudo-device through a module. + * + * The ksensor character device driver (2) registers with the main genunix + * ksensor code (1) when it attaches and when it detaches. The kernel only + * allows a single driver to be attached to it. When that character device + * driver attaches, the ksensor framework will walk through all of the currently + * registered sensors and inform the character device driver of the nodes that + * it needs to create. While the character device driver is attached, the + * ksensor framework will also call back into it when a sensor needs to be + * removed. + * + * Generally speaking, this distinction of responsibilities allows the kernel + * sensor character device driver to attach and detach without impact to the + * sensor providers or them even being notified at all, it's all transparent to + * them. + * + * ------------------------------ + * Sensor Lifetime and detach(9E) + * ------------------------------ + * + * Traditionally, a device driver may be detached by the broader kernel whenever + * the kernel desires it. On debug builds this happens by a dedicated thread. On + * a non-debug build this may happen due to memory pressure or as an attempt to + * reclaim idle resources (though this is much less common). However, when the + * module is detached, the system remembers that minor nodes previously existed + * and that entries in /devices had been created. When something proceeds to + * access an entry in /devices again, the system will use that to bring a driver + * back to life. It doesn't matter whether it's a pseudo-device driver or + * something else, this can happen. + * + * One downside to the sensor framework, is that we need to emulate this + * behavior which leads to some amount of complexity here. But this is a + * worthwhile tradeoff as it makes things much simpler for providers and it's + * not too hard for us to emulate this behavior. + * + * When a sensor provider registers the sensor, the sensor becomes available to + * the system. When the sensor provider unregisters with the system, which + * happens during its detach routine, then we note that it has been detached; + * however, we don't delete its minor node and if something accesses it, we + * attempt to load the driver again, the same way that devfs (the file system + * behind /devices) does. + * + * For each dev_info_t that registers a sensor we register a callback such that + * when the device is removed, e.g. someone called rem_drv or physically pulls + * the device, then we'll be able to finally clean up the device. This lifetime + * can be represented in the following image: + * + * | + * | + * +-----<-------------------------------------+ + * | | + * | . . call ksensor_create() | + * v | + * +-------+ | + * | Valid | | + * +-------+ | + * | ^ + * | . . call ksensor_remove() | + * v | + * +---------+ | + * | Invalid | | + * +---------+ | + * | | | + * | | . . user uses sensor again | + * | | | + * | +-------------------+ | + * | | | + * | v | + * | +---------------+ | + * | | Attatching... |-->---------+ + * | +---------------+ + * | . . ddi unbind cb | + * | | + * v | . . attatch fails or + * +---------+ | no call to ksensor_create() + * | Deleted |--<---------------+ again + * +---------+ + * + * When the DDI unbind callback is called, we know that the device is going to + * be removed. However, this happens within a subtle context with a majority of + * the device tree held (at least the dip's parent). In particular, another + * thread may be trying to obtain a hold on it and be blocked in + * ndi_devi_enter(). As the callback thread holds that, that could lead to a + * deadlock. As a result, we clean things up in two phases. One during the + * synchronous callback and the other via a taskq. In the first phase we + * logically do the following: + * + * o Remove the dip from the list of ksensor dips and set the flag that + * indicates that it's been removed. + * o Remove all of the sensors from the global avl to make sure that new + * threads cannot look it up. + * + * Then, after the taskq is dispatched, we do the following in taskq context: + * + * o Tell the ksensor driver that it should remove the minor node. + * o Block on each sensor until it is no-longer busy and then clean it up. + * o Clean up the ksensor_dip_t. + * + * ------------------ + * Accessing a Sensor + * ------------------ + * + * Access to a particular sensor is serialized in the system. In addition to + * that, a number of steps are required to access one that is not unlike + * accessing a character device. When a given sensor is held the KSENSOR_F_BUSY + * flag is set in the ksensor_flags member. In addition, as part of taking a + * hold a number of side effects occur that ensure that the sensor provider's + * dev_info_t is considered busy and can't be detached. + * + * To obtain a hold on a sensor the following logical steps are required (see + * ksensor_hold_by_id() for the implementation): + * + * 1. Map the minor to the ksensor_t via the avl tree + * 2. Check that the ksensor's dip is valid + * 3. If the sensor is busy, wait until it is no longer so, and restart from + * the top. Otherwise, mark the sensor as busy. + * 4. Enter the parent and place a hold on the sensor provider's dip. + * 5. Once again check if the dip is removed or not because we have to drop + * locks during that operation. + * 6. Check if the ksensor has the valid flag set. If not, attempt to configure + * the dip. + * 7. Assuming the sensor is now valid, we can return it. + * + * After this point, the sensor is considered valid for use. Once the consumer + * is finished with the sensor, it should be released by calling + * ksensor_release(). + * + * An important aspect of the above scheme is that the KSENSOR_F_BUSY flag is + * required to progress through the validation and holding of the device. This + * makes sure that only one thread is attempting to attach it at a given time. A + * reasonable future optimization would be to amortize this cost in open(9E) + * and close(9E) of the minor and to bump a count as it being referenced as long + * as it is open. + * + * ----------------------------- + * Character Device Registration + * ----------------------------- + * + * The 'ksensor' character device driver can come and go. To support this, the + * ksensor framework communicates with the ksensor character device by a + * well-defined set of callbacks, used to indicate sensor addition and removal. + * The ksensor character device is found in uts/common/io/ksensor/ksensor_drv.c. + * The ksensor character device is responsible for creating and destroying minor + * nodes. + * + * Each ksensor_t has a flag, KSENSOR_F_NOTIFIED, that is used to indicate + * whether or not the registered driver has been notified of the sensor. When a + * callback is first registered, we'll walk through the entire list of nodes to + * make sure that its minor has been created. When unregistering, the minor node + * remove callback will not be called; however, this can generally by dealt with + * by calling something like ddi_remove_minor_node(dip, NULL). + * + * ------- + * Locking + * ------- + * + * The following rules apply to dealing with lock ordering: + * + * 1. The global ksensor_g_mutex protects all global data and must be taken + * before a ksensor_t's individual mutex. + * + * 2. A thread should not hold any two ksensor_t's mutex at any time. + * + * 3. No locks should be held when attempting to grab or manipulate a + * dev_info_t, e.g. ndi_devi_enter(). + * + * 4. Unless the ksensor is actively being held, whenever a ksensor is found, + * one must check whether the ksensor_dip_t flag KSENSOR_DIP_F_REMOVED is + * set or not and whether the ksensor_t's KSENSOR_F_VALID flag is set. + */ + +#include <sys/types.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/esunddi.h> +#include <sys/ksensor_impl.h> +#include <sys/ddi_impldefs.h> +#include <sys/pci.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/stddef.h> +#include <sys/sysmacros.h> +#include <sys/fs/dv_node.h> + +typedef enum { + /* + * This flag indicates that the subscribing ksensor character device has + * been notified about this flag. + */ + KSENSOR_F_NOTIFIED = 1 << 0, + /* + * This indicates that the sensor is currently valid, meaning that the + * ops vector and argument are safe to use. This is removed when a + * driver with a sensor is detached. + */ + KSENSOR_F_VALID = 1 << 1, + /* + * Indicates that a client has a hold on the sensor for some purpose. + * This must be set before trying to get an NDI hold. Once this is set + * and a NDI hold is in place, it is safe to use the operations vector + * and argument. + */ + KSENSOR_F_BUSY = 1 << 2, +} ksensor_flags_t; + +typedef enum { + KSENSOR_DIP_F_REMOVED = 1 << 0 +} ksensor_dip_flags_t; + +typedef struct { + list_node_t ksdip_link; + ksensor_dip_flags_t ksdip_flags; + dev_info_t *ksdip_dip; + ddi_unbind_callback_t ksdip_cb; + list_t ksdip_sensors; +} ksensor_dip_t; + +typedef struct { + kmutex_t ksensor_mutex; + kcondvar_t ksensor_cv; + ksensor_flags_t ksensor_flags; + list_node_t ksensor_dip_list; + avl_node_t ksensor_id_avl; + uint_t ksensor_nwaiters; + ksensor_dip_t *ksensor_ksdip; + char *ksensor_name; + char *ksensor_class; + id_t ksensor_id; + const ksensor_ops_t *ksensor_ops; + void *ksensor_arg; +} ksensor_t; + +static kmutex_t ksensor_g_mutex; +static id_space_t *ksensor_ids; +static list_t ksensor_dips; +static avl_tree_t ksensor_avl; +static dev_info_t *ksensor_cb_dip; +static ksensor_create_f ksensor_cb_create; +static ksensor_remove_f ksensor_cb_remove; + +static int +ksensor_avl_compare(const void *l, const void *r) +{ + const ksensor_t *kl = l; + const ksensor_t *kr = r; + + if (kl->ksensor_id > kr->ksensor_id) { + return (1); + } else if (kl->ksensor_id < kr->ksensor_id) { + return (-1); + } else { + return (0); + } +} + +static ksensor_t * +ksensor_find_by_id(id_t id) +{ + ksensor_t k, *ret; + + ASSERT(MUTEX_HELD(&ksensor_g_mutex)); + + k.ksensor_id = id; + return (avl_find(&ksensor_avl, &k, NULL)); + +} + +static ksensor_t * +ksensor_search_ksdip(ksensor_dip_t *ksdip, const char *name, const char *class) +{ + ksensor_t *s; + + ASSERT(MUTEX_HELD(&ksensor_g_mutex)); + + for (s = list_head(&ksdip->ksdip_sensors); s != NULL; + s = list_next(&ksdip->ksdip_sensors, s)) { + if (strcmp(s->ksensor_name, name) == 0 && + strcmp(s->ksensor_class, class) == 0) { + return (s); + } + } + + return (NULL); +} + +static void +ksensor_free_sensor(ksensor_t *sensor) +{ + strfree(sensor->ksensor_name); + strfree(sensor->ksensor_class); + id_free(ksensor_ids, sensor->ksensor_id); + mutex_destroy(&sensor->ksensor_mutex); + kmem_free(sensor, sizeof (ksensor_t)); +} + +static void +ksensor_free_dip(ksensor_dip_t *ksdip) +{ + list_destroy(&ksdip->ksdip_sensors); + kmem_free(ksdip, sizeof (ksensor_dip_t)); +} + +static void +ksensor_dip_unbind_taskq(void *arg) +{ + ksensor_dip_t *k = arg; + ksensor_t *sensor; + + /* + * First notify an attached driver that the nodes are going away + * before we block and wait on them. + */ + mutex_enter(&ksensor_g_mutex); + for (sensor = list_head(&k->ksdip_sensors); sensor != NULL; + sensor = list_next(&k->ksdip_sensors, sensor)) { + mutex_enter(&sensor->ksensor_mutex); + if (sensor->ksensor_flags & KSENSOR_F_NOTIFIED) { + ksensor_cb_remove(sensor->ksensor_id, + sensor->ksensor_name); + sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED; + } + mutex_exit(&sensor->ksensor_mutex); + } + mutex_exit(&ksensor_g_mutex); + + /* + * Now that the driver has destroyed its minor, wait for anything that's + * still there. + */ + while ((sensor = list_remove_head(&k->ksdip_sensors)) != NULL) { + mutex_enter(&sensor->ksensor_mutex); + while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0 || + sensor->ksensor_nwaiters > 0) { + cv_wait(&sensor->ksensor_cv, &sensor->ksensor_mutex); + } + mutex_exit(&sensor->ksensor_mutex); + ksensor_free_sensor(sensor); + } + ksensor_free_dip(k); +} + +static void +ksensor_dip_unbind_cb(void *arg, dev_info_t *dip) +{ + ksensor_dip_t *k = arg; + ksensor_t *sensor; + + /* + * Remove the dip and the associated sensors from global visibility. + * This will ensure that no new clients can find this; however, others + * may have extent attempts to grab it (but lost the race in an NDI + * hold). + */ + mutex_enter(&ksensor_g_mutex); + list_remove(&ksensor_dips, k); + k->ksdip_flags |= KSENSOR_DIP_F_REMOVED; + for (sensor = list_head(&k->ksdip_sensors); sensor != NULL; + sensor = list_next(&k->ksdip_sensors, sensor)) { + avl_remove(&ksensor_avl, sensor); + } + mutex_exit(&ksensor_g_mutex); + + (void) taskq_dispatch(system_taskq, ksensor_dip_unbind_taskq, k, + TQ_SLEEP); +} + +static ksensor_dip_t * +ksensor_dip_create(dev_info_t *dip) +{ + ksensor_dip_t *k; + + k = kmem_zalloc(sizeof (ksensor_dip_t), KM_SLEEP); + k->ksdip_dip = dip; + k->ksdip_cb.ddiub_cb = ksensor_dip_unbind_cb; + k->ksdip_cb.ddiub_arg = k; + list_create(&k->ksdip_sensors, sizeof (ksensor_t), + offsetof(ksensor_t, ksensor_dip_list)); + e_ddi_register_unbind_callback(dip, &k->ksdip_cb); + + return (k); +} + +static ksensor_dip_t * +ksensor_dip_find(dev_info_t *dip) +{ + ksensor_dip_t *k; + + ASSERT(MUTEX_HELD(&ksensor_g_mutex)); + for (k = list_head(&ksensor_dips); k != NULL; + k = list_next(&ksensor_dips, k)) { + if (dip == k->ksdip_dip) { + return (k); + } + } + + return (NULL); +} + +int +ksensor_create(dev_info_t *dip, const ksensor_ops_t *ops, void *arg, + const char *name, const char *class, id_t *idp) +{ + ksensor_dip_t *ksdip; + ksensor_t *sensor; + + if (dip == NULL || ops == NULL || name == NULL || class == NULL || + idp == NULL) { + return (EINVAL); + } + + if (!DEVI_IS_ATTACHING(dip)) { + return (EAGAIN); + } + + mutex_enter(&ksensor_g_mutex); + ksdip = ksensor_dip_find(dip); + if (ksdip == NULL) { + ksdip = ksensor_dip_create(dip); + list_insert_tail(&ksensor_dips, ksdip); + } + + sensor = ksensor_search_ksdip(ksdip, name, class); + if (sensor != NULL) { + ASSERT3P(sensor->ksensor_ksdip, ==, ksdip); + if ((sensor->ksensor_flags & KSENSOR_F_VALID) != 0) { + mutex_exit(&ksensor_g_mutex); + dev_err(dip, CE_WARN, "tried to create sensor %s:%s " + "which is currently active", class, name); + return (EEXIST); + } + + sensor->ksensor_ops = ops; + sensor->ksensor_arg = arg; + } else { + sensor = kmem_zalloc(sizeof (ksensor_t), KM_SLEEP); + sensor->ksensor_ksdip = ksdip; + sensor->ksensor_name = ddi_strdup(name, KM_SLEEP); + sensor->ksensor_class = ddi_strdup(class, KM_SLEEP); + sensor->ksensor_id = id_alloc(ksensor_ids); + sensor->ksensor_ops = ops; + sensor->ksensor_arg = arg; + list_insert_tail(&ksdip->ksdip_sensors, sensor); + avl_add(&ksensor_avl, sensor); + } + + sensor->ksensor_flags |= KSENSOR_F_VALID; + + if (ksensor_cb_create != NULL) { + + if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class, + sensor->ksensor_name) == 0) { + sensor->ksensor_flags |= KSENSOR_F_NOTIFIED; + } + } + + *idp = sensor->ksensor_id; + mutex_exit(&ksensor_g_mutex); + + return (0); +} + +int +ksensor_create_scalar_pcidev(dev_info_t *dip, uint_t kind, + const ksensor_ops_t *ops, void *arg, const char *name, id_t *idp) +{ + char *pci_name, *type; + const char *class; + int *regs, ret; + uint_t nregs; + uint16_t bus, dev; + + switch (kind) { + case SENSOR_KIND_TEMPERATURE: + class = "ddi_sensor:temperature:pci"; + break; + case SENSOR_KIND_VOLTAGE: + class = "ddi_sensor:voltage:pci"; + break; + case SENSOR_KIND_CURRENT: + class = "ddi_sensor:current:pci"; + break; + default: + return (ENOTSUP); + } + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, "device_type", + &type) != DDI_PROP_SUCCESS) { + return (EINVAL); + } + + if (strcmp(type, "pciex") != 0 && strcmp(type, "pci") != 0) { + ddi_prop_free(type); + return (EINVAL); + } + ddi_prop_free(type); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, "reg", + ®s, &nregs) != DDI_PROP_SUCCESS) { + return (EINVAL); + } + + if (nregs < 1) { + ddi_prop_free(regs); + return (EIO); + } + + bus = PCI_REG_BUS_G(regs[0]); + dev = PCI_REG_DEV_G(regs[0]); + ddi_prop_free(regs); + + pci_name = kmem_asprintf("%x.%x:%s", bus, dev, name); + + ret = ksensor_create(dip, ops, arg, pci_name, class, idp); + strfree(pci_name); + return (ret); +} + +/* + * When a driver removes a sensor, we basically mark it as invalid. This happens + * because drivers can detach and we will need to reattach them when the sensor + * is used again. + */ +int +ksensor_remove(dev_info_t *dip, id_t id) +{ + ksensor_dip_t *kdip; + ksensor_t *sensor; + + if (!DEVI_IS_ATTACHING(dip) && !DEVI_IS_DETACHING(dip)) { + return (EAGAIN); + } + + mutex_enter(&ksensor_g_mutex); + kdip = ksensor_dip_find(dip); + if (kdip == NULL) { + mutex_exit(&ksensor_g_mutex); + return (ENOENT); + } + + for (sensor = list_head(&kdip->ksdip_sensors); sensor != NULL; + sensor = list_next(&kdip->ksdip_sensors, sensor)) { + if (sensor->ksensor_id == id || id == KSENSOR_ALL_IDS) { + mutex_enter(&sensor->ksensor_mutex); + sensor->ksensor_flags &= ~KSENSOR_F_VALID; + sensor->ksensor_ops = NULL; + sensor->ksensor_arg = NULL; + mutex_exit(&sensor->ksensor_mutex); + } + } + mutex_exit(&ksensor_g_mutex); + return (0); +} + +static void +ksensor_release(ksensor_t *sensor) +{ + int circ; + dev_info_t *pdip; + + ddi_release_devi(sensor->ksensor_ksdip->ksdip_dip); + + mutex_enter(&sensor->ksensor_mutex); + sensor->ksensor_flags &= ~KSENSOR_F_BUSY; + cv_broadcast(&sensor->ksensor_cv); + mutex_exit(&sensor->ksensor_mutex); +} + +static int +ksensor_hold_by_id(id_t id, ksensor_t **outp) +{ + int circ; + ksensor_t *sensor; + dev_info_t *pdip; + +restart: + mutex_enter(&ksensor_g_mutex); + sensor = ksensor_find_by_id(id); + if (sensor == NULL) { + mutex_exit(&ksensor_g_mutex); + *outp = NULL; + return (ESTALE); + } + + if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) { + mutex_exit(&ksensor_g_mutex); + *outp = NULL; + return (ESTALE); + } + + mutex_enter(&sensor->ksensor_mutex); + if ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) { + mutex_exit(&ksensor_g_mutex); + sensor->ksensor_nwaiters++; + while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) { + int cv = cv_wait_sig(&sensor->ksensor_cv, + &sensor->ksensor_mutex); + if (cv == 0) { + sensor->ksensor_nwaiters--; + cv_broadcast(&sensor->ksensor_cv); + mutex_exit(&sensor->ksensor_mutex); + *outp = NULL; + return (EINTR); + } + } + sensor->ksensor_nwaiters--; + cv_broadcast(&sensor->ksensor_cv); + mutex_exit(&sensor->ksensor_mutex); + goto restart; + } + + /* + * We have obtained ownership of the sensor. At this point, we should + * check to see if it's valid or not. + */ + sensor->ksensor_flags |= KSENSOR_F_BUSY; + pdip = ddi_get_parent(sensor->ksensor_ksdip->ksdip_dip); + mutex_exit(&sensor->ksensor_mutex); + mutex_exit(&ksensor_g_mutex); + + /* + * Grab a reference on the device node to ensure that it won't go away. + */ + ndi_devi_enter(pdip, &circ); + e_ddi_hold_devi(sensor->ksensor_ksdip->ksdip_dip); + ndi_devi_exit(pdip, circ); + + /* + * Now that we have an NDI hold, check if it's valid or not. It may have + * become invalid while we were waiting due to a race. + */ + mutex_enter(&ksensor_g_mutex); + if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) { + mutex_exit(&ksensor_g_mutex); + ksensor_release(sensor); + return (ESTALE); + } + + mutex_enter(&sensor->ksensor_mutex); + if ((sensor->ksensor_flags & KSENSOR_F_VALID) == 0) { + mutex_exit(&sensor->ksensor_mutex); + mutex_exit(&ksensor_g_mutex); + (void) ndi_devi_config(pdip, NDI_NO_EVENT); + mutex_enter(&ksensor_g_mutex); + mutex_enter(&sensor->ksensor_mutex); + + /* + * If we attempted to reattach it and it isn't now valid, fail + * this request. + */ + if ((sensor->ksensor_ksdip->ksdip_flags & + KSENSOR_DIP_F_REMOVED) != 0 || + (sensor->ksensor_flags & KSENSOR_F_VALID) == 0) { + mutex_exit(&sensor->ksensor_mutex); + mutex_exit(&ksensor_g_mutex); + ksensor_release(sensor); + return (ESTALE); + } + } + mutex_exit(&sensor->ksensor_mutex); + mutex_exit(&ksensor_g_mutex); + *outp = sensor; + + return (0); +} + +int +ksensor_op_kind(id_t id, sensor_ioctl_kind_t *kind) +{ + int ret; + ksensor_t *sensor; + + if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) { + return (ret); + } + + ret = sensor->ksensor_ops->kso_kind(sensor->ksensor_arg, kind); + ksensor_release(sensor); + + return (ret); +} + +int +ksensor_op_scalar(id_t id, sensor_ioctl_scalar_t *scalar) +{ + int ret; + ksensor_t *sensor; + + if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) { + return (ret); + } + + ret = sensor->ksensor_ops->kso_scalar(sensor->ksensor_arg, scalar); + ksensor_release(sensor); + + return (ret); +} + +void +ksensor_unregister(dev_info_t *reg_dip) +{ + ksensor_t *sensor; + + mutex_enter(&ksensor_g_mutex); + if (ksensor_cb_dip != reg_dip) { + dev_err(reg_dip, CE_PANIC, "asked to unregister illegal dip"); + } + + for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor = + AVL_NEXT(&ksensor_avl, sensor)) { + mutex_enter(&sensor->ksensor_mutex); + sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED; + mutex_exit(&sensor->ksensor_mutex); + } + + ksensor_cb_dip = NULL; + ksensor_cb_create = NULL; + ksensor_cb_remove = NULL; + mutex_exit(&ksensor_g_mutex); +} + +int +ksensor_register(dev_info_t *reg_dip, ksensor_create_f create, + ksensor_remove_f remove) +{ + ksensor_t *sensor; + + mutex_enter(&ksensor_g_mutex); + if (ksensor_cb_dip != NULL) { + dev_err(reg_dip, CE_WARN, "kernel sensors are already " + "registered"); + mutex_exit(&ksensor_g_mutex); + return (EEXIST); + } + + ksensor_cb_dip = reg_dip; + ksensor_cb_create = create; + ksensor_cb_remove = remove; + + for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor = + AVL_NEXT(&ksensor_avl, sensor)) { + mutex_enter(&sensor->ksensor_mutex); + ASSERT0(sensor->ksensor_flags & KSENSOR_F_NOTIFIED); + + if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class, + sensor->ksensor_name) == 0) { + sensor->ksensor_flags |= KSENSOR_F_NOTIFIED; + } + + mutex_exit(&sensor->ksensor_mutex); + } + + mutex_exit(&ksensor_g_mutex); + + return (0); +} + +int +ksensor_kind_temperature(void *unused, sensor_ioctl_kind_t *k) +{ + k->sik_kind = SENSOR_KIND_TEMPERATURE; + return (0); +} + +int +ksensor_kind_current(void *unused, sensor_ioctl_kind_t *k) +{ + k->sik_kind = SENSOR_KIND_CURRENT; + return (0); +} + +int +ksensor_kind_voltage(void *unused, sensor_ioctl_kind_t *k) +{ + k->sik_kind = SENSOR_KIND_VOLTAGE; + return (0); +} + +void +ksensor_init(void) +{ + mutex_init(&ksensor_g_mutex, NULL, MUTEX_DRIVER, NULL); + list_create(&ksensor_dips, sizeof (ksensor_dip_t), + offsetof(ksensor_dip_t, ksdip_link)); + ksensor_ids = id_space_create("ksensor", 1, L_MAXMIN32); + avl_create(&ksensor_avl, ksensor_avl_compare, sizeof (ksensor_t), + offsetof(ksensor_t, ksensor_id_avl)); +} diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c index f3404a1cdf..31b0cf7e0d 100644 --- a/usr/src/uts/common/os/lgrp.c +++ b/usr/src/uts/common/os/lgrp.c @@ -1449,8 +1449,8 @@ lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) * Remove memory node from lgroup. */ lgrp->lgrp_mnodes &= ~mnodes_mask; + ASSERT(lgrp->lgrp_nmnodes > 0); lgrp->lgrp_nmnodes--; - ASSERT(lgrp->lgrp_nmnodes >= 0); } ASSERT(lgrp_root->lgrp_nmnodes > 0); @@ -2160,8 +2160,8 @@ lpl_topo_verify(cpupart_t *cpupart) /* do the parent lgroups exist and do they match? */ if (lgrp->lgrp_parent) { - ASSERT(lpl->lpl_parent); - ASSERT(lgrp->lgrp_parent->lgrp_id == + ASSERT(lpl->lpl_parent != NULL && + lgrp->lgrp_parent->lgrp_id == lpl->lpl_parent->lpl_lgrpid); if (!lpl->lpl_parent) { @@ -4100,12 +4100,13 @@ lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, lgrp_shm_policy_seg_t *newseg; avl_index_t where; - ASSERT(seg != NULL); - ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); + ASSERT(seg != NULL && (off >= seg->shm_off && + off <= seg->shm_off + seg->shm_size)); - if (!seg || off < seg->shm_off || off > seg->shm_off + - seg->shm_size) + if (!seg || off < seg->shm_off || + off > seg->shm_off + seg->shm_size) { return (NULL); + } if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) return (seg); diff --git a/usr/src/uts/common/os/log_sysevent.c b/usr/src/uts/common/os/log_sysevent.c index 35e0048ee7..50dc5dfd82 100644 --- a/usr/src/uts/common/os/log_sysevent.c +++ b/usr/src/uts/common/os/log_sysevent.c @@ -1277,7 +1277,7 @@ get_registration(sysevent_channel_descriptor_t *chan, char *databuf, class_lst_t *clist; subclass_lst_t *sc_list; - if (class_index < 0 || class_index > CLASS_HASH_SZ) + if (class_index > CLASS_HASH_SZ) return (EINVAL); if ((clist = chan->scd_class_list_tbl[class_index]) == NULL) { @@ -1395,10 +1395,15 @@ log_sysevent_register(char *channel_name, char *udatabuf, se_pubsub_t *udata) case SE_CLOSE_REGISTRATION: close_channel(kchannel); break; - case SE_BIND_REGISTRATION: - if ((kdata.ps_id = bind_common(chan, kdata.ps_type)) <= 0) + case SE_BIND_REGISTRATION: { + id_t id; + + id = bind_common(chan, kdata.ps_type); + kdata.ps_id = (uint32_t)id; + if (id <= 0) error = EBUSY; break; + } case SE_UNBIND_REGISTRATION: (void) unbind_common(chan, kdata.ps_type, (id_t)kdata.ps_id); break; diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 9e58a7bb56..6a922343e7 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -20,9 +20,11 @@ */ /* + * Copyright 2020 Oxide Computer Company * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2020 Joyent, Inc. + * Copyright 2022 Joyent, Inc. + * Copyright 2022 MNX Cloud, Inc. */ #include <sys/types.h> @@ -43,6 +45,7 @@ #include <sys/utsname.h> #include <sys/id_space.h> #include <sys/zone.h> +#include <sys/bootbanner.h> log_zone_t log_global; queue_t *log_consq; @@ -182,6 +185,14 @@ log_zonefree(zoneid_t zoneid, void *arg) kmem_free(lzp, sizeof (log_zone_t)); } +static void +log_bootbanner_print(const char *line, uint_t num) +{ + const char *pfx = (num == 0) ? "\r" : ""; + + printf("%s%s\n", pfx, line); +} + void log_init(void) { @@ -189,7 +200,7 @@ log_init(void) /* * Create a backlog queue to consume console messages during periods - * when there is no console reader (e.g. before syslogd(1M) starts). + * when there is no console reader (e.g. before syslogd(8) starts). */ log_backlogq = log_consq = log_makeq(0, LOG_HIWAT, NULL); @@ -207,7 +218,7 @@ log_init(void) log_intrq = log_makeq(0, LOG_HIWAT, (void *)ipltospl(SPL8)); /* - * Create a queue to hold the most recent 8K of console messages. + * Create a queue to hold the most recent 64K of console messages. * Useful for debugging. Required by the "$<msgbuf" adb macro. */ log_recentq = log_makeq(0, LOG_RECENTSIZE, NULL); @@ -246,11 +257,19 @@ log_init(void) log_update(&log_backlog, log_backlogq, SL_CONSOLE, log_console); /* - * Now that logging is enabled, emit the SunOS banner. + * Now that logging is enabled, emit the boot banner. */ +#ifdef LEGACY_BANNER printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright 2010-2020 Joyent, Inc.\n"); + /* + * Note: In the future this should be 2022-20XX, and delete this + * comment when we don't need it anymore + */ + printf("Copyright 2022 MNX Cloud, Inc.\n"); +#else + bootbanner_print(log_bootbanner_print, KM_SLEEP); +#endif #ifdef DEBUG printf("DEBUG enabled\n"); #endif @@ -667,7 +686,7 @@ log_sendmsg(mblk_t *mp, zoneid_t zoneid) if (lp->log_q == log_consq) { console_printf(log_overflow_msg, lp->log_minor, - " -- is syslogd(1M) running?"); + " -- is syslogd(8) running?"); } else { printf(log_overflow_msg, lp->log_minor, ""); diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 6961a2ff4f..c57f8a7d2c 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -565,7 +565,7 @@ main(void) /* * Set the scan rate and other parameters of the paging subsystem. */ - setupclock(0); + setupclock(); /* * Initialize process 0's lwp directory and lwpid hash table. diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 285b76347b..fd74dd3092 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -509,7 +509,7 @@ mapalloc: * Recalculate the paging parameters now total_pages has changed. * This will also cause the clock hands to be reset before next use. */ - setupclock(1); + setupclock(); memsegs_unlock(1); @@ -2700,7 +2700,7 @@ kphysm_del_cleanup(struct mem_handle *mhp) * Recalculate the paging parameters now total_pages has changed. * This will also cause the clock hands to be reset before next use. */ - setupclock(1); + setupclock(); memsegs_unlock(1); diff --git a/usr/src/uts/common/os/memlist_new.c b/usr/src/uts/common/os/memlist_new.c index adef7cb015..eaa23ed24e 100644 --- a/usr/src/uts/common/os/memlist_new.c +++ b/usr/src/uts/common/os/memlist_new.c @@ -143,13 +143,17 @@ memlist_insert( } new->ml_next = NULL; new->ml_prev = last; - if (last != NULL) + if (last != NULL) { last->ml_next = new; + } else { + ASSERT3P(*curmemlistp, ==, NULL); + *curmemlistp = new; + } } void memlist_del(struct memlist *memlistp, - struct memlist **curmemlistp) + struct memlist **curmemlistp) { #ifdef DEBUG /* diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index 0410e6f47b..d14a4ef005 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -213,8 +213,6 @@ struct mobj_stats { #define OVERLAPS_STACK(addr, p) \ ((p->p_model == DATAMODEL_LP64) && \ (addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK)))) -#elif defined(__i386) -#define OVERLAPS_STACK(addr, p) 0 #endif /* lv_flags values - bitmap */ @@ -1010,8 +1008,8 @@ mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding, * fcred - credentials for the file associated with vp at open time. */ static int -mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen, - off_t offset, int prot, cred_t *fcred) +mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, + volatile size_t zfodlen, off_t offset, int prot, cred_t *fcred) { int error = 0; caddr_t zfodbase, oldaddr; @@ -1060,8 +1058,8 @@ mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen, * maxprot is passed as PROT_ALL so that mdb can * write to this segment. */ - if (error = VOP_MAP(vp, (offset_t)offset, as, &addr, - len, prot, PROT_ALL, mflag, fcred, NULL)) { + if ((error = VOP_MAP(vp, (offset_t)offset, as, &addr, + len, prot, PROT_ALL, mflag, fcred, NULL)) != 0) { return (error); } diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c index d8782b320e..f141fb4bf0 100644 --- a/usr/src/uts/common/os/modctl.c +++ b/usr/src/uts/common/os/modctl.c @@ -2694,7 +2694,7 @@ modrload(const char *subdir, const char *filename, struct modctl **rmodp) CPU_STATS_ADDQ(CPU, sys, modload, 1); } -done: if (subdir != NULL) + if (subdir != NULL) kmem_free(fullname, size); return (rmodp ? retval : id); } diff --git a/usr/src/uts/common/os/modsubr.c b/usr/src/uts/common/os/modsubr.c index e980516b10..53c4195e48 100644 --- a/usr/src/uts/common/os/modsubr.c +++ b/usr/src/uts/common/os/modsubr.c @@ -74,8 +74,7 @@ static void hwc_unhash(struct hwc_spec *); int major_valid(major_t major) { - return (major != DDI_MAJOR_T_NONE && - (major >= 0 && major < devcnt)); + return (major != DDI_MAJOR_T_NONE && major < devcnt); } int diff --git a/usr/src/uts/common/os/ndifm.c b/usr/src/uts/common/os/ndifm.c index 16613a9203..54640971fd 100644 --- a/usr/src/uts/common/os/ndifm.c +++ b/usr/src/uts/common/os/ndifm.c @@ -669,7 +669,7 @@ ndi_fm_dma_err_set(ddi_dma_handle_t handle, ddi_fm_error_t *dfe) /* * Call parent busop fm initialization routine. * - * Called during driver attach(1M) + * Called during driver attach(9E) */ int i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc) @@ -696,7 +696,7 @@ i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc) /* * Call parent busop fm clean-up routine. * - * Called during driver detach(1M) + * Called during driver detach(9E) */ void i_ndi_busop_fm_fini(dev_info_t *dip) diff --git a/usr/src/uts/common/os/panic.c b/usr/src/uts/common/os/panic.c index 62be47e843..addb8b79cb 100644 --- a/usr/src/uts/common/os/panic.c +++ b/usr/src/uts/common/os/panic.c @@ -213,7 +213,7 @@ panicsys(const char *format, va_list alist, struct regs *rp, int on_panic_stack) cpu_t *cp = CPU; caddr_t intr_stack = NULL; - uint_t intr_actv; + volatile uint_t intr_actv; ushort_t schedflag = t->t_schedflag; cpu_t *bound_cpu = t->t_bound_cpu; diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 861c748cff..b3f01cfab2 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -22,6 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2022 Oxide Computer Company */ #include <sys/types.h> @@ -69,6 +70,19 @@ int priv_debug = 0; int priv_basic_test = -1; /* + * Unlinking or creating new hard links to directories was historically allowed + * in some file systems; e.g., UFS allows root users to do it, at the cost of + * almost certain file system corruption that will require fsck to fix. + * + * Most modern operating systems and file systems (e.g., ZFS) do not allow this + * behaviour anymore, and we have elected to stamp it out entirely for + * compatibility and safety reasons. An attempt to unlink a directory will + * fail with EPERM, as described in the standard. During this transition, one + * can turn the behaviour back on, at their own risk, with this tuneable: + */ +int priv_allow_linkdir = 0; + +/* * This file contains the majority of the policy routines. * Since the policy routines are defined by function and not * by privilege, there is quite a bit of duplication of @@ -896,6 +910,23 @@ secpolicy_fs_config(const cred_t *cr, const vfs_t *vfsp) int secpolicy_fs_linkdir(const cred_t *cr, const vfs_t *vfsp) { + if (priv_allow_linkdir == 0) { + /* + * By default, this policy check will now always return EPERM + * unless overridden. + * + * We do so without triggering auditing or allowing privilege + * debugging for two reasons: first, we intend eventually to + * deprecate the PRIV_SYS_LINKDIR privilege entirely and remove + * the use of this policy check from the file systems; second, + * for privilege debugging in particular, because it would be + * confusing to report an unlink() failure as the result of a + * missing privilege when in fact we are simply no longer + * allowing the operation at all. + */ + return (EPERM); + } + return (PRIV_POLICY(cr, PRIV_SYS_LINKDIR, B_FALSE, EPERM, NULL)); } @@ -1381,7 +1412,7 @@ secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) * this is required because vop_access function should lock the * node for reading. A three argument function should be defined * which accepts the following argument: - * A pointer to the internal "node" type (inode *) + * A pointer to the internal "node" type (inode *) * vnode access bits (VREAD|VWRITE|VEXEC) * a pointer to the credential * @@ -1453,8 +1484,8 @@ secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, struct vattr *vap, * * If you are the file owner: * chown to other uid FILE_CHOWN_SELF - * chown to gid (non-member) FILE_CHOWN_SELF - * chown to gid (member) <none> + * chown to gid (non-member) FILE_CHOWN_SELF + * chown to gid (member) <none> * * Instead of PRIV_FILE_CHOWN_SELF, FILE_CHOWN is also * acceptable but the first one is reported when debugging. @@ -2433,13 +2464,14 @@ secpolicy_gart_map(const cred_t *cr) } /* - * secpolicy_xhci + * secpolicy_hwmanip * - * Determine if the subject can observe and manipulate the xhci driver with a - * dangerous blunt hammer. Requires all privileges. + * Determine if the subject can observe and manipulate a hardware device with a + * dangerous blunt hammer, often suggests they can do something destructive. + * Requires all privileges. */ int -secpolicy_xhci(const cred_t *cr) +secpolicy_hwmanip(const cred_t *cr) { return (secpolicy_require_set(cr, PRIV_FULLSET, NULL, KLPDARG_NONE)); } diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c index f9fe8649c0..57bd2241fd 100644 --- a/usr/src/uts/common/os/pool.c +++ b/usr/src/uts/common/os/pool.c @@ -1441,9 +1441,13 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) switch (idtype) { case P_PID: case P_TASKID: + default: + /* - * Can't bind processes or tasks - * in local zones to pools. + * Can't bind processes or tasks in local zones + * to pools. Also catch all remaining types of + * idtype_t that should already have been + * filtered out. */ mutex_exit(&p->p_lock); mutex_exit(&pidlock); @@ -1715,6 +1719,8 @@ out: switch (idtype) { zone->zone_pool_mod = gethrtime(); zone_rele(zone); break; + default: + break; } kmem_free(procs, procs_size * sizeof (proc_t *)); diff --git a/usr/src/uts/common/os/priv.c b/usr/src/uts/common/os/priv.c index ccde6e5af5..388ccd8918 100644 --- a/usr/src/uts/common/os/priv.c +++ b/usr/src/uts/common/os/priv.c @@ -182,8 +182,7 @@ priv_pr_spriv(proc_t *p, prpriv_t *prpriv, const cred_t *cr) if (prpriv->pr_nsets != PRIV_NSET || prpriv->pr_setsize != PRIV_SETSIZE || (prpriv->pr_infosize & (sizeof (uint32_t) - 1)) != 0 || - prpriv->pr_infosize > priv_info->priv_infosize || - prpriv->pr_infosize < 0) + prpriv->pr_infosize > priv_info->priv_infosize) return (EINVAL); mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index 854fb602da..05979dd236 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -217,7 +217,7 @@ privilege PRIV_NET_BINDMLP Allow a process to bind to a port that is configured as a multi-level port(MLP) for the process's zone. This privilege applies to both shared address and zone-specific address MLPs. - See tnzonecfg(4) from the Trusted Extensions manual pages for + See tnzonecfg(5) from the Trusted Extensions manual pages for information on configuring MLP ports. This privilege is interpreted only if the system is configured with Trusted Extensions. @@ -507,7 +507,7 @@ privilege PRIV_SYS_TRANS_LABEL privilege PRIV_VIRT_MANAGE Allows a process to manage virtualized environments such as - xVM(5). + xVM(7). privilege PRIV_WIN_COLORMAP @@ -613,7 +613,7 @@ privilege PRIV_WIN_UPGRADE_SL privilege PRIV_XVM_CONTROL - Allows a process access to the xVM(5) control devices for + Allows a process access to the xVM(7) control devices for managing guest domains and the hypervisor. This privilege is used only if booted into xVM on x86 platforms. diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index e0a1126567..8f52f4ef3a 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -149,7 +149,7 @@ * The locking subsequence of interest is: p_lock, rctl_dict_lock, * rctl_lists_lock, entity->rcs_lock. * - * The projects(4) database and project entity resource controls + * The project(5) database and project entity resource controls * A special case is made for RCENTITY_PROJECT values set through the * setproject(3PROJECT) interface. setproject() makes use of a private * interface, setprojrctl(), which passes through an array of resource control @@ -170,7 +170,7 @@ * * rctl->rc_values - a linked list of rctl_val_t. These are the active * resource values associated with this rctl, and may have been set by - * setrctl() - via prctl(1M), or by setprojrctl() - via + * setrctl() - via prctl(1), or by setprojrctl() - via * setproject(3PROJECT). * * rctl->rc_projdb - a linked list of rctl_val_t. These reflect the @@ -1570,8 +1570,6 @@ rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval, int ret = 0; rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl); -local_op_retry: - ASSERT(MUTEX_HELD(&p->p_lock)); rset = rctl_entity_obtain_rset(rde, p); diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c index 18b396a765..d500bf7468 100644 --- a/usr/src/uts/common/os/schedctl.c +++ b/usr/src/uts/common/os/schedctl.c @@ -22,7 +22,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2021 Joyent, Inc. + * Copyright 2021 Oxide Computer Company */ #include <sys/types.h> @@ -81,9 +82,9 @@ static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ static size_t sc_bitmap_words; /* # of words in allocation bitmap */ /* Context ops */ -static void schedctl_save(sc_shared_t *); -static void schedctl_restore(sc_shared_t *); -static void schedctl_fork(kthread_t *, kthread_t *); +static void schedctl_save(void *); +static void schedctl_restore(void *); +static void schedctl_fork(void *, void *); /* Functions for handling shared pages */ static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); @@ -92,6 +93,13 @@ static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); static int schedctl_getpage(struct anon_map **, caddr_t *); static void schedctl_freepage(struct anon_map *, caddr_t); +static const struct ctxop_template schedctl_ctxop_tpl = { + .ct_rev = CTXOP_TPL_REV, + .ct_save = schedctl_save, + .ct_restore = schedctl_restore, + .ct_fork = schedctl_fork, +}; + /* * System call interface to scheduler activations. * This always operates on the current lwp. @@ -112,8 +120,7 @@ schedctl(void) return ((caddr_t)(uintptr_t)set_errno(error)); bzero(ssp, sizeof (*ssp)); - installctx(t, ssp, schedctl_save, schedctl_restore, - schedctl_fork, NULL, NULL, NULL); + ctxop_install(t, &schedctl_ctxop_tpl, ssp); thread_lock(t); /* protect against ts_tick and ts_update */ t->t_schedctl = ssp; @@ -151,8 +158,7 @@ schedctl_lwp_cleanup(kthread_t *t) * Remove the context op to avoid the final call to * schedctl_save when switching away from this lwp. */ - (void) removectx(t, ssp, schedctl_save, schedctl_restore, - schedctl_fork, NULL, NULL, NULL); + (void) ctxop_remove(t, &schedctl_ctxop_tpl, ssp); /* * Do not unmap the shared page until the process exits. @@ -207,8 +213,10 @@ schedctl_proc_cleanup(void) * Save new thread state. */ static void -schedctl_save(sc_shared_t *ssp) +schedctl_save(void *arg) { + sc_shared_t *ssp = arg; + ssp->sc_state = curthread->t_state; } @@ -218,8 +226,10 @@ schedctl_save(sc_shared_t *ssp) * Save new thread state and CPU. */ static void -schedctl_restore(sc_shared_t *ssp) +schedctl_restore(void *arg) { + sc_shared_t *ssp = arg; + ssp->sc_state = SC_ONPROC; ssp->sc_cpu = CPU->cpu_id; } @@ -230,8 +240,9 @@ schedctl_restore(sc_shared_t *ssp) * The child's threads must call schedctl() to get new shared mappings. */ static void -schedctl_fork(kthread_t *pt, kthread_t *ct) +schedctl_fork(void *parent, void *child) { + kthread_t *pt = parent, *ct = child; proc_t *pp = ttoproc(pt); proc_t *cp = ttoproc(ct); sc_page_ctl_t *pagep; diff --git a/usr/src/uts/common/os/share.c b/usr/src/uts/common/os/share.c index 55a7422868..6a06be2d9c 100644 --- a/usr/src/uts/common/os/share.c +++ b/usr/src/uts/common/os/share.c @@ -24,7 +24,7 @@ */ /* - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. */ #include <sys/types.h> @@ -125,6 +125,8 @@ add_share(struct vnode *vp, struct shrlock *shr) (shr->s_deny & F_RDDNY) || (shrl->shr->s_access & F_WRACC)) { mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } /* @@ -135,6 +137,8 @@ add_share(struct vnode *vp, struct shrlock *shr) if (isreadonly(vp)) break; mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } /* @@ -147,6 +151,8 @@ add_share(struct vnode *vp, struct shrlock *shr) (shrl->shr->s_access == F_RDACC)) break; mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } @@ -171,6 +177,8 @@ add_share(struct vnode *vp, struct shrlock *shr) (shrl->shr->s_deny & F_RDDNY) || (shrl->shr->s_access & F_WRACC)) { mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } /* @@ -183,6 +191,8 @@ add_share(struct vnode *vp, struct shrlock *shr) break; } mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } /* @@ -199,6 +209,8 @@ add_share(struct vnode *vp, struct shrlock *shr) if ((shr->s_access & shrl->shr->s_deny) || (shr->s_deny & shrl->shr->s_access)) { mutex_exit(&vp->v_lock); + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); return (EAGAIN); } } @@ -609,8 +621,11 @@ nbl_share_conflict(vnode_t *vp, nbl_op_t op, caller_context_t *ct) break; #endif } - if (conflict) + if (conflict) { + DTRACE_PROBE1(conflict_shrlock, + struct shrlock *, shrl->shr); break; + } } mutex_exit(&vp->v_lock); diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index 74f1649a07..d0611eb9bb 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -348,7 +348,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size = P2ROUNDUP(size, share_size); align_hint = share_size; -#if defined(__i386) || defined(__amd64) +#if defined(__x86) /* * For x86, we want to share as much of the page table tree * as possible. We use a large align_hint at first, but @@ -366,7 +366,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) while (size >= ptes_per_table * (uint64_t)align_hint) align_hint *= ptes_per_table; } -#endif /* __i386 || __amd64 */ +#endif /* __x86 */ #if defined(__sparcv9) if (addr == 0 && diff --git a/usr/src/uts/common/os/softint.c b/usr/src/uts/common/os/softint.c index ecdb038c79..8801340cf9 100644 --- a/usr/src/uts/common/os/softint.c +++ b/usr/src/uts/common/os/softint.c @@ -58,29 +58,29 @@ * * Starting state is IDLE. * - * softint() + * softint() * * * (c) - * ____________________________________________________ - * | ^ ^ - * v (a) | (b) | - * IDLE--------------------->PEND--------------------->DRAIN - * ^ | | - * | | | - * | | | - * | | | - * | | | - * | d d - * | | | - * | v v - * | PEND DRAIN - * | (e) & & - * |<-----------------------STEAL STEAL - * ^ | - * | | - * | (e) v - * |_________________________<__________________________| + * ____________________________________________________ + * | ^ ^ + * v (a) | (b) | + * IDLE--------------------->PEND--------------------->DRAIN + * ^ | | + * | | | + * | | | + * | | | + * | | | + * | d d + * | | | + * | v v + * | PEND DRAIN + * | (e) & & + * |<-----------------------STEAL STEAL + * ^ | + * | | + * | (e) v + * |_________________________<__________________________| * * * @@ -146,9 +146,9 @@ uint_t softcall_pokemax = 10; /* * This ensures that softcall entries don't get stuck for long. It's expressed - * in 10 milliseconds as 1 unit. When hires_tick is set or other clock frequency - * is used, softcall_init() ensures that it's still expressed as 1 = 10 milli - * seconds. + * in 10 milliseconds as 1 unit. Regardless of the value of hires_tick or + * clock frequency, softcall_init() ensures that it's still expressed as 1 = + * 10 milliseconds. */ unsigned int softcall_delay = 1; diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c index 3fd8275df0..37792b7254 100644 --- a/usr/src/uts/common/os/space.c +++ b/usr/src/uts/common/os/space.c @@ -23,6 +23,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -93,8 +94,6 @@ int __lintzero; /* Alway zero for shutting up lint */ pfn_t physmax; pgcnt_t physinstalled; -struct var v; - #include <sys/systm.h> #include <sys/conf.h> #include <sys/kmem.h> @@ -142,53 +141,6 @@ char dhcifname[IFNAMSIZ]; ether_addr_t etherbroadcastaddr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - -/* - * Data from timod that must be resident - */ - -/* - * state transition table for TI interface - */ -#include <sys/tihdr.h> - -#define nr 127 /* not reachable */ - -char ti_statetbl[TE_NOEVENTS][TS_NOSTATES] = { - /* STATES */ - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */ - - { 1, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 2, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 4, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, 0, 3, nr, 3, 3, nr, nr, 7, nr, nr, nr, 6, 7, 9, 10, 11}, - {nr, nr, 0, nr, nr, 6, nr, nr, nr, nr, nr, nr, 3, nr, 3, 3, 3}, - {nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, nr, nr, nr, 3, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, 3, nr, nr, nr, nr, 3, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, 7, nr, nr, nr, nr, 7, nr, nr, nr}, - {nr, nr, nr, 5, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, 8, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, 12, 13, nr, 14, 15, 16, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, 11, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, 11, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 10, nr, 3, nr, nr, nr, nr, nr}, - {nr, nr, nr, 7, nr, nr, nr, 7, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, 9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, 10, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, 10, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, nr, nr, 11, 3, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, 3, nr, nr, 3, 3, 3, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, nr, nr, nr, nr, 7, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, - {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr}, -}; - - #include <sys/tty.h> #include <sys/ptyvar.h> diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 975b2f3d2e..90a9ea6f0f 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -3629,7 +3629,7 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, /* * The I_STR facility provides a trap door for malicious - * code to send down bogus streamio(7I) ioctl commands to + * code to send down bogus streamio(4I) ioctl commands to * unsuspecting STREAMS modules and drivers which expect to * only get these messages from the stream head. * Explicitly prohibit any streamio ioctls which can be diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index ac1ee2d1ce..796f89dca2 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* @@ -28,6 +28,7 @@ * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -1901,36 +1902,9 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp, */ error = strdoioctl(stp, &strioc, FNATIVE, K_TO_K | STR_NOERROR | STR_NOSIG, crp, rvalp); - if (error != 0) { - lbfree(linkp); - - if (!(passyncq->sq_flags & SQ_BLOCKED)) - blocksq(passyncq, SQ_BLOCKED, 0); - /* - * Restore the stream head queue and then remove - * the passq. Turn off STPLEX before we turn on - * the stream by removing the passq. - */ - rq->q_ptr = _WR(rq)->q_ptr = stpdown; - setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO, - B_TRUE); - - mutex_enter(&stpdown->sd_lock); - stpdown->sd_flag &= ~STPLEX; - mutex_exit(&stpdown->sd_lock); - - link_rempassthru(passq); - - mutex_enter(&stpdown->sd_lock); - stpdown->sd_flag &= ~STRPLUMB; - /* Wakeup anyone waiting for STRPLUMB to clear. */ - cv_broadcast(&stpdown->sd_monitor); - mutex_exit(&stpdown->sd_lock); + if (error != 0) + goto cleanup; - mutex_exit(&muxifier); - netstack_rele(ss->ss_netstack); - return (error); - } mutex_enter(&fpdown->f_tlock); fpdown->f_count++; mutex_exit(&fpdown->f_tlock); @@ -1942,9 +1916,16 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp, ASSERT((cmd == I_LINK) || (cmd == I_PLINK)); if (cmd == I_LINK) { - ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL); + error = ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL); } else { - ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST); + error = ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST); + } + + if (error != 0) { + mutex_enter(&fpdown->f_tlock); + fpdown->f_count--; + mutex_exit(&fpdown->f_tlock); + goto cleanup; } link_rempassthru(passq); @@ -1976,6 +1957,36 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp, *rvalp = linkp->li_lblk.l_index; netstack_rele(ss->ss_netstack); return (0); + +cleanup: + lbfree(linkp); + + if (!(passyncq->sq_flags & SQ_BLOCKED)) + blocksq(passyncq, SQ_BLOCKED, 0); + /* + * Restore the stream head queue and then remove + * the passq. Turn off STPLEX before we turn on + * the stream by removing the passq. + */ + rq->q_ptr = _WR(rq)->q_ptr = stpdown; + setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO, + B_TRUE); + + mutex_enter(&stpdown->sd_lock); + stpdown->sd_flag &= ~STPLEX; + mutex_exit(&stpdown->sd_lock); + + link_rempassthru(passq); + + mutex_enter(&stpdown->sd_lock); + stpdown->sd_flag &= ~STRPLUMB; + /* Wakeup anyone waiting for STRPLUMB to clear. */ + cv_broadcast(&stpdown->sd_monitor); + mutex_exit(&stpdown->sd_lock); + + mutex_exit(&muxifier); + netstack_rele(ss->ss_netstack); + return (error); } int @@ -2232,9 +2243,9 @@ munlink(stdata_t *stp, linkinfo_t *linkp, int flag, cred_t *crp, int *rvalp, /* clean up the layered driver linkages */ if ((flag & LINKTYPEMASK) == LINKNORMAL) { - ldi_munlink_fp(stp, fpdown, LINKNORMAL); + VERIFY0(ldi_munlink_fp(stp, fpdown, LINKNORMAL)); } else { - ldi_munlink_fp(stp, fpdown, LINKPERSIST); + VERIFY0(ldi_munlink_fp(stp, fpdown, LINKPERSIST)); } link_rempassthru(passq); @@ -3006,7 +3017,7 @@ strwaitbuf(size_t size, int pri) * GETWAIT Check for read side errors, no M_READ * WRITEWAIT Check for write side errors. * NOINTR Do not return error if nonblocking or timeout. - * STR_NOERROR Ignore all errors except STPLEX. + * STR_NOERROR Ignore all errors except STPLEX. * STR_NOSIG Ignore/hold signals during the duration of the call. * STR_PEEK Pass through the strgeterr(). */ @@ -6630,9 +6641,9 @@ drain_syncq(syncq_t *sq) * * qdrain_syncq can be called (currently) from only one of two places: * drain_syncq - * putnext (or some variation of it). + * putnext (or some variation of it). * and eventually - * qwait(_sig) + * qwait(_sig) * * If called from drain_syncq, we found it in the list of queues needing * service, so there is work to be done (or it wouldn't be in the list). @@ -6652,8 +6663,8 @@ drain_syncq(syncq_t *sq) * * ASSUMES: * One claim - * QLOCK held - * SQLOCK not held + * QLOCK held + * SQLOCK not held * Will release QLOCK before returning */ void @@ -7107,11 +7118,11 @@ static int propagate_syncq(queue_t *qp) { mblk_t *bp, *head, *tail, *prev, *next; - syncq_t *sq; + syncq_t *sq; queue_t *nqp; syncq_t *nsq; boolean_t isdriver; - int moved = 0; + int moved = 0; uint16_t flags; pri_t priority = curthread->t_pri; #ifdef DEBUG @@ -7144,7 +7155,7 @@ propagate_syncq(queue_t *qp) /* debug macro */ SQ_PUTLOCKS_HELD(nsq); #ifdef DEBUG - func = (void (*)())nqp->q_qinfo->qi_putp; + func = (void (*)())(uintptr_t)nqp->q_qinfo->qi_putp; #endif } diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index 0dde96307b..ac48bf31b7 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -250,7 +250,7 @@ ddi_unmap_regs(dev_info_t *dip, uint_t rnumber, caddr_t *kaddrp, off_t offset, int ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, - off_t offset, off_t len, caddr_t *vaddrp) + off_t offset, off_t len, caddr_t *vaddrp) { return (i_ddi_bus_map(dip, rdip, mp, offset, len, vaddrp)); } @@ -265,7 +265,7 @@ ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, */ int nullbusmap(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, - off_t offset, off_t len, caddr_t *vaddrp) + off_t offset, off_t len, caddr_t *vaddrp) { _NOTE(ARGUNUSED(rdip)) if (mp->map_type == DDI_MT_RNUMBER) @@ -433,45 +433,6 @@ ddi_peek64(dev_info_t *dip, int64_t *addr, int64_t *val_p) val_p)); } - -/* - * We need to separate the old interfaces from the new ones and leave them - * in here for a while. Previous versions of the OS defined the new interfaces - * to the old interfaces. This way we can fix things up so that we can - * eventually remove these interfaces. - * e.g. A 3rd party module/driver using ddi_peek8 and built against S10 - * or earlier will actually have a reference to ddi_peekc in the binary. - */ -#ifdef _ILP32 -int -ddi_peekc(dev_info_t *dip, int8_t *addr, int8_t *val_p) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr, - val_p)); -} - -int -ddi_peeks(dev_info_t *dip, int16_t *addr, int16_t *val_p) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr, - val_p)); -} - -int -ddi_peekl(dev_info_t *dip, int32_t *addr, int32_t *val_p) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr, - val_p)); -} - -int -ddi_peekd(dev_info_t *dip, int64_t *addr, int64_t *val_p) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr, - val_p)); -} -#endif /* _ILP32 */ - int ddi_poke8(dev_info_t *dip, int8_t *addr, int8_t val) { @@ -497,40 +458,6 @@ ddi_poke64(dev_info_t *dip, int64_t *addr, int64_t val) } /* - * We need to separate the old interfaces from the new ones and leave them - * in here for a while. Previous versions of the OS defined the new interfaces - * to the old interfaces. This way we can fix things up so that we can - * eventually remove these interfaces. - * e.g. A 3rd party module/driver using ddi_poke8 and built against S10 - * or earlier will actually have a reference to ddi_pokec in the binary. - */ -#ifdef _ILP32 -int -ddi_pokec(dev_info_t *dip, int8_t *addr, int8_t val) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val)); -} - -int -ddi_pokes(dev_info_t *dip, int16_t *addr, int16_t val) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val)); -} - -int -ddi_pokel(dev_info_t *dip, int32_t *addr, int32_t val) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val)); -} - -int -ddi_poked(dev_info_t *dip, int64_t *addr, int64_t val) -{ - return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val)); -} -#endif /* _ILP32 */ - -/* * ddi_peekpokeio() is used primarily by the mem drivers for moving * data to and from uio structures via peek and poke. Note that we * use "internal" routines ddi_peek and ddi_poke to make this go @@ -2886,7 +2813,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data) */ ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos + sizeof (int64_t); - return (DDI_PROP_RESULT_OK); + return (DDI_PROP_RESULT_OK); case DDI_PROP_CMD_ENCODE: /* @@ -2934,7 +2861,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data) */ ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos + sizeof (int64_t); - return (DDI_PROP_RESULT_OK); + return (DDI_PROP_RESULT_OK); case DDI_PROP_CMD_GET_ESIZE: /* @@ -3115,7 +3042,7 @@ ddi_prop_1275_string(prop_handle_t *ph, uint_t cmd, char *data) */ int ddi_prop_1275_bytes(prop_handle_t *ph, uint_t cmd, uchar_t *data, - uint_t nelements) + uint_t nelements) { switch (cmd) { case DDI_PROP_CMD_DECODE: @@ -4922,7 +4849,7 @@ impl_ddi_callback_init(void) static void callback_insert(int (*funcp)(caddr_t), caddr_t arg, uintptr_t *listid, - int count) + int count) { struct ddi_callback *list, *marker, *new; size_t size = sizeof (struct ddi_callback); @@ -5614,7 +5541,7 @@ fail: * devfs event subclass names as device class names. */ static int -derive_devi_class(dev_info_t *dip, char *node_type, int flag) +derive_devi_class(dev_info_t *dip, const char *node_type, int flag) { int rv = DDI_SUCCESS; @@ -5659,10 +5586,10 @@ derive_devi_class(dev_info_t *dip, char *node_type, int flag) * exceed IFNAMSIZ (16) characters in length. */ static boolean_t -verify_name(char *name) +verify_name(const char *name) { - size_t len = strlen(name); - char *cp; + size_t len = strlen(name); + const char *cp; if (len == 0 || len > IFNAMSIZ) return (B_FALSE); @@ -5680,9 +5607,9 @@ verify_name(char *name) * attach it to the given devinfo node. */ -int -ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type, - minor_t minor_num, char *node_type, int flag, ddi_minor_type mtype, +static int +ddi_create_minor_common(dev_info_t *dip, const char *name, int spec_type, + minor_t minor_num, const char *node_type, int flag, ddi_minor_type mtype, const char *read_priv, const char *write_priv, mode_t priv_mode) { struct ddi_minor_data *dmdp; @@ -5793,7 +5720,7 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type, */ if (!(DEVI_IS_ATTACHING(dip) || DEVI_IS_DETACHING(dip)) && mtype != DDM_INTERNAL_PATH) { - (void) i_log_devfs_minor_create(dip, name); + (void) i_log_devfs_minor_create(dip, dmdp->ddm_name); } /* @@ -5804,16 +5731,16 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type, } int -ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type, - minor_t minor_num, char *node_type, int flag) +ddi_create_minor_node(dev_info_t *dip, const char *name, int spec_type, + minor_t minor_num, const char *node_type, int flag) { return (ddi_create_minor_common(dip, name, spec_type, minor_num, node_type, flag, DDM_MINOR, NULL, NULL, 0)); } int -ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type, - minor_t minor_num, char *node_type, int flag, +ddi_create_priv_minor_node(dev_info_t *dip, const char *name, int spec_type, + minor_t minor_num, const char *node_type, int flag, const char *rdpriv, const char *wrpriv, mode_t priv_mode) { return (ddi_create_minor_common(dip, name, spec_type, minor_num, @@ -5821,8 +5748,8 @@ ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type, } int -ddi_create_default_minor_node(dev_info_t *dip, char *name, int spec_type, - minor_t minor_num, char *node_type, int flag) +ddi_create_default_minor_node(dev_info_t *dip, const char *name, int spec_type, + minor_t minor_num, const char *node_type, int flag) { return (ddi_create_minor_common(dip, name, spec_type, minor_num, node_type, flag, DDM_DEFAULT, NULL, NULL, 0)); @@ -5842,7 +5769,7 @@ ddi_create_internal_pathname(dev_info_t *dip, char *name, int spec_type, } void -ddi_remove_minor_node(dev_info_t *dip, char *name) +ddi_remove_minor_node(dev_info_t *dip, const char *name) { int circ; struct ddi_minor_data *dmdp, *dmdp1; @@ -6956,7 +6883,7 @@ ddi_set_console_bell(void (*bellfunc)(clock_t duration)) int ddi_dma_alloc_handle(dev_info_t *dip, ddi_dma_attr_t *attr, - int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep) + int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep) { int (*funcp)() = ddi_dma_allochdl; ddi_dma_attr_t dma_attr; @@ -6986,9 +6913,9 @@ static uintptr_t dma_mem_list_id = 0; int ddi_dma_mem_alloc(ddi_dma_handle_t handle, size_t length, - ddi_device_acc_attr_t *accattrp, uint_t flags, - int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp, - size_t *real_length, ddi_acc_handle_t *handlep) + ddi_device_acc_attr_t *accattrp, uint_t flags, + int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp, + size_t *real_length, ddi_acc_handle_t *handlep) { ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle; dev_info_t *dip = hp->dmai_rdip; @@ -7079,8 +7006,8 @@ ddi_dma_mem_free(ddi_acc_handle_t *handlep) int ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp, - uint_t flags, int (*waitfp)(caddr_t), caddr_t arg, - ddi_dma_cookie_t *cookiep, uint_t *ccountp) + uint_t flags, int (*waitfp)(caddr_t), caddr_t arg, + ddi_dma_cookie_t *cookiep, uint_t *ccountp) { ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle; dev_info_t *dip, *rdip; @@ -7143,8 +7070,8 @@ ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp, int ddi_dma_addr_bind_handle(ddi_dma_handle_t handle, struct as *as, - caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t), - caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp) + caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t), + caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp) { ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle; dev_info_t *dip, *rdip; @@ -7282,7 +7209,7 @@ ddi_dma_numwin(ddi_dma_handle_t handle, uint_t *nwinp) int ddi_dma_getwin(ddi_dma_handle_t h, uint_t win, off_t *offp, - size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp) + size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp) { int (*funcp)() = ddi_dma_win; struct bus_ops *bop; @@ -7358,8 +7285,8 @@ i_ddi_dma_clr_fault(ddi_dma_handle_t handle) */ int ddi_regs_map_setup(dev_info_t *dip, uint_t rnumber, caddr_t *addrp, - offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp, - ddi_acc_handle_t *handle) + offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp, + ddi_acc_handle_t *handle) { ddi_map_req_t mr; ddi_acc_hdl_t *hp; @@ -7433,7 +7360,7 @@ ddi_regs_map_free(ddi_acc_handle_t *handlep) int ddi_device_zero(ddi_acc_handle_t handle, caddr_t dev_addr, size_t bytecount, - ssize_t dev_advcnt, uint_t dev_datasz) + ssize_t dev_advcnt, uint_t dev_datasz) { uint8_t *b; uint16_t *w; @@ -7627,7 +7554,7 @@ i_ddi_devtspectype_to_minorname(dev_info_t *dip, dev_t dev, int spec_type) */ int i_ddi_minorname_to_devtspectype(dev_info_t *dip, char *minor_name, - dev_t *devtp, int *spectypep) + dev_t *devtp, int *spectypep) { int circ; struct ddi_minor_data *dmdp; @@ -8366,8 +8293,8 @@ umem_decr_devlockmem(struct ddi_umem_cookie *cookie) */ int umem_lockmemory(caddr_t addr, size_t len, int flags, ddi_umem_cookie_t *cookie, - struct umem_callback_ops *ops_vector, - proc_t *procp) + struct umem_callback_ops *ops_vector, + proc_t *procp) { int error; struct ddi_umem_cookie *p; @@ -8838,8 +8765,8 @@ ddi_umem_unlock(ddi_umem_cookie_t cookie) */ struct buf * ddi_umem_iosetup(ddi_umem_cookie_t cookie, off_t off, size_t len, - int direction, dev_t dev, daddr_t blkno, - int (*iodone)(struct buf *), int sleepflag) + int direction, dev_t dev, daddr_t blkno, + int (*iodone)(struct buf *), int sleepflag) { struct ddi_umem_cookie *p = (struct ddi_umem_cookie *)cookie; struct buf *bp; @@ -8919,7 +8846,7 @@ ddi_get_devstate(dev_info_t *dip) void ddi_dev_report_fault(dev_info_t *dip, ddi_fault_impact_t impact, - ddi_fault_location_t location, const char *message) + ddi_fault_location_t location, const char *message) { struct ddi_fault_event_data fd; ddi_eventcookie_t ec; @@ -8950,7 +8877,7 @@ i_ddi_devi_class(dev_info_t *dip) } int -i_ddi_set_devi_class(dev_info_t *dip, char *devi_class, int flag) +i_ddi_set_devi_class(dev_info_t *dip, const char *devi_class, int flag) { struct dev_info *devi = DEVI(dip); @@ -9912,7 +9839,7 @@ e_ddi_branch_unconfigure( /* The dip still exists, so do a hold */ e_ddi_branch_hold(rdip); } -out: + kmem_free(devnm, MAXNAMELEN + 1); ndi_devi_exit(pdip, circ); return (ndi2errno(rv)); diff --git a/usr/src/uts/common/os/sunmdi.c b/usr/src/uts/common/os/sunmdi.c index 0cdfd30392..6d1e10e0a4 100644 --- a/usr/src/uts/common/os/sunmdi.c +++ b/usr/src/uts/common/os/sunmdi.c @@ -3597,6 +3597,16 @@ i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag) MDI_PI_LOCK(pip); MDI_PI_SET_OFFLINING(pip); break; + + case MDI_PATHINFO_STATE_INIT: + /* + * Callers are not allowed to ask us to change the state to the + * initial state. + */ + rv = MDI_FAILURE; + MDI_PI_UNLOCK(pip); + goto state_change_exit; + } MDI_PI_UNLOCK(pip); MDI_CLIENT_UNSTABLE(ct); @@ -5722,6 +5732,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error) break; case DDI_RESUME: + case DDI_PM_RESUME: MDI_DEBUG(2, (MDI_NOTE, dip, "pHCI post_resume: called %p", (void *)ph)); if (error == DDI_SUCCESS) { @@ -5769,6 +5780,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error) break; case DDI_RESUME: + case DDI_PM_RESUME: MDI_DEBUG(2, (MDI_NOTE, dip, "client post_attach: called %p", (void *)ct)); if (error == DDI_SUCCESS) { @@ -6011,12 +6023,15 @@ i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error) break; case DDI_SUSPEND: + case DDI_PM_SUSPEND: MDI_DEBUG(2, (MDI_NOTE, dip, "pHCI post_suspend: called %p", (void *)ph)); if (error != DDI_SUCCESS) MDI_PHCI_SET_RESUME(ph); break; + case DDI_HOTPLUG_DETACH: + break; } MDI_PHCI_UNLOCK(ph); } @@ -6054,11 +6069,14 @@ i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error) break; case DDI_SUSPEND: + case DDI_PM_SUSPEND: MDI_DEBUG(2, (MDI_NOTE, dip, "called %p", (void *)ct)); if (error != DDI_SUCCESS) MDI_CLIENT_SET_RESUME(ct); break; + case DDI_HOTPLUG_DETACH: + break; } MDI_CLIENT_UNLOCK(ct); } @@ -6820,6 +6838,10 @@ mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op, i_mdi_pm_rele_client(ct, ct->ct_path_count); } break; + default: + dev_err(parent, CE_WARN, "!unhandled bus power operation: 0x%x", + op); + break; } MDI_CLIENT_UNLOCK(ct); diff --git a/usr/src/uts/common/os/sunpci.c b/usr/src/uts/common/os/sunpci.c index 209b269838..b1098b4fcc 100644 --- a/usr/src/uts/common/os/sunpci.c +++ b/usr/src/uts/common/os/sunpci.c @@ -145,104 +145,6 @@ pci_config_put64(ddi_acc_handle_t handle, off_t offset, uint64_t value) ddi_put64(handle, (uint64_t *)cfgaddr, value); } -/* - * We need to separate the old interfaces from the new ones and leave them - * in here for a while. Previous versions of the OS defined the new interfaces - * to the old interfaces. This way we can fix things up so that we can - * eventually remove these interfaces. - * e.g. A 3rd party module/driver using pci_config_get8 and built against S10 - * or earlier will actually have a reference to pci_config_getb in the binary. - */ -#ifdef _ILP32 -uint8_t -pci_config_getb(ddi_acc_handle_t handle, off_t offset) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - return (ddi_get8(handle, (uint8_t *)cfgaddr)); -} - -uint16_t -pci_config_getw(ddi_acc_handle_t handle, off_t offset) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - return (ddi_get16(handle, (uint16_t *)cfgaddr)); -} - -uint32_t -pci_config_getl(ddi_acc_handle_t handle, off_t offset) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - return (ddi_get32(handle, (uint32_t *)cfgaddr)); -} - -uint64_t -pci_config_getll(ddi_acc_handle_t handle, off_t offset) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - return (ddi_get64(handle, (uint64_t *)cfgaddr)); -} - -void -pci_config_putb(ddi_acc_handle_t handle, off_t offset, uint8_t value) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - ddi_put8(handle, (uint8_t *)cfgaddr, value); -} - -void -pci_config_putw(ddi_acc_handle_t handle, off_t offset, uint16_t value) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - ddi_put16(handle, (uint16_t *)cfgaddr, value); -} - -void -pci_config_putl(ddi_acc_handle_t handle, off_t offset, uint32_t value) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - ddi_put32(handle, (uint32_t *)cfgaddr, value); -} - -void -pci_config_putll(ddi_acc_handle_t handle, off_t offset, uint64_t value) -{ - caddr_t cfgaddr; - ddi_acc_hdl_t *hp; - - hp = impl_acc_hdl_get(handle); - cfgaddr = hp->ah_addr + offset; - ddi_put64(handle, (uint64_t *)cfgaddr, value); -} -#endif /* _ILP32 */ - /*ARGSUSED*/ int pci_report_pmcap(dev_info_t *dip, int cap, void *arg) @@ -926,7 +828,7 @@ restoreconfig_err: /*ARGSUSED*/ static int pci_lookup_pmcap(dev_info_t *dip, ddi_acc_handle_t conf_hdl, - uint16_t *pmcap_offsetp) + uint16_t *pmcap_offsetp) { uint8_t cap_ptr; uint8_t cap_id; diff --git a/usr/src/uts/common/os/sunpm.c b/usr/src/uts/common/os/sunpm.c index 3ce7cc530d..7518c45cea 100644 --- a/usr/src/uts/common/os/sunpm.c +++ b/usr/src/uts/common/os/sunpm.c @@ -61,8 +61,8 @@ * tells what each component's power state values are, and provides human * readable strings (currently unused) for each component name and power state. * Devices which export pm-components(9P) are automatically power managed - * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(1M) - * after parsing power.conf(4)). The exception to this rule is that power + * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(8) + * after parsing power.conf(5)). The exception to this rule is that power * manageable CPU devices may be automatically managed independently of autopm * by either enabling or disabling (via PM_START_CPUPM and PM_STOP_CPUPM * ioctls) cpupm. If the CPU devices are not managed independently, then they @@ -72,13 +72,13 @@ * hardware state. * * Each device component also has a threshold time associated with each power - * transition (see power.conf(4)), and a busy/idle state maintained by the + * transition (see power.conf(5)), and a busy/idle state maintained by the * driver calling pm_idle_component(9F) and pm_busy_component(9F). * Components are created idle. * * The PM framework provides several functions: - * -implement PM policy as described in power.conf(4) - * Policy is set by pmconfig(1M) issuing pm ioctls based on power.conf(4). + * -implement PM policy as described in power.conf(5) + * Policy is set by pmconfig(8) issuing pm ioctls based on power.conf(5). * Policies consist of: * -set threshold values (defaults if none provided by pmconfig) * -set dependencies among devices @@ -122,7 +122,7 @@ * cdrom is always up whenever the console framebuffer is up, so that the user * can insert a cdrom and see a popup as a result. * - * The dependency terminology used in power.conf(4) is not easy to understand, + * The dependency terminology used in power.conf(5) is not easy to understand, * so we've adopted a different terminology in the implementation. We write * of a "keeps up" and a "kept up" device. A relationship can be established * where one device keeps up another. That means that if the keepsup device @@ -384,7 +384,7 @@ int cpr_platform_enable = 0; /* * pm_S3_enabled indicates that we believe the platform can support S3, - * which we get from pmconfig(1M) + * which we get from pmconfig(8) */ int pm_S3_enabled; @@ -1616,7 +1616,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level, (PM_CP(dip, comp)->pmc_flags & PM_PHC_WHILE_SET_POWER)); - resume_needed = suspended; + resume_needed = suspended; } } else { if (POWERING_OFF(old_level, level)) { @@ -1629,7 +1629,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level, (PM_CP(dip, comp)->pmc_flags & PM_PHC_WHILE_SET_POWER)); - resume_needed = suspended; + resume_needed = suspended; } } } @@ -2076,13 +2076,12 @@ e_pm_hold_rele_power(dev_info_t *dip, int cnt) return; PM_LOCK_POWER(dip, &circ); - ASSERT(cnt >= 0 && PM_KUC(dip) >= 0 || cnt < 0 && PM_KUC(dip) > 0); + ASSERT(cnt >= 0 || (cnt < 0 && PM_KUC(dip) > 0)); PMD(PMD_KIDSUP, ("%s: kidsupcnt for %s@%s(%s#%d) %d->%d\n", pmf, PM_DEVICE(dip), PM_KUC(dip), (PM_KUC(dip) + cnt))) PM_KUC(dip) += cnt; - ASSERT(PM_KUC(dip) >= 0); PM_UNLOCK_POWER(dip, circ); if (cnt < 0 && PM_KUC(dip) == 0) @@ -7647,7 +7646,7 @@ pm_cfb_setup(const char *stdout_path) */ } else { cmn_err(CE_WARN, "Kernel debugger present: see " - "kmdb(1M) for interaction with power management."); + "kmdb(1) for interaction with power management."); } } #ifdef DEBUG diff --git a/usr/src/uts/common/os/swapgeneric.c b/usr/src/uts/common/os/swapgeneric.c index 77167149fe..ce64aff89a 100644 --- a/usr/src/uts/common/os/swapgeneric.c +++ b/usr/src/uts/common/os/swapgeneric.c @@ -878,7 +878,7 @@ load_bootpath_drivers(char *bootpath) #endif dip = path_to_devinfo(pathcopy); -#if defined(__i386) || defined(__amd64) +#if defined(__x86) /* * i386 does not provide stub nodes for all boot devices, * but we should be able to find the node for the parent, @@ -910,7 +910,7 @@ load_bootpath_drivers(char *bootpath) rval = load_boot_driver(leaf, NULL); if (rval == -1) { kmem_free(pathcopy, pathcopy_len); - return (NULL); + return (0); } } } @@ -920,7 +920,7 @@ load_bootpath_drivers(char *bootpath) cmn_err(CE_WARN, "can't bind driver for boot path <%s>", bootpath); kmem_free(pathcopy, pathcopy_len); - return (NULL); + return (0); } /* @@ -936,7 +936,7 @@ load_bootpath_drivers(char *bootpath) modloadonly("drv", "ibp") == -1) { cmn_err(CE_CONT, "ibp: cannot load platform driver\n"); kmem_free(pathcopy, pathcopy_len); - return (NULL); + return (0); } /* diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index fb64000e4d..dca168b642 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -25,6 +25,7 @@ * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -88,9 +89,9 @@ int getloadavg(int *, int); int rusagesys(int, void *, void *, void *, void *); int getpagesizes(int, size_t *, int); int gtty(int, intptr_t); -#if defined(__i386) || defined(__amd64) +#if defined(__x86) int hrtsys(struct hrtsysa *, rval_t *); -#endif /* __i386 || __amd64 */ +#endif /* __x86 */ int ioctl(int, int, intptr_t); int kill(); int labelsys(int, void *, void *, void *, void *, void *); @@ -184,7 +185,7 @@ int statvfs(char *, struct statvfs *); int fstatvfs(int, struct statvfs *); offset_t llseek32(int32_t, uint32_t, uint32_t, int); -#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT) +#if defined(__i386_COMPAT) int sysi86(short, uintptr_t, uintptr_t, uintptr_t); #endif @@ -330,6 +331,7 @@ int setsockopt(int, int, int, void *, socklen_t *, int); int sockconfig(int, void *, void *, void *, void *); ssize_t sendfilev(int, int, const struct sendfilevec *, int, size_t *); ssize_t getrandom(void *, size_t, unsigned int); +void upanic(void *, size_t); typedef int64_t (*llfcn_t)(); /* for casting one-word returns */ @@ -390,19 +392,15 @@ typedef int64_t (*llfcn_t)(); /* for casting one-word returns */ #define IF_sparc(true, false) false #endif -#if defined(__i386) && !defined(__amd64) -#define IF_i386(true, false) true -#else #define IF_i386(true, false) false -#endif -#if defined(__i386) || defined(__amd64) +#if defined(__x86) #define IF_x86(true, false) true #else #define IF_x86(true, false) false #endif -#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT) +#if defined(__i386_COMPAT) #define IF_386_ABI(true, false) true #else #define IF_386_ABI(true, false) false @@ -583,7 +581,7 @@ struct sysent sysent[NSYSCALL] = /* 122 */ SYSENT_CL("writev", writev, 3), /* 123 */ SYSENT_CL("preadv", preadv, 5), /* 124 */ SYSENT_CL("pwritev", pwritev, 5), - /* 125 */ SYSENT_LOADABLE(), /* (was fxstat) */ + /* 125 */ SYSENT_CI("upanic", upanic, 2), /* 126 */ SYSENT_CL("getrandom", getrandom, 3), /* 127 */ SYSENT_CI("mmapobj", mmapobjsys, 5), /* 128 */ IF_LP64( @@ -948,7 +946,7 @@ struct sysent sysent32[NSYSCALL] = /* 122 */ SYSENT_CI("writev", writev32, 3), /* 123 */ SYSENT_CI("preadv", preadv, 5), /* 124 */ SYSENT_CI("pwritev", pwritev, 5), - /* 125 */ SYSENT_LOADABLE32(), /* was fxstat32 */ + /* 125 */ SYSENT_CI("upanic", upanic, 2), /* 126 */ SYSENT_CI("getrandom", getrandom, 3), /* 127 */ SYSENT_CI("mmapobj", mmapobjsys, 5), /* 128 */ SYSENT_CI("setrlimit", setrlimit32, 2), diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index c965db6737..f587430625 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ #include <sys/timer.h> @@ -179,7 +179,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) /* * timer_grab() and its companion routine, timer_release(), are wrappers - * around timer_lock()/_unlock() which allow the timer_*(3R) routines to + * around timer_lock()/_unlock() which allow the timer_*(3C) routines to * (a) share error handling code and (b) not grab p_lock themselves. Routines * which are called with p_lock held (e.g. timer_lwpbind(), timer_lwpexit()) * must call timer_lock()/_unlock() explictly. @@ -194,7 +194,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) * (a) The specified timer ID is out of range. * * (b) The specified timer ID does not correspond to a timer ID returned - * from timer_create(3R). + * from timer_create(3C). * * (c) The specified timer ID is currently being removed. * @@ -482,105 +482,106 @@ timer_fire(itimer_t *it) } /* - * Allocate an itimer_t and find and appropriate slot for it in p_itimer. - * Acquires p_lock and holds it on return, regardless of success. + * Find an unused (i.e. NULL) entry in p->p_itimer and set *id to the + * index of the unused entry, growing p->p_itimer as necessary (up to timer_max + * entries). Returns B_TRUE (with *id set) on success, B_FALSE on failure + * (e.g. the process already has the maximum number of allowed timers + * allocated). */ -static itimer_t * -timer_alloc(proc_t *p, timer_t *id) +static boolean_t +timer_get_id(proc_t *p, timer_t *id) { - itimer_t *it, **itp = NULL; + itimer_t **itp = NULL, **itp_new; + uint_t target_sz; uint_t i; - ASSERT(MUTEX_NOT_HELD(&p->p_lock)); - - it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); - bzero(it, sizeof (itimer_t)); - mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); - - mutex_enter(&p->p_lock); -retry: - if (p->p_itimer != NULL) { - for (i = 0; i < p->p_itimer_sz; i++) { - if (p->p_itimer[i] == NULL) { - itp = &(p->p_itimer[i]); - break; - } - } - } - - /* - * A suitable slot was not found. If possible, allocate (or resize) - * the p_itimer array and try again. - */ - if (itp == NULL) { - uint_t target_sz = _TIMER_ALLOC_INIT; - itimer_t **itp_new; - - if (p->p_itimer != NULL) { - ASSERT(p->p_itimer_sz != 0); + ASSERT(MUTEX_HELD(&p->p_lock)); - target_sz = p->p_itimer_sz * 2; - } + if (p->p_itimer == NULL) { /* - * Protect against exceeding the max or overflow + * No timers have been allocated for this process, allocate + * the initial array. */ - if (target_sz > timer_max || target_sz > INT_MAX || - target_sz < p->p_itimer_sz) { - kmem_cache_free(clock_timer_cache, it); - return (NULL); - } + ASSERT0(p->p_itimer_sz); + target_sz = _TIMER_ALLOC_INIT; + mutex_exit(&p->p_lock); itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), KM_SLEEP); mutex_enter(&p->p_lock); - if (target_sz <= p->p_itimer_sz) { - /* - * A racing thread performed the resize while we were - * waiting outside p_lock. Discard our now-useless - * allocation and retry. - */ - kmem_free(itp_new, target_sz * sizeof (itimer_t *)); - goto retry; - } else { + + if (p->p_itimer == NULL) { /* - * Instantiate the larger allocation and select the - * first fresh entry for use. + * As long as no other thread beat us to allocating + * the initial p_itimer array, use what we allocated. + * Since we just allocated it, we know slot 0 is + * free. */ - if (p->p_itimer != NULL) { - uint_t old_sz; - - old_sz = p->p_itimer_sz; - bcopy(p->p_itimer, itp_new, - old_sz * sizeof (itimer_t *)); - kmem_free(p->p_itimer, - old_sz * sizeof (itimer_t *)); - - /* - * Short circuit to use the first free entry in - * the new allocation. It's possible that - * other lower-indexed timers were freed while - * p_lock was dropped, but skipping over them - * is not harmful at all. In the common case, - * we skip the need to walk over an array - * filled with timers before arriving at the - * slot we know is fresh from the allocation. - */ - i = old_sz; - } else { - /* - * For processes lacking any existing timers, - * we can simply select the first entry. - */ - i = 0; - } p->p_itimer = itp_new; p->p_itimer_sz = target_sz; + i = 0; + goto done; + } + + /* + * Another thread beat us to allocating the initial array. + * Proceed to searching for an empty slot and growing the + * array if needed. + */ + kmem_free(itp_new, target_sz * sizeof (itimer_t *)); + } + +retry: + /* Use the first empty slot (if any exist) */ + for (i = 0; i < p->p_itimer_sz; i++) { + if (p->p_itimer[i] == NULL) { + goto done; } } - ASSERT(i <= INT_MAX); + /* No empty slots, try to grow p->p_itimer and retry */ + target_sz = p->p_itimer_sz * 2; + if (target_sz > timer_max || target_sz > INT_MAX || + target_sz < p->p_itimer_sz) { + /* Protect against exceeding the max or overflow */ + return (B_FALSE); + } + + mutex_exit(&p->p_lock); + itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), KM_SLEEP); + mutex_enter(&p->p_lock); + + if (target_sz <= p->p_itimer_sz) { + /* + * A racing thread performed the resize while we were + * waiting outside p_lock. Discard our now-useless + * allocation and retry. + */ + kmem_free(itp_new, target_sz * sizeof (itimer_t *)); + goto retry; + } + + ASSERT3P(p->p_itimer, !=, NULL); + bcopy(p->p_itimer, itp_new, p->p_itimer_sz * sizeof (itimer_t *)); + kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *)); + + /* + * Short circuit to use the first free entry in the new allocation. + * It's possible that other lower-indexed timers were freed while + * p_lock was dropped, but skipping over them is not harmful at all. + * In the common case, we skip the need to walk over an array filled + * with timers before arriving at the slot we know is fresh from the + * allocation. + */ + i = p->p_itimer_sz; + + p->p_itimer = itp_new; + p->p_itimer_sz = target_sz; + +done: + ASSERT3U(i, <=, INT_MAX); *id = (timer_t)i; - return (it); + return (B_TRUE); } /* @@ -612,19 +613,20 @@ timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); /* - * Allocate a timer and choose a slot for it. This acquires p_lock. + * Allocate a timer and choose a slot for it. */ - it = timer_alloc(p, &tid); - ASSERT(MUTEX_HELD(&p->p_lock)); + it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP); + bzero(it, sizeof (*it)); + mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); - if (it == NULL) { + mutex_enter(&p->p_lock); + if (!timer_get_id(p, &tid)) { mutex_exit(&p->p_lock); kmem_free(sigq, sizeof (sigqueue_t)); - return (EAGAIN); + return (set_errno(EAGAIN)); } ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); - ASSERT(evp != NULL); /* * If we develop other notification mechanisms, this will need diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c index 53be806026..cb57b60758 100644 --- a/usr/src/uts/common/os/timers.c +++ b/usr/src/uts/common/os/timers.c @@ -1211,7 +1211,7 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp) hrtime_t ts2hrt(const timestruc_t *tsp) { -#if defined(__amd64) || defined(__i386) +#if defined(__x86) /* * On modern x86 CPUs, the simple version is faster. */ @@ -1232,7 +1232,7 @@ ts2hrt(const timestruc_t *tsp) hrt = (hrt << 7) - hrt - hrt - hrt; hrt = (hrt << 9) + tsp->tv_nsec; return (hrt); -#endif /* defined(__amd64) || defined(__i386) */ +#endif /* defined(__x86) */ } /* diff --git a/usr/src/uts/common/os/upanic.c b/usr/src/uts/common/os/upanic.c new file mode 100644 index 0000000000..b4d23eeaff --- /dev/null +++ b/usr/src/uts/common/os/upanic.c @@ -0,0 +1,98 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#include <sys/proc.h> +#include <c2/audit.h> +#include <sys/procfs.h> +#include <sys/core.h> + +/* + * This function is meant to be a guaranteed abort that generates a core file + * that allows up to 1k of data to enter into an elfnote in the process. This is + * meant to insure that even in the face of other problems, this can get out. + */ + +void +upanic(void *addr, size_t len) +{ + kthread_t *t = curthread; + proc_t *p = curproc; + klwp_t *lwp = ttolwp(t); + uint32_t auditing = AU_AUDITING(); + uint32_t upflag = P_UPF_PANICKED; + void *buf; + int code; + + /* + * Before we worry about the data that the user has as a message, go + * ahead and make sure we try and get all the other threads stopped. + * That'll help us make sure that nothing else is going on and we don't + * lose a race. + */ + mutex_enter(&p->p_lock); + lwp->lwp_cursig = SIGABRT; + mutex_exit(&p->p_lock); + + proc_is_exiting(p); + if (exitlwps(1) != 0) { + mutex_enter(&p->p_lock); + lwp_exit(); + } + + /* + * Copy in the user data. We truncate it to PRUPANIC_BUFLEN no matter + * what and ensure that the last data was set to zero. + */ + if (addr != NULL && len > 0) { + size_t copylen; + + upflag |= P_UPF_HAVEMSG; + + if (len >= PRUPANIC_BUFLEN) { + copylen = PRUPANIC_BUFLEN; + upflag |= P_UPF_TRUNCMSG; + } else { + copylen = len; + } + + buf = kmem_zalloc(PRUPANIC_BUFLEN, KM_SLEEP); + if (copyin(addr, buf, copylen) != 0) { + upflag |= P_UPF_INVALMSG; + upflag &= ~P_UPF_HAVEMSG; + } else { + mutex_enter(&p->p_lock); + ASSERT3P(p->p_upanic, ==, NULL); + p->p_upanic = buf; + mutex_exit(&p->p_lock); + } + } + + mutex_enter(&p->p_lock); + p->p_upanicflag = upflag; + mutex_exit(&p->p_lock); + + /* + * If we're auditing we need to finish the system call itself and then + * begin the core dump. + */ + if (auditing) { + audit_finish(0, SYS_upanic, 0, NULL); + audit_core_start(SIGABRT); + } + code = core(SIGABRT, B_FALSE); + if (auditing) /* audit core dump */ + audit_core_finish(code ? CLD_KILLED : CLD_DUMPED); + exit(code ? CLD_KILLED : CLD_DUMPED, SIGABRT); +} diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index f5ee76a2cb..1df2f479a5 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -18,14 +18,20 @@ * * CDDL HEADER END */ + +/* + * Copyright 2021 Oxide Computer Company + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2018 Joyent, Inc. */ -/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 @@ -60,6 +66,7 @@ #include <sys/mem_cage.h> #include <sys/time.h> #include <sys/zone.h> +#include <sys/stdbool.h> #include <vm/hat.h> #include <vm/as.h> @@ -68,149 +75,275 @@ #include <vm/pvn.h> #include <vm/seg_kmem.h> -static int checkpage(page_t *, int); +/* + * FREE MEMORY MANAGEMENT + * + * Management of the pool of free pages is a tricky business. There are + * several critical threshold values which constrain our allocation of new + * pages and inform the rate of paging out of memory to swap. These threshold + * values, and the behaviour they induce, are described below in descending + * order of size -- and thus increasing order of severity! + * + * +---------------------------------------------------- physmem (all memory) + * | + * | Ordinarily there are no particular constraints placed on page + * v allocation. The page scanner is not running and page_create_va() + * | will effectively grant all page requests (whether from the kernel + * | or from user processes) without artificial delay. + * | + * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB) + * | + * | When we have less than "lotsfree" pages, pageout_scanner() is + * v signalled by schedpaging() to begin looking for pages that can + * | be evicted to disk to bring us back above lotsfree. At this + * | stage there is still no constraint on allocation of free pages. + * | + * | For small systems, we set a lower bound of 16MB for lotsfree; + * v this is the natural value for a system with 1GB memory. This is + * | to ensure that the pageout reserve pool contains at least 4MB + * | for use by ZFS. + * | + * | For systems with a large amount of memory, we constrain lotsfree + * | to be at most 2GB (with a pageout reserve of around 0.5GB), as + * v at some point the required slack relates more closely to the + * | rate at which paging can occur than to the total amount of memory. + * | + * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB) + * | + * | When we drop below desfree, a number of kernel facilities will + * v wait before allocating more memory, under the assumption that + * | pageout or reaping will make progress and free up some memory. + * | This behaviour is not especially coordinated; look for comparisons + * | of desfree and freemem. + * | + * | In addition to various attempts at advisory caution, clock() + * | will wake up the thread that is ordinarily parked in sched(). + * | This routine is responsible for the heavy-handed swapping out + * v of entire processes in an attempt to arrest the slide of free + * | memory. See comments in sched.c for more details. + * | + * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB) + * | + * | These two separate tunables have, by default, the same value. + * v Various parts of the kernel use minfree to signal the need for + * | more aggressive reclamation of memory, and sched() is more + * | aggressive at swapping processes out. + * | + * | If free memory falls below throttlefree, page_create_va() will + * | use page_create_throttle() to begin holding most requests for + * | new pages while pageout and reaping free up memory. Sleeping + * v allocations (e.g., KM_SLEEP) are held here while we wait for + * | more memory. Non-sleeping allocations are generally allowed to + * | proceed, unless their priority is explicitly lowered with + * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).). + * | + * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB) + * | + * | When we hit throttlefree, the situation is already dire. The + * v system is generally paging out memory and swapping out entire + * | processes in order to free up memory for continued operation. + * | + * | Unfortunately, evicting memory to disk generally requires short + * | term use of additional memory; e.g., allocation of buffers for + * | storage drivers, updating maps of free and used blocks, etc. + * | As such, pageout_reserve is the number of pages that we keep in + * | special reserve for use by pageout() and sched() and by any + * v other parts of the kernel that need to be working for those to + * | make forward progress such as the ZFS I/O pipeline. + * | + * | When we are below pageout_reserve, we fail or hold any allocation + * | that has not explicitly requested access to the reserve pool. + * | Access to the reserve is generally granted via the KM_PUSHPAGE + * | flag, or by marking a thread T_PUSHPAGE such that all allocations + * | can implicitly tap the reserve. For more details, see the + * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE + * | and VM_PUSHPAGE allocation flags, and page_create_throttle(). + * | + * +---------------------------------------------------------- no free memory + * | + * | If we have arrived here, things are very bad indeed. It is + * v surprisingly difficult to tell if this condition is even fatal, + * | as enough memory may have been granted to pageout() and to the + * | ZFS I/O pipeline that requests for eviction that have already been + * | made will complete and free up memory some time soon. + * | + * | If free memory does not materialise, the system generally remains + * | deadlocked. The pageout_deadman() below is run once per second + * | from clock(), seeking to limit the amount of time a single request + * v to page out can be blocked before the system panics to get a crash + * | dump and return to service. + * | + * +------------------------------------------------------------------------- + */ /* * The following parameters control operation of the page replacement - * algorithm. They are initialized to 0, and then computed at boot time - * based on the size of the system. If they are patched non-zero in - * a loaded vmunix they are left alone and may thus be changed per system - * using mdb on the loaded system. + * algorithm. They are initialized to 0, and then computed at boot time based + * on the size of the system; see setupclock(). If they are patched non-zero + * in a loaded vmunix they are left alone and may thus be changed per system + * using "mdb -kw" on the loaded system. */ pgcnt_t slowscan = 0; pgcnt_t fastscan = 0; static pgcnt_t handspreadpages = 0; -static int loopfraction = 2; + +/* + * looppages: + * Cached copy of the total number of pages in the system (total_pages). + * + * loopfraction: + * Divisor used to relate fastscan to looppages in setupclock(). + */ +static uint_t loopfraction = 2; static pgcnt_t looppages; -/* See comment below describing 4% and 80% */ -static int min_percent_cpu = 4; -static int max_percent_cpu = 80; + +static uint_t min_percent_cpu = 4; +static uint_t max_percent_cpu = 80; static pgcnt_t maxfastscan = 0; static pgcnt_t maxslowscan = 100; -pgcnt_t maxpgio = 0; -pgcnt_t minfree = 0; -pgcnt_t desfree = 0; -pgcnt_t lotsfree = 0; -pgcnt_t needfree = 0; -pgcnt_t throttlefree = 0; -pgcnt_t pageout_reserve = 0; +#define MEGABYTES (1024ULL * 1024ULL) + +/* + * pageout_threshold_style: + * set to 1 to use the previous default threshold size calculation; + * i.e., each threshold is half of the next largest value. + */ +uint_t pageout_threshold_style = 0; + +/* + * The operator may override these tunables to request a different minimum or + * maximum lotsfree value, or to change the divisor we use for automatic + * sizing. + * + * By default, we make lotsfree 1/64th of the total memory in the machine. The + * minimum and maximum are specified in bytes, rather than pages; a zero value + * means the default values (below) are used. + */ +uint_t lotsfree_fraction = 64; +pgcnt_t lotsfree_min = 0; +pgcnt_t lotsfree_max = 0; -pgcnt_t deficit; -pgcnt_t nscan; -pgcnt_t desscan; +#define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES) +#define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES) + +/* + * If these tunables are set to non-zero values in /etc/system, and provided + * the value is not larger than the threshold above, the specified value will + * be used directly without any additional calculation or adjustment. The boot + * time value of these overrides is preserved in the "clockinit" struct. More + * detail is available in the comment at the top of the file. + */ +pgcnt_t maxpgio = 0; +pgcnt_t minfree = 0; +pgcnt_t desfree = 0; +pgcnt_t lotsfree = 0; +pgcnt_t needfree = 0; +pgcnt_t throttlefree = 0; +pgcnt_t pageout_reserve = 0; +pri_t pageout_pri; + +pgcnt_t deficit; +pgcnt_t nscan; +pgcnt_t desscan; /* kstats */ uint64_t low_mem_scan; uint64_t zone_cap_scan; -uint64_t n_throttle; -clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */ +#define MAX_PSCAN_THREADS 16 /* - * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks - * are the number of ticks in each wakeup cycle that gives the - * equivalent of some underlying %CPU duty cycle. - * - * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging() - * will run 4 times/sec to update pageout scanning parameters and kickoff - * the pageout_scanner() thread if necessary. + * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and + * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle + * that gives the equivalent of some underlying %CPU duty cycle. * - * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When - * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed - * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1). + * min_pageout_nsec: + * nanoseconds/wakeup equivalent of min_percent_cpu. * - * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When - * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed - * by the scanner in a 1 second interval is 80% of a CPU - * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25 - * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec. + * max_pageout_nsec: + * nanoseconds/wakeup equivalent of max_percent_cpu. * - * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks - * will be 200, so the CPU percentages are the same as when hz is 100. - * - * min_pageout_ticks: - * ticks/wakeup equivalent of min_percent_cpu. - * - * max_pageout_ticks: - * ticks/wakeup equivalent of max_percent_cpu. - * - * pageout_ticks: - * Number of clock ticks budgeted for each wakeup cycle. + * pageout_nsec: + * Number of nanoseconds budgeted for each wakeup cycle. * Computed each time around by schedpaging(). - * Varies between min_pageout_ticks .. max_pageout_ticks, + * Varies between min_pageout_nsec and max_pageout_nsec, * depending on memory pressure or zones over their cap. + * + * zone_pageout_nsec: + * Number of nanoseconds budget for each cycle when a zone + * is over its memory cap. If this is zero, then the value + * of max_pageout_nsec is used instead. */ +static hrtime_t min_pageout_nsec; +static hrtime_t max_pageout_nsec; +static hrtime_t pageout_nsec; +static hrtime_t zone_pageout_nsec; -static clock_t min_pageout_ticks; -static clock_t max_pageout_ticks; -static clock_t pageout_ticks; +static boolean_t reset_hands[MAX_PSCAN_THREADS]; -#define MAX_PSCAN_THREADS 16 -static boolean_t reset_hands[MAX_PSCAN_THREADS]; +#define PAGES_POLL_MASK 1023 +#define SCHEDPAGING_HZ 4 /* - * These can be tuned in /etc/system or set with mdb. - * 'des_page_scanners' is the desired number of page scanner threads. The - * system will bring the actual number of threads into line with the desired - * number. If des_page_scanners is set to an invalid value, the system will - * correct the setting. + * despagescanners: + * The desired number of page scanner threads. The value can be set in + * /etc/system or tuned directly with 'mdb -kw'. The system will bring + * the actual number of threads into line with the desired number. If set + * to an invalid value, the system will correct the setting. */ -uint_t des_page_scanners; -uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */ - -uint_t n_page_scanners; -static pgcnt_t pscan_region_sz; /* informational only */ - - -#define PAGES_POLL_MASK 1023 +uint_t despagescanners = 0; /* * pageout_sample_lim: - * The limit on the number of samples needed to establish a value - * for new pageout parameters, fastscan, slowscan, and handspreadpages. + * The limit on the number of samples needed to establish a value for new + * pageout parameters: fastscan, slowscan, pageout_new_spread, and + * handspreadpages. * * pageout_sample_cnt: - * Current sample number. Once the sample gets large enough, - * set new values for handspreadpages, fastscan and slowscan. + * Current sample number. Once the sample gets large enough, set new + * values for handspreadpages, pageout_new_spread, fastscan and slowscan. * * pageout_sample_pages: * The accumulated number of pages scanned during sampling. * * pageout_sample_etime: - * The accumulated number of nanoseconds for the sample. + * The accumulated nanoseconds for the sample. * * pageout_rate: - * Rate in pages/second, computed at the end of sampling. + * Rate in pages/nanosecond, computed at the end of sampling. * * pageout_new_spread: - * The new value to use for maxfastscan and (perhaps) handspreadpages. - * Intended to be the number pages that can be scanned per sec using ~10% - * of a CPU. Calculated after enough samples have been taken. - * pageout_rate / 10 + * Initially zero while the system scan rate is measured by + * pageout_scanner(), which then sets this value once per system boot after + * enough samples have been recorded (pageout_sample_cnt). Once set, this + * new value is used for fastscan and handspreadpages. */ - typedef hrtime_t hrrate_t; -static uint_t pageout_sample_lim = 4; -static uint_t pageout_sample_cnt = 0; +static uint64_t pageout_sample_lim = 4; +static uint64_t pageout_sample_cnt = 0; static pgcnt_t pageout_sample_pages = 0; +static hrtime_t pageout_sample_etime = 0; static hrrate_t pageout_rate = 0; static pgcnt_t pageout_new_spread = 0; -static hrtime_t pageout_sample_etime = 0; - -/* True if page scanner is first starting up */ +/* True if the page scanner is first starting up */ #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim) +/* The current number of page scanner threads */ +static uint_t n_page_scanners = 1; +/* The number of page scanner threads that are actively scanning. */ +static uint_t pageouts_running; + /* - * Record number of times a pageout_scanner wakeup cycle finished because it + * Record number of times a pageout_scanner() wakeup cycle finished because it * timed out (exceeded its CPU budget), rather than because it visited * its budgeted number of pages. This is only done when scanning under low * free memory conditions, not when scanning for zones over their cap. */ -uint64_t pageout_timeouts = 0; +uint64_t pageout_timeouts = 0; #ifdef VM_STATS static struct pageoutvmstats_str { @@ -225,10 +358,57 @@ static struct pageoutvmstats_str { kmutex_t memavail_lock; kcondvar_t memavail_cv; -/* - * The size of the clock loop. - */ -#define LOOPPAGES total_pages +typedef enum pageout_hand { + POH_FRONT = 1, + POH_BACK, +} pageout_hand_t; + +typedef enum { + CKP_INELIGIBLE, + CKP_NOT_FREED, + CKP_FREED, +} checkpage_result_t; + +static checkpage_result_t checkpage(page_t *, pageout_hand_t); + +static struct clockinit { + bool ci_init; + pgcnt_t ci_lotsfree_min; + pgcnt_t ci_lotsfree_max; + pgcnt_t ci_lotsfree; + pgcnt_t ci_desfree; + pgcnt_t ci_minfree; + pgcnt_t ci_throttlefree; + pgcnt_t ci_pageout_reserve; + pgcnt_t ci_maxpgio; + pgcnt_t ci_maxfastscan; + pgcnt_t ci_fastscan; + pgcnt_t ci_slowscan; + pgcnt_t ci_handspreadpages; + uint_t ci_despagescanners; +} clockinit = { .ci_init = false }; + +static inline pgcnt_t +clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) +{ + if (value < minimum) { + return (minimum); + } else if (value > maximum) { + return (maximum); + } else { + return (value); + } +} + +static pgcnt_t +tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) +{ + if (initval == 0 || initval >= initval_ceiling) { + return (defval); + } else { + return (initval); + } +} /* * Local boolean to control scanning when zones are over their cap. Avoids @@ -242,108 +422,145 @@ kcondvar_t memavail_cv; static boolean_t zones_over = B_FALSE; /* - * Set up the paging constants for the page scanner clock-hand algorithm. - * Called at startup after the system is initialized and the amount of memory - * and number of paging devices is known (recalc will be 0). Called again once - * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples - * (recalc will be 1). - * - * Will also be called after a memory dynamic reconfiguration operation and - * recalc will be 1 in those cases too. + * On large memory systems, multiple instances of the page scanner are run, + * each responsible for a separate region of memory. This speeds up page + * invalidation under low memory conditions. * - * lotsfree is 1/64 of memory, but at least 512K (ha!). - * desfree is 1/2 of lotsfree. - * minfree is 1/2 of desfree. + * despagescanners can be set in /etc/system or via mdb and it will + * be used as a guide for how many page scanners to create; the value + * will be adjusted if it is not sensible. Otherwise, the number of + * page scanners is determined dynamically based on handspreadpages. */ -void -setupclock(int recalc) +static void +recalc_pagescanners(void) { - uint_t i; - pgcnt_t sz, tmp; + pgcnt_t sz; + uint_t des; - static spgcnt_t init_lfree, init_dfree, init_mfree; - static spgcnt_t init_tfree, init_preserve, init_mpgio; - static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages; - - looppages = LOOPPAGES; + /* If the initial calibration has not been done, take no action. */ + if (pageout_new_spread == 0) + return; /* - * setupclock can be called to recalculate the paging - * parameters in the case of dynamic reconfiguration of memory. - * So to make sure we make the proper calculations, if such a - * situation should arise, we save away the initial values - * of each parameter so we can recall them when needed. This - * way we don't lose the settings an admin might have made - * through the /etc/system file. + * If the desired number of scanners is set in /etc/system + * then try to use it. */ + if (despagescanners == 0 && clockinit.ci_despagescanners != 0) + despagescanners = clockinit.ci_despagescanners; - if (!recalc) { - init_lfree = lotsfree; - init_dfree = desfree; - init_mfree = minfree; - init_tfree = throttlefree; - init_preserve = pageout_reserve; - init_mpgio = maxpgio; - init_mfscan = maxfastscan; - init_fscan = fastscan; - init_sscan = slowscan; - init_hspages = handspreadpages; + if (despagescanners != 0) { + /* + * We have a desired number of page scanners, either from + * /etc/system or set via mdb. Try and use it (it will be + * clamped below). + */ + des = despagescanners; + } else { + /* + * Calculate the number of desired scanners based on the + * system's memory size. + * + * A 64GiB region size is used as the basis for calculating how + * many scanner threads should be created. For systems with up + * to 64GiB of RAM, a single thread is used; for very large + * memory systems the threads are limited to MAX_PSCAN_THREADS. + */ + sz = btop(64ULL << 30); + + if (sz > looppages) { + des = 1; + } else { + pgcnt_t tmp = sz; + + for (des = 1; tmp < looppages; des++) + tmp += sz; + } } /* - * Set up thresholds for paging: + * clamp the number of scanners so that we are under MAX_PSCAN_THREADS + * and so that each scanner covers at least 10% more than + * handspreadpages. */ + des = clamp(des, 1, + looppages / (handspreadpages + handspreadpages / 10)); + despagescanners = clamp(des, 1, MAX_PSCAN_THREADS); +} - /* - * Lotsfree is threshold where paging daemon turns on. - */ - if (init_lfree == 0 || init_lfree >= looppages) - lotsfree = MAX(looppages / 64, btop(512 * 1024)); - else - lotsfree = init_lfree; +/* + * Set up the paging constants for the clock algorithm used by + * pageout_scanner(), and by the virtual memory system overall. See the + * comments at the top of this file for more information about the threshold + * values and system responses to memory pressure. + * + * This routine is called once by main() at startup, after the initial size of + * physical memory is determined. It may be called again later if memory is + * added to or removed from the system, or if new measurements of the page scan + * rate become available. + */ +void +setupclock(void) +{ + bool half = (pageout_threshold_style == 1); + bool recalc = true; - /* - * Desfree is amount of memory desired free. - * If less than this for extended period, start swapping. - */ - if (init_dfree == 0 || init_dfree >= lotsfree) - desfree = lotsfree / 2; - else - desfree = init_dfree; + looppages = total_pages; /* - * Minfree is minimal amount of free memory which is tolerable. + * The operator may have provided specific values for some of the + * tunables via /etc/system. On our first call, we preserve those + * values so that they can be used for subsequent recalculations. + * + * A value of zero for any tunable means we will use the default + * sizing. */ - if (init_mfree == 0 || init_mfree >= desfree) - minfree = desfree / 2; - else - minfree = init_mfree; + if (!clockinit.ci_init) { + clockinit.ci_init = true; + + clockinit.ci_lotsfree_min = lotsfree_min; + clockinit.ci_lotsfree_max = lotsfree_max; + clockinit.ci_lotsfree = lotsfree; + clockinit.ci_desfree = desfree; + clockinit.ci_minfree = minfree; + clockinit.ci_throttlefree = throttlefree; + clockinit.ci_pageout_reserve = pageout_reserve; + clockinit.ci_maxpgio = maxpgio; + clockinit.ci_maxfastscan = maxfastscan; + clockinit.ci_fastscan = fastscan; + clockinit.ci_slowscan = slowscan; + clockinit.ci_handspreadpages = handspreadpages; + clockinit.ci_despagescanners = despagescanners; - /* - * Throttlefree is the point at which we start throttling - * PG_WAIT requests until enough memory becomes available. - */ - if (init_tfree == 0 || init_tfree >= desfree) - throttlefree = minfree; - else - throttlefree = init_tfree; + /* + * The first call does not trigger a recalculation, only + * subsequent calls. + */ + recalc = false; + } /* - * Pageout_reserve is the number of pages that we keep in - * stock for pageout's own use. Having a few such pages - * provides insurance against system deadlock due to - * pageout needing pages. When freemem < pageout_reserve, - * non-blocking allocations are denied to any threads - * other than pageout and sched. (At some point we might - * want to consider a per-thread flag like T_PUSHING_PAGES - * to indicate that a thread is part of the page-pushing - * dance (e.g. an interrupt thread) and thus is entitled - * to the same special dispensation we accord pageout.) + * Configure paging threshold values. For more details on what each + * threshold signifies, see the comments at the top of this file. */ - if (init_preserve == 0 || init_preserve >= throttlefree) - pageout_reserve = throttlefree / 2; - else - pageout_reserve = init_preserve; + lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages, + btop(LOTSFREE_MAX_DEFAULT)); + lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max, + btop(LOTSFREE_MIN_DEFAULT)); + + lotsfree = tune(clockinit.ci_lotsfree, looppages, + clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max)); + + desfree = tune(clockinit.ci_desfree, lotsfree, + lotsfree / 2); + + minfree = tune(clockinit.ci_minfree, desfree, + half ? desfree / 2 : 3 * desfree / 4); + + throttlefree = tune(clockinit.ci_throttlefree, desfree, + minfree); + + pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree, + half ? throttlefree / 2 : 3 * throttlefree / 4); /* * Maxpgio thresholds how much paging is acceptable. @@ -352,143 +569,160 @@ setupclock(int recalc) * * XXX - Does not account for multiple swap devices. */ - if (init_mpgio == 0) + if (clockinit.ci_maxpgio == 0) { maxpgio = (DISKRPM * 2) / 3; - else - maxpgio = init_mpgio; + } else { + maxpgio = clockinit.ci_maxpgio; + } /* - * When the system is in a low memory state, the page scan rate varies - * between fastscan and slowscan based on the amount of free memory - * available. When only zones are over their memory cap, the scan rate - * is always fastscan. - * - * The fastscan rate should be set based on the number pages that can - * be scanned per sec using ~10% of a CPU. Since this value depends on - * the processor, MMU, Ghz etc., it must be determined dynamically. - * - * When the scanner first starts up, fastscan will be set to 0 and - * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages). - * However, once the scanner has collected enough samples, then fastscan - * is set to be the smaller of 1/2 of memory (looppages / loopfraction) - * or maxfastscan (which is set from pageout_new_spread). Thus, - * MAXHANDSPREADPAGES is irrelevant after the scanner is fully - * initialized. - * - * pageout_new_spread is calculated when the scanner first starts - * running. During this initial sampling period the nscan_limit - * is set to the total_pages of system memory. Thus, the scanner could - * theoretically scan all of memory in one pass. However, each sample - * is also limited by the %CPU budget. This is controlled by - * pageout_ticks which is set in schedpaging(). During the sampling - * period, pageout_ticks is set to max_pageout_ticks. This tick value - * is derived from the max_percent_cpu (80%) described above. On a - * system with more than a small amount of memory (~8GB), the scanner's - * %CPU will be the limiting factor in calculating pageout_new_spread. - * - * At the end of the sampling period, the pageout_rate indicates how - * many pages could be scanned per second. The pageout_new_spread is - * then set to be 1/10th of that (i.e. approximating 10% of a CPU). - * Of course, this value could still be more than the physical memory - * on the system. If so, fastscan is set to 1/2 of memory, as - * mentioned above. + * The clock scan rate varies between fastscan and slowscan + * based on the amount of free memory available. Fastscan + * rate should be set based on the number pages that can be + * scanned per sec using ~10% of processor time. Since this + * value depends on the processor, MMU, Mhz etc., it is + * difficult to determine it in a generic manner for all + * architectures. * - * All of this leads up to the setting of handspreadpages, which is - * set to fastscan. This is the distance, in pages, between the front - * and back hands during scanning. It will dictate which pages will - * be considered "hot" on the backhand and which pages will be "cold" - * and reclaimed + * Instead of trying to determine the number of pages scanned + * per sec for every processor, fastscan is set to be the smaller + * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling + * time is limited to ~4% of processor time. * - * If the scanner is limited by desscan, then at the highest rate it - * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the - * scanner is limited by the %CPU, then at the highest rate (20% of a - * CPU per cycle) the number of pages scanned could be much less. + * Setting fastscan to be 1/2 of memory allows pageout to scan + * all of memory in ~2 secs. This implies that user pages not + * accessed within 1 sec (assuming, handspreadpages == fastscan) + * can be reclaimed when free memory is very low. Stealing pages + * not accessed within 1 sec seems reasonable and ensures that + * active user processes don't thrash. * - * Thus, if the scanner is limited by desscan, then the handspreadpages - * setting means 1sec between the front and back hands, but if the - * scanner is limited by %CPU, it could be several seconds between the - * two hands. + * Smaller values of fastscan result in scanning fewer pages + * every second and consequently pageout may not be able to free + * sufficient memory to maintain the minimum threshold. Larger + * values of fastscan result in scanning a lot more pages which + * could lead to thrashing and higher CPU usage. * - * The basic assumption is that at the worst case, stealing pages - * not accessed within 1 sec seems reasonable and ensures that active - * user processes don't thrash. This is especially true when the system - * is in a low memory state. + * Fastscan needs to be limited to a maximum value and should not + * scale with memory to prevent pageout from consuming too much + * time for scanning on slow CPU's and avoid thrashing, as a + * result of scanning too many pages, on faster CPU's. + * The value of 64 Meg was chosen for MAXHANDSPREADPAGES + * (the upper bound for fastscan) based on the average number + * of pages that can potentially be scanned in ~1 sec (using ~4% + * of the CPU) on some of the following machines that currently + * run Solaris 2.x: * - * There are some additional factors to consider for the case of - * scanning when zones are over their cap. In this situation it is - * also likely that the machine will have a large physical memory which - * will take many seconds to fully scan (due to the %CPU and desscan - * limits per cycle). It is probable that there will be few (or 0) - * pages attributed to these zones in any single scanning cycle. The - * result is that reclaiming enough pages for these zones might take - * several additional seconds (this is generally not a problem since - * the zone physical cap is just a soft cap). + * average memory scanned in ~1 sec * - * This is similar to the typical multi-processor situation in which - * pageout is often unable to maintain the minimum paging thresholds - * under heavy load due to the fact that user processes running on - * other CPU's can be dirtying memory at a much faster pace than - * pageout can find pages to free. + * 25 Mhz SS1+: 23 Meg + * LX: 37 Meg + * 50 Mhz SC2000: 68 Meg * - * One potential approach to address both of these cases is to enable - * more than one CPU to run the page scanner, in such a manner that the - * various clock hands don't overlap. However, this also makes it more - * difficult to determine the values for fastscan, slowscan and - * handspreadpages. This is left as a future enhancement, if necessary. + * 40 Mhz 486: 26 Meg + * 66 Mhz 486: 42 Meg * - * When free memory falls just below lotsfree, the scan rate goes from - * 0 to slowscan (i.e., the page scanner starts running). This + * When free memory falls just below lotsfree, the scan rate + * goes from 0 to slowscan (i.e., pageout starts running). This * transition needs to be smooth and is achieved by ensuring that * pageout scans a small number of pages to satisfy the transient * memory demand. This is set to not exceed 100 pages/sec (25 per * wakeup) since scanning that many pages has no noticible impact * on system performance. * - * The swapper is currently used to free up memory when pageout is - * unable to meet memory demands. It does this by swapping out entire - * processes. In addition to freeing up memory, swapping also reduces - * the demand for memory because the swapped out processes cannot - * run, and thereby consume memory. However, this is a pathological - * state and performance will generally be considered unacceptable. + * In addition to setting fastscan and slowscan, pageout is + * limited to using ~4% of the CPU. This results in increasing + * the time taken to scan all of memory, which in turn means that + * user processes have a better opportunity of preventing their + * pages from being stolen. This has a positive effect on + * interactive and overall system performance when memory demand + * is high. + * + * Thus, the rate at which pages are scanned for replacement will + * vary linearly between slowscan and the number of pages that + * can be scanned using ~4% of processor time instead of varying + * linearly between slowscan and fastscan. + * + * Also, the processor time used by pageout will vary from ~1% + * at slowscan to ~4% at fastscan instead of varying between + * ~1% at slowscan and ~10% at fastscan. + * + * The values chosen for the various VM parameters (fastscan, + * handspreadpages, etc) are not universally true for all machines, + * but appear to be a good rule of thumb for the machines we've + * tested. They have the following ranges: + * + * cpu speed: 20 to 70 Mhz + * page size: 4K to 8K + * memory size: 16M to 5G + * page scan rate: 4000 - 17400 4K pages per sec + * + * The values need to be re-examined for machines which don't + * fall into the various ranges (e.g., slower or faster CPUs, + * smaller or larger pagesizes etc) shown above. + * + * On an MP machine, pageout is often unable to maintain the + * minimum paging thresholds under heavy load. This is due to + * the fact that user processes running on other CPU's can be + * dirtying memory at a much faster pace than pageout can find + * pages to free. The memory demands could be met by enabling + * more than one CPU to run the clock algorithm in such a manner + * that the various clock hands don't overlap. This also makes + * it more difficult to determine the values for fastscan, slowscan + * and handspreadpages. + * + * The swapper is currently used to free up memory when pageout + * is unable to meet memory demands by swapping out processes. + * In addition to freeing up memory, swapping also reduces the + * demand for memory by preventing user processes from running + * and thereby consuming memory. */ - if (init_mfscan == 0) { - if (pageout_new_spread != 0) + if (clockinit.ci_maxfastscan == 0) { + if (pageout_new_spread != 0) { maxfastscan = pageout_new_spread; - else + } else { maxfastscan = MAXHANDSPREADPAGES; + } } else { - maxfastscan = init_mfscan; + maxfastscan = clockinit.ci_maxfastscan; } - if (init_fscan == 0) { + + if (clockinit.ci_fastscan == 0) { fastscan = MIN(looppages / loopfraction, maxfastscan); } else { - fastscan = init_fscan; - if (fastscan > looppages / loopfraction) - fastscan = looppages / loopfraction; + fastscan = clockinit.ci_fastscan; + } + + if (fastscan > looppages / loopfraction) { + fastscan = looppages / loopfraction; } /* * Set slow scan time to 1/10 the fast scan time, but * not to exceed maxslowscan. */ - if (init_sscan == 0) + if (clockinit.ci_slowscan == 0) { slowscan = MIN(fastscan / 10, maxslowscan); - else - slowscan = init_sscan; - if (slowscan > fastscan / 2) + } else { + slowscan = clockinit.ci_slowscan; + } + + if (slowscan > fastscan / 2) { slowscan = fastscan / 2; + } /* - * Handspreadpages is distance (in pages) between front and back + * Handspreadpages is the distance (in pages) between front and back * pageout daemon hands. The amount of time to reclaim a page * once pageout examines it increases with this distance and * decreases as the scan rate rises. It must be < the amount * of pageable memory. * - * Since pageout is limited to the %CPU per cycle, setting - * handspreadpages to be "fastscan" results in the front hand being - * a few secs (varies based on the processor speed) ahead of the back - * hand at fastscan rates. + * Since pageout is limited to ~4% of the CPU, setting handspreadpages + * to be "fastscan" results in the front hand being a few secs + * (varies based on the processor speed) ahead of the back hand + * at fastscan rates. This distance can be further reduced, if + * necessary, by increasing the processor time used by pageout + * to be more than ~4% and preferrably not more than ~10%. * * As a result, user processes have a much better chance of * referencing their pages before the back hand examines them. @@ -496,91 +730,62 @@ setupclock(int recalc) * the freelist since pageout does not end up freeing pages which * may be referenced a sec later. */ - if (init_hspages == 0) + if (clockinit.ci_handspreadpages == 0) { handspreadpages = fastscan; - else - handspreadpages = init_hspages; + } else { + handspreadpages = clockinit.ci_handspreadpages; + } /* * Make sure that back hand follows front hand by at least - * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible - * for the back hand to look at a page during the same wakeup of - * the pageout daemon in which the front hand cleared its ref bit. + * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the + * back hand to look at a page during the same wakeup of the pageout + * daemon in which the front hand cleared its ref bit. */ - if (handspreadpages >= looppages) + if (handspreadpages >= looppages) { handspreadpages = looppages - 1; - - if (recalc == 0) { - /* - * Setup basic values at initialization. - */ - pscan_region_sz = total_pages; - des_page_scanners = n_page_scanners = 1; - reset_hands[0] = B_TRUE; - return; } /* - * Recalculating - * - * We originally set the number of page scanners to 1. Now that we - * know what the handspreadpages is for a scanner, figure out how many - * scanners we should run. We want to ensure that the regions don't - * overlap and that they are not touching. - * - * A default 64GB region size is used as the initial value to calculate - * how many scanner threads we should create on lower memory systems. - * The idea is to limit the number of threads to a practical value - * (e.g. a 64GB machine really only needs one scanner thread). For very - * large memory systems, we limit ourselves to MAX_PSCAN_THREADS - * threads. - * - * The scanner threads themselves are evenly spread out around the - * memory "clock" in pageout_scanner when we reset the hands, and each - * thread will scan all of memory. + * Establish the minimum and maximum length of time to be spent + * scanning pages per wakeup, limiting the scanner duty cycle. The + * input percentage values (0-100) must be converted to a fraction of + * the number of nanoseconds in a second of wall time, then further + * scaled down by the number of scanner wakeups in a second. */ - sz = (btop(64ULL * 0x40000000ULL)); - if (sz < handspreadpages) { - /* - * 64GB is smaller than the separation between the front - * and back hands; use double handspreadpages. - */ - sz = handspreadpages << 1; - } - if (sz > total_pages) { - sz = total_pages; - } - /* Record region size for inspection with mdb, otherwise unused */ - pscan_region_sz = sz; + min_pageout_nsec = MAX(1, + NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); + max_pageout_nsec = MAX(min_pageout_nsec, + NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); - tmp = sz; - for (i = 1; tmp < total_pages; i++) { - tmp += sz; - } + /* + * If not called for recalculation, return and skip the remaining + * steps. + */ + if (!recalc) + return; - if (i > MAX_PSCAN_THREADS) - i = MAX_PSCAN_THREADS; + /* + * Set a flag to re-evaluate the clock hand positions. + */ + for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; - des_page_scanners = i; + recalc_pagescanners(); } /* * Pageout scheduling. * * Schedpaging controls the rate at which the page out daemon runs by - * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING - * times a second. The pageout_ticks variable controls the percent of one - * CPU that each page scanner thread should consume (see min_percent_cpu - * and max_percent_cpu descriptions). The desscan variable records the number - * of pages pageout should examine in its next pass; schedpaging sets this - * value based on the amount of currently available memory. In addtition, the - * nscan variable records the number of pages pageout has examined in its - * current pass; schedpaging resets this value to zero each time it runs. + * setting the global variables nscan and desscan SCHEDPAGING_HZ + * times a second. Nscan records the number of pages pageout has examined + * in its current pass; schedpaging() resets this value to zero each time + * it runs. Desscan records the number of pages pageout should examine + * in its next pass; schedpaging() sets this value based on the amount of + * currently available memory. */ -#define RATETOSCHEDPAGING 4 /* times/second */ - -/* held while pageout_scanner or schedpaging are modifying shared data */ static kmutex_t pageout_mutex; /* @@ -592,7 +797,24 @@ static struct async_reqs *push_list; /* pending reqs */ static kmutex_t push_lock; /* protects req pool */ static kcondvar_t push_cv; -static int async_list_size = 256; /* number of async request structs */ +/* + * If pageout() is stuck on a single push for this many seconds, + * pageout_deadman() will assume the system has hit a memory deadlock. If set + * to 0, the deadman will have no effect. + * + * Note that we are only looking for stalls in the calls that pageout() makes + * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging + * I/O, which should not take long unless the underlying strategy call blocks + * indefinitely for memory. The actual I/O request happens (or fails) later. + */ +uint_t pageout_deadman_seconds = 90; + +static uint_t pageout_stucktime = 0; +static bool pageout_pushing = false; +static uint64_t pageout_pushcount = 0; +static uint64_t pageout_pushcount_seen = 0; + +static int async_list_size = 8192; /* number of async request structs */ static void pageout_scanner(void *); @@ -623,153 +845,142 @@ schedpaging(void *arg) if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) kcage_cageout_wakeup(); - (void) atomic_swap_ulong(&nscan, 0); - vavail = freemem - deficit; - if (pageout_new_spread != 0) - vavail -= needfree; - if (vavail < 0) - vavail = 0; - if (vavail > lotsfree) - vavail = lotsfree; + if (mutex_tryenter(&pageout_mutex)) { - /* - * Fix for 1161438 (CRS SPR# 73922). All variables - * in the original calculation for desscan were 32 bit signed - * ints. As freemem approaches 0x0 on a system with 1 Gig or - * more of memory, the calculation can overflow. When this - * happens, desscan becomes negative and pageout_scanner() - * stops paging out. - */ - if ((needfree) && (pageout_new_spread == 0)) { - /* - * If we've not yet collected enough samples to - * calculate a spread, kick into high gear anytime - * needfree is non-zero. Note that desscan will not be - * the limiting factor for systems with larger memory; - * the %CPU will limit the scan. That will also be - * maxed out below. - */ - desscan = fastscan / RATETOSCHEDPAGING; - } else { - /* - * Once we've calculated a spread based on system - * memory and usage, just treat needfree as another - * form of deficit. - */ - spgcnt_t faststmp, slowstmp, result; + if (pageouts_running != 0) + goto out; - slowstmp = slowscan * vavail; - faststmp = fastscan * (lotsfree - vavail); - result = (slowstmp + faststmp) / - nz(lotsfree) / RATETOSCHEDPAGING; - desscan = (pgcnt_t)result; - } + /* No pageout scanner threads running. */ + nscan = 0; + vavail = freemem - deficit; + if (pageout_new_spread != 0) + vavail -= needfree; + vavail = clamp(vavail, 0, lotsfree); - /* - * If we've not yet collected enough samples to calculate a - * spread, also kick %CPU to the max. - */ - if (pageout_new_spread == 0) { - pageout_ticks = max_pageout_ticks; - } else { - pageout_ticks = min_pageout_ticks + - (lotsfree - vavail) * - (max_pageout_ticks - min_pageout_ticks) / - nz(lotsfree); - } + if (needfree > 0 && pageout_new_spread == 0) { + /* + * If we've not yet collected enough samples to + * calculate a spread, use the old logic of kicking + * into high gear anytime needfree is non-zero. + */ + desscan = fastscan / SCHEDPAGING_HZ; + } else { + /* + * Once we've calculated a spread based on system + * memory and usage, just treat needfree as another + * form of deficit. + */ + spgcnt_t faststmp, slowstmp, result; - if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) { - /* - * We have finished the pagescan initialization and the desired - * number of page scanners has changed, either because - * initialization just finished, because of a memory DR, or - * because des_page_scanners has been modified on the fly (i.e. - * by mdb). If we need more scanners, start them now, otherwise - * the excess scanners will terminate on their own when they - * reset their hands. - */ - uint_t i; - uint_t curr_nscan = n_page_scanners; - pgcnt_t max = total_pages / handspreadpages; + slowstmp = slowscan * vavail; + faststmp = fastscan * (lotsfree - vavail); + result = (slowstmp + faststmp) / + nz(lotsfree) / SCHEDPAGING_HZ; + desscan = (pgcnt_t)result; + } - if (des_page_scanners > max) - des_page_scanners = max; + pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * + (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); - if (des_page_scanners > MAX_PSCAN_THREADS) { - des_page_scanners = MAX_PSCAN_THREADS; - } else if (des_page_scanners == 0) { - des_page_scanners = 1; - } + DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t, + pageout_nsec); - /* - * Each thread has its own entry in the reset_hands array, so - * we don't need any locking in pageout_scanner to check the - * thread's reset_hands entry. Thus, we use a pre-allocated - * fixed size reset_hands array and upper limit on the number - * of pagescan threads. - * - * The reset_hands entries need to be true before we start new - * scanners, but if we're reducing, we don't want a race on the - * recalculation for the existing threads, so we set - * n_page_scanners first. - */ - n_page_scanners = des_page_scanners; - for (i = 0; i < MAX_PSCAN_THREADS; i++) { - reset_hands[i] = B_TRUE; - } + if (pageout_new_spread != 0 && despagescanners != 0 && + despagescanners != n_page_scanners) { + /* + * We have finished the pagescan initialisation and the + * desired number of page scanners has changed, either + * because initialisation just finished, because of a + * memory DR, or because despagescanners has been + * modified on the fly (i.e. by mdb). + */ + uint_t i, curr_nscan = n_page_scanners; + + /* Re-validate despagescanners */ + recalc_pagescanners(); + + n_page_scanners = despagescanners; + + for (i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; + + /* If we need more scanners, start them now. */ + if (n_page_scanners > curr_nscan) { + for (i = curr_nscan; i < n_page_scanners; i++) { + (void) lwp_kernel_create(proc_pageout, + pageout_scanner, + (void *)(uintptr_t)i, TS_RUN, + pageout_pri); + } + } - if (des_page_scanners > curr_nscan) { - /* Create additional pageout scanner threads. */ - for (i = curr_nscan; i < des_page_scanners; i++) { - (void) lwp_kernel_create(proc_pageout, - pageout_scanner, (void *)(uintptr_t)i, - TS_RUN, curthread->t_pri); + /* + * If the number of scanners has decreased, trigger a + * wakeup so that the excess threads will terminate. + */ + if (n_page_scanners < curr_nscan) { + WAKE_PAGEOUT_SCANNER(); } } - } - - zones_over = B_FALSE; - - if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) { - if (!PAGE_SCAN_STARTUP) - low_mem_scan++; - DTRACE_PROBE(schedpage__wake__low); - WAKE_PAGEOUT_SCANNER(); - - } else if (zone_num_over_cap > 0) { - /* One or more zones are over their cap. */ - /* No page limit */ - desscan = total_pages; + zones_over = B_FALSE; - /* - * Increase the scanning CPU% to the max. This implies - * 80% of one CPU/sec if the scanner can run each - * opportunity. Can also be tuned via setting - * zone_pageout_ticks in /etc/system or with mdb. - */ - pageout_ticks = (zone_pageout_ticks != 0) ? - zone_pageout_ticks : max_pageout_ticks; + if (PAGE_SCAN_STARTUP) { + /* + * We still need to measure the rate at which the + * system is able to scan pages of memory. Each of + * these initial samples is a scan of as much system + * memory as practical, regardless of whether or not we + * are experiencing memory pressure. + */ + desscan = total_pages; + pageout_nsec = max_pageout_nsec; - zones_over = B_TRUE; - zone_cap_scan++; + DTRACE_PROBE(schedpage__wake__sample); + WAKE_PAGEOUT_SCANNER(); + } else if (freemem < lotsfree + needfree) { + /* + * We need more memory. + */ + low_mem_scan++; - DTRACE_PROBE(schedpage__wake__zone); - WAKE_PAGEOUT_SCANNER(); + DTRACE_PROBE(schedpage__wake__low); + WAKE_PAGEOUT_SCANNER(); + } else if (zone_num_over_cap > 0) { + /* + * One of more zones are over their cap. + */ - } else { - /* - * There are enough free pages, no need to - * kick the scanner thread. And next time - * around, keep more of the `highly shared' - * pages. - */ - cv_signal_pageout(); + /* No page limit */ + desscan = total_pages; - mutex_enter(&pageout_mutex); - if (po_share > MIN_PO_SHARE) { - po_share >>= 1; + /* + * Increase the scanning CPU% to the max. This implies + * 80% of one CPU/sec if the scanner can run each + * opportunity. Can also be tuned via setting + * zone_pageout_nsec in /etc/system or with mdb. + */ + pageout_nsec = (zone_pageout_nsec != 0) ? + zone_pageout_nsec : max_pageout_nsec; + + zones_over = B_TRUE; + zone_cap_scan++; + + DTRACE_PROBE(schedpage__wake__zone); + WAKE_PAGEOUT_SCANNER(); + } else { + /* + * There are enough free pages, no need to + * kick the scanner thread. And next time + * around, keep more of the `highly shared' + * pages. + */ + cv_signal_pageout(); + if (po_share > MIN_PO_SHARE) { + po_share >>= 1; + } } +out: mutex_exit(&pageout_mutex); } @@ -782,61 +993,55 @@ schedpaging(void *arg) if (kmem_avail() > 0) cv_broadcast(&memavail_cv); - (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING); + (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ); } pgcnt_t pushes; ulong_t push_list_size; /* # of requests on pageout queue */ -#define FRONT 1 -#define BACK 2 - -int dopageout = 1; /* /etc/system tunable to disable page reclamation */ +/* + * Paging out should always be enabled. This tunable exists to hold pageout + * for debugging purposes. If set to 0, pageout_scanner() will go back to + * sleep each time it is woken by schedpaging(). + */ +uint_t dopageout = 1; /* * The page out daemon, which runs as process 2. * - * Page out occurs when either: - * a) there is less than lotsfree pages, - * b) there are one or more zones over their physical memory cap. - * - * The daemon treats physical memory as a circular array of pages and scans the - * pages using a 'two-handed clock' algorithm. The front hand moves through - * the pages, clearing the reference bit. The back hand travels a distance - * (handspreadpages) behind the front hand, freeing the pages that have not - * been referenced in the time since the front hand passed. If modified, they - * are first written to their backing store before being freed. - * - * In order to make page invalidation more responsive on machines with larger - * memory, multiple pageout_scanner threads may be created. In this case, the - * threads are evenly distributed around the the memory "clock face" so that - * memory can be reclaimed more quickly (that is, there can be large regions in - * which no pages can be reclaimed by a single thread, leading to lag which - * causes undesirable behavior such as htable stealing). + * The daemon treats physical memory as a circular array of pages and scans + * the pages using a 'two-handed clock' algorithm. The front hand moves + * through the pages, clearing the reference bit. The back hand travels a + * distance (handspreadpages) behind the front hand, freeing the pages that + * have not been referenced in the time since the front hand passed. If + * modified, they are first written to their backing store before being + * freed. * - * As long as there are at least lotsfree pages, or no zones over their cap, - * then pageout_scanner threads are not run. When pageout_scanner threads are - * running for case (a), all pages are considered for pageout. For case (b), - * only pages belonging to a zone over its cap will be considered for pageout. + * In order to make page invalidation more responsive on machines with + * larger memory, multiple pageout_scanner threads may be created. In this + * case, each thread is given a segment of the memory "clock face" so that + * memory can be reclaimed more quickly. * - * There are multiple threads that act on behalf of the pageout process. - * A set of threads scan pages (pageout_scanner) and frees them up if - * they don't require any VOP_PUTPAGE operation. If a page must be - * written back to its backing store, the request is put on a list - * and the other (pageout) thread is signaled. The pageout thread - * grabs VOP_PUTPAGE requests from the list, and processes them. - * Some filesystems may require resources for the VOP_PUTPAGE - * operations (like memory) and hence can block the pageout - * thread, but the pageout_scanner threads can still operate. There is still - * no guarantee that memory deadlocks cannot occur. + * As long as there are at least lotsfree pages, or no zones over their + * cap, then pageout_scanner threads are not run. When pageout_scanner + * threads are running for case (a), all pages are considered for pageout. + * For case (b), only pages belonging to a zone over its cap will be + * considered for pageout. * - * The pageout_scanner parameters are determined in schedpaging(). + * There are multiple threads that act on behalf of the pageout process. A + * set of threads scan pages (pageout_scanner) and frees them up if they + * don't require any VOP_PUTPAGE operation. If a page must be written back + * to its backing store, the request is put on a list and the other + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE + * requests from the list, and processes them. Some filesystems may require + * resources for the VOP_PUTPAGE operations (like memory) and hence can + * block the pageout thread, but the scanner thread can still operate. + * There is still no guarantee that memory deadlocks cannot occur. */ void pageout() { struct async_reqs *arg; - pri_t pageout_pri; int i; pgcnt_t max_pushes; callb_cpr_t cprinfo; @@ -863,14 +1068,16 @@ pageout() kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); req_freelist = push_req; - for (i = 0; i < async_list_size - 1; i++) + for (i = 0; i < async_list_size - 1; i++) { push_req[i].a_next = &push_req[i + 1]; + } - pageout_pri = curthread->t_pri; + pageout_pri = curthread->t_pri - 1; - /* Create the (first) pageout scanner thread. */ - (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0, - TS_RUN, pageout_pri - 1); + /* Create the first pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, + (void *)0, /* this is instance 0, not NULL */ + TS_RUN, pageout_pri); /* * kick off pageout scheduler. @@ -888,7 +1095,7 @@ pageout() /* * Limit pushes to avoid saturating pageout devices. */ - max_pushes = maxpgio / RATETOSCHEDPAGING; + max_pushes = maxpgio / SCHEDPAGING_HZ; CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); for (;;) { @@ -902,9 +1109,11 @@ pageout() } push_list = arg->a_next; arg->a_next = NULL; + pageout_pushing = true; mutex_exit(&push_lock); DTRACE_PROBE(pageout__push); + if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { pushes++; @@ -914,6 +1123,8 @@ pageout() VN_RELE(arg->a_vp); mutex_enter(&push_lock); + pageout_pushing = false; + pageout_pushcount++; arg->a_next = req_freelist; /* back on freelist */ req_freelist = arg; push_list_size--; @@ -927,134 +1138,172 @@ pageout() static void pageout_scanner(void *a) { - struct page *fronthand, *backhand; - uint_t count, iter = 0; + struct page *fronthand, *backhand, *fronthandstart; + struct page *regionstart, *regionend; + uint_t laps; callb_cpr_t cprinfo; - pgcnt_t nscan_cnt, nscan_limit; + pgcnt_t nscan_cnt, tick; pgcnt_t pcount; - uint_t inst = (uint_t)(uintptr_t)a; + bool bhwrapping, fhwrapping; hrtime_t sample_start, sample_end; - clock_t pageout_lbolt; - kmutex_t pscan_mutex; + uint_t inst = (uint_t)(uintptr_t)a; VERIFY3U(inst, <, MAX_PSCAN_THREADS); - mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL); + CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); + mutex_enter(&pageout_mutex); - CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan"); - mutex_enter(&pscan_mutex); + /* + * The restart case does not attempt to point the hands at roughly + * the right point on the assumption that after one circuit things + * will have settled down, and restarts shouldn't be that often. + */ + reset_hands[inst] = B_TRUE; - min_pageout_ticks = MAX(1, - ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); - max_pageout_ticks = MAX(min_pageout_ticks, - ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING); + pageouts_running++; + mutex_exit(&pageout_mutex); loop: cv_signal_pageout(); + mutex_enter(&pageout_mutex); + pageouts_running--; CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(&proc_pageout->p_cv, &pscan_mutex); - CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex); + cv_wait(&proc_pageout->p_cv, &pageout_mutex); + CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); + pageouts_running++; + mutex_exit(&pageout_mutex); - if (!dopageout) + /* + * Check if pageout has been disabled for debugging purposes. + */ + if (!dopageout) { goto loop; + } + /* + * One may reset the clock hands and scanned region for debugging + * purposes. Hands will also be reset on first thread startup, if + * the number of scanning threads (n_page_scanners) changes, or if + * memory is added to, or removed from, the system. + */ if (reset_hands[inst]) { struct page *first; - pgcnt_t offset = total_pages / n_page_scanners; reset_hands[inst] = B_FALSE; + if (inst >= n_page_scanners) { /* - * The desired number of page scanners has been - * reduced and this instance is no longer wanted. - * Exit the lwp. - */ + * The desired number of page scanners has been + * reduced and this instance is no longer wanted. + * Exit the lwp. + */ VERIFY3U(inst, !=, 0); - mutex_exit(&pscan_mutex); + DTRACE_PROBE1(pageout__exit, uint_t, inst); + mutex_enter(&pageout_mutex); + pageouts_running--; + mutex_exit(&pageout_mutex); mutex_enter(&curproc->p_lock); lwp_exit(); + /* NOTREACHED */ } + first = page_first(); + /* - * The reset case repositions the hands at the proper place - * on the memory clock face to prevent creep into another - * thread's active region or when the number of threads has - * changed. - * - * Set the two clock hands to be separated by a reasonable - * amount, but no more than 360 degrees apart. - * - * If inst == 0, backhand starts at first page, otherwise - * it is (inst * offset) around the memory "clock face" so that - * we spread out each scanner instance evenly. + * Each scanner thread gets its own sector of the memory + * clock face. */ - first = page_first(); - backhand = page_nextn(first, offset * inst); - if (handspreadpages >= total_pages) { - fronthand = page_nextn(backhand, total_pages - 1); + pgcnt_t span, offset; + + span = looppages / n_page_scanners; + VERIFY3U(span, >, handspreadpages); + + offset = inst * span; + regionstart = page_nextn(first, offset); + if (inst == n_page_scanners - 1) { + /* The last instance goes up to the last page */ + regionend = page_nextn(first, looppages - 1); } else { - fronthand = page_nextn(backhand, handspreadpages); + regionend = page_nextn(regionstart, span - 1); } + + backhand = regionstart; + fronthand = page_nextn(backhand, handspreadpages); + tick = 1; + + bhwrapping = fhwrapping = B_FALSE; + + DTRACE_PROBE4(pageout__reset, uint_t, inst, + pgcnt_t, regionstart, pgcnt_t, regionend, + pgcnt_t, fronthand); } /* - * This CPU kstat is only incremented here and we're obviously on this - * CPU, so no lock. + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); - count = 0; - /* Kernel probe */ - TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, - tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree); + /* + * Keep track of the number of times we have scanned all the way around + * the loop on this wakeup. + */ + laps = 0; - pcount = 0; + /* + * Track the number of pages visited during this scan so that we can + * periodically measure our duty cycle. + */ nscan_cnt = 0; - if (PAGE_SCAN_STARTUP) { - nscan_limit = total_pages; - } else { - nscan_limit = desscan; - } + pcount = 0; + + DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan, + hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand); - DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst, - page_t *, backhand, page_t *, fronthand); + /* + * Record the initial position of the front hand for this cycle so + * that we can detect when the hand wraps around. + */ + fronthandstart = fronthand; - pageout_lbolt = ddi_get_lbolt(); sample_start = gethrtime(); /* * Scan the appropriate number of pages for a single duty cycle. - * Only scan while at least one of these is true: - * 1) one or more zones is over its cap - * 2) there is not enough free memory - * 3) during page scan startup when determining sample data */ - while (nscan_cnt < nscan_limit && - (zones_over || - freemem < lotsfree + needfree || - PAGE_SCAN_STARTUP)) { - int rvfront, rvback; + while (nscan_cnt < desscan) { + checkpage_result_t rvfront, rvback; + + /* + * Only scan while at least one of these is true: + * 1) one or more zones is over its cap + * 2) there is not enough free memory + * 3) during page scan startup when determining sample data + */ + if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree && + !zones_over) { + /* + * We are not sampling and enough memory has become + * available that scanning is no longer required. + */ + DTRACE_PROBE1(pageout__memfree, uint_t, inst); + break; + } - DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst); + DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount); /* - * Check to see if we have exceeded our %CPU budget - * for this wakeup, but not on every single page visited, - * just every once in a while. + * Periodically check to see if we have exceeded the CPU duty + * cycle for a single wakeup. */ if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { - clock_t pageout_cycle_ticks; + hrtime_t pageout_cycle_nsec; - pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt; - if (pageout_cycle_ticks >= pageout_ticks) { - /* - * This is where we normally break out of the - * loop when scanning zones or sampling. - */ - if (!zones_over) { + pageout_cycle_nsec = gethrtime() - sample_start; + if (pageout_cycle_nsec >= pageout_nsec) { + if (!zones_over) atomic_inc_64(&pageout_timeouts); - } DTRACE_PROBE1(pageout__timeout, uint_t, inst); break; } @@ -1062,12 +1311,14 @@ loop: /* * If checkpage manages to add a page to the free list, - * we give ourselves another couple of trips around memory. + * we give ourselves another couple of trips around the loop. */ - if ((rvfront = checkpage(fronthand, FRONT)) == 1) - count = 0; - if ((rvback = checkpage(backhand, BACK)) == 1) - count = 0; + if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) { + laps = 0; + } + if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) { + laps = 0; + } ++pcount; @@ -1080,25 +1331,35 @@ loop: /* * Don't include ineligible pages in the number scanned. */ - if (rvfront != -1 || rvback != -1) + if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) { nscan_cnt++; + } + + if (bhwrapping) { + backhand = regionstart; + bhwrapping = B_FALSE; + } else { + backhand = page_nextn(backhand, tick); + if (backhand == regionend) + bhwrapping = B_TRUE; + } - backhand = page_next(backhand); + if (fhwrapping) { + fronthand = regionstart; + fhwrapping = B_FALSE; + } else { + fronthand = page_nextn(fronthand, tick); + if (fronthand == regionend) + fhwrapping = B_TRUE; + } /* - * backhand update and wraparound check are done separately - * because lint barks when it finds an empty "if" body + * The front hand has wrapped around during this wakeup. */ - - if ((fronthand = page_next(fronthand)) == page_first()) { - DTRACE_PROBE1(pageout__wrap__front, uint_t, inst); - - /* - * Every 64 wraps we reposition our hands within our - * region to prevent creep into another thread. - */ - if ((++iter % pageout_reset_cnt) == 0) - reset_hands[inst] = B_TRUE; + if (fronthand == fronthandstart) { + laps++; + DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst, + uint_t, laps); /* * This CPU kstat is only incremented here and we're @@ -1107,96 +1368,134 @@ loop: CPU_STATS_ADDQ(CPU, vm, rev, 1); /* - * If scanning because the system is low on memory, * then when we wraparound memory we want to try to * reclaim more pages. * If scanning only because zones are over their cap, * then wrapping is common and we simply keep going. - */ - if (freemem < lotsfree + needfree && ++count > 1) { + */ + if (laps > 1 && freemem < lotsfree + needfree) { /* - * The system is low on memory. * Extremely unlikely, but it happens. - * We went around memory at least once - * and didn't reclaim enough. + * We went around the loop at least once + * and didn't get far enough. * If we are still skipping `highly shared' * pages, skip fewer of them. Otherwise, * give up till the next clock tick. */ - mutex_enter(&pageout_mutex); if (po_share < MAX_PO_SHARE) { po_share <<= 1; - mutex_exit(&pageout_mutex); } else { - /* - * Really a "goto loop", but if someone - * is tracing or TNF_PROBE_ing, hit - * those probes first. - */ - mutex_exit(&pageout_mutex); break; } } } } - atomic_add_long(&nscan, nscan_cnt); - sample_end = gethrtime(); + atomic_add_long(&nscan, nscan_cnt); - DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount, - uint_t, inst); - - /* Kernel probe */ - TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, - tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free, - freemem); + DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps, + pgcnt_t, nscan_cnt, pgcnt_t, pcount) /* - * The following two blocks are only relevant when the scanner is - * first started up. After the scanner runs for a while, neither of - * the conditions will ever be true again. - * * The global variables used below are only modified by this thread and * only during initial scanning when there is a single page scanner - * thread running. Thus, we don't use any locking. + * thread running. */ - if (PAGE_SCAN_STARTUP) { + if (pageout_new_spread == 0) { VERIFY3U(inst, ==, 0); - pageout_sample_pages += pcount; - pageout_sample_etime += sample_end - sample_start; - ++pageout_sample_cnt; - } else if (pageout_new_spread == 0) { - uint_t i; + if (PAGE_SCAN_STARTUP) { + /* + * Continue accumulating samples until we have enough + * to get a reasonable value for average scan rate. + */ + pageout_sample_pages += pcount; + pageout_sample_etime += sample_end - sample_start; + ++pageout_sample_cnt; + } + if (!PAGE_SCAN_STARTUP) { + /* + * We have enough samples, set the spread. + */ + pageout_rate = (hrrate_t)pageout_sample_pages * + (hrrate_t)(NANOSEC) / pageout_sample_etime; + pageout_new_spread = pageout_rate / 10; + setupclock(); + } + } + + goto loop; +} + +/* + * The pageout deadman is run once per second by clock(). + */ +void +pageout_deadman(void) +{ + if (panicstr != NULL) { /* - * We have run enough samples, set the spread. + * There is no pageout after panic. */ - VERIFY3U(inst, ==, 0); - pageout_rate = (hrrate_t)pageout_sample_pages * - (hrrate_t)(NANOSEC) / pageout_sample_etime; - pageout_new_spread = pageout_rate / 10; - setupclock(1); + return; } - goto loop; + if (pageout_deadman_seconds == 0) { + /* + * The deadman is not enabled. + */ + return; + } + + if (!pageout_pushing) { + goto reset; + } + + /* + * We are pushing a page. Check to see if it is the same call we saw + * last time we looked: + */ + if (pageout_pushcount != pageout_pushcount_seen) { + /* + * It is a different call from the last check, so we are not + * stuck. + */ + goto reset; + } + + if (++pageout_stucktime >= pageout_deadman_seconds) { + panic("pageout_deadman: stuck pushing the same page for %d " + "seconds (freemem is %lu)", pageout_deadman_seconds, + freemem); + } + + return; + +reset: + /* + * Reset our tracking state to reflect that we are not stuck: + */ + pageout_stucktime = 0; + pageout_pushcount_seen = pageout_pushcount; } /* * Look at the page at hand. If it is locked (e.g., for physical i/o), * system (u., page table) or free, then leave it alone. Otherwise, * if we are running the front hand, turn off the page's reference bit. - * If running the back hand, check whether the page has been reclaimed. - * If not, free the page, pushing it to disk first if necessary. + * If the proc is over maxrss, we take it. If running the back hand, + * check whether the page has been reclaimed. If not, free the page, + * pushing it to disk first if necessary. * * Return values: - * -1 if the page is not a candidate at all, - * 0 if not freed, or - * 1 if we freed it. + * CKP_INELIGIBLE if the page is not a candidate at all, + * CKP_NOT_FREED if the page was not freed, or + * CKP_FREED if we freed it. */ -static int -checkpage(struct page *pp, int whichhand) +static checkpage_result_t +checkpage(struct page *pp, pageout_hand_t whichhand) { int ppattr; int isfs = 0; @@ -1206,7 +1505,7 @@ checkpage(struct page *pp, int whichhand) /* * Skip pages: - * - associated with the kernel vnode since + * - associated with the kernel vnode since * they are always "exclusively" locked. * - that are free * - that are shared more than po_share'd times @@ -1218,21 +1517,21 @@ checkpage(struct page *pp, int whichhand) if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || hat_page_checkshare(pp, po_share)) { - return (-1); + return (CKP_INELIGIBLE); } if (!page_trylock(pp, SE_EXCL)) { /* * Skip the page if we can't acquire the "exclusive" lock. */ - return (-1); + return (CKP_INELIGIBLE); } else if (PP_ISFREE(pp)) { /* * It became free between the above check and our actually - * locking the page. Oh, well there will be other pages. + * locking the page. Oh well, there will be other pages. */ page_unlock(pp); - return (-1); + return (CKP_INELIGIBLE); } /* @@ -1242,7 +1541,7 @@ checkpage(struct page *pp, int whichhand) */ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); - return (-1); + return (CKP_INELIGIBLE); } if (zones_over) { @@ -1251,11 +1550,11 @@ checkpage(struct page *pp, int whichhand) if (pp->p_zoneid == ALL_ZONES || zone_pdata[pp->p_zoneid].zpers_over == 0) { /* - * Cross-zone shared page, or zone not over it's cap. - * Leave the page alone. - */ + * Cross-zone shared page, or zone not over it's cap. + * Leave the page alone. + */ page_unlock(pp); - return (-1); + return (CKP_INELIGIBLE); } zid = pp->p_zoneid; } @@ -1263,7 +1562,6 @@ checkpage(struct page *pp, int whichhand) /* * Maintain statistics for what we are freeing */ - if (pp->p_vnode != NULL) { if (pp->p_vnode->v_flag & VVMEXEC) isexec = 1; @@ -1277,34 +1575,44 @@ checkpage(struct page *pp, int whichhand) * The back hand examines the REF bit and always considers * SHARED pages as referenced. */ - if (whichhand == FRONT) + if (whichhand == POH_FRONT) { pagesync_flag = HAT_SYNC_ZERORM; - else + } else { pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | HAT_SYNC_STOPON_SHARED; + } ppattr = hat_pagesync(pp, pagesync_flag); recheck: /* - * If page is referenced; fronthand makes unreferenced and reclaimable. - * For the backhand, a process referenced the page since the front hand - * went by, so it's not a candidate for freeing up. + * If page is referenced; make unreferenced but reclaimable. + * If this page is not referenced, then it must be reclaimable + * and we can add it to the free list. */ if (ppattr & P_REF) { - DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand); - if (whichhand == FRONT) { + DTRACE_PROBE2(pageout__isref, page_t *, pp, + pageout_hand_t, whichhand); + + if (whichhand == POH_FRONT) { + /* + * Checking of rss or madvise flags needed here... + * + * If not "well-behaved", fall through into the code + * for not referenced. + */ hat_clrref(pp); } + + /* + * Somebody referenced the page since the front + * hand went by, so it's not a candidate for + * freeing up. + */ page_unlock(pp); - return (0); + return (CKP_NOT_FREED); } - /* - * This page is not referenced, so it must be reclaimable and we can - * add it to the free list. This can be done by either hand. - */ - VM_STAT_ADD(pageoutvmstats.checkpage[0]); /* @@ -1315,31 +1623,32 @@ recheck: if (!page_try_demote_pages(pp)) { VM_STAT_ADD(pageoutvmstats.checkpage[1]); page_unlock(pp); - return (-1); + return (CKP_INELIGIBLE); } + ASSERT(pp->p_szc == 0); VM_STAT_ADD(pageoutvmstats.checkpage[2]); + /* - * since page_try_demote_pages() could have unloaded some + * Since page_try_demote_pages() could have unloaded some * mappings it makes sense to reload ppattr. */ ppattr = hat_page_getattr(pp, P_MOD | P_REF); } /* - * If the page is currently dirty, we have to arrange - * to have it cleaned before it can be freed. + * If the page is currently dirty, we have to arrange to have it + * cleaned before it can be freed. * * XXX - ASSERT(pp->p_vnode != NULL); */ - if ((ppattr & P_MOD) && pp->p_vnode) { + if ((ppattr & P_MOD) && pp->p_vnode != NULL) { struct vnode *vp = pp->p_vnode; u_offset_t offset = pp->p_offset; /* - * Note: There is no possibility to test for process being - * swapped out or about to exit since we can't get back to - * process(es) from the page. + * XXX - Test for process being swapped out or about to exit? + * [Can't get back to process(es) using the page.] */ /* @@ -1351,34 +1660,33 @@ recheck: page_unlock(pp); /* - * Queue i/o request for the pageout thread. + * Queue I/O request for the pageout thread. */ if (!queue_io_request(vp, offset)) { VN_RELE(vp); - return (0); + return (CKP_NOT_FREED); } if (isfs) { zone_pageout_stat(zid, ZPO_DIRTY); } else { zone_pageout_stat(zid, ZPO_ANONDIRTY); } - return (1); + return (CKP_FREED); } /* - * Now we unload all the translations, - * and put the page back on to the free list. - * If the page was used (referenced or modified) after - * the pagesync but before it was unloaded we catch it - * and handle the page properly. + * Now we unload all the translations and put the page back on to the + * free list. If the page was used (referenced or modified) after the + * pagesync but before it was unloaded we catch it and handle the page + * properly. */ - DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand); + DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand); (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); ppattr = hat_page_getattr(pp, P_MOD | P_REF); - if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) + if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) { goto recheck; + } - /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_FREE, 0, kcred); CPU_STATS_ADD_K(vm, dfree, 1); @@ -1395,7 +1703,7 @@ recheck: zone_pageout_stat(zid, ZPO_ANON); } - return (1); /* freed a page! */ + return (CKP_FREED); } /* diff --git a/usr/src/uts/common/os/watchpoint.c b/usr/src/uts/common/os/watchpoint.c index eee612ef93..24db9637d4 100644 --- a/usr/src/uts/common/os/watchpoint.c +++ b/usr/src/uts/common/os/watchpoint.c @@ -821,7 +821,6 @@ watch_xcopyin(const void *uaddr, void *kaddr, size_t count) count -= part; } -error: /* if we hit a watched address, do the watchpoint logic */ if (watchcode && (!sys_watchpoint(vaddr, watchcode, ta) || |
