summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os')
-rw-r--r--usr/src/uts/common/os/autoconf.c2
-rw-r--r--usr/src/uts/common/os/bio.c1
-rw-r--r--usr/src/uts/common/os/bitmap.c19
-rw-r--r--usr/src/uts/common/os/cap_util.c4
-rw-r--r--usr/src/uts/common/os/clock.c6
-rw-r--r--usr/src/uts/common/os/clock_highres.c2
-rw-r--r--usr/src/uts/common/os/clock_process.c130
-rw-r--r--usr/src/uts/common/os/clock_thread.c191
-rw-r--r--usr/src/uts/common/os/cpu.c11
-rw-r--r--usr/src/uts/common/os/cred.c9
-rw-r--r--usr/src/uts/common/os/dacf.c2
-rw-r--r--usr/src/uts/common/os/dacf_clnt.c6
-rw-r--r--usr/src/uts/common/os/ddi.c4
-rw-r--r--usr/src/uts/common/os/ddi_hp_impl.c12
-rw-r--r--usr/src/uts/common/os/ddi_intr_impl.c6
-rw-r--r--usr/src/uts/common/os/ddi_intr_irm.c4
-rw-r--r--usr/src/uts/common/os/ddi_ufm.c85
-rw-r--r--usr/src/uts/common/os/ddifm.c4
-rw-r--r--usr/src/uts/common/os/devcfg.c133
-rw-r--r--usr/src/uts/common/os/devid_cache.c6
-rw-r--r--usr/src/uts/common/os/dkioc_free_util.c442
-rw-r--r--usr/src/uts/common/os/driver_lyr.c133
-rw-r--r--usr/src/uts/common/os/errorq.c2
-rw-r--r--usr/src/uts/common/os/exacct.c6
-rw-r--r--usr/src/uts/common/os/exit.c71
-rw-r--r--usr/src/uts/common/os/fio.c24
-rw-r--r--usr/src/uts/common/os/flock.c62
-rw-r--r--usr/src/uts/common/os/fm.c7
-rw-r--r--usr/src/uts/common/os/grow.c21
-rw-r--r--usr/src/uts/common/os/ip_cksum.c8
-rw-r--r--usr/src/uts/common/os/kcpc.c77
-rw-r--r--usr/src/uts/common/os/klpd.c2
-rw-r--r--usr/src/uts/common/os/kmem.c26
-rw-r--r--usr/src/uts/common/os/ksensor.c871
-rw-r--r--usr/src/uts/common/os/lgrp.c15
-rw-r--r--usr/src/uts/common/os/log_sysevent.c11
-rw-r--r--usr/src/uts/common/os/logsubr.c31
-rw-r--r--usr/src/uts/common/os/main.c2
-rw-r--r--usr/src/uts/common/os/mem_config.c4
-rw-r--r--usr/src/uts/common/os/memlist_new.c8
-rw-r--r--usr/src/uts/common/os/mmapobj.c10
-rw-r--r--usr/src/uts/common/os/modctl.c2
-rw-r--r--usr/src/uts/common/os/modsubr.c3
-rw-r--r--usr/src/uts/common/os/ndifm.c4
-rw-r--r--usr/src/uts/common/os/panic.c2
-rw-r--r--usr/src/uts/common/os/policy.c46
-rw-r--r--usr/src/uts/common/os/pool.c10
-rw-r--r--usr/src/uts/common/os/priv.c3
-rw-r--r--usr/src/uts/common/os/priv_defs6
-rw-r--r--usr/src/uts/common/os/rctl.c6
-rw-r--r--usr/src/uts/common/os/schedctl.c33
-rw-r--r--usr/src/uts/common/os/share.c19
-rw-r--r--usr/src/uts/common/os/shm.c4
-rw-r--r--usr/src/uts/common/os/softint.c46
-rw-r--r--usr/src/uts/common/os/space.c50
-rw-r--r--usr/src/uts/common/os/streamio.c2
-rw-r--r--usr/src/uts/common/os/strsubr.c95
-rw-r--r--usr/src/uts/common/os/sunddi.c155
-rw-r--r--usr/src/uts/common/os/sunmdi.c22
-rw-r--r--usr/src/uts/common/os/sunpci.c100
-rw-r--r--usr/src/uts/common/os/sunpm.c23
-rw-r--r--usr/src/uts/common/os/swapgeneric.c8
-rw-r--r--usr/src/uts/common/os/sysent.c20
-rw-r--r--usr/src/uts/common/os/timer.c180
-rw-r--r--usr/src/uts/common/os/timers.c4
-rw-r--r--usr/src/uts/common/os/upanic.c98
-rw-r--r--usr/src/uts/common/os/vm_pageout.c1696
-rw-r--r--usr/src/uts/common/os/watchpoint.c1
68 files changed, 3670 insertions, 1438 deletions
diff --git a/usr/src/uts/common/os/autoconf.c b/usr/src/uts/common/os/autoconf.c
index 71af31ba2b..44ec3353fc 100644
--- a/usr/src/uts/common/os/autoconf.c
+++ b/usr/src/uts/common/os/autoconf.c
@@ -53,6 +53,7 @@
#include <sys/fm/util.h>
#include <sys/ddifm_impl.h>
#include <sys/ddi_ufm_impl.h>
+#include <sys/ksensor_impl.h>
extern dev_info_t *top_devinfo;
extern dev_info_t *scsi_vhci_dip;
@@ -96,6 +97,7 @@ setup_ddi(void)
ndi_fm_init();
irm_init();
ufm_init();
+ ksensor_init();
(void) i_ddi_load_drvconf(DDI_MAJOR_T_NONE);
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c
index abaaef1b4a..daf3b638a6 100644
--- a/usr/src/uts/common/os/bio.c
+++ b/usr/src/uts/common/os/bio.c
@@ -1488,7 +1488,6 @@ bio_getfreeblk(long bsize)
*/
bio_mem_get(bsize); /* Account for our memory request */
-again:
bp = bio_bhdr_alloc(); /* Get a buf hdr */
sema_p(&bp->b_sem); /* Should never fail */
diff --git a/usr/src/uts/common/os/bitmap.c b/usr/src/uts/common/os/bitmap.c
index 46fae44adb..06dd326f4a 100644
--- a/usr/src/uts/common/os/bitmap.c
+++ b/usr/src/uts/common/os/bitmap.c
@@ -19,17 +19,16 @@
*
* CDDL HEADER END
*/
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2022 Oxide Computer Company
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Operations on bitmaps of arbitrary size
* A bitmap is a vector of 1 or more ulongs.
@@ -39,7 +38,7 @@
#include <sys/types.h>
#include <sys/bitmap.h>
-#include <sys/debug.h> /* ASSERT */
+#include <sys/debug.h>
/*
* Return index of first available bit in denoted bitmap, or -1 for
@@ -49,7 +48,7 @@
* Caller is responsible for range checks.
*/
index_t
-bt_availbit(ulong_t *bitmap, size_t nbits)
+bt_availbit(const ulong_t *bitmap, size_t nbits)
{
index_t maxword; /* index of last in map */
index_t wx; /* word index in map */
@@ -92,7 +91,7 @@ bt_availbit(ulong_t *bitmap, size_t nbits)
* the word specified by wx.
*/
int
-bt_gethighbit(ulong_t *mapp, int wx)
+bt_gethighbit(const ulong_t *mapp, int wx)
{
ulong_t word;
@@ -115,7 +114,7 @@ bt_gethighbit(ulong_t *mapp, int wx)
* and one past the last bit (pos2) in the pattern.
*/
int
-bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos)
+bt_range(const ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos)
{
size_t pos;
@@ -169,7 +168,7 @@ odd_parity(ulong_t i)
* a -1 is returned.
*/
int
-bt_getlowbit(ulong_t *map, size_t start, size_t stop)
+bt_getlowbit(const ulong_t *map, size_t start, size_t stop)
{
ulong_t word;
int counter = start >> BT_ULSHIFT;
@@ -236,7 +235,7 @@ bt_getlowbit(ulong_t *map, size_t start, size_t stop)
* Copy the bitmap.
*/
void
-bt_copy(ulong_t *from, ulong_t *to, ulong_t size)
+bt_copy(const ulong_t *from, ulong_t *to, ulong_t size)
{
ulong_t i;
for (i = 0; i < size; i++)
diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c
index 4f9b9f5985..7647302cfe 100644
--- a/usr/src/uts/common/os/cap_util.c
+++ b/usr/src/uts/common/os/cap_util.c
@@ -693,7 +693,7 @@ cu_cpc_program(cpu_t *cp, int *err)
*
* Context is marked with KCPC_CTX_INVALID_STOPPED when context is
* unprogrammed and may be marked with KCPC_CTX_INVALID when
- * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
+ * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
* invalidate all CPC contexts before they take over all the counters.
*
* This isn't necessary since these flags are only used for thread bound
@@ -1258,7 +1258,7 @@ cu_cpu_fini(cpu_t *cp)
ctx = cpu_ctx->ctx_ptr_array[i];
if (ctx == NULL)
continue;
- kcpc_free(ctx, 0);
+ kcpc_free_cpu(ctx);
}
/*
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 75c3b000db..93f12d7b96 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -318,7 +318,9 @@ time_t boot_time = 0; /* Boot time in seconds since 1970 */
cyclic_id_t clock_cyclic; /* clock()'s cyclic_id */
cyclic_id_t deadman_cyclic; /* deadman()'s cyclic_id */
-extern void clock_tick_schedule(int);
+extern void clock_tick_schedule(int);
+extern void set_freemem(void);
+extern void pageout_deadman(void);
static int lgrp_ticks; /* counter to schedule lgrp load calcs */
@@ -400,7 +402,6 @@ clock(void)
uint_t w_io;
cpu_t *cp;
cpupart_t *cpupart;
- extern void set_freemem();
void (*funcp)();
int32_t ltemp;
int64_t lltemp;
@@ -477,6 +478,7 @@ clock(void)
if (one_sec) {
loadavg_update();
deadman_counter++;
+ pageout_deadman();
}
/*
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 1280c8a1b6..27bc319ee6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -93,7 +93,7 @@ clock_highres_fire(void *arg)
static int
clock_highres_timer_settime(itimer_t *it, int flags,
- const struct itimerspec *when)
+ const struct itimerspec *when)
{
cyclic_id_t cyc, *cycp = it->it_arg;
proc_t *p = curproc;
diff --git a/usr/src/uts/common/os/clock_process.c b/usr/src/uts/common/os/clock_process.c
new file mode 100644
index 0000000000..a3c1641c9c
--- /dev/null
+++ b/usr/src/uts/common/os/clock_process.c
@@ -0,0 +1,130 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This clock backend implements basic support for the CLOCK_PROCESS_CPUTIME_ID
+ * clock. This clock is weakly defined by POSIX as "The identifier of the
+ * CPU-time clock associated with the process making a clock() or timer*()
+ * function call". We interpret that as including LMS_USER, LMS_SYSTEM, and
+ * LMS_TRAP microstates. This is similar to what we do in proc(5) for the
+ * lwpstatus_t and the prstatus_t.
+ *
+ * At this time, we only provide the ability to read the current time (e.g.
+ * through a call to clock_gettime(3C)). There is never a case where being able
+ * to set the time makes sense today and even if so, the privileges required for
+ * that are circumspect. Today, we do not support the ability to create interval
+ * timers based on this backend (e.g. timer_create(3C) and timer_settime(3C)).
+ * However, there is no reason that couldn't be added.
+ *
+ * To implement this, we leverage the existing microstate aggregation time that
+ * is done in /proc.
+ */
+
+#include <sys/timer.h>
+#include <sys/cyclic.h>
+#include <sys/msacct.h>
+
+static clock_backend_t clock_process;
+
+static int
+clock_process_settime(timespec_t *ts)
+{
+ return (EINVAL);
+}
+
+static int
+clock_process_gettime(timespec_t *ts)
+{
+ hrtime_t hrt;
+ proc_t *p = curproc;
+
+ /*
+ * mstate_aggr_state() automatically includes LMS_TRAP when we ask for
+ * LMS_SYSTEM below.
+ */
+ mutex_enter(&p->p_lock);
+ hrt = mstate_aggr_state(p, LMS_USER);
+ hrt += mstate_aggr_state(p, LMS_SYSTEM);
+ mutex_exit(&p->p_lock);
+
+ hrt2ts(hrt, ts);
+
+ return (0);
+}
+
+/*
+ * See the discussion in clock_thread_getres() for the why of using
+ * cyclic_getres() here.
+ */
+static int
+clock_process_getres(timespec_t *ts)
+{
+ hrt2ts(cyclic_getres(), (timestruc_t *)ts);
+
+ return (0);
+}
+
+static int
+clock_process_timer_create(itimer_t *it, void (*fire)(itimer_t *))
+{
+ return (EINVAL);
+}
+
+static int
+clock_process_timer_settime(itimer_t *it, int flags,
+ const struct itimerspec *when)
+{
+ return (EINVAL);
+}
+
+static int
+clock_process_timer_gettime(itimer_t *it, struct itimerspec *when)
+{
+ return (EINVAL);
+}
+
+static int
+clock_process_timer_delete(itimer_t *it)
+{
+ return (EINVAL);
+}
+
+static void
+clock_process_timer_lwpbind(itimer_t *it)
+{
+}
+
+void
+clock_process_init(void)
+{
+ /*
+ * While this clock backend doesn't support notifications right now, we
+ * still fill out the default for what it would be.
+ */
+ clock_process.clk_default.sigev_signo = SIGALRM;
+ clock_process.clk_default.sigev_notify = SIGEV_SIGNAL;
+ clock_process.clk_default.sigev_value.sival_ptr = NULL;
+
+ clock_process.clk_clock_settime = clock_process_settime;
+ clock_process.clk_clock_gettime = clock_process_gettime;
+ clock_process.clk_clock_getres = clock_process_getres;
+ clock_process.clk_timer_create = clock_process_timer_create;
+ clock_process.clk_timer_settime = clock_process_timer_settime;
+ clock_process.clk_timer_gettime = clock_process_timer_gettime;
+ clock_process.clk_timer_delete = clock_process_timer_delete;
+ clock_process.clk_timer_lwpbind = clock_process_timer_lwpbind;
+
+ clock_add_backend(CLOCK_PROCESS_CPUTIME_ID, &clock_process);
+}
diff --git a/usr/src/uts/common/os/clock_thread.c b/usr/src/uts/common/os/clock_thread.c
new file mode 100644
index 0000000000..96dd36fa08
--- /dev/null
+++ b/usr/src/uts/common/os/clock_thread.c
@@ -0,0 +1,191 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This clock backend implements basic support for the following two clocks:
+ *
+ * o CLOCK_VIRTUAL This provides the ability to read the amount of
+ * user CPU time that the calling thread has spent
+ * on CPU. This is the LMS_USER cpu microstate.
+ *
+ * o CLOCK_THREAD_CPUTIME_ID This clock is similar to the above; however, it
+ * also includes system time. This is the LMS_USER,
+ * LMS_SYSTEM, and LMS_TRAP microstates combined
+ * together. We include LMS_TRAP here because that
+ * is what you see in a thread's lwpstatus file.
+ *
+ * At this time, we only provide the ability to read the current time (e.g.
+ * through a call to clock_gettime(3C)). There is never a case where being able
+ * to set the time makes sense today and truthfully, lying about a process's
+ * runtime should be left to mdb -kw. Today, we do not support the ability to
+ * create interval timers based on this backend (e.g. timer_create(3C) and
+ * timer_settime(3C)). However, there is no reason that couldn't be added.
+ *
+ * A nice simplification here is that this clock is always about reading from
+ * the current thread. This means that one can always access it. Because the
+ * calling thread exists and is in this code, it means that we know it is here.
+ * Any other privilege information is left to the broader kernel.
+ *
+ * Because the only difference between these is the question of whether or not
+ * we include LMS_SYSTEM time in the value, we generally use the same actual
+ * clock backend functions except for the one that implements
+ * clk_clock_gettime().
+ */
+
+#include <sys/timer.h>
+#include <sys/cyclic.h>
+#include <sys/msacct.h>
+
+static clock_backend_t clock_thread_usr;
+static clock_backend_t clock_thread_usrsys;
+
+static int
+clock_thread_settime(timespec_t *ts)
+{
+ return (EINVAL);
+}
+
+static int
+clock_thread_usr_gettime(timespec_t *ts)
+{
+ hrtime_t hrt;
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+
+ hrt = lwp->lwp_mstate.ms_acct[LMS_USER];
+ scalehrtime(&hrt);
+ hrt2ts(hrt, ts);
+
+ return (0);
+}
+
+static int
+clock_thread_usrsys_gettime(timespec_t *ts)
+{
+ hrtime_t hrt;
+ kthread_t *t = curthread;
+
+ /*
+ * mstate_thread_onproc_time() takes care of doing the following:
+ *
+ * o Combining LMS_USER, LMS_SYSTEM, and LMS_TRAP.
+ * o Ensuring that the result is scaled
+ * o Ensuring that the time that's elapsed to the point of our asking
+ * is included. By definition the kernel is executing in LMS_SYSTEM
+ * so this ensures that we add that time which isn't currently in the
+ * microstate to this.
+ */
+ thread_lock(t);
+ hrt = mstate_thread_onproc_time(t);
+ thread_unlock(t);
+
+ hrt2ts(hrt, ts);
+ return (0);
+}
+
+/*
+ * The question of the resolution here is a thorny one. Technically this would
+ * really be based upon the resolution of gethrtime_unscaled(), as we can
+ * actually tell that much due to our use of CPU microstate accounting. However,
+ * from a timer resolution perspective it's actually quite different and would
+ * in theory be based on the system tick rate.
+ *
+ * This basically leaves us with two options:
+ *
+ * 1) Use 'nsec_per_tick' to go down the Hz path.
+ * 2) Use the cyclic resolution, which basically is kind of the resolution of
+ * that timer.
+ *
+ * POSIX is unclear as to the effect of the resolution in the case of timer_*()
+ * functions and only really says it is used to impact the implementation of
+ * clock_settime() which of course isn't actually supported here. As a result,
+ * we opt to prefer the cyclic resolution, which is closer to the actual
+ * resolution of this subsystem. Strictly speaking, this might not be completely
+ * accurate, but should be on current platforms.
+ */
+static int
+clock_thread_getres(timespec_t *ts)
+{
+ hrt2ts(cyclic_getres(), (timestruc_t *)ts);
+
+ return (0);
+}
+
+static int
+clock_thread_timer_create(itimer_t *it, void (*fire)(itimer_t *))
+{
+ return (EINVAL);
+}
+
+static int
+clock_thread_timer_settime(itimer_t *it, int flags,
+ const struct itimerspec *when)
+{
+ return (EINVAL);
+}
+
+static int
+clock_thread_timer_gettime(itimer_t *it, struct itimerspec *when)
+{
+ return (EINVAL);
+}
+
+static int
+clock_thread_timer_delete(itimer_t *it)
+{
+ return (EINVAL);
+}
+
+static void
+clock_thread_timer_lwpbind(itimer_t *it)
+{
+}
+
+void
+clock_thread_init(void)
+{
+ /*
+ * While this clock backends don't support notifications right now, we
+ * still fill out the default for what it would be.
+ */
+ clock_thread_usr.clk_default.sigev_signo = SIGALRM;
+ clock_thread_usr.clk_default.sigev_notify = SIGEV_SIGNAL;
+ clock_thread_usr.clk_default.sigev_value.sival_ptr = NULL;
+
+ clock_thread_usr.clk_clock_settime = clock_thread_settime;
+ clock_thread_usr.clk_clock_gettime = clock_thread_usr_gettime;
+ clock_thread_usr.clk_clock_getres = clock_thread_getres;
+ clock_thread_usr.clk_timer_create = clock_thread_timer_create;
+ clock_thread_usr.clk_timer_settime = clock_thread_timer_settime;
+ clock_thread_usr.clk_timer_gettime = clock_thread_timer_gettime;
+ clock_thread_usr.clk_timer_delete = clock_thread_timer_delete;
+ clock_thread_usr.clk_timer_lwpbind = clock_thread_timer_lwpbind;
+
+ clock_thread_usrsys.clk_default.sigev_signo = SIGALRM;
+ clock_thread_usrsys.clk_default.sigev_notify = SIGEV_SIGNAL;
+ clock_thread_usrsys.clk_default.sigev_value.sival_ptr = NULL;
+
+ clock_thread_usrsys.clk_clock_settime = clock_thread_settime;
+ clock_thread_usrsys.clk_clock_gettime = clock_thread_usrsys_gettime;
+ clock_thread_usrsys.clk_clock_getres = clock_thread_getres;
+ clock_thread_usrsys.clk_timer_create = clock_thread_timer_create;
+ clock_thread_usrsys.clk_timer_settime = clock_thread_timer_settime;
+ clock_thread_usrsys.clk_timer_gettime = clock_thread_timer_gettime;
+ clock_thread_usrsys.clk_timer_delete = clock_thread_timer_delete;
+ clock_thread_usrsys.clk_timer_lwpbind = clock_thread_timer_lwpbind;
+
+ clock_add_backend(CLOCK_VIRTUAL, &clock_thread_usr);
+ clock_add_backend(CLOCK_THREAD_CPUTIME_ID, &clock_thread_usrsys);
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index e53c75b64e..6a86dbb8cb 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -22,6 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
/*
@@ -60,7 +61,7 @@
#include <sys/archsystm.h>
#include <sys/sdt.h>
#include <sys/smt.h>
-#if defined(__x86) || defined(__amd64)
+#if defined(__x86)
#include <sys/x86_archext.h>
#endif
#include <sys/callo.h>
@@ -613,7 +614,7 @@ again:
* requests will continue to be satisfied in the same way,
* even if weak bindings have recommenced.
*/
- if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) {
+ if (t->t_nomigrate < 0 || (weakbindingbarrier && t->t_nomigrate == 0)) {
--t->t_nomigrate;
thread_unlock(curthread);
return; /* with kpreempt_disable still active */
@@ -2909,7 +2910,7 @@ cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
}
void
-cpuset_or(cpuset_t *dst, cpuset_t *src)
+cpuset_or(cpuset_t *dst, const cpuset_t *src)
{
for (int i = 0; i < CPUSET_WORDS; i++) {
dst->cpub[i] |= src->cpub[i];
@@ -2917,7 +2918,7 @@ cpuset_or(cpuset_t *dst, cpuset_t *src)
}
void
-cpuset_xor(cpuset_t *dst, cpuset_t *src)
+cpuset_xor(cpuset_t *dst, const cpuset_t *src)
{
for (int i = 0; i < CPUSET_WORDS; i++) {
dst->cpub[i] ^= src->cpub[i];
@@ -2925,7 +2926,7 @@ cpuset_xor(cpuset_t *dst, cpuset_t *src)
}
void
-cpuset_and(cpuset_t *dst, cpuset_t *src)
+cpuset_and(cpuset_t *dst, const cpuset_t *src)
{
for (int i = 0; i < CPUSET_WORDS; i++) {
dst->cpub[i] &= src->cpub[i];
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 0bd6cfd44f..5e909667de 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -20,13 +20,14 @@
*/
/*
* Copyright (c) 2013, Ira Cooper. All rights reserved.
+ * Copyright 2020 Nexenta by DDN, Inc. All rights reserved.
*/
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
@@ -288,7 +289,7 @@ crget(void)
{
cred_t *cr = kmem_cache_alloc(cred_cache, KM_SLEEP);
- bcopy(kcred, cr, crsize);
+ bcopy(zone_kcred(), cr, crsize);
cr->cr_ref = 1;
zone_cred_hold(cr->cr_zone);
if (cr->cr_label)
@@ -377,7 +378,7 @@ crfree(cred_t *cr)
/*
* Copy a cred structure to a new one and free the old one.
* The new cred will have two references. One for the calling process,
- * and one for the thread.
+ * and one for the thread.
*/
cred_t *
crcopy(cred_t *cr)
@@ -404,7 +405,7 @@ crcopy(cred_t *cr)
/*
* Copy a cred structure to a new one and free the old one.
* The new cred will have two references. One for the calling process,
- * and one for the thread.
+ * and one for the thread.
* This variation on crcopy uses a pre-allocated structure for the
* "new" cred.
*/
diff --git a/usr/src/uts/common/os/dacf.c b/usr/src/uts/common/os/dacf.c
index 8d4cd486d8..592b1cd570 100644
--- a/usr/src/uts/common/os/dacf.c
+++ b/usr/src/uts/common/os/dacf.c
@@ -642,7 +642,7 @@ dacf_arglist_delete(dacf_arg_t **list)
* Match a device-spec to a rule.
*/
dacf_rule_t *
-dacf_match(dacf_opid_t op, dacf_devspec_t ds, void *match_info)
+dacf_match(dacf_opid_t op, dacf_devspec_t ds, const void *match_info)
{
dacf_rule_t *rule;
diff --git a/usr/src/uts/common/os/dacf_clnt.c b/usr/src/uts/common/os/dacf_clnt.c
index e40509d33b..fdb1696fb2 100644
--- a/usr/src/uts/common/os/dacf_clnt.c
+++ b/usr/src/uts/common/os/dacf_clnt.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DACF (Device Autoconfiguration Framework) client code.
*
@@ -67,8 +65,8 @@
* for the operation to be invoked at post-attach and/or pre-detach time.
*/
void
-dacfc_match_create_minor(char *name, char *node_type, dev_info_t *dip,
- struct ddi_minor_data *dmdp, int flag)
+dacfc_match_create_minor(const char *name, const char *node_type,
+ dev_info_t *dip, struct ddi_minor_data *dmdp, int flag)
{
dacf_rule_t *r;
char *dev_path, *dev_pathp, *drv_mname = NULL;
diff --git a/usr/src/uts/common/os/ddi.c b/usr/src/uts/common/os/ddi.c
index a37d91e92a..c348ee474c 100644
--- a/usr/src/uts/common/os/ddi.c
+++ b/usr/src/uts/common/os/ddi.c
@@ -1136,8 +1136,8 @@ qunbufcall(queue_t *q, bufcall_id_t id)
* Associate the stream with an instance of the bottom driver. This
* function is called by APIs that establish or modify the hardware
* association (ppa) of an open stream. Two examples of such
- * post-open(9E) APIs are the dlpi(7p) DL_ATTACH_REQ message, and the
- * ndd(1M) "instance=" ioctl(2). This interface may be called from a
+ * post-open(9E) APIs are the dlpi(4P) DL_ATTACH_REQ message, and the
+ * ndd(8) "instance=" ioctl(2). This interface may be called from a
* stream driver's wput procedure and from within syncq perimeters,
* so it can't block.
*
diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c
index 38e575dbfd..8f0890fc2b 100644
--- a/usr/src/uts/common/os/ddi_hp_impl.c
+++ b/usr/src/uts/common/os/ddi_hp_impl.c
@@ -92,8 +92,8 @@
* - Through the nexus driver interface, ndi_hp_state_change_req. PCIe
* nexus drivers that pass a hotplug interrupt through to pciehpc will kick
* off state changes in this way.
- * - Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and
- * hotplug(1M) pass state change requests through hotplugd, which uses
+ * - Through coordinated removal, ddihp_modctl. Both cfgadm(8) and
+ * hotplug(8) pass state change requests through hotplugd, which uses
* modctl to request state changes to the DDI hotplug framework. That
* interface is ultimately implemented by ddihp_modctl.
*
@@ -131,7 +131,7 @@
* of some key components are below.
*
* +------------+
- * | cfgadm(1M) |
+ * | cfgadm(8) |
* +------------+
* |
* +-------------------+
@@ -139,7 +139,7 @@
* +-------------------+
* |
* +-------------+ +------------+
- * | hotplug(1M) |----------| libhotplug |
+ * | hotplug(8) |----------| libhotplug |
* +-------------+ +------------+
* |
* +----------+
@@ -193,14 +193,14 @@
*
* KEY HOTPLUG SOFTWARE COMPONENTS
*
- * CFGADM(1M)
+ * cfgadm(8)
*
* cfgadm is the canonical tool for hotplug operations. It can be used to
* list connections on the system and change their state in a coordinated
* fashion. For more information, see its manual page.
*
*
- * HOTPLUG(1M)
+ * hotplug(8)
*
* hotplug is a command line tool for managing hotplug connections for
* connectors. For more information, see its manual page.
diff --git a/usr/src/uts/common/os/ddi_intr_impl.c b/usr/src/uts/common/os/ddi_intr_impl.c
index 215be73722..22f4548607 100644
--- a/usr/src/uts/common/os/ddi_intr_impl.c
+++ b/usr/src/uts/common/os/ddi_intr_impl.c
@@ -35,7 +35,7 @@
#include <sys/sunndi.h>
#include <sys/ndi_impldefs.h> /* include prototypes */
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
/*
* MSI-X allocation limit.
*/
@@ -294,7 +294,7 @@ i_ddi_intr_get_limit(dev_info_t *dip, int type, ddi_irm_pool_t *pool_p)
limit = MIN(limit, nintrs);
/* Impose a global MSI-X limit on x86 */
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
if (type == DDI_INTR_TYPE_MSIX)
limit = MIN(limit, ddi_msix_alloc_limit);
#endif
@@ -539,7 +539,7 @@ set_intr_affinity(ddi_intr_handle_t h, processorid_t tgt)
return (ret);
}
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
ddi_acc_handle_t
i_ddi_get_pci_config_handle(dev_info_t *dip)
{
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index a4b35dcb5b..2433c504fc 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -34,7 +34,7 @@
#include <sys/sunndi.h>
#include <sys/ndi_impldefs.h> /* include prototypes */
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
/*
* MSI-X allocation limit.
*/
@@ -767,7 +767,7 @@ i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
/* Determine new request size */
nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
/* Use the default static limit for non-IRM drivers */
if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
nreq = MIN(nreq, ddi_msix_alloc_limit);
diff --git a/usr/src/uts/common/os/ddi_ufm.c b/usr/src/uts/common/os/ddi_ufm.c
index ffb04eddec..940ebf82bf 100644
--- a/usr/src/uts/common/os/ddi_ufm.c
+++ b/usr/src/uts/common/os/ddi_ufm.c
@@ -11,6 +11,7 @@
/*
* Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
*/
#include <sys/avl.h>
@@ -20,13 +21,16 @@
#include <sys/kmem.h>
#include <sys/sunddi.h>
#include <sys/stddef.h>
+#include <sys/sunndi.h>
+#include <sys/file.h>
+#include <sys/sysmacros.h>
/*
* The UFM subsystem tracks its internal state with respect to device
* drivers that participate in the DDI UFM subsystem on a per-instance basis
* via ddi_ufm_handle_t structures (see ddi_ufm_impl.h). This is known as the
* UFM handle. The UFM handle contains a pointer to the driver's UFM ops,
- * which the ufm(7D) pseudo driver uses to invoke the UFM entry points in
+ * which the ufm(4D) pseudo driver uses to invoke the UFM entry points in
* response to DDI UFM ioctls. Additionally, the DDI UFM subsystem uses the
* handle to maintain cached UFM image and slot data.
*
@@ -65,6 +69,12 @@
* These tests should be run whenever changes are made to the DDI UFM
* subsystem or the ufm driver.
*/
+
+/*
+ * Amount of data to read in one go (1 MiB).
+ */
+#define UFM_READ_STRIDE (1024 * 1024)
+
static avl_tree_t ufm_handles;
static kmutex_t ufm_lock;
@@ -171,7 +181,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
*/
ufmh->ufmh_images =
kmem_zalloc((sizeof (ddi_ufm_image_t) * ufmh->ufmh_nimages),
- KM_NOSLEEP | KM_NORMALPRI);
+ KM_NOSLEEP_LAZY);
if (ufmh->ufmh_images == NULL)
return (ENOMEM);
@@ -191,7 +201,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
img->ufmi_slots =
kmem_zalloc((sizeof (ddi_ufm_slot_t) * img->ufmi_nslots),
- KM_NOSLEEP | KM_NORMALPRI);
+ KM_NOSLEEP_LAZY);
if (img->ufmi_slots == NULL) {
ret = ENOMEM;
goto cache_fail;
@@ -234,6 +244,12 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
if (slot->ufms_attrs & DDI_UFM_ATTR_EMPTY)
continue;
+ if (slot->ufms_imgsize != 0) {
+ fnvlist_add_uint64(slots[s],
+ DDI_UFM_NV_SLOT_IMGSIZE,
+ slot->ufms_imgsize);
+ }
+
fnvlist_add_string(slots[s], DDI_UFM_NV_SLOT_VERSION,
slot->ufms_version);
if (slot->ufms_misc != NULL) {
@@ -257,6 +273,56 @@ cache_fail:
return (ret);
}
+int
+ufm_read_img(ddi_ufm_handle_t *ufmh, uint_t img, uint_t slot, uint64_t len,
+ uint64_t off, uintptr_t uaddr, uint64_t *nreadp, int copyflags)
+{
+ int ret = 0;
+ ddi_ufm_cap_t caps;
+ void *buf;
+ uint64_t nread;
+
+ ret = ufmh->ufmh_ops->ddi_ufm_op_getcaps(ufmh, ufmh->ufmh_arg, &caps);
+ if (ret != 0) {
+ return (ret);
+ }
+
+ if ((caps & DDI_UFM_CAP_READIMG) == 0 ||
+ ufmh->ufmh_ops->ddi_ufm_op_readimg == NULL) {
+ return (ENOTSUP);
+ }
+
+ if (off + len < MAX(off, len)) {
+ return (EOVERFLOW);
+ }
+
+ buf = kmem_zalloc(UFM_READ_STRIDE, KM_SLEEP);
+ nread = 0;
+ while (len > 0) {
+ uint64_t toread = MIN(len, UFM_READ_STRIDE);
+ uint64_t iter;
+
+ ret = ufmh->ufmh_ops->ddi_ufm_op_readimg(ufmh, ufmh->ufmh_arg,
+ img, slot, toread, off + nread, buf, &iter);
+ if (ret != 0) {
+ break;
+ }
+
+ if (ddi_copyout(buf, (void *)(uintptr_t)(uaddr + nread), iter,
+ copyflags & FKIOCTL) != 0) {
+ ret = EFAULT;
+ break;
+ }
+
+ nread += iter;
+ len -= iter;
+ }
+
+ *nreadp = nread;
+ kmem_free(buf, UFM_READ_STRIDE);
+ return (ret);
+}
+
/*
* This gets called early in boot by setup_ddi().
*/
@@ -375,6 +441,12 @@ ddi_ufm_init(dev_info_t *dip, uint_t version, ddi_ufm_ops_t *ufmops,
mutex_exit(&old_ufmh->ufmh_lock);
}
+ /*
+ * Give a hint in the devinfo tree that this device supports UFM
+ * capabilities.
+ */
+ (void) ndi_prop_create_boolean(DDI_DEV_T_NONE, dip, "ddi-ufm-capable");
+
return (DDI_SUCCESS);
}
@@ -453,3 +525,10 @@ ddi_ufm_slot_set_misc(ddi_ufm_slot_t *usp, nvlist_t *misc)
nvlist_free(usp->ufms_misc);
usp->ufms_misc = misc;
}
+
+void
+ddi_ufm_slot_set_imgsize(ddi_ufm_slot_t *usp, uint64_t size)
+{
+ VERIFY3P(usp, !=, NULL);
+ usp->ufms_imgsize = size;
+}
diff --git a/usr/src/uts/common/os/ddifm.c b/usr/src/uts/common/os/ddifm.c
index 533fa15aed..dc39ba49ab 100644
--- a/usr/src/uts/common/os/ddifm.c
+++ b/usr/src/uts/common/os/ddifm.c
@@ -56,7 +56,7 @@
*
* Error reports resulting from hardware component specific and common IO
* fault and driver defects must be accompanied by an Eversholt fault
- * tree (.eft) by the Solaris fault manager (fmd(1M)) for
+ * tree (.eft) by the Solaris fault manager (fmd(8)) for
* diagnosis.
*
* DDI_FM_ERRCB_CAPABLE
@@ -466,7 +466,7 @@ out: if (ereport && (nva == NULL))
/*
* Generate an error report for consumption by the Solaris Fault Manager,
- * fmd(1M). Valid ereport classes are defined in /usr/include/sys/fm/io.
+ * fmd(8). Valid ereport classes are defined in /usr/include/sys/fm/io.
*
* The ENA should be set if this error is a result of an error status
* returned from ddi_dma_err_check() or ddi_acc_err_check(). Otherwise,
diff --git a/usr/src/uts/common/os/devcfg.c b/usr/src/uts/common/os/devcfg.c
index cbcc4db3d8..d61525be9c 100644
--- a/usr/src/uts/common/os/devcfg.c
+++ b/usr/src/uts/common/os/devcfg.c
@@ -24,6 +24,7 @@
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
*/
#include <sys/note.h>
@@ -62,6 +63,7 @@
#include <sys/varargs.h>
#include <sys/modhash.h>
#include <sys/instance.h>
+#include <sys/sysevent/eventdefs.h>
#if defined(__amd64) && !defined(__xpv)
#include <sys/iommulib.h>
@@ -253,7 +255,7 @@ i_ddi_node_cache_init()
* The allocated node has a reference count of 0.
*/
dev_info_t *
-i_ddi_alloc_node(dev_info_t *pdip, char *node_name, pnode_t nodeid,
+i_ddi_alloc_node(dev_info_t *pdip, const char *node_name, pnode_t nodeid,
int instance, ddi_prop_t *sys_prop, int flag)
{
struct dev_info *devi;
@@ -395,6 +397,9 @@ sid: devi->devi_node_attributes |= DDI_PERSISTENT;
devi->devi_ct_count = -1; /* counter not in use if -1 */
list_create(&(devi->devi_ct), sizeof (cont_device_t),
offsetof(cont_device_t, cond_next));
+ list_create(&devi->devi_unbind_cbs, sizeof (ddi_unbind_callback_t),
+ offsetof(ddi_unbind_callback_t, ddiub_next));
+ mutex_init(&devi->devi_unbind_lock, NULL, MUTEX_DEFAULT, NULL);
i_ddi_set_node_state((dev_info_t *)devi, DS_PROTO);
da_log_enter((dev_info_t *)devi);
@@ -491,6 +496,9 @@ i_ddi_free_node(dev_info_t *dip)
if (devi->devi_ev_path)
kmem_free(devi->devi_ev_path, MAXPATHLEN);
+ mutex_destroy(&devi->devi_unbind_lock);
+ list_destroy(&devi->devi_unbind_cbs);
+
kmem_cache_free(ddi_node_cache, devi);
}
@@ -828,6 +836,7 @@ bind_node(dev_info_t *dip)
static int
unbind_node(dev_info_t *dip)
{
+ ddi_unbind_callback_t *cb;
ASSERT(DEVI(dip)->devi_node_state == DS_BOUND);
ASSERT(DEVI(dip)->devi_major != DDI_MAJOR_T_NONE);
@@ -842,6 +851,11 @@ unbind_node(dev_info_t *dip)
DEVI(dip)->devi_major = DDI_MAJOR_T_NONE;
DEVI(dip)->devi_binding_name = DEVI(dip)->devi_node_name;
+
+ while ((cb = list_remove_head(&DEVI(dip)->devi_unbind_cbs)) != NULL) {
+ cb->ddiub_cb(cb->ddiub_arg, dip);
+ }
+
return (DDI_SUCCESS);
}
@@ -1486,12 +1500,12 @@ postattach_node(dev_info_t *dip)
/*
* Plumbing during postattach may fail because of the
* underlying device is not ready. This will fail ndi_devi_config()
- * in dv_filldir() and a warning message is issued. The message
- * from here will explain what happened
+ * in dv_filldir().
*/
if (rval != DACF_SUCCESS) {
- cmn_err(CE_WARN, "Postattach failed for %s%d\n",
- ddi_driver_name(dip), ddi_get_instance(dip));
+ NDI_CONFIG_DEBUG((CE_CONT, "postattach_node: %s%d (%p) "
+ "postattach failed\n", ddi_driver_name(dip),
+ ddi_get_instance(dip), (void *)dip));
return (DDI_FAILURE);
}
@@ -2044,7 +2058,7 @@ ndi_devi_tryenter(dev_info_t *dip, int *circular)
* not allowed to sleep.
*/
int
-ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
+ndi_devi_alloc(dev_info_t *parent, const char *node_name, pnode_t nodeid,
dev_info_t **ret_dip)
{
ASSERT(node_name != NULL);
@@ -2064,7 +2078,7 @@ ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
* This routine may sleep and should not be called at interrupt time
*/
void
-ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, pnode_t nodeid,
+ndi_devi_alloc_sleep(dev_info_t *parent, const char *node_name, pnode_t nodeid,
dev_info_t **ret_dip)
{
ASSERT(node_name != NULL);
@@ -2534,7 +2548,7 @@ i_ddi_get_exported_classes(dev_info_t *dip, char ***classes)
* Helper functions, returns NULL if no memory.
*/
char *
-i_ddi_strdup(char *str, uint_t flag)
+i_ddi_strdup(const char *str, uint_t flag)
{
char *copy;
@@ -3560,7 +3574,6 @@ walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg,
* They include, but not limited to, _init(9e), _fini(9e), probe(9e),
* attach(9e), and detach(9e).
*/
-
void
ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg)
{
@@ -3580,7 +3593,6 @@ ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg)
*
* N.B. The same restrictions from ddi_walk_devs() apply.
*/
-
void
e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg)
{
@@ -3609,6 +3621,91 @@ e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg)
UNLOCK_DEV_OPS(&dnp->dn_lock);
}
+struct preroot_walk_block_devices_arg {
+ int (*prwb_func)(const char *, void *);
+ void *prwb_arg;
+};
+
+static int
+preroot_walk_block_devices_walker(dev_info_t *dip, void *arg)
+{
+ struct preroot_walk_block_devices_arg *prwb = arg;
+
+ if (i_ddi_devi_class(dip) == NULL ||
+ strcmp(i_ddi_devi_class(dip), ESC_DISK) != 0) {
+ /*
+ * We do not think that this is a disk.
+ */
+ return (DDI_WALK_CONTINUE);
+ }
+
+ for (struct ddi_minor_data *md = DEVI(dip)->devi_minor; md != NULL;
+ md = md->next) {
+ if (md->ddm_spec_type != S_IFBLK) {
+ /*
+ * We don't want the raw version of any block device.
+ */
+ continue;
+ }
+
+ /*
+ * The node type taxonomy is hierarchical, with each level
+ * separated by colons. Nodes of interest are either of the
+ * BLOCK type, or are prefixed with that type.
+ */
+ if (strcmp(md->ddm_node_type, DDI_NT_BLOCK) != 0 &&
+ strncmp(md->ddm_node_type, DDI_NT_BLOCK ":",
+ strlen(DDI_NT_BLOCK ":")) != 0) {
+ /*
+ * This minor node does not represent a block device.
+ */
+ continue;
+ }
+
+ char buf[MAXPATHLEN];
+ int r;
+ if ((r = prwb->prwb_func(ddi_pathname_minor(md, buf),
+ prwb->prwb_arg)) == PREROOT_WALK_BLOCK_DEVICES_CANCEL) {
+ /*
+ * The consumer does not need any more minor nodes.
+ */
+ return (DDI_WALK_TERMINATE);
+ }
+ VERIFY3S(r, ==, PREROOT_WALK_BLOCK_DEVICES_NEXT);
+ }
+
+ return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * Private routine for ZFS when it needs to attach and scan all of the block
+ * device minors in the system while looking for vdev labels.
+ *
+ * The callback function accepts a physical device path and the context
+ * argument (arg) passed to this function; it should return
+ * PREROOT_WALK_BLOCK_DEVICES_NEXT when more devices are required and
+ * PREROOT_WALK_BLOCK_DEVICES_CANCEL to stop the walk.
+ */
+void
+preroot_walk_block_devices(int (*callback)(const char *, void *), void *arg)
+{
+ /*
+ * First, force everything which can attach to do so. The device class
+ * is not derived until at least one minor mode is created, so we
+ * cannot walk the device tree looking for a device class of ESC_DISK
+ * until everything is attached.
+ */
+ (void) ndi_devi_config(ddi_root_node(), NDI_CONFIG | NDI_DEVI_PERSIST |
+ NDI_NO_EVENT | NDI_DRV_CONF_REPROBE);
+
+ struct preroot_walk_block_devices_arg prwb;
+ prwb.prwb_func = callback;
+ prwb.prwb_arg = arg;
+
+ ddi_walk_devs(ddi_root_node(), preroot_walk_block_devices_walker,
+ &prwb);
+}
+
/*
* argument to i_find_devi, a devinfo node search callback function.
*/
@@ -3823,8 +3920,8 @@ ddi_is_pci_dip(dev_info_t *dip)
* to ioc's bus_config entry point.
*/
int
-resolve_pathname(char *pathname,
- dev_info_t **dipp, dev_t *devtp, int *spectypep)
+resolve_pathname(char *pathname, dev_info_t **dipp, dev_t *devtp,
+ int *spectypep)
{
int error;
dev_info_t *parent, *child;
@@ -9055,7 +9152,7 @@ out:
char *
ddi_curr_redirect(char *curr)
{
- char *alias;
+ char *alias;
int i;
if (ddi_aliases_present == B_FALSE)
@@ -9196,3 +9293,13 @@ ddi_mem_update(uint64_t addr, uint64_t size)
;
#endif
}
+
+void
+e_ddi_register_unbind_callback(dev_info_t *dip, ddi_unbind_callback_t *cb)
+{
+ struct dev_info *devi = DEVI(dip);
+
+ mutex_enter(&devi->devi_unbind_lock);
+ list_insert_tail(&devi->devi_unbind_cbs, cb);
+ mutex_exit(&devi->devi_unbind_lock);
+}
diff --git a/usr/src/uts/common/os/devid_cache.c b/usr/src/uts/common/os/devid_cache.c
index 3e1a06a844..2a780eebe2 100644
--- a/usr/src/uts/common/os/devid_cache.c
+++ b/usr/src/uts/common/os/devid_cache.c
@@ -47,7 +47,7 @@
* involves walking the entire device tree attaching all possible disk
* instances, to search for the device referenced by a devid. Obviously,
* full device discovery is something to be avoided where possible.
- * Note that simply invoking devfsadm(1M) is equivalent to running full
+ * Note that simply invoking devfsadm(8) is equivalent to running full
* discovery at the devid cache level.
*
* Reasons why a disk may not be accessible:
@@ -61,7 +61,7 @@
* When discovery may succeed:
* Discovery will result in success when a device has been moved
* to a different address. Note that it's recommended that
- * devfsadm(1M) be invoked (no arguments required) whenever a system's
+ * devfsadm(8) be invoked (no arguments required) whenever a system's
* h/w configuration has been updated. Alternatively, a
* reconfiguration boot can be used to accomplish the same result.
*
@@ -69,7 +69,7 @@
* failure for a device which was powered off. Assuming the cache has an
* entry for such a device, simply powering it on should permit the system
* to access it. If problems persist after powering it on, invoke
- * devfsadm(1M).
+ * devfsadm(8).
*
* Discovery prior to mounting root is only of interest when booting
* from a filesystem which accesses devices by device id, which of
diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c
index 85470f7e28..4bf1f54ca4 100644
--- a/usr/src/uts/common/os/dkioc_free_util.c
+++ b/usr/src/uts/common/os/dkioc_free_util.c
@@ -10,7 +10,8 @@
*/
/*
- * Copyright 2017 Nexenta Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
*/
/* needed when building libzpool */
@@ -25,6 +26,13 @@
#include <sys/file.h>
#include <sys/sdt.h>
+static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
+ uint64_t len_blk);
+static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
+ uint64_t, dfl_iter_fn_t, void *, int);
+static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
+ dfl_iter_fn_t, void *, int);
+
/*
* Copy-in convenience function for variable-length dkioc_free_list_t
* structures. The pointer to be copied from is in `arg' (may be a pointer
@@ -78,3 +86,435 @@ dfl_free(dkioc_free_list_t *dfl)
{
kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
}
+
+/*
+ * Convenience function to resize and segment the array of extents in
+ * a DKIOCFREE request as required by a driver.
+ *
+ * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
+ * on either the number of extents that can be submitted in a single request,
+ * or the total number of blocks that can be submitted in a single request.
+ * In addition, devices may have alignment requirements on the starting
+ * address stricter than the device block size.
+ *
+ * Since there is currently no mechanism for callers of DKIOCFREE to discover
+ * such restrictions, instead of rejecting any requests that do not conform to
+ * some undiscoverable (to the caller) set of requirements, a driver can use
+ * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
+ * required to conform to its requirements.
+ *
+ * The original request is passed as 'dfl' and the alignment requirements
+ * are passed in 'dfi'. Additionally the maximum offset of the device allowed
+ * in bytes) is passed as max_off -- this allows a driver with
+ * multiple instances of different sizes but similar requirements (e.g.
+ * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
+ * struct for each device.
+ *
+ * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
+ * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
+ * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
+ * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
+ * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
+ * instances and populate them with the adjusted extents that do conform to the
+ * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
+ * passed to it when this occurs. The net result is that 'func' can always
+ * assume it will be called with a dkioc_free_list_t with extents that
+ * comply with the requirements in 'dfi'. 'func' is also responsible for
+ * freeing the dkioc_free_list_t passed to it (likely via a completion
+ * callback).
+ *
+ * Combined with the behavior described above, dfl_iter() can be viewed as
+ * consuming the dkioc_free_list_t passed to it. Either it will pass it along
+ * to 'func' (and let 'func' handle freeing it), or it will free it and
+ * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
+ * letting 'func' handle freeing the new instances). This way neither the
+ * dfl_iter() caller nor nor the driver need to worry about treating
+ * conforming and non-conforming requests differently.
+ *
+ * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
+ * any notion of partial completion -- either it returns success (0) or
+ * an error. It's not clear if such a notion would even be possible while
+ * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
+ * interface. As such, there's little benefit to providing more detailed error
+ * semantics beyond what DKIOCFREE can handle.
+ *
+ * Due to this, a somewhat simplistic approach is taken to error handling. The
+ * original list of extents is first checked to make sure they all appear
+ * valid -- that is they do not start or extend beyond the end of the device.
+ * Any request that contains such extents is always rejected in it's entirety.
+ * It is possible after applying any needed adjustments to the original list
+ * of extents that the result is not acceptable to the driver. For example,
+ * a device with a 512 byte block size that tries to free the range 513-1023
+ * (bytes) would not be able to be processed. Such extents will be silently
+ * ignored. If the original request consists of nothing but such requests,
+ * dfl_iter() will never call 'func' and will merely return 0.
+ */
+int
+dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
+ dfl_iter_fn_t func, void *arg, int kmflag)
+{
+ dkioc_free_list_ext_t *ext;
+ uint64_t n_bytes, n_segs, start_idx, i;
+ uint_t bsize = 1U << dfi->dfi_bshift;
+ int r = 0;
+ boolean_t need_copy = B_FALSE;
+
+ /*
+ * Make sure the block size derived from dfi_bshift is at least 512
+ * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
+ * to prevent any problems with other parts of the system that might
+ * assume a minimum block size of 512, and the upper bound is just
+ * to prevent overflow when creating the block size from dfi_bshift
+ * (though it seems unlikely we'll have _block_ sizes near a GiB
+ * any time soon).
+ */
+ if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
+
+ /* Max bytes must be a multiple of the block size */
+ if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
+
+ /* Start offset alignment must also be a multiple of the block size */
+ if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
+
+ /* Max bytes in an extent must be a multiple of the block size */
+ if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
+
+ /*
+ * It makes no sense to allow a single extent to be larger than the
+ * total allowed for an entire request.
+ */
+ if (dfi->dfi_max_ext_bytes > 0 &&
+ dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
+ r = SET_ERROR(EINVAL);
+ goto done;
+ }
+
+ /*
+ * The first pass, align everything as needed and make sure all the
+ * extents look valid.
+ */
+ if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
+ goto done;
+ }
+
+ /*
+ * Go through and split things up as needed. The general idea is to
+ * split along the original extent boundaries when needed. We only
+ * split an extent from the original request into multiple extents
+ * if the original extent is by itself too big for the device to
+ * process in a single request.
+ */
+ start_idx = 0;
+ n_bytes = n_segs = 0;
+ ext = dfl->dfl_exts;
+ for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
+ uint64_t start = dfl->dfl_offset + ext->dfle_start;
+ uint64_t len = ext->dfle_length;
+
+ if (len == 0) {
+ /*
+ * If we encounter a zero length extent, we're going
+ * to create a new copy of dfl no matter what --
+ * the size of dfl is determined by dfl_num_exts so
+ * we cannot do things like shift the contents and
+ * reduce dfl_num_exts to get a contiguous array
+ * of non-zero length extents.
+ */
+ need_copy = B_TRUE;
+ continue;
+ }
+
+ if (dfi->dfi_max_ext_bytes > 0 &&
+ len > dfi->dfi_max_ext_bytes) {
+ /*
+ * An extent that's too large. Dispatch what we've
+ * accumulated, and then split this extent into
+ * smaller ones the device can accept.
+ */
+ if ((r = process_range(dfl, start_idx, i - start_idx,
+ func, arg, kmflag)) != 0) {
+ goto done;
+ }
+
+ if ((r = split_extent(dfl, dfi, i, func, arg,
+ kmflag)) != 0) {
+ goto done;
+ }
+ start_idx = i + 1;
+ n_segs = 0;
+ n_bytes = 0;
+ continue;
+ }
+
+ if (dfi->dfi_max_bytes > 0 &&
+ n_bytes + len > dfi->dfi_max_bytes) {
+ /*
+ * This extent would put us over the limit for total
+ * bytes that can be trimmed in one request.
+ * Dispatch what we've accumulated. Then deal
+ * with this extent.
+ */
+ if ((r = process_range(dfl, start_idx, i - start_idx,
+ func, arg, kmflag)) != 0) {
+ goto done;
+ }
+
+ if (len < dfi->dfi_max_bytes) {
+ /*
+ * After dispatching what we've accumulated,
+ * this extent can fit in a new request
+ * Just add it to the accumulated list of
+ * extents and move on.
+ */
+ start_idx = i;
+ n_segs = 1;
+ n_bytes = len;
+ continue;
+ }
+
+ /*
+ * Even after starting a new request, this extent
+ * is too big. Split it until it fits.
+ */
+ if ((r = split_extent(dfl, dfi, i, func, arg,
+ kmflag)) != 0) {
+ goto done;
+ }
+
+ start_idx = i + 1;
+ n_segs = 0;
+ n_bytes = 0;
+ continue;
+ }
+
+ if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
+ /*
+ * This extent will put us over the limit on the number
+ * of extents the device can accept. Dispatch what
+ * we've accumulated so far.
+ */
+ if ((r = process_range(dfl, start_idx, i - start_idx,
+ func, arg, kmflag)) != 0) {
+ goto done;
+ }
+
+ start_idx = i;
+ n_segs = 1;
+ n_bytes = len;
+ continue;
+ }
+
+ n_segs++;
+ n_bytes += len;
+ }
+
+ /*
+ * If a copy wasn't required, and we haven't processed a subset of
+ * the extents already, we can just use the original request.
+ */
+ if (!need_copy && start_idx == 0) {
+ return (func(dfl, arg, kmflag));
+ }
+
+ r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
+
+done:
+ dfl_free(dfl);
+ return (r);
+}
+
+/*
+ * Adjust the start and length of each extent in dfl so that it conforms to
+ * the requirements in dfi. It also verifies that no extent extends beyond
+ * the end of the device (given by len_blk).
+ *
+ * Returns 0 on success, or an error value.
+ */
+static int
+adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
+ uint64_t max_off)
+{
+ dkioc_free_list_ext_t *exts = dfl->dfl_exts;
+ /*
+ * These must be uint64_t to prevent the P2 macros from truncating
+ * the result.
+ */
+ const uint64_t align = dfi->dfi_align;
+ const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
+
+ for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
+ /*
+ * Since there are no known requirements on the value of
+ * dfl_offset, it's possible (though odd) to have a scenario
+ * where dfl_offset == 1, and dfle_start == 511 (resulting
+ * in an actual start offset of 512). As such, we always
+ * apply the offset and find the resulting starting offset
+ * and length (in bytes) first, then apply any rounding
+ * and alignment.
+ */
+ uint64_t start = exts->dfle_start + dfl->dfl_offset;
+ uint64_t end = start + exts->dfle_length;
+
+ /*
+ * Make sure after applying dfl->dfl_offset and any alignment
+ * adjustments that the results don't overflow.
+ */
+ if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
+ return (SET_ERROR(EOVERFLOW));
+ }
+
+ if (end < start) {
+ return (SET_ERROR(EOVERFLOW));
+ }
+
+ /*
+ * Make sure we don't extend past the end of the device
+ */
+ if (end > max_off) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ start = P2ROUNDUP(start, align);
+ end = P2ALIGN(end, bsize);
+
+ /*
+ * Remove the offset so that when it's later applied again,
+ * the correct start value is obtained.
+ */
+ exts->dfle_start = start - dfl->dfl_offset;
+
+ /*
+ * If the original length was less than the block size
+ * of the device, we can end up with end < start. If that
+ * happens we just set the length to zero.
+ */
+ exts->dfle_length = (end < start) ? 0 : end - start;
+ }
+
+ return (0);
+}
+
+/*
+ * Take a subset of extents from dfl (starting at start_idx, with n entries)
+ * and create a new dkioc_free_list_t, passing that to func.
+ */
+static int
+process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
+ dfl_iter_fn_t func, void *arg, int kmflag)
+{
+ dkioc_free_list_t *new_dfl = NULL;
+ dkioc_free_list_ext_t *new_exts = NULL;
+ dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
+ size_t actual_n = n;
+ int r = 0;
+
+ if (n == 0) {
+ return (0);
+ }
+
+ /*
+ * Ignore any zero length extents. No known devices attach any
+ * semantic meaning to such extents, and are likely just a result of
+ * narrowing the range of the extent to fit the device alignment
+ * requirements. It is possible the original caller submitted a
+ * zero length extent, but we ignore those as well. Since we can't
+ * communicate partial results back to the caller anyway, it's
+ * unclear whether reporting that one of potentially many exents was
+ * too small (without being able to identify which one) to the caller
+ * of the DKIOCFREE request would be useful.
+ */
+ for (uint64_t i = 0; i < n; i++) {
+ if (exts[i].dfle_length == 0 && --actual_n == 0) {
+ return (0);
+ }
+ }
+
+ new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
+ if (new_dfl == NULL) {
+ return (SET_ERROR(ENOMEM));
+ }
+
+ new_dfl->dfl_flags = dfl->dfl_flags;
+ new_dfl->dfl_num_exts = actual_n;
+ new_dfl->dfl_offset = dfl->dfl_offset;
+ new_exts = new_dfl->dfl_exts;
+
+ for (uint64_t i = 0; i < n; i++) {
+ if (exts[i].dfle_length == 0) {
+ continue;
+ }
+
+ *new_exts++ = exts[i];
+ }
+
+ return (func(new_dfl, arg, kmflag));
+}
+
+/*
+ * If dfi_max_ext_bytes is set, use as the max segment length,
+ * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
+ */
+#define MAX_SEGLEN(dfi) \
+ (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \
+ ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
+
+/*
+ * Split the extent at idx into multiple lists (calling func for each one).
+ */
+static int
+split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
+ dfl_iter_fn_t func, void *arg, int kmflag)
+{
+ ASSERT3U(idx, <, dfl->dfl_num_exts);
+
+ const uint64_t maxlen = MAX_SEGLEN(dfi);
+ dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx;
+ uint64_t remain = ext->dfle_length;
+ int r;
+
+ /*
+ * Break the extent into as many single requests as needed. While it
+ * would be possible in some circumstances to combine the final chunk
+ * of the extent (after splitting) with the remaining extents in the
+ * original request, it's not clear there's much benefit from the
+ * added complexity. Such behavior could be added in the future if
+ * it's determined to be worthwhile.
+ */
+ while (remain > 0) {
+ uint64_t start = dfl->dfl_offset + ext->dfle_start;
+ uint64_t len = remain;
+
+ /*
+ * If we know we have at least one more segment left after
+ * the current iteration of this loop, split it so that
+ * the next segment starts on an aligned boundary.
+ */
+ if (len > maxlen) {
+ uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
+ len = end - start;
+ }
+
+ ext->dfle_length = len;
+
+ if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
+ return (r);
+ }
+
+ ext->dfle_start += len;
+ remain -= len;
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/os/driver_lyr.c b/usr/src/uts/common/os/driver_lyr.c
index 9e5eb33dd6..d64342738b 100644
--- a/usr/src/uts/common/os/driver_lyr.c
+++ b/usr/src/uts/common/os/driver_lyr.c
@@ -1131,7 +1131,7 @@ ldi_usage_walker_helper(struct ldi_ident *lip, vnode_t *vp,
else
major = lip->li_major;
- ASSERT((major >= 0) && (major < devcnt));
+ ASSERT3U(major, <, devcnt);
dnp = &devnamesp[major];
LOCK_DEV_OPS(&dnp->dn_lock);
@@ -1258,7 +1258,7 @@ ldi_mlink_lh(vnode_t *vp, int cmd, intptr_t arg, cred_t *crp, int *rvalp)
* in its internal state so that the devinfo snapshot code has some
* observability into streams device linkage information.
*/
-void
+int
ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
{
vnode_t *vp = fpdown->f_vnode;
@@ -1267,9 +1267,13 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
major_t major;
int ret;
- /* if the lower stream is not a device then return */
+ /*
+ * If the lower stream is not a device then return but claim to have
+ * succeeded, which matches our historical behaviour of just not
+ * setting up LDI in this case.
+ */
if (!vn_matchops(vp, spec_getvnodeops()))
- return;
+ return (0);
ASSERT(!servicing_interrupt());
@@ -1280,6 +1284,41 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
sp = VTOS(vp);
csp = VTOS(sp->s_commonvp);
+ /* get a layered ident for the upper stream */
+ if (type == LINKNORMAL) {
+ /*
+ * if the link is not persistant then we can associate
+ * the upper stream with a dev_t. this is because the
+ * upper stream is associated with a vnode, which is
+ * associated with a dev_t and this binding can't change
+ * during the life of the stream. since the link isn't
+ * persistant once the stream is destroyed the link is
+ * destroyed. so the dev_t will be valid for the life
+ * of the link.
+ */
+ ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
+ } else {
+ /*
+ * if the link is persistant we can only associate the
+ * link with a driver (and not a dev_t.) this is
+ * because subsequent opens of the upper device may result
+ * in a different stream (and dev_t) having access to
+ * the lower stream.
+ *
+ * for example, if the upper stream is closed after the
+ * persistant link operation is completed, a subsequent
+ * open of the upper device will create a new stream which
+ * may have a different dev_t and an unlink operation
+ * can be performed using this new upper stream.
+ */
+ VERIFY3S(type, ==, LINKPERSIST);
+ major = getmajor(stp->sd_vnode->v_rdev);
+ ret = ldi_ident_from_major(major, &li);
+ }
+
+ if (ret != 0)
+ return (ret);
+
/* check if this was a plink via a layered handle */
if (lhlink) {
/*
@@ -1303,8 +1342,10 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
* while there may still be valid layered handles
* pointing to it.
*/
+ VERIFY3S(type, ==, LINKPERSIST);
+
mutex_enter(&csp->s_lock);
- ASSERT(csp->s_count >= 1);
+ VERIFY(csp->s_count >= 1);
csp->s_count++;
mutex_exit(&csp->s_lock);
@@ -1330,48 +1371,17 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
* mark the snode/stream as multiplexed
*/
mutex_enter(&sp->s_lock);
- ASSERT(!(sp->s_flag & SMUXED));
+ VERIFY(!(sp->s_flag & SMUXED));
sp->s_flag |= SMUXED;
mutex_exit(&sp->s_lock);
- /* get a layered ident for the upper stream */
- if (type == LINKNORMAL) {
- /*
- * if the link is not persistant then we can associate
- * the upper stream with a dev_t. this is because the
- * upper stream is associated with a vnode, which is
- * associated with a dev_t and this binding can't change
- * during the life of the stream. since the link isn't
- * persistant once the stream is destroyed the link is
- * destroyed. so the dev_t will be valid for the life
- * of the link.
- */
- ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
- } else {
- /*
- * if the link is persistant we can only associate the
- * link with a driver (and not a dev_t.) this is
- * because subsequent opens of the upper device may result
- * in a different stream (and dev_t) having access to
- * the lower stream.
- *
- * for example, if the upper stream is closed after the
- * persistant link operation is compleated, a subsequent
- * open of the upper device will create a new stream which
- * may have a different dev_t and an unlink operation
- * can be performed using this new upper stream.
- */
- ASSERT(type == LINKPERSIST);
- major = getmajor(stp->sd_vnode->v_rdev);
- ret = ldi_ident_from_major(major, &li);
- }
-
- ASSERT(ret == 0);
(void) handle_alloc(vp, (struct ldi_ident *)li);
ldi_ident_release(li);
+
+ return (0);
}
-void
+int
ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
{
struct ldi_handle *lhp;
@@ -1381,31 +1391,21 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
major_t major;
int ret;
- /* if the lower stream is not a device then return */
+ /*
+ * If the lower stream is not a device then return but claim to have
+ * succeeded, which matches our historical behaviour of just not
+ * setting up LDI in this case.
+ */
if (!vn_matchops(vp, spec_getvnodeops()))
- return;
+ return (0);
ASSERT(!servicing_interrupt());
- ASSERT((type == LINKNORMAL) || (type == LINKPERSIST));
LDI_STREAMS_LNK((CE_NOTE, "%s: unlinking streams "
"stp=0x%p, fpdown=0x%p", "ldi_munlink_fp",
(void *)stp, (void *)fpdown));
/*
- * NOTE: here we rely on the streams subsystem not allowing
- * a stream to be multiplexed more than once. if this
- * changes, we break.
- *
- * mark the snode/stream as not multiplexed
- */
- sp = VTOS(vp);
- mutex_enter(&sp->s_lock);
- ASSERT(sp->s_flag & SMUXED);
- sp->s_flag &= ~SMUXED;
- mutex_exit(&sp->s_lock);
-
- /*
* clear the owner for this snode
* see the comment in ldi_mlink_fp() for information about how
* the ident is allocated
@@ -1413,15 +1413,32 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
if (type == LINKNORMAL) {
ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
} else {
- ASSERT(type == LINKPERSIST);
+ VERIFY3S(type, ==, LINKPERSIST);
major = getmajor(stp->sd_vnode->v_rdev);
ret = ldi_ident_from_major(major, &li);
}
- ASSERT(ret == 0);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * NOTE: here we rely on the streams subsystem not allowing
+ * a stream to be multiplexed more than once. if this
+ * changes, we break.
+ *
+ * mark the snode/stream as not multiplexed
+ */
+ sp = VTOS(vp);
+ mutex_enter(&sp->s_lock);
+ VERIFY(sp->s_flag & SMUXED);
+ sp->s_flag &= ~SMUXED;
+ mutex_exit(&sp->s_lock);
+
lhp = handle_find(vp, (struct ldi_ident *)li);
handle_release(lhp);
ldi_ident_release(li);
+
+ return (0);
}
/*
diff --git a/usr/src/uts/common/os/errorq.c b/usr/src/uts/common/os/errorq.c
index 8b41e7e8c1..cd71b9be08 100644
--- a/usr/src/uts/common/os/errorq.c
+++ b/usr/src/uts/common/os/errorq.c
@@ -946,7 +946,7 @@ errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
/*
* Write elements on the dump list of each nvlist errorq to the dump device.
- * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
+ * Upon reboot, fmd(8) will extract and replay them for diagnosis.
*/
void
errorq_dump(void)
diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c
index c9214cec84..1051c037fa 100644
--- a/usr/src/uts/common/os/exacct.c
+++ b/usr/src/uts/common/os/exacct.c
@@ -1508,10 +1508,8 @@ exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
}
break;
case AC_FLOW_UID:
- if (fu->fu_userid >= 0) {
- (void) ea_attach_item(record, &fu->fu_userid,
- sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
- }
+ (void) ea_attach_item(record, &fu->fu_userid,
+ sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
break;
case AC_FLOW_ANAME:
(void) ea_attach_item(record, fu->fu_aname,
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 06e0117cd6..7ccf9b3221 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -22,6 +22,8 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -71,6 +73,7 @@
#include <sys/pool.h>
#include <sys/sdt.h>
#include <sys/corectl.h>
+#include <sys/core.h>
#include <sys/brand.h>
#include <sys/libc_kernel.h>
@@ -163,7 +166,7 @@ restart_init_notify(zone_t *zone)
* it failed. As long as the given zone is still in the "running"
* state, we will re-exec() init, but first we need to reset things
* which are usually inherited across exec() but will break init's
- * assumption that it is being exec()'d from a virgin process. Most
+ * assumption that it is being exec()'d from a virgin process. Most
* importantly this includes closing all file descriptors (exec only
* closes those marked close-on-exec) and resetting signals (exec only
* resets handled signals, and we need to clear any signals which
@@ -176,6 +179,7 @@ restart_init(int what, int why)
kthread_t *t = curthread;
klwp_t *lwp = ttolwp(t);
proc_t *p = ttoproc(t);
+ proc_t *pp = p->p_zone->zone_zsched;
user_t *up = PTOU(p);
vnode_t *oldcd, *oldrd;
@@ -187,11 +191,11 @@ restart_init(int what, int why)
* zone) know that init has failed and will be restarted.
*/
zcmn_err(p->p_zone->zone_id, CE_WARN,
- "init(1M) %s: restarting automatically",
+ "init(8) %s: restarting automatically",
exit_reason(reason_buf, sizeof (reason_buf), what, why));
if (!INGLOBALZONE(p)) {
- cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
+ cmn_err(CE_WARN, "init(8) for zone %s (pid %d) %s: "
"restarting automatically",
p->p_zone->zone_name, p->p_pid, reason_buf);
}
@@ -206,7 +210,7 @@ restart_init(int what, int why)
/*
* Grab p_lock and begin clearing miscellaneous global process
- * state that needs to be reset before we exec the new init(1M).
+ * state that needs to be reset before we exec the new init(8).
*/
mutex_enter(&p->p_lock);
@@ -270,6 +274,11 @@ restart_init(int what, int why)
up->u_cwd = NULL;
}
+ /* Reset security flags */
+ mutex_enter(&pp->p_lock);
+ p->p_secflags = pp->p_secflags;
+ mutex_exit(&pp->p_lock);
+
mutex_exit(&p->p_lock);
if (oldrd != NULL)
@@ -277,6 +286,23 @@ restart_init(int what, int why)
if (oldcd != NULL)
VN_RELE(oldcd);
+ /*
+ * It's possible that a zone's init will have become privilege aware
+ * and modified privilege sets; reset them.
+ */
+ cred_t *oldcr, *newcr;
+
+ mutex_enter(&p->p_crlock);
+ oldcr = p->p_cred;
+ mutex_enter(&pp->p_crlock);
+ crhold(newcr = p->p_cred = pp->p_cred);
+ mutex_exit(&pp->p_crlock);
+ mutex_exit(&p->p_crlock);
+ crfree(oldcr);
+ /* Additional hold for the current thread - expected by crset() */
+ crhold(newcr);
+ crset(p, newcr);
+
/* Free the controlling tty. (freectty() always assumes curproc.) */
ASSERT(p == curproc);
(void) freectty(B_TRUE);
@@ -284,7 +310,7 @@ restart_init(int what, int why)
restart_init_notify(p->p_zone);
/*
- * Now exec() the new init(1M) on top of the current process. If we
+ * Now exec() the new init(8) on top of the current process. If we
* succeed, the caller will treat this like a successful system call.
* If we fail, we issue messages and the caller will proceed with exit.
*/
@@ -294,11 +320,11 @@ restart_init(int what, int why)
return (0);
zcmn_err(p->p_zone->zone_id, CE_WARN,
- "failed to restart init(1M) (err=%d): system reboot required", err);
+ "failed to restart init(8) (err=%d): system reboot required", err);
if (!INGLOBALZONE(p)) {
- cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
- "(pid %d, err=%d): zoneadm(1M) boot required",
+ cmn_err(CE_WARN, "failed to restart init(8) for zone %s "
+ "(pid %d, err=%d): zoneadm(8) boot required",
p->p_zone->zone_name, p->p_pid, err);
}
@@ -317,7 +343,7 @@ exit(int why, int what)
/*
* If proc_exit() fails, then some other lwp in the process
* got there first. We just have to call lwp_exit() to allow
- * the other lwp to finish exiting the process. Otherwise we're
+ * the other lwp to finish exiting the process. Otherwise we're
* restarting init, and should return.
*/
if (proc_exit(why, what) != 0) {
@@ -330,7 +356,7 @@ exit(int why, int what)
/*
* Set the SEXITING flag on the process, after making sure /proc does
- * not have it locked. This is done in more places than proc_exit(),
+ * not have it locked. This is done in more places than proc_exit(),
* so it is a separate function.
*/
void
@@ -445,9 +471,9 @@ zone_init_exit(zone_t *z, int why, int what)
}
}
-
/*
- * The restart failed, the zone will shut down.
+ * The restart failed, or the criteria for a restart are not met;
+ * the zone will shut down.
*/
z->zone_init_status = wstat(why, what);
(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
@@ -484,7 +510,7 @@ proc_exit(int why, int what)
/*
* Stop and discard the process's lwps except for the current one,
- * unless some other lwp beat us to it. If exitlwps() fails then
+ * unless some other lwp beat us to it. If exitlwps() fails then
* return and the calling lwp will call (or continue in) lwp_exit().
*/
proc_is_exiting(p);
@@ -502,6 +528,13 @@ proc_exit(int why, int what)
}
mutex_exit(&p->p_lock);
+ /*
+ * Don't let init exit unless zone_start_init() failed its exec, or
+ * we are shutting down the zone or the machine.
+ *
+ * Since we are single threaded, we don't need to lock the
+ * following accesses to zone_proc_initpid.
+ */
if (p->p_pid == z->zone_proc_initpid) {
/* If zone's init restarts, we're done here. */
if (zone_init_exit(z, why, what))
@@ -600,6 +633,14 @@ proc_exit(int why, int what)
}
/*
+ * If we had generated any upanic(2) state, free that now.
+ */
+ if (p->p_upanic != NULL) {
+ kmem_free(p->p_upanic, PRUPANIC_BUFLEN);
+ p->p_upanic = NULL;
+ }
+
+ /*
* Remove any fpollinfo_t's for this (last) thread from our file
* descriptors so closeall() can ASSERT() that they're all gone.
*/
@@ -971,7 +1012,7 @@ proc_exit(int why, int what)
* curthread's proc pointer is changed to point to the 'sched'
* process for the corresponding zone, except in the case when
* the exiting process is in fact a zsched instance, in which
- * case the proc pointer is set to p0. We do so, so that the
+ * case the proc pointer is set to p0. We do so, so that the
* process still points at the right zone when we call the VN_RELE()
* below.
*
@@ -1055,7 +1096,7 @@ proc_exit(int why, int what)
/*
* task_rele() may ultimately cause the zone to go away (or
* may cause the last user process in a zone to go away, which
- * signals zsched to go away). So prior to this call, we must
+ * signals zsched to go away). So prior to this call, we must
* no longer point at zsched.
*/
t->t_procp = &p0;
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index ec89cb0657..f6179cf301 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2017, Joyent Inc.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -958,7 +959,22 @@ closef(file_t *fp)
vp = fp->f_vnode;
- error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
+ /*
+ * The __FLXPATH flag is a private interface for use by the lx
+ * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
+ * when a symbolic link is encountered, returns a file
+ * descriptor which references it.
+ * See uts/common/brand/lx/syscall/lx_open.c
+ *
+ * When this flag is set, VOP_OPEN() will not have been called when
+ * this file descriptor was opened, and VOP_CLOSE() should not be
+ * called here (for a symlink, most filesystems would return ENOSYS
+ * anyway)
+ */
+ if (fp->f_flag2 & (__FLXPATH >> 16))
+ error = 0;
+ else
+ error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
if (count > 1) {
mutex_exit(&fp->f_tlock);
@@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp)
mutex_enter(&fp->f_tlock);
fp->f_count = 1;
fp->f_flag = (ushort_t)flag;
- fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16;
+ fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16;
fp->f_vnode = vp;
fp->f_offset = 0;
fp->f_audit_data = 0;
@@ -1585,7 +1601,9 @@ fsetattrat(int fd, char *path, int flags, struct vattr *vap)
VN_HOLD(vp);
}
- if (vn_is_readonly(vp)) {
+ if (vp->v_type == VLNK && (vap->va_mask & AT_MODE) != 0) {
+ error = EOPNOTSUPP;
+ } else if (vn_is_readonly(vp)) {
error = EROFS;
} else {
error = VOP_SETATTR(vp, vap, 0, CRED(), NULL);
diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c
index 78907db25c..2d7849e30d 100644
--- a/usr/src/uts/common/os/flock.c
+++ b/usr/src/uts/common/os/flock.c
@@ -28,7 +28,7 @@
/* All Rights Reserved */
/*
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
* Copyright 2015 Joyent, Inc.
*/
@@ -1122,8 +1122,8 @@ flk_process_request(lock_descriptor_t *request)
}
if (!request_blocked_by_active) {
- lock_descriptor_t *lk[1];
- lock_descriptor_t *first_glock = NULL;
+ lock_descriptor_t *lk[1];
+ lock_descriptor_t *first_glock = NULL;
/*
* Shall we grant this?! NO!!
* What about those locks that were just granted and still
@@ -2093,12 +2093,12 @@ flk_graph_uncolor(graph_t *gp)
if (gp->mark == UINT_MAX) {
gp->mark = 1;
- for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
- lock = lock->l_next)
+ for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
+ lock = lock->l_next)
lock->l_color = 0;
- for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
- lock = lock->l_next)
+ for (lock = SLEEPING_HEAD(gp)->l_next;
+ lock != SLEEPING_HEAD(gp); lock = lock->l_next)
lock->l_color = 0;
} else {
gp->mark++;
@@ -4318,6 +4318,8 @@ nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
lock->l_flock.l_pid != pid) &&
lock_blocks_io(op, offset, length,
lock->l_type, lock->l_start, lock->l_end)) {
+ DTRACE_PROBE1(conflict_lock,
+ lock_descriptor_t *, lock);
conflict = 1;
break;
}
@@ -4467,34 +4469,34 @@ check_sleeping_locks(graph_t *gp)
edge_t *ep;
for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
lock1 = lock1->l_next) {
- ASSERT(!IS_BARRIER(lock1));
- for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
- lock2 = lock2->l_next) {
- if (lock1->l_vnode == lock2->l_vnode) {
- if (BLOCKS(lock2, lock1)) {
- ASSERT(!IS_GRANTED(lock1));
- ASSERT(!NOT_BLOCKED(lock1));
- path(lock1, lock2);
+ ASSERT(!IS_BARRIER(lock1));
+ for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
+ lock2 = lock2->l_next) {
+ if (lock1->l_vnode == lock2->l_vnode) {
+ if (BLOCKS(lock2, lock1)) {
+ ASSERT(!IS_GRANTED(lock1));
+ ASSERT(!NOT_BLOCKED(lock1));
+ path(lock1, lock2);
+ }
}
}
- }
- for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
- lock2 = lock2->l_next) {
- ASSERT(!IS_BARRIER(lock1));
- if (lock1->l_vnode == lock2->l_vnode) {
- if (BLOCKS(lock2, lock1)) {
- ASSERT(!IS_GRANTED(lock1));
- ASSERT(!NOT_BLOCKED(lock1));
- path(lock1, lock2);
+ for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
+ lock2 = lock2->l_next) {
+ ASSERT(!IS_BARRIER(lock1));
+ if (lock1->l_vnode == lock2->l_vnode) {
+ if (BLOCKS(lock2, lock1)) {
+ ASSERT(!IS_GRANTED(lock1));
+ ASSERT(!NOT_BLOCKED(lock1));
+ path(lock1, lock2);
+ }
}
}
- }
- ep = FIRST_ADJ(lock1);
- while (ep != HEAD(lock1)) {
- ASSERT(BLOCKS(ep->to_vertex, lock1));
- ep = NEXT_ADJ(ep);
- }
+ ep = FIRST_ADJ(lock1);
+ while (ep != HEAD(lock1)) {
+ ASSERT(BLOCKS(ep->to_vertex, lock1));
+ ep = NEXT_ADJ(ep);
+ }
}
}
diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c
index 66fe699366..bd3e5dceac 100644
--- a/usr/src/uts/common/os/fm.c
+++ b/usr/src/uts/common/os/fm.c
@@ -336,6 +336,7 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
c = fm_printf(d + 1, c, cols, "[...]");
break;
case DATA_TYPE_UNKNOWN:
+ case DATA_TYPE_DONTCARE:
c = fm_printf(d + 1, c, cols, "<unknown>");
break;
}
@@ -363,7 +364,7 @@ fm_nvprint(nvlist_t *nvl)
/*
* Wrapper for panic() that first produces an FMA-style message for admins.
- * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
+ * Normally such messages are generated by fmd(8)'s syslog-msgs agent: this
* is the one exception to that rule and the only error that gets messaged.
* This function is intended for use by subsystems that have detected a fatal
* error and enqueued appropriate ereports and wish to then force a panic.
@@ -375,9 +376,9 @@ fm_panic(const char *format, ...)
va_list ap;
(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
fastreboot_disable_highpil();
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
va_start(ap, format);
vpanic(format, ap);
va_end(ap);
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index 07fd623a95..6e2d3c403c 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -30,7 +30,7 @@
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
#include <sys/types.h>
#include <sys/inttypes.h>
@@ -770,20 +770,11 @@ smmap_common(caddr_t *addrp, size_t len,
}
/*
- * XXX - Do we also adjust maxprot based on protections
- * of the vnode? E.g. if no execute permission is given
- * on the vnode for the current user, maxprot probably
- * should disallow PROT_EXEC also? This is different
- * from the write access as this would be a per vnode
- * test as opposed to a per fd test for writability.
- */
-
- /*
- * Verify that the specified protections are not greater than
- * the maximum allowable protections. Also test to make sure
- * that the file descriptor does allows for read access since
- * "write only" mappings are hard to do since normally we do
- * the read from the file before the page can be written.
+ * Verify that the specified protections are not greater than the
+ * maximum allowable protections. Also test to make sure that the
+ * file descriptor allows for read access since "write only" mappings
+ * are hard to do since normally we do the read from the file before
+ * the page can be written.
*/
if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
return (EACCES);
diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 0a237e86ec..51a93dfa24 100644
--- a/usr/src/uts/common/os/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -40,8 +40,7 @@
#include <sys/multidata.h>
#include <sys/multidata_impl.h>
-extern unsigned int ip_ocsum(ushort_t *address, int halfword_count,
- unsigned int sum);
+extern unsigned int ip_ocsum(ushort_t *, int, unsigned int);
/*
* Checksum routine for Internet Protocol family headers.
@@ -587,7 +586,8 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
ip6_rthdr_t *rthdr;
ip6_frag_t *fraghdr;
- ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+ if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION)
+ return (B_FALSE);
length = IPV6_HDR_LEN;
whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
endptr = mp->b_wptr;
diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c
index 977d243400..27e30a5725 100644
--- a/usr/src/uts/common/os/kcpc.c
+++ b/usr/src/uts/common/os/kcpc.c
@@ -21,6 +21,8 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
#include <sys/param.h>
@@ -74,9 +76,10 @@ static uint32_t kcpc_nullctx_count; /* # overflows in a thread with no ctx */
*/
static int kcpc_nullctx_panic = 0;
-static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
-static void kcpc_restore(kcpc_ctx_t *ctx);
-static void kcpc_save(kcpc_ctx_t *ctx);
+static void kcpc_save(void *);
+static void kcpc_restore(void *);
+static void kcpc_lwp_create(void *, void *);
+static void kcpc_free(void *, int);
static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
@@ -111,6 +114,14 @@ extern int kcpc_hw_load_pcbe(void);
*/
static int kcpc_pcbe_error = 0;
+static const struct ctxop_template kcpc_ctxop_tpl = {
+ .ct_rev = CTXOP_TPL_REV,
+ .ct_save = kcpc_save,
+ .ct_restore = kcpc_restore,
+ .ct_lwp_create = kcpc_lwp_create,
+ .ct_free = kcpc_free,
+};
+
/*
* Perform one-time initialization of kcpc framework.
* This function performs the initialization only the first time it is called.
@@ -317,8 +328,7 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
/*
* Add a device context to the subject thread.
*/
- installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
- kcpc_lwp_create, NULL, kcpc_free);
+ ctxop_install(t, &kcpc_ctxop_tpl, ctx);
/*
* Ask the backend to program the hardware.
@@ -546,7 +556,7 @@ kcpc_unbind(kcpc_set_t *set)
t = ctx->kc_thread;
/*
* The context is thread-bound and therefore has a device
- * context. It will be freed via removectx() calling
+ * context. It will be freed via ctxop_remove() calling
* freectx() calling kcpc_free().
*/
if (t == curthread) {
@@ -559,15 +569,7 @@ kcpc_unbind(kcpc_set_t *set)
splx(save_spl);
kpreempt_enable();
}
-#ifdef DEBUG
- if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
- kcpc_lwp_create, NULL, kcpc_free) == 0)
- panic("kcpc_unbind: context %p not preset on thread %p",
- (void *)ctx, (void *)t);
-#else
- (void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
- kcpc_lwp_create, NULL, kcpc_free);
-#endif /* DEBUG */
+ VERIFY3U(ctxop_remove(t, &kcpc_ctxop_tpl, ctx), !=, 0);
t->t_cpc_set = NULL;
t->t_cpc_ctx = NULL;
} else {
@@ -1214,8 +1216,9 @@ kcpc_overflow_ast()
* Called when switching away from current thread.
*/
static void
-kcpc_save(kcpc_ctx_t *ctx)
+kcpc_save(void *arg)
{
+ kcpc_ctx_t *ctx = arg;
int err;
int save_spl;
@@ -1263,8 +1266,9 @@ kcpc_save(kcpc_ctx_t *ctx)
}
static void
-kcpc_restore(kcpc_ctx_t *ctx)
+kcpc_restore(void *arg)
{
+ kcpc_ctx_t *ctx = arg;
int save_spl;
mutex_enter(&ctx->kc_lock);
@@ -1323,9 +1327,11 @@ kcpc_restore(kcpc_ctx_t *ctx)
* it is switched off.
*/
/*ARGSUSED*/
-void
-kcpc_idle_save(struct cpu *cp)
+static void
+kcpc_idle_save(void *arg)
{
+ struct cpu *cp = arg;
+
/*
* The idle thread shouldn't be run anywhere else.
*/
@@ -1347,9 +1353,11 @@ kcpc_idle_save(struct cpu *cp)
mutex_exit(&cp->cpu_cpc_ctxlock);
}
-void
-kcpc_idle_restore(struct cpu *cp)
+static void
+kcpc_idle_restore(void *arg)
{
+ struct cpu *cp = arg;
+
/*
* The idle thread shouldn't be run anywhere else.
*/
@@ -1371,10 +1379,23 @@ kcpc_idle_restore(struct cpu *cp)
mutex_exit(&cp->cpu_cpc_ctxlock);
}
+static const struct ctxop_template kcpc_idle_ctxop_tpl = {
+ .ct_rev = CTXOP_TPL_REV,
+ .ct_save = kcpc_idle_save,
+ .ct_restore = kcpc_idle_restore,
+};
+
+void
+kcpc_idle_ctxop_install(kthread_t *t, struct cpu *cp)
+{
+ ctxop_install(t, &kcpc_idle_ctxop_tpl, cp);
+}
+
/*ARGSUSED*/
static void
-kcpc_lwp_create(kthread_t *t, kthread_t *ct)
+kcpc_lwp_create(void *parent, void *child)
{
+ kthread_t *t = parent, *ct = child;
kcpc_ctx_t *ctx = t->t_cpc_ctx, *cctx;
int i;
@@ -1423,8 +1444,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
aston(ct);
}
- installctx(ct, cctx, kcpc_save, kcpc_restore,
- NULL, kcpc_lwp_create, NULL, kcpc_free);
+ ctxop_install(ct, &kcpc_ctxop_tpl, cctx);
}
/*
@@ -1461,8 +1481,9 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
/*ARGSUSED*/
void
-kcpc_free(kcpc_ctx_t *ctx, int isexec)
+kcpc_free(void *arg, int isexec)
{
+ kcpc_ctx_t *ctx = arg;
int i;
kcpc_set_t *set = ctx->kc_set;
@@ -1543,6 +1564,12 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
kcpc_free_set(set);
}
+void
+kcpc_free_cpu(kcpc_ctx_t *ctx)
+{
+ kcpc_free(ctx, 0);
+}
+
/*
* Free the memory associated with a request set.
*/
diff --git a/usr/src/uts/common/os/klpd.c b/usr/src/uts/common/os/klpd.c
index 8592b47021..0879f791b5 100644
--- a/usr/src/uts/common/os/klpd.c
+++ b/usr/src/uts/common/os/klpd.c
@@ -1150,7 +1150,7 @@ check_user_privs(const cred_t *cr, const priv_set_t *set)
out:
if (da.rbuf != (char *)&res)
kmem_free(da.rbuf, da.rsize);
-out1:
+
kmem_free(pap, pasize);
klpd_rele(pfd);
return (err);
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index d12928acc3..4d2c1e6c10 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -24,6 +24,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2018, Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
*/
/*
@@ -2250,7 +2251,7 @@ kmem_dumppr(char **pp, char *e, const char *format, ...)
}
/*
- * Called when dumpadm(1M) configures dump parameters.
+ * Called when dumpadm(8) configures dump parameters.
*/
void
kmem_dump_init(size_t size)
@@ -4462,8 +4463,7 @@ kmem_init(void)
if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE |
KMF_CONTENTS | KMF_LITE)) != 0) ||
((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE))
- cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. "
- "See the Solaris Tunable Parameters Reference Manual.",
+ cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x.",
kmem_flags);
#ifdef DEBUG
@@ -4481,8 +4481,7 @@ kmem_init(void)
(kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0)
cmn_err(CE_WARN, "High-overhead kmem debugging features "
"enabled (kmem_flags = 0x%x). Performance degradation "
- "and large memory overhead possible. See the Solaris "
- "Tunable Parameters Reference Manual.", kmem_flags);
+ "and large memory overhead possible.", kmem_flags);
#endif /* not DEBUG */
kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP);
@@ -4530,8 +4529,21 @@ void
kmem_thread_init(void)
{
kmem_move_init();
+
+ /*
+ * This taskq is used for various kmem maintenance functions, including
+ * kmem_reap(). When maintenance is required on every cache,
+ * kmem_cache_applyall() dispatches one task per cache onto this queue.
+ *
+ * In the case of kmem_reap(), the system may be under increasingly
+ * dire memory pressure and may not be able to allocate a new task
+ * entry. The count of entries to prepopulate (below) should cover at
+ * least as many caches as we generally expect to exist on the system
+ * so that they may all be scheduled for reaping under those
+ * conditions.
+ */
kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri,
- 300, INT_MAX, TASKQ_PREPOPULATE);
+ 600, INT_MAX, TASKQ_PREPOPULATE);
}
void
@@ -5351,7 +5363,7 @@ kmem_cache_scan(kmem_cache_t *cp)
}
if (kmem_cache_is_fragmented(cp, &reap)) {
- size_t slabs_found;
+ int slabs_found;
/*
* Consolidate reclaimable slabs from the end of the partial
diff --git a/usr/src/uts/common/os/ksensor.c b/usr/src/uts/common/os/ksensor.c
new file mode 100644
index 0000000000..7dd4a22c8a
--- /dev/null
+++ b/usr/src/uts/common/os/ksensor.c
@@ -0,0 +1,871 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Oxide Computer Company
+ */
+
+/*
+ * Kernel Sensor Framework
+ *
+ * The kernel sensor framework exists to provide a simple and straightforward
+ * means for various parts of the system to declare and instantiate sensor
+ * information. Between this and the ksensor character device
+ * (uts/common/io/ksensor/ksensor_drv.c) this exposes per-device sensors and
+ * character devices.
+ *
+ * --------------------------
+ * Driver and User Interfaces
+ * --------------------------
+ *
+ * Each sensor that is registered with the framework is exposed as a character
+ * device under /dev/sensors. The device class and node name are often ':'
+ * delineated and must begin with 'ddi_sensor'. Everything after 'ddi_sensor'
+ * will be created in a directory under /dev/sensors. So for example the Intel
+ * PCH driver uses a class "ddi_sensor:temperature:pch" and a node name of
+ * 'ts.%d'. This creates the node /dev/sensors/temperature/pch/ts.0. The
+ * devfsadm plugin automatically handles the creation of directories which makes
+ * the addition of additional sensor types easy to create.
+ *
+ * Strictly speaking, any device can manage their own sensors and minor nodes by
+ * using the appropriate class and implementing the corresponding ioctls. That
+ * was how the first kernel sensors were written; however, there are a lot of
+ * issues with that which led to this:
+ *
+ * 1. Every driver had to actually implement character devices.
+ *
+ * 2. Every driver had to duplicate a lot of the logic around open(9E),
+ * close(9E), and ioctl(9E).
+ *
+ * 3. Drivers that tied into frameworks like mac(9E) or SCSAv3 needed a lot more
+ * work to fit into this model. For example, because the minor state is
+ * shared between all the instances and the frameworks, they would have
+ * required shared, global state that they don't have today.
+ *
+ * Ultimately, having an operations vector and a callback argument makes work a
+ * lot simpler for the producers of sensor data and that simplicity makes it
+ * worthwhile to take on additional effort and work here.
+ *
+ * ----------
+ * Components
+ * ----------
+ *
+ * The ksensor framework is made of a couple of different pieces:
+ *
+ * 1. This glue that is a part of genunix.
+ * 2. The ksensor character device driver.
+ * 3. Sensor providers, which are generally drivers that register with the
+ * ksensor framework.
+ *
+ * The implementation of (1) is all in this file. The implementation of (2) is
+ * in uts/common/io/ksensor/ksensor_drv.c. The implementation of (3) is found in
+ * all of the different leaf devices. Examples of (3) include pchtemp(4D) and
+ * igb(4D).
+ *
+ * We separate numbers one and two into two different components for a few
+ * reasons. The most important thing is that drivers that provide sensors should
+ * not be dependent on some other part of the system having been loaded. This
+ * makes a compelling argument for it being a part of the core kernel. However,
+ * like other subsystems (e.g. kstats, smbios, etc.), it's useful to separate
+ * out the thing that provides the interface to users with the thing that is
+ * used to glue together providers in the kernel. There's the added benefit that
+ * it's practically simpler to spin up a pseudo-device through a module.
+ *
+ * The ksensor character device driver (2) registers with the main genunix
+ * ksensor code (1) when it attaches and when it detaches. The kernel only
+ * allows a single driver to be attached to it. When that character device
+ * driver attaches, the ksensor framework will walk through all of the currently
+ * registered sensors and inform the character device driver of the nodes that
+ * it needs to create. While the character device driver is attached, the
+ * ksensor framework will also call back into it when a sensor needs to be
+ * removed.
+ *
+ * Generally speaking, this distinction of responsibilities allows the kernel
+ * sensor character device driver to attach and detach without impact to the
+ * sensor providers or them even being notified at all, it's all transparent to
+ * them.
+ *
+ * ------------------------------
+ * Sensor Lifetime and detach(9E)
+ * ------------------------------
+ *
+ * Traditionally, a device driver may be detached by the broader kernel whenever
+ * the kernel desires it. On debug builds this happens by a dedicated thread. On
+ * a non-debug build this may happen due to memory pressure or as an attempt to
+ * reclaim idle resources (though this is much less common). However, when the
+ * module is detached, the system remembers that minor nodes previously existed
+ * and that entries in /devices had been created. When something proceeds to
+ * access an entry in /devices again, the system will use that to bring a driver
+ * back to life. It doesn't matter whether it's a pseudo-device driver or
+ * something else, this can happen.
+ *
+ * One downside to the sensor framework, is that we need to emulate this
+ * behavior which leads to some amount of complexity here. But this is a
+ * worthwhile tradeoff as it makes things much simpler for providers and it's
+ * not too hard for us to emulate this behavior.
+ *
+ * When a sensor provider registers the sensor, the sensor becomes available to
+ * the system. When the sensor provider unregisters with the system, which
+ * happens during its detach routine, then we note that it has been detached;
+ * however, we don't delete its minor node and if something accesses it, we
+ * attempt to load the driver again, the same way that devfs (the file system
+ * behind /devices) does.
+ *
+ * For each dev_info_t that registers a sensor we register a callback such that
+ * when the device is removed, e.g. someone called rem_drv or physically pulls
+ * the device, then we'll be able to finally clean up the device. This lifetime
+ * can be represented in the following image:
+ *
+ * |
+ * |
+ * +-----<-------------------------------------+
+ * | |
+ * | . . call ksensor_create() |
+ * v |
+ * +-------+ |
+ * | Valid | |
+ * +-------+ |
+ * | ^
+ * | . . call ksensor_remove() |
+ * v |
+ * +---------+ |
+ * | Invalid | |
+ * +---------+ |
+ * | | |
+ * | | . . user uses sensor again |
+ * | | |
+ * | +-------------------+ |
+ * | | |
+ * | v |
+ * | +---------------+ |
+ * | | Attatching... |-->---------+
+ * | +---------------+
+ * | . . ddi unbind cb |
+ * | |
+ * v | . . attatch fails or
+ * +---------+ | no call to ksensor_create()
+ * | Deleted |--<---------------+ again
+ * +---------+
+ *
+ * When the DDI unbind callback is called, we know that the device is going to
+ * be removed. However, this happens within a subtle context with a majority of
+ * the device tree held (at least the dip's parent). In particular, another
+ * thread may be trying to obtain a hold on it and be blocked in
+ * ndi_devi_enter(). As the callback thread holds that, that could lead to a
+ * deadlock. As a result, we clean things up in two phases. One during the
+ * synchronous callback and the other via a taskq. In the first phase we
+ * logically do the following:
+ *
+ * o Remove the dip from the list of ksensor dips and set the flag that
+ * indicates that it's been removed.
+ * o Remove all of the sensors from the global avl to make sure that new
+ * threads cannot look it up.
+ *
+ * Then, after the taskq is dispatched, we do the following in taskq context:
+ *
+ * o Tell the ksensor driver that it should remove the minor node.
+ * o Block on each sensor until it is no-longer busy and then clean it up.
+ * o Clean up the ksensor_dip_t.
+ *
+ * ------------------
+ * Accessing a Sensor
+ * ------------------
+ *
+ * Access to a particular sensor is serialized in the system. In addition to
+ * that, a number of steps are required to access one that is not unlike
+ * accessing a character device. When a given sensor is held the KSENSOR_F_BUSY
+ * flag is set in the ksensor_flags member. In addition, as part of taking a
+ * hold a number of side effects occur that ensure that the sensor provider's
+ * dev_info_t is considered busy and can't be detached.
+ *
+ * To obtain a hold on a sensor the following logical steps are required (see
+ * ksensor_hold_by_id() for the implementation):
+ *
+ * 1. Map the minor to the ksensor_t via the avl tree
+ * 2. Check that the ksensor's dip is valid
+ * 3. If the sensor is busy, wait until it is no longer so, and restart from
+ * the top. Otherwise, mark the sensor as busy.
+ * 4. Enter the parent and place a hold on the sensor provider's dip.
+ * 5. Once again check if the dip is removed or not because we have to drop
+ * locks during that operation.
+ * 6. Check if the ksensor has the valid flag set. If not, attempt to configure
+ * the dip.
+ * 7. Assuming the sensor is now valid, we can return it.
+ *
+ * After this point, the sensor is considered valid for use. Once the consumer
+ * is finished with the sensor, it should be released by calling
+ * ksensor_release().
+ *
+ * An important aspect of the above scheme is that the KSENSOR_F_BUSY flag is
+ * required to progress through the validation and holding of the device. This
+ * makes sure that only one thread is attempting to attach it at a given time. A
+ * reasonable future optimization would be to amortize this cost in open(9E)
+ * and close(9E) of the minor and to bump a count as it being referenced as long
+ * as it is open.
+ *
+ * -----------------------------
+ * Character Device Registration
+ * -----------------------------
+ *
+ * The 'ksensor' character device driver can come and go. To support this, the
+ * ksensor framework communicates with the ksensor character device by a
+ * well-defined set of callbacks, used to indicate sensor addition and removal.
+ * The ksensor character device is found in uts/common/io/ksensor/ksensor_drv.c.
+ * The ksensor character device is responsible for creating and destroying minor
+ * nodes.
+ *
+ * Each ksensor_t has a flag, KSENSOR_F_NOTIFIED, that is used to indicate
+ * whether or not the registered driver has been notified of the sensor. When a
+ * callback is first registered, we'll walk through the entire list of nodes to
+ * make sure that its minor has been created. When unregistering, the minor node
+ * remove callback will not be called; however, this can generally by dealt with
+ * by calling something like ddi_remove_minor_node(dip, NULL).
+ *
+ * -------
+ * Locking
+ * -------
+ *
+ * The following rules apply to dealing with lock ordering:
+ *
+ * 1. The global ksensor_g_mutex protects all global data and must be taken
+ * before a ksensor_t's individual mutex.
+ *
+ * 2. A thread should not hold any two ksensor_t's mutex at any time.
+ *
+ * 3. No locks should be held when attempting to grab or manipulate a
+ * dev_info_t, e.g. ndi_devi_enter().
+ *
+ * 4. Unless the ksensor is actively being held, whenever a ksensor is found,
+ * one must check whether the ksensor_dip_t flag KSENSOR_DIP_F_REMOVED is
+ * set or not and whether the ksensor_t's KSENSOR_F_VALID flag is set.
+ */
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/esunddi.h>
+#include <sys/ksensor_impl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/pci.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/stddef.h>
+#include <sys/sysmacros.h>
+#include <sys/fs/dv_node.h>
+
+typedef enum {
+ /*
+ * This flag indicates that the subscribing ksensor character device has
+ * been notified about this flag.
+ */
+ KSENSOR_F_NOTIFIED = 1 << 0,
+ /*
+ * This indicates that the sensor is currently valid, meaning that the
+ * ops vector and argument are safe to use. This is removed when a
+ * driver with a sensor is detached.
+ */
+ KSENSOR_F_VALID = 1 << 1,
+ /*
+ * Indicates that a client has a hold on the sensor for some purpose.
+ * This must be set before trying to get an NDI hold. Once this is set
+ * and a NDI hold is in place, it is safe to use the operations vector
+ * and argument.
+ */
+ KSENSOR_F_BUSY = 1 << 2,
+} ksensor_flags_t;
+
+typedef enum {
+ KSENSOR_DIP_F_REMOVED = 1 << 0
+} ksensor_dip_flags_t;
+
+typedef struct {
+ list_node_t ksdip_link;
+ ksensor_dip_flags_t ksdip_flags;
+ dev_info_t *ksdip_dip;
+ ddi_unbind_callback_t ksdip_cb;
+ list_t ksdip_sensors;
+} ksensor_dip_t;
+
+typedef struct {
+ kmutex_t ksensor_mutex;
+ kcondvar_t ksensor_cv;
+ ksensor_flags_t ksensor_flags;
+ list_node_t ksensor_dip_list;
+ avl_node_t ksensor_id_avl;
+ uint_t ksensor_nwaiters;
+ ksensor_dip_t *ksensor_ksdip;
+ char *ksensor_name;
+ char *ksensor_class;
+ id_t ksensor_id;
+ const ksensor_ops_t *ksensor_ops;
+ void *ksensor_arg;
+} ksensor_t;
+
+static kmutex_t ksensor_g_mutex;
+static id_space_t *ksensor_ids;
+static list_t ksensor_dips;
+static avl_tree_t ksensor_avl;
+static dev_info_t *ksensor_cb_dip;
+static ksensor_create_f ksensor_cb_create;
+static ksensor_remove_f ksensor_cb_remove;
+
+static int
+ksensor_avl_compare(const void *l, const void *r)
+{
+ const ksensor_t *kl = l;
+ const ksensor_t *kr = r;
+
+ if (kl->ksensor_id > kr->ksensor_id) {
+ return (1);
+ } else if (kl->ksensor_id < kr->ksensor_id) {
+ return (-1);
+ } else {
+ return (0);
+ }
+}
+
+static ksensor_t *
+ksensor_find_by_id(id_t id)
+{
+ ksensor_t k, *ret;
+
+ ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+
+ k.ksensor_id = id;
+ return (avl_find(&ksensor_avl, &k, NULL));
+
+}
+
+static ksensor_t *
+ksensor_search_ksdip(ksensor_dip_t *ksdip, const char *name, const char *class)
+{
+ ksensor_t *s;
+
+ ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+
+ for (s = list_head(&ksdip->ksdip_sensors); s != NULL;
+ s = list_next(&ksdip->ksdip_sensors, s)) {
+ if (strcmp(s->ksensor_name, name) == 0 &&
+ strcmp(s->ksensor_class, class) == 0) {
+ return (s);
+ }
+ }
+
+ return (NULL);
+}
+
+static void
+ksensor_free_sensor(ksensor_t *sensor)
+{
+ strfree(sensor->ksensor_name);
+ strfree(sensor->ksensor_class);
+ id_free(ksensor_ids, sensor->ksensor_id);
+ mutex_destroy(&sensor->ksensor_mutex);
+ kmem_free(sensor, sizeof (ksensor_t));
+}
+
+static void
+ksensor_free_dip(ksensor_dip_t *ksdip)
+{
+ list_destroy(&ksdip->ksdip_sensors);
+ kmem_free(ksdip, sizeof (ksensor_dip_t));
+}
+
+static void
+ksensor_dip_unbind_taskq(void *arg)
+{
+ ksensor_dip_t *k = arg;
+ ksensor_t *sensor;
+
+ /*
+ * First notify an attached driver that the nodes are going away
+ * before we block and wait on them.
+ */
+ mutex_enter(&ksensor_g_mutex);
+ for (sensor = list_head(&k->ksdip_sensors); sensor != NULL;
+ sensor = list_next(&k->ksdip_sensors, sensor)) {
+ mutex_enter(&sensor->ksensor_mutex);
+ if (sensor->ksensor_flags & KSENSOR_F_NOTIFIED) {
+ ksensor_cb_remove(sensor->ksensor_id,
+ sensor->ksensor_name);
+ sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED;
+ }
+ mutex_exit(&sensor->ksensor_mutex);
+ }
+ mutex_exit(&ksensor_g_mutex);
+
+ /*
+ * Now that the driver has destroyed its minor, wait for anything that's
+ * still there.
+ */
+ while ((sensor = list_remove_head(&k->ksdip_sensors)) != NULL) {
+ mutex_enter(&sensor->ksensor_mutex);
+ while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0 ||
+ sensor->ksensor_nwaiters > 0) {
+ cv_wait(&sensor->ksensor_cv, &sensor->ksensor_mutex);
+ }
+ mutex_exit(&sensor->ksensor_mutex);
+ ksensor_free_sensor(sensor);
+ }
+ ksensor_free_dip(k);
+}
+
+static void
+ksensor_dip_unbind_cb(void *arg, dev_info_t *dip)
+{
+ ksensor_dip_t *k = arg;
+ ksensor_t *sensor;
+
+ /*
+ * Remove the dip and the associated sensors from global visibility.
+ * This will ensure that no new clients can find this; however, others
+ * may have extent attempts to grab it (but lost the race in an NDI
+ * hold).
+ */
+ mutex_enter(&ksensor_g_mutex);
+ list_remove(&ksensor_dips, k);
+ k->ksdip_flags |= KSENSOR_DIP_F_REMOVED;
+ for (sensor = list_head(&k->ksdip_sensors); sensor != NULL;
+ sensor = list_next(&k->ksdip_sensors, sensor)) {
+ avl_remove(&ksensor_avl, sensor);
+ }
+ mutex_exit(&ksensor_g_mutex);
+
+ (void) taskq_dispatch(system_taskq, ksensor_dip_unbind_taskq, k,
+ TQ_SLEEP);
+}
+
+static ksensor_dip_t *
+ksensor_dip_create(dev_info_t *dip)
+{
+ ksensor_dip_t *k;
+
+ k = kmem_zalloc(sizeof (ksensor_dip_t), KM_SLEEP);
+ k->ksdip_dip = dip;
+ k->ksdip_cb.ddiub_cb = ksensor_dip_unbind_cb;
+ k->ksdip_cb.ddiub_arg = k;
+ list_create(&k->ksdip_sensors, sizeof (ksensor_t),
+ offsetof(ksensor_t, ksensor_dip_list));
+ e_ddi_register_unbind_callback(dip, &k->ksdip_cb);
+
+ return (k);
+}
+
+static ksensor_dip_t *
+ksensor_dip_find(dev_info_t *dip)
+{
+ ksensor_dip_t *k;
+
+ ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+ for (k = list_head(&ksensor_dips); k != NULL;
+ k = list_next(&ksensor_dips, k)) {
+ if (dip == k->ksdip_dip) {
+ return (k);
+ }
+ }
+
+ return (NULL);
+}
+
+int
+ksensor_create(dev_info_t *dip, const ksensor_ops_t *ops, void *arg,
+ const char *name, const char *class, id_t *idp)
+{
+ ksensor_dip_t *ksdip;
+ ksensor_t *sensor;
+
+ if (dip == NULL || ops == NULL || name == NULL || class == NULL ||
+ idp == NULL) {
+ return (EINVAL);
+ }
+
+ if (!DEVI_IS_ATTACHING(dip)) {
+ return (EAGAIN);
+ }
+
+ mutex_enter(&ksensor_g_mutex);
+ ksdip = ksensor_dip_find(dip);
+ if (ksdip == NULL) {
+ ksdip = ksensor_dip_create(dip);
+ list_insert_tail(&ksensor_dips, ksdip);
+ }
+
+ sensor = ksensor_search_ksdip(ksdip, name, class);
+ if (sensor != NULL) {
+ ASSERT3P(sensor->ksensor_ksdip, ==, ksdip);
+ if ((sensor->ksensor_flags & KSENSOR_F_VALID) != 0) {
+ mutex_exit(&ksensor_g_mutex);
+ dev_err(dip, CE_WARN, "tried to create sensor %s:%s "
+ "which is currently active", class, name);
+ return (EEXIST);
+ }
+
+ sensor->ksensor_ops = ops;
+ sensor->ksensor_arg = arg;
+ } else {
+ sensor = kmem_zalloc(sizeof (ksensor_t), KM_SLEEP);
+ sensor->ksensor_ksdip = ksdip;
+ sensor->ksensor_name = ddi_strdup(name, KM_SLEEP);
+ sensor->ksensor_class = ddi_strdup(class, KM_SLEEP);
+ sensor->ksensor_id = id_alloc(ksensor_ids);
+ sensor->ksensor_ops = ops;
+ sensor->ksensor_arg = arg;
+ list_insert_tail(&ksdip->ksdip_sensors, sensor);
+ avl_add(&ksensor_avl, sensor);
+ }
+
+ sensor->ksensor_flags |= KSENSOR_F_VALID;
+
+ if (ksensor_cb_create != NULL) {
+
+ if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class,
+ sensor->ksensor_name) == 0) {
+ sensor->ksensor_flags |= KSENSOR_F_NOTIFIED;
+ }
+ }
+
+ *idp = sensor->ksensor_id;
+ mutex_exit(&ksensor_g_mutex);
+
+ return (0);
+}
+
+int
+ksensor_create_scalar_pcidev(dev_info_t *dip, uint_t kind,
+ const ksensor_ops_t *ops, void *arg, const char *name, id_t *idp)
+{
+ char *pci_name, *type;
+ const char *class;
+ int *regs, ret;
+ uint_t nregs;
+ uint16_t bus, dev;
+
+ switch (kind) {
+ case SENSOR_KIND_TEMPERATURE:
+ class = "ddi_sensor:temperature:pci";
+ break;
+ case SENSOR_KIND_VOLTAGE:
+ class = "ddi_sensor:voltage:pci";
+ break;
+ case SENSOR_KIND_CURRENT:
+ class = "ddi_sensor:current:pci";
+ break;
+ default:
+ return (ENOTSUP);
+ }
+
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, "device_type",
+ &type) != DDI_PROP_SUCCESS) {
+ return (EINVAL);
+ }
+
+ if (strcmp(type, "pciex") != 0 && strcmp(type, "pci") != 0) {
+ ddi_prop_free(type);
+ return (EINVAL);
+ }
+ ddi_prop_free(type);
+
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, "reg",
+ &regs, &nregs) != DDI_PROP_SUCCESS) {
+ return (EINVAL);
+ }
+
+ if (nregs < 1) {
+ ddi_prop_free(regs);
+ return (EIO);
+ }
+
+ bus = PCI_REG_BUS_G(regs[0]);
+ dev = PCI_REG_DEV_G(regs[0]);
+ ddi_prop_free(regs);
+
+ pci_name = kmem_asprintf("%x.%x:%s", bus, dev, name);
+
+ ret = ksensor_create(dip, ops, arg, pci_name, class, idp);
+ strfree(pci_name);
+ return (ret);
+}
+
+/*
+ * When a driver removes a sensor, we basically mark it as invalid. This happens
+ * because drivers can detach and we will need to reattach them when the sensor
+ * is used again.
+ */
+int
+ksensor_remove(dev_info_t *dip, id_t id)
+{
+ ksensor_dip_t *kdip;
+ ksensor_t *sensor;
+
+ if (!DEVI_IS_ATTACHING(dip) && !DEVI_IS_DETACHING(dip)) {
+ return (EAGAIN);
+ }
+
+ mutex_enter(&ksensor_g_mutex);
+ kdip = ksensor_dip_find(dip);
+ if (kdip == NULL) {
+ mutex_exit(&ksensor_g_mutex);
+ return (ENOENT);
+ }
+
+ for (sensor = list_head(&kdip->ksdip_sensors); sensor != NULL;
+ sensor = list_next(&kdip->ksdip_sensors, sensor)) {
+ if (sensor->ksensor_id == id || id == KSENSOR_ALL_IDS) {
+ mutex_enter(&sensor->ksensor_mutex);
+ sensor->ksensor_flags &= ~KSENSOR_F_VALID;
+ sensor->ksensor_ops = NULL;
+ sensor->ksensor_arg = NULL;
+ mutex_exit(&sensor->ksensor_mutex);
+ }
+ }
+ mutex_exit(&ksensor_g_mutex);
+ return (0);
+}
+
+static void
+ksensor_release(ksensor_t *sensor)
+{
+ int circ;
+ dev_info_t *pdip;
+
+ ddi_release_devi(sensor->ksensor_ksdip->ksdip_dip);
+
+ mutex_enter(&sensor->ksensor_mutex);
+ sensor->ksensor_flags &= ~KSENSOR_F_BUSY;
+ cv_broadcast(&sensor->ksensor_cv);
+ mutex_exit(&sensor->ksensor_mutex);
+}
+
+static int
+ksensor_hold_by_id(id_t id, ksensor_t **outp)
+{
+ int circ;
+ ksensor_t *sensor;
+ dev_info_t *pdip;
+
+restart:
+ mutex_enter(&ksensor_g_mutex);
+ sensor = ksensor_find_by_id(id);
+ if (sensor == NULL) {
+ mutex_exit(&ksensor_g_mutex);
+ *outp = NULL;
+ return (ESTALE);
+ }
+
+ if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) {
+ mutex_exit(&ksensor_g_mutex);
+ *outp = NULL;
+ return (ESTALE);
+ }
+
+ mutex_enter(&sensor->ksensor_mutex);
+ if ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) {
+ mutex_exit(&ksensor_g_mutex);
+ sensor->ksensor_nwaiters++;
+ while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) {
+ int cv = cv_wait_sig(&sensor->ksensor_cv,
+ &sensor->ksensor_mutex);
+ if (cv == 0) {
+ sensor->ksensor_nwaiters--;
+ cv_broadcast(&sensor->ksensor_cv);
+ mutex_exit(&sensor->ksensor_mutex);
+ *outp = NULL;
+ return (EINTR);
+ }
+ }
+ sensor->ksensor_nwaiters--;
+ cv_broadcast(&sensor->ksensor_cv);
+ mutex_exit(&sensor->ksensor_mutex);
+ goto restart;
+ }
+
+ /*
+ * We have obtained ownership of the sensor. At this point, we should
+ * check to see if it's valid or not.
+ */
+ sensor->ksensor_flags |= KSENSOR_F_BUSY;
+ pdip = ddi_get_parent(sensor->ksensor_ksdip->ksdip_dip);
+ mutex_exit(&sensor->ksensor_mutex);
+ mutex_exit(&ksensor_g_mutex);
+
+ /*
+ * Grab a reference on the device node to ensure that it won't go away.
+ */
+ ndi_devi_enter(pdip, &circ);
+ e_ddi_hold_devi(sensor->ksensor_ksdip->ksdip_dip);
+ ndi_devi_exit(pdip, circ);
+
+ /*
+ * Now that we have an NDI hold, check if it's valid or not. It may have
+ * become invalid while we were waiting due to a race.
+ */
+ mutex_enter(&ksensor_g_mutex);
+ if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) {
+ mutex_exit(&ksensor_g_mutex);
+ ksensor_release(sensor);
+ return (ESTALE);
+ }
+
+ mutex_enter(&sensor->ksensor_mutex);
+ if ((sensor->ksensor_flags & KSENSOR_F_VALID) == 0) {
+ mutex_exit(&sensor->ksensor_mutex);
+ mutex_exit(&ksensor_g_mutex);
+ (void) ndi_devi_config(pdip, NDI_NO_EVENT);
+ mutex_enter(&ksensor_g_mutex);
+ mutex_enter(&sensor->ksensor_mutex);
+
+ /*
+ * If we attempted to reattach it and it isn't now valid, fail
+ * this request.
+ */
+ if ((sensor->ksensor_ksdip->ksdip_flags &
+ KSENSOR_DIP_F_REMOVED) != 0 ||
+ (sensor->ksensor_flags & KSENSOR_F_VALID) == 0) {
+ mutex_exit(&sensor->ksensor_mutex);
+ mutex_exit(&ksensor_g_mutex);
+ ksensor_release(sensor);
+ return (ESTALE);
+ }
+ }
+ mutex_exit(&sensor->ksensor_mutex);
+ mutex_exit(&ksensor_g_mutex);
+ *outp = sensor;
+
+ return (0);
+}
+
+int
+ksensor_op_kind(id_t id, sensor_ioctl_kind_t *kind)
+{
+ int ret;
+ ksensor_t *sensor;
+
+ if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) {
+ return (ret);
+ }
+
+ ret = sensor->ksensor_ops->kso_kind(sensor->ksensor_arg, kind);
+ ksensor_release(sensor);
+
+ return (ret);
+}
+
+int
+ksensor_op_scalar(id_t id, sensor_ioctl_scalar_t *scalar)
+{
+ int ret;
+ ksensor_t *sensor;
+
+ if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) {
+ return (ret);
+ }
+
+ ret = sensor->ksensor_ops->kso_scalar(sensor->ksensor_arg, scalar);
+ ksensor_release(sensor);
+
+ return (ret);
+}
+
+void
+ksensor_unregister(dev_info_t *reg_dip)
+{
+ ksensor_t *sensor;
+
+ mutex_enter(&ksensor_g_mutex);
+ if (ksensor_cb_dip != reg_dip) {
+ dev_err(reg_dip, CE_PANIC, "asked to unregister illegal dip");
+ }
+
+ for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor =
+ AVL_NEXT(&ksensor_avl, sensor)) {
+ mutex_enter(&sensor->ksensor_mutex);
+ sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED;
+ mutex_exit(&sensor->ksensor_mutex);
+ }
+
+ ksensor_cb_dip = NULL;
+ ksensor_cb_create = NULL;
+ ksensor_cb_remove = NULL;
+ mutex_exit(&ksensor_g_mutex);
+}
+
+int
+ksensor_register(dev_info_t *reg_dip, ksensor_create_f create,
+ ksensor_remove_f remove)
+{
+ ksensor_t *sensor;
+
+ mutex_enter(&ksensor_g_mutex);
+ if (ksensor_cb_dip != NULL) {
+ dev_err(reg_dip, CE_WARN, "kernel sensors are already "
+ "registered");
+ mutex_exit(&ksensor_g_mutex);
+ return (EEXIST);
+ }
+
+ ksensor_cb_dip = reg_dip;
+ ksensor_cb_create = create;
+ ksensor_cb_remove = remove;
+
+ for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor =
+ AVL_NEXT(&ksensor_avl, sensor)) {
+ mutex_enter(&sensor->ksensor_mutex);
+ ASSERT0(sensor->ksensor_flags & KSENSOR_F_NOTIFIED);
+
+ if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class,
+ sensor->ksensor_name) == 0) {
+ sensor->ksensor_flags |= KSENSOR_F_NOTIFIED;
+ }
+
+ mutex_exit(&sensor->ksensor_mutex);
+ }
+
+ mutex_exit(&ksensor_g_mutex);
+
+ return (0);
+}
+
+int
+ksensor_kind_temperature(void *unused, sensor_ioctl_kind_t *k)
+{
+ k->sik_kind = SENSOR_KIND_TEMPERATURE;
+ return (0);
+}
+
+int
+ksensor_kind_current(void *unused, sensor_ioctl_kind_t *k)
+{
+ k->sik_kind = SENSOR_KIND_CURRENT;
+ return (0);
+}
+
+int
+ksensor_kind_voltage(void *unused, sensor_ioctl_kind_t *k)
+{
+ k->sik_kind = SENSOR_KIND_VOLTAGE;
+ return (0);
+}
+
+void
+ksensor_init(void)
+{
+ mutex_init(&ksensor_g_mutex, NULL, MUTEX_DRIVER, NULL);
+ list_create(&ksensor_dips, sizeof (ksensor_dip_t),
+ offsetof(ksensor_dip_t, ksdip_link));
+ ksensor_ids = id_space_create("ksensor", 1, L_MAXMIN32);
+ avl_create(&ksensor_avl, ksensor_avl_compare, sizeof (ksensor_t),
+ offsetof(ksensor_t, ksensor_id_avl));
+}
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index f3404a1cdf..31b0cf7e0d 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -1449,8 +1449,8 @@ lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
* Remove memory node from lgroup.
*/
lgrp->lgrp_mnodes &= ~mnodes_mask;
+ ASSERT(lgrp->lgrp_nmnodes > 0);
lgrp->lgrp_nmnodes--;
- ASSERT(lgrp->lgrp_nmnodes >= 0);
}
ASSERT(lgrp_root->lgrp_nmnodes > 0);
@@ -2160,8 +2160,8 @@ lpl_topo_verify(cpupart_t *cpupart)
/* do the parent lgroups exist and do they match? */
if (lgrp->lgrp_parent) {
- ASSERT(lpl->lpl_parent);
- ASSERT(lgrp->lgrp_parent->lgrp_id ==
+ ASSERT(lpl->lpl_parent != NULL &&
+ lgrp->lgrp_parent->lgrp_id ==
lpl->lpl_parent->lpl_lgrpid);
if (!lpl->lpl_parent) {
@@ -4100,12 +4100,13 @@ lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
lgrp_shm_policy_seg_t *newseg;
avl_index_t where;
- ASSERT(seg != NULL);
- ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
+ ASSERT(seg != NULL && (off >= seg->shm_off &&
+ off <= seg->shm_off + seg->shm_size));
- if (!seg || off < seg->shm_off || off > seg->shm_off +
- seg->shm_size)
+ if (!seg || off < seg->shm_off ||
+ off > seg->shm_off + seg->shm_size) {
return (NULL);
+ }
if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
return (seg);
diff --git a/usr/src/uts/common/os/log_sysevent.c b/usr/src/uts/common/os/log_sysevent.c
index 35e0048ee7..50dc5dfd82 100644
--- a/usr/src/uts/common/os/log_sysevent.c
+++ b/usr/src/uts/common/os/log_sysevent.c
@@ -1277,7 +1277,7 @@ get_registration(sysevent_channel_descriptor_t *chan, char *databuf,
class_lst_t *clist;
subclass_lst_t *sc_list;
- if (class_index < 0 || class_index > CLASS_HASH_SZ)
+ if (class_index > CLASS_HASH_SZ)
return (EINVAL);
if ((clist = chan->scd_class_list_tbl[class_index]) == NULL) {
@@ -1395,10 +1395,15 @@ log_sysevent_register(char *channel_name, char *udatabuf, se_pubsub_t *udata)
case SE_CLOSE_REGISTRATION:
close_channel(kchannel);
break;
- case SE_BIND_REGISTRATION:
- if ((kdata.ps_id = bind_common(chan, kdata.ps_type)) <= 0)
+ case SE_BIND_REGISTRATION: {
+ id_t id;
+
+ id = bind_common(chan, kdata.ps_type);
+ kdata.ps_id = (uint32_t)id;
+ if (id <= 0)
error = EBUSY;
break;
+ }
case SE_UNBIND_REGISTRATION:
(void) unbind_common(chan, kdata.ps_type, (id_t)kdata.ps_id);
break;
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 9e58a7bb56..6a922343e7 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -20,9 +20,11 @@
*/
/*
+ * Copyright 2020 Oxide Computer Company
* Copyright (c) 2013 Gary Mills
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2020 Joyent, Inc.
+ * Copyright 2022 Joyent, Inc.
+ * Copyright 2022 MNX Cloud, Inc.
*/
#include <sys/types.h>
@@ -43,6 +45,7 @@
#include <sys/utsname.h>
#include <sys/id_space.h>
#include <sys/zone.h>
+#include <sys/bootbanner.h>
log_zone_t log_global;
queue_t *log_consq;
@@ -182,6 +185,14 @@ log_zonefree(zoneid_t zoneid, void *arg)
kmem_free(lzp, sizeof (log_zone_t));
}
+static void
+log_bootbanner_print(const char *line, uint_t num)
+{
+ const char *pfx = (num == 0) ? "\r" : "";
+
+ printf("%s%s\n", pfx, line);
+}
+
void
log_init(void)
{
@@ -189,7 +200,7 @@ log_init(void)
/*
* Create a backlog queue to consume console messages during periods
- * when there is no console reader (e.g. before syslogd(1M) starts).
+ * when there is no console reader (e.g. before syslogd(8) starts).
*/
log_backlogq = log_consq = log_makeq(0, LOG_HIWAT, NULL);
@@ -207,7 +218,7 @@ log_init(void)
log_intrq = log_makeq(0, LOG_HIWAT, (void *)ipltospl(SPL8));
/*
- * Create a queue to hold the most recent 8K of console messages.
+ * Create a queue to hold the most recent 64K of console messages.
* Useful for debugging. Required by the "$<msgbuf" adb macro.
*/
log_recentq = log_makeq(0, LOG_RECENTSIZE, NULL);
@@ -246,11 +257,19 @@ log_init(void)
log_update(&log_backlog, log_backlogq, SL_CONSOLE, log_console);
/*
- * Now that logging is enabled, emit the SunOS banner.
+ * Now that logging is enabled, emit the boot banner.
*/
+#ifdef LEGACY_BANNER
printf("\rSunOS Release %s Version %s %u-bit\n",
utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
- printf("Copyright 2010-2020 Joyent, Inc.\n");
+ /*
+ * Note: In the future this should be 2022-20XX, and delete this
+ * comment when we don't need it anymore
+ */
+ printf("Copyright 2022 MNX Cloud, Inc.\n");
+#else
+ bootbanner_print(log_bootbanner_print, KM_SLEEP);
+#endif
#ifdef DEBUG
printf("DEBUG enabled\n");
#endif
@@ -667,7 +686,7 @@ log_sendmsg(mblk_t *mp, zoneid_t zoneid)
if (lp->log_q == log_consq) {
console_printf(log_overflow_msg,
lp->log_minor,
- " -- is syslogd(1M) running?");
+ " -- is syslogd(8) running?");
} else {
printf(log_overflow_msg,
lp->log_minor, "");
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 6961a2ff4f..c57f8a7d2c 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -565,7 +565,7 @@ main(void)
/*
* Set the scan rate and other parameters of the paging subsystem.
*/
- setupclock(0);
+ setupclock();
/*
* Initialize process 0's lwp directory and lwpid hash table.
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 285b76347b..fd74dd3092 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -509,7 +509,7 @@ mapalloc:
* Recalculate the paging parameters now total_pages has changed.
* This will also cause the clock hands to be reset before next use.
*/
- setupclock(1);
+ setupclock();
memsegs_unlock(1);
@@ -2700,7 +2700,7 @@ kphysm_del_cleanup(struct mem_handle *mhp)
* Recalculate the paging parameters now total_pages has changed.
* This will also cause the clock hands to be reset before next use.
*/
- setupclock(1);
+ setupclock();
memsegs_unlock(1);
diff --git a/usr/src/uts/common/os/memlist_new.c b/usr/src/uts/common/os/memlist_new.c
index adef7cb015..eaa23ed24e 100644
--- a/usr/src/uts/common/os/memlist_new.c
+++ b/usr/src/uts/common/os/memlist_new.c
@@ -143,13 +143,17 @@ memlist_insert(
}
new->ml_next = NULL;
new->ml_prev = last;
- if (last != NULL)
+ if (last != NULL) {
last->ml_next = new;
+ } else {
+ ASSERT3P(*curmemlistp, ==, NULL);
+ *curmemlistp = new;
+ }
}
void
memlist_del(struct memlist *memlistp,
- struct memlist **curmemlistp)
+ struct memlist **curmemlistp)
{
#ifdef DEBUG
/*
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 0410e6f47b..d14a4ef005 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -213,8 +213,6 @@ struct mobj_stats {
#define OVERLAPS_STACK(addr, p) \
((p->p_model == DATAMODEL_LP64) && \
(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))))
-#elif defined(__i386)
-#define OVERLAPS_STACK(addr, p) 0
#endif
/* lv_flags values - bitmap */
@@ -1010,8 +1008,8 @@ mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding,
* fcred - credentials for the file associated with vp at open time.
*/
static int
-mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
- off_t offset, int prot, cred_t *fcred)
+mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len,
+ volatile size_t zfodlen, off_t offset, int prot, cred_t *fcred)
{
int error = 0;
caddr_t zfodbase, oldaddr;
@@ -1060,8 +1058,8 @@ mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
* maxprot is passed as PROT_ALL so that mdb can
* write to this segment.
*/
- if (error = VOP_MAP(vp, (offset_t)offset, as, &addr,
- len, prot, PROT_ALL, mflag, fcred, NULL)) {
+ if ((error = VOP_MAP(vp, (offset_t)offset, as, &addr,
+ len, prot, PROT_ALL, mflag, fcred, NULL)) != 0) {
return (error);
}
diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c
index d8782b320e..f141fb4bf0 100644
--- a/usr/src/uts/common/os/modctl.c
+++ b/usr/src/uts/common/os/modctl.c
@@ -2694,7 +2694,7 @@ modrload(const char *subdir, const char *filename, struct modctl **rmodp)
CPU_STATS_ADDQ(CPU, sys, modload, 1);
}
-done: if (subdir != NULL)
+ if (subdir != NULL)
kmem_free(fullname, size);
return (rmodp ? retval : id);
}
diff --git a/usr/src/uts/common/os/modsubr.c b/usr/src/uts/common/os/modsubr.c
index e980516b10..53c4195e48 100644
--- a/usr/src/uts/common/os/modsubr.c
+++ b/usr/src/uts/common/os/modsubr.c
@@ -74,8 +74,7 @@ static void hwc_unhash(struct hwc_spec *);
int
major_valid(major_t major)
{
- return (major != DDI_MAJOR_T_NONE &&
- (major >= 0 && major < devcnt));
+ return (major != DDI_MAJOR_T_NONE && major < devcnt);
}
int
diff --git a/usr/src/uts/common/os/ndifm.c b/usr/src/uts/common/os/ndifm.c
index 16613a9203..54640971fd 100644
--- a/usr/src/uts/common/os/ndifm.c
+++ b/usr/src/uts/common/os/ndifm.c
@@ -669,7 +669,7 @@ ndi_fm_dma_err_set(ddi_dma_handle_t handle, ddi_fm_error_t *dfe)
/*
* Call parent busop fm initialization routine.
*
- * Called during driver attach(1M)
+ * Called during driver attach(9E)
*/
int
i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc)
@@ -696,7 +696,7 @@ i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc)
/*
* Call parent busop fm clean-up routine.
*
- * Called during driver detach(1M)
+ * Called during driver detach(9E)
*/
void
i_ndi_busop_fm_fini(dev_info_t *dip)
diff --git a/usr/src/uts/common/os/panic.c b/usr/src/uts/common/os/panic.c
index 62be47e843..addb8b79cb 100644
--- a/usr/src/uts/common/os/panic.c
+++ b/usr/src/uts/common/os/panic.c
@@ -213,7 +213,7 @@ panicsys(const char *format, va_list alist, struct regs *rp, int on_panic_stack)
cpu_t *cp = CPU;
caddr_t intr_stack = NULL;
- uint_t intr_actv;
+ volatile uint_t intr_actv;
ushort_t schedflag = t->t_schedflag;
cpu_t *bound_cpu = t->t_bound_cpu;
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 861c748cff..b3f01cfab2 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -22,6 +22,7 @@
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2022 Oxide Computer Company
*/
#include <sys/types.h>
@@ -69,6 +70,19 @@ int priv_debug = 0;
int priv_basic_test = -1;
/*
+ * Unlinking or creating new hard links to directories was historically allowed
+ * in some file systems; e.g., UFS allows root users to do it, at the cost of
+ * almost certain file system corruption that will require fsck to fix.
+ *
+ * Most modern operating systems and file systems (e.g., ZFS) do not allow this
+ * behaviour anymore, and we have elected to stamp it out entirely for
+ * compatibility and safety reasons. An attempt to unlink a directory will
+ * fail with EPERM, as described in the standard. During this transition, one
+ * can turn the behaviour back on, at their own risk, with this tuneable:
+ */
+int priv_allow_linkdir = 0;
+
+/*
* This file contains the majority of the policy routines.
* Since the policy routines are defined by function and not
* by privilege, there is quite a bit of duplication of
@@ -896,6 +910,23 @@ secpolicy_fs_config(const cred_t *cr, const vfs_t *vfsp)
int
secpolicy_fs_linkdir(const cred_t *cr, const vfs_t *vfsp)
{
+ if (priv_allow_linkdir == 0) {
+ /*
+ * By default, this policy check will now always return EPERM
+ * unless overridden.
+ *
+ * We do so without triggering auditing or allowing privilege
+ * debugging for two reasons: first, we intend eventually to
+ * deprecate the PRIV_SYS_LINKDIR privilege entirely and remove
+ * the use of this policy check from the file systems; second,
+ * for privilege debugging in particular, because it would be
+ * confusing to report an unlink() failure as the result of a
+ * missing privilege when in fact we are simply no longer
+ * allowing the operation at all.
+ */
+ return (EPERM);
+ }
+
return (PRIV_POLICY(cr, PRIV_SYS_LINKDIR, B_FALSE, EPERM, NULL));
}
@@ -1381,7 +1412,7 @@ secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
* this is required because vop_access function should lock the
* node for reading. A three argument function should be defined
* which accepts the following argument:
- * A pointer to the internal "node" type (inode *)
+ * A pointer to the internal "node" type (inode *)
* vnode access bits (VREAD|VWRITE|VEXEC)
* a pointer to the credential
*
@@ -1453,8 +1484,8 @@ secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, struct vattr *vap,
*
* If you are the file owner:
* chown to other uid FILE_CHOWN_SELF
- * chown to gid (non-member) FILE_CHOWN_SELF
- * chown to gid (member) <none>
+ * chown to gid (non-member) FILE_CHOWN_SELF
+ * chown to gid (member) <none>
*
* Instead of PRIV_FILE_CHOWN_SELF, FILE_CHOWN is also
* acceptable but the first one is reported when debugging.
@@ -2433,13 +2464,14 @@ secpolicy_gart_map(const cred_t *cr)
}
/*
- * secpolicy_xhci
+ * secpolicy_hwmanip
*
- * Determine if the subject can observe and manipulate the xhci driver with a
- * dangerous blunt hammer. Requires all privileges.
+ * Determine if the subject can observe and manipulate a hardware device with a
+ * dangerous blunt hammer, often suggests they can do something destructive.
+ * Requires all privileges.
*/
int
-secpolicy_xhci(const cred_t *cr)
+secpolicy_hwmanip(const cred_t *cr)
{
return (secpolicy_require_set(cr, PRIV_FULLSET, NULL, KLPDARG_NONE));
}
diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c
index f9fe8649c0..57bd2241fd 100644
--- a/usr/src/uts/common/os/pool.c
+++ b/usr/src/uts/common/os/pool.c
@@ -1441,9 +1441,13 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
switch (idtype) {
case P_PID:
case P_TASKID:
+ default:
+
/*
- * Can't bind processes or tasks
- * in local zones to pools.
+ * Can't bind processes or tasks in local zones
+ * to pools. Also catch all remaining types of
+ * idtype_t that should already have been
+ * filtered out.
*/
mutex_exit(&p->p_lock);
mutex_exit(&pidlock);
@@ -1715,6 +1719,8 @@ out: switch (idtype) {
zone->zone_pool_mod = gethrtime();
zone_rele(zone);
break;
+ default:
+ break;
}
kmem_free(procs, procs_size * sizeof (proc_t *));
diff --git a/usr/src/uts/common/os/priv.c b/usr/src/uts/common/os/priv.c
index ccde6e5af5..388ccd8918 100644
--- a/usr/src/uts/common/os/priv.c
+++ b/usr/src/uts/common/os/priv.c
@@ -182,8 +182,7 @@ priv_pr_spriv(proc_t *p, prpriv_t *prpriv, const cred_t *cr)
if (prpriv->pr_nsets != PRIV_NSET ||
prpriv->pr_setsize != PRIV_SETSIZE ||
(prpriv->pr_infosize & (sizeof (uint32_t) - 1)) != 0 ||
- prpriv->pr_infosize > priv_info->priv_infosize ||
- prpriv->pr_infosize < 0)
+ prpriv->pr_infosize > priv_info->priv_infosize)
return (EINVAL);
mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index 854fb602da..05979dd236 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -217,7 +217,7 @@ privilege PRIV_NET_BINDMLP
Allow a process to bind to a port that is configured as a
multi-level port(MLP) for the process's zone. This privilege
applies to both shared address and zone-specific address MLPs.
- See tnzonecfg(4) from the Trusted Extensions manual pages for
+ See tnzonecfg(5) from the Trusted Extensions manual pages for
information on configuring MLP ports.
This privilege is interpreted only if the system is configured
with Trusted Extensions.
@@ -507,7 +507,7 @@ privilege PRIV_SYS_TRANS_LABEL
privilege PRIV_VIRT_MANAGE
Allows a process to manage virtualized environments such as
- xVM(5).
+ xVM(7).
privilege PRIV_WIN_COLORMAP
@@ -613,7 +613,7 @@ privilege PRIV_WIN_UPGRADE_SL
privilege PRIV_XVM_CONTROL
- Allows a process access to the xVM(5) control devices for
+ Allows a process access to the xVM(7) control devices for
managing guest domains and the hypervisor. This privilege is
used only if booted into xVM on x86 platforms.
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index e0a1126567..8f52f4ef3a 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -149,7 +149,7 @@
* The locking subsequence of interest is: p_lock, rctl_dict_lock,
* rctl_lists_lock, entity->rcs_lock.
*
- * The projects(4) database and project entity resource controls
+ * The project(5) database and project entity resource controls
* A special case is made for RCENTITY_PROJECT values set through the
* setproject(3PROJECT) interface. setproject() makes use of a private
* interface, setprojrctl(), which passes through an array of resource control
@@ -170,7 +170,7 @@
*
* rctl->rc_values - a linked list of rctl_val_t. These are the active
* resource values associated with this rctl, and may have been set by
- * setrctl() - via prctl(1M), or by setprojrctl() - via
+ * setrctl() - via prctl(1), or by setprojrctl() - via
* setproject(3PROJECT).
*
* rctl->rc_projdb - a linked list of rctl_val_t. These reflect the
@@ -1570,8 +1570,6 @@ rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
int ret = 0;
rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
-local_op_retry:
-
ASSERT(MUTEX_HELD(&p->p_lock));
rset = rctl_entity_obtain_rset(rde, p);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 18b396a765..d500bf7468 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,7 +22,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2021 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
*/
#include <sys/types.h>
@@ -81,9 +82,9 @@ static size_t sc_bitmap_len; /* # of bits in allocation bitmap */
static size_t sc_bitmap_words; /* # of words in allocation bitmap */
/* Context ops */
-static void schedctl_save(sc_shared_t *);
-static void schedctl_restore(sc_shared_t *);
-static void schedctl_fork(kthread_t *, kthread_t *);
+static void schedctl_save(void *);
+static void schedctl_restore(void *);
+static void schedctl_fork(void *, void *);
/* Functions for handling shared pages */
static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
@@ -92,6 +93,13 @@ static int schedctl_map(struct anon_map *, caddr_t *, caddr_t);
static int schedctl_getpage(struct anon_map **, caddr_t *);
static void schedctl_freepage(struct anon_map *, caddr_t);
+static const struct ctxop_template schedctl_ctxop_tpl = {
+ .ct_rev = CTXOP_TPL_REV,
+ .ct_save = schedctl_save,
+ .ct_restore = schedctl_restore,
+ .ct_fork = schedctl_fork,
+};
+
/*
* System call interface to scheduler activations.
* This always operates on the current lwp.
@@ -112,8 +120,7 @@ schedctl(void)
return ((caddr_t)(uintptr_t)set_errno(error));
bzero(ssp, sizeof (*ssp));
- installctx(t, ssp, schedctl_save, schedctl_restore,
- schedctl_fork, NULL, NULL, NULL);
+ ctxop_install(t, &schedctl_ctxop_tpl, ssp);
thread_lock(t); /* protect against ts_tick and ts_update */
t->t_schedctl = ssp;
@@ -151,8 +158,7 @@ schedctl_lwp_cleanup(kthread_t *t)
* Remove the context op to avoid the final call to
* schedctl_save when switching away from this lwp.
*/
- (void) removectx(t, ssp, schedctl_save, schedctl_restore,
- schedctl_fork, NULL, NULL, NULL);
+ (void) ctxop_remove(t, &schedctl_ctxop_tpl, ssp);
/*
* Do not unmap the shared page until the process exits.
@@ -207,8 +213,10 @@ schedctl_proc_cleanup(void)
* Save new thread state.
*/
static void
-schedctl_save(sc_shared_t *ssp)
+schedctl_save(void *arg)
{
+ sc_shared_t *ssp = arg;
+
ssp->sc_state = curthread->t_state;
}
@@ -218,8 +226,10 @@ schedctl_save(sc_shared_t *ssp)
* Save new thread state and CPU.
*/
static void
-schedctl_restore(sc_shared_t *ssp)
+schedctl_restore(void *arg)
{
+ sc_shared_t *ssp = arg;
+
ssp->sc_state = SC_ONPROC;
ssp->sc_cpu = CPU->cpu_id;
}
@@ -230,8 +240,9 @@ schedctl_restore(sc_shared_t *ssp)
* The child's threads must call schedctl() to get new shared mappings.
*/
static void
-schedctl_fork(kthread_t *pt, kthread_t *ct)
+schedctl_fork(void *parent, void *child)
{
+ kthread_t *pt = parent, *ct = child;
proc_t *pp = ttoproc(pt);
proc_t *cp = ttoproc(ct);
sc_page_ctl_t *pagep;
diff --git a/usr/src/uts/common/os/share.c b/usr/src/uts/common/os/share.c
index 55a7422868..6a06be2d9c 100644
--- a/usr/src/uts/common/os/share.c
+++ b/usr/src/uts/common/os/share.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -125,6 +125,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
(shr->s_deny & F_RDDNY) ||
(shrl->shr->s_access & F_WRACC)) {
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
/*
@@ -135,6 +137,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
if (isreadonly(vp))
break;
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
/*
@@ -147,6 +151,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
(shrl->shr->s_access == F_RDACC))
break;
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
@@ -171,6 +177,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
(shrl->shr->s_deny & F_RDDNY) ||
(shrl->shr->s_access & F_WRACC)) {
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
/*
@@ -183,6 +191,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
break;
}
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
/*
@@ -199,6 +209,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
if ((shr->s_access & shrl->shr->s_deny) ||
(shr->s_deny & shrl->shr->s_access)) {
mutex_exit(&vp->v_lock);
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
return (EAGAIN);
}
}
@@ -609,8 +621,11 @@ nbl_share_conflict(vnode_t *vp, nbl_op_t op, caller_context_t *ct)
break;
#endif
}
- if (conflict)
+ if (conflict) {
+ DTRACE_PROBE1(conflict_shrlock,
+ struct shrlock *, shrl->shr);
break;
+ }
}
mutex_exit(&vp->v_lock);
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index 74f1649a07..d0611eb9bb 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -348,7 +348,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
size = P2ROUNDUP(size, share_size);
align_hint = share_size;
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
/*
* For x86, we want to share as much of the page table tree
* as possible. We use a large align_hint at first, but
@@ -366,7 +366,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
while (size >= ptes_per_table * (uint64_t)align_hint)
align_hint *= ptes_per_table;
}
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
#if defined(__sparcv9)
if (addr == 0 &&
diff --git a/usr/src/uts/common/os/softint.c b/usr/src/uts/common/os/softint.c
index ecdb038c79..8801340cf9 100644
--- a/usr/src/uts/common/os/softint.c
+++ b/usr/src/uts/common/os/softint.c
@@ -58,29 +58,29 @@
*
* Starting state is IDLE.
*
- * softint()
+ * softint()
*
*
* (c)
- * ____________________________________________________
- * | ^ ^
- * v (a) | (b) |
- * IDLE--------------------->PEND--------------------->DRAIN
- * ^ | |
- * | | |
- * | | |
- * | | |
- * | | |
- * | d d
- * | | |
- * | v v
- * | PEND DRAIN
- * | (e) & &
- * |<-----------------------STEAL STEAL
- * ^ |
- * | |
- * | (e) v
- * |_________________________<__________________________|
+ * ____________________________________________________
+ * | ^ ^
+ * v (a) | (b) |
+ * IDLE--------------------->PEND--------------------->DRAIN
+ * ^ | |
+ * | | |
+ * | | |
+ * | | |
+ * | | |
+ * | d d
+ * | | |
+ * | v v
+ * | PEND DRAIN
+ * | (e) & &
+ * |<-----------------------STEAL STEAL
+ * ^ |
+ * | |
+ * | (e) v
+ * |_________________________<__________________________|
*
*
*
@@ -146,9 +146,9 @@ uint_t softcall_pokemax = 10;
/*
* This ensures that softcall entries don't get stuck for long. It's expressed
- * in 10 milliseconds as 1 unit. When hires_tick is set or other clock frequency
- * is used, softcall_init() ensures that it's still expressed as 1 = 10 milli
- * seconds.
+ * in 10 milliseconds as 1 unit. Regardless of the value of hires_tick or
+ * clock frequency, softcall_init() ensures that it's still expressed as 1 =
+ * 10 milliseconds.
*/
unsigned int softcall_delay = 1;
diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c
index 3fd8275df0..37792b7254 100644
--- a/usr/src/uts/common/os/space.c
+++ b/usr/src/uts/common/os/space.c
@@ -23,6 +23,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
/*
@@ -93,8 +94,6 @@ int __lintzero; /* Alway zero for shutting up lint */
pfn_t physmax;
pgcnt_t physinstalled;
-struct var v;
-
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/kmem.h>
@@ -142,53 +141,6 @@ char dhcifname[IFNAMSIZ];
ether_addr_t etherbroadcastaddr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
-/*
- * Data from timod that must be resident
- */
-
-/*
- * state transition table for TI interface
- */
-#include <sys/tihdr.h>
-
-#define nr 127 /* not reachable */
-
-char ti_statetbl[TE_NOEVENTS][TS_NOSTATES] = {
- /* STATES */
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
-
- { 1, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 2, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 4, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, 0, 3, nr, 3, 3, nr, nr, 7, nr, nr, nr, 6, 7, 9, 10, 11},
- {nr, nr, 0, nr, nr, 6, nr, nr, nr, nr, nr, nr, 3, nr, 3, 3, 3},
- {nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, nr, nr, nr, 3, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, 3, nr, nr, nr, nr, 3, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, 7, nr, nr, nr, nr, 7, nr, nr, nr},
- {nr, nr, nr, 5, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, 8, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, 12, 13, nr, 14, 15, 16, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, 11, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, nr, 11, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 10, nr, 3, nr, nr, nr, nr, nr},
- {nr, nr, nr, 7, nr, nr, nr, 7, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, 9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, 10, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 9, 10, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, nr, nr, 11, 3, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, 3, nr, nr, 3, 3, 3, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, nr, nr, nr, nr, 7, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
- {nr, nr, nr, 3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-};
-
-
#include <sys/tty.h>
#include <sys/ptyvar.h>
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 975b2f3d2e..90a9ea6f0f 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -3629,7 +3629,7 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag,
/*
* The I_STR facility provides a trap door for malicious
- * code to send down bogus streamio(7I) ioctl commands to
+ * code to send down bogus streamio(4I) ioctl commands to
* unsuspecting STREAMS modules and drivers which expect to
* only get these messages from the stream head.
* Explicitly prohibit any streamio ioctls which can be
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index ac1ee2d1ce..796f89dca2 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
/*
@@ -28,6 +28,7 @@
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2018 Joyent, Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -1901,36 +1902,9 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
*/
error = strdoioctl(stp, &strioc, FNATIVE,
K_TO_K | STR_NOERROR | STR_NOSIG, crp, rvalp);
- if (error != 0) {
- lbfree(linkp);
-
- if (!(passyncq->sq_flags & SQ_BLOCKED))
- blocksq(passyncq, SQ_BLOCKED, 0);
- /*
- * Restore the stream head queue and then remove
- * the passq. Turn off STPLEX before we turn on
- * the stream by removing the passq.
- */
- rq->q_ptr = _WR(rq)->q_ptr = stpdown;
- setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO,
- B_TRUE);
-
- mutex_enter(&stpdown->sd_lock);
- stpdown->sd_flag &= ~STPLEX;
- mutex_exit(&stpdown->sd_lock);
-
- link_rempassthru(passq);
-
- mutex_enter(&stpdown->sd_lock);
- stpdown->sd_flag &= ~STRPLUMB;
- /* Wakeup anyone waiting for STRPLUMB to clear. */
- cv_broadcast(&stpdown->sd_monitor);
- mutex_exit(&stpdown->sd_lock);
+ if (error != 0)
+ goto cleanup;
- mutex_exit(&muxifier);
- netstack_rele(ss->ss_netstack);
- return (error);
- }
mutex_enter(&fpdown->f_tlock);
fpdown->f_count++;
mutex_exit(&fpdown->f_tlock);
@@ -1942,9 +1916,16 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
ASSERT((cmd == I_LINK) || (cmd == I_PLINK));
if (cmd == I_LINK) {
- ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL);
+ error = ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL);
} else {
- ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST);
+ error = ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST);
+ }
+
+ if (error != 0) {
+ mutex_enter(&fpdown->f_tlock);
+ fpdown->f_count--;
+ mutex_exit(&fpdown->f_tlock);
+ goto cleanup;
}
link_rempassthru(passq);
@@ -1976,6 +1957,36 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
*rvalp = linkp->li_lblk.l_index;
netstack_rele(ss->ss_netstack);
return (0);
+
+cleanup:
+ lbfree(linkp);
+
+ if (!(passyncq->sq_flags & SQ_BLOCKED))
+ blocksq(passyncq, SQ_BLOCKED, 0);
+ /*
+ * Restore the stream head queue and then remove
+ * the passq. Turn off STPLEX before we turn on
+ * the stream by removing the passq.
+ */
+ rq->q_ptr = _WR(rq)->q_ptr = stpdown;
+ setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO,
+ B_TRUE);
+
+ mutex_enter(&stpdown->sd_lock);
+ stpdown->sd_flag &= ~STPLEX;
+ mutex_exit(&stpdown->sd_lock);
+
+ link_rempassthru(passq);
+
+ mutex_enter(&stpdown->sd_lock);
+ stpdown->sd_flag &= ~STRPLUMB;
+ /* Wakeup anyone waiting for STRPLUMB to clear. */
+ cv_broadcast(&stpdown->sd_monitor);
+ mutex_exit(&stpdown->sd_lock);
+
+ mutex_exit(&muxifier);
+ netstack_rele(ss->ss_netstack);
+ return (error);
}
int
@@ -2232,9 +2243,9 @@ munlink(stdata_t *stp, linkinfo_t *linkp, int flag, cred_t *crp, int *rvalp,
/* clean up the layered driver linkages */
if ((flag & LINKTYPEMASK) == LINKNORMAL) {
- ldi_munlink_fp(stp, fpdown, LINKNORMAL);
+ VERIFY0(ldi_munlink_fp(stp, fpdown, LINKNORMAL));
} else {
- ldi_munlink_fp(stp, fpdown, LINKPERSIST);
+ VERIFY0(ldi_munlink_fp(stp, fpdown, LINKPERSIST));
}
link_rempassthru(passq);
@@ -3006,7 +3017,7 @@ strwaitbuf(size_t size, int pri)
* GETWAIT Check for read side errors, no M_READ
* WRITEWAIT Check for write side errors.
* NOINTR Do not return error if nonblocking or timeout.
- * STR_NOERROR Ignore all errors except STPLEX.
+ * STR_NOERROR Ignore all errors except STPLEX.
* STR_NOSIG Ignore/hold signals during the duration of the call.
* STR_PEEK Pass through the strgeterr().
*/
@@ -6630,9 +6641,9 @@ drain_syncq(syncq_t *sq)
*
* qdrain_syncq can be called (currently) from only one of two places:
* drain_syncq
- * putnext (or some variation of it).
+ * putnext (or some variation of it).
* and eventually
- * qwait(_sig)
+ * qwait(_sig)
*
* If called from drain_syncq, we found it in the list of queues needing
* service, so there is work to be done (or it wouldn't be in the list).
@@ -6652,8 +6663,8 @@ drain_syncq(syncq_t *sq)
*
* ASSUMES:
* One claim
- * QLOCK held
- * SQLOCK not held
+ * QLOCK held
+ * SQLOCK not held
* Will release QLOCK before returning
*/
void
@@ -7107,11 +7118,11 @@ static int
propagate_syncq(queue_t *qp)
{
mblk_t *bp, *head, *tail, *prev, *next;
- syncq_t *sq;
+ syncq_t *sq;
queue_t *nqp;
syncq_t *nsq;
boolean_t isdriver;
- int moved = 0;
+ int moved = 0;
uint16_t flags;
pri_t priority = curthread->t_pri;
#ifdef DEBUG
@@ -7144,7 +7155,7 @@ propagate_syncq(queue_t *qp)
/* debug macro */
SQ_PUTLOCKS_HELD(nsq);
#ifdef DEBUG
- func = (void (*)())nqp->q_qinfo->qi_putp;
+ func = (void (*)())(uintptr_t)nqp->q_qinfo->qi_putp;
#endif
}
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index 0dde96307b..ac48bf31b7 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -250,7 +250,7 @@ ddi_unmap_regs(dev_info_t *dip, uint_t rnumber, caddr_t *kaddrp, off_t offset,
int
ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
- off_t offset, off_t len, caddr_t *vaddrp)
+ off_t offset, off_t len, caddr_t *vaddrp)
{
return (i_ddi_bus_map(dip, rdip, mp, offset, len, vaddrp));
}
@@ -265,7 +265,7 @@ ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
*/
int
nullbusmap(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
- off_t offset, off_t len, caddr_t *vaddrp)
+ off_t offset, off_t len, caddr_t *vaddrp)
{
_NOTE(ARGUNUSED(rdip))
if (mp->map_type == DDI_MT_RNUMBER)
@@ -433,45 +433,6 @@ ddi_peek64(dev_info_t *dip, int64_t *addr, int64_t *val_p)
val_p));
}
-
-/*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using ddi_peek8 and built against S10
- * or earlier will actually have a reference to ddi_peekc in the binary.
- */
-#ifdef _ILP32
-int
-ddi_peekc(dev_info_t *dip, int8_t *addr, int8_t *val_p)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
- val_p));
-}
-
-int
-ddi_peeks(dev_info_t *dip, int16_t *addr, int16_t *val_p)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
- val_p));
-}
-
-int
-ddi_peekl(dev_info_t *dip, int32_t *addr, int32_t *val_p)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
- val_p));
-}
-
-int
-ddi_peekd(dev_info_t *dip, int64_t *addr, int64_t *val_p)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
- val_p));
-}
-#endif /* _ILP32 */
-
int
ddi_poke8(dev_info_t *dip, int8_t *addr, int8_t val)
{
@@ -497,40 +458,6 @@ ddi_poke64(dev_info_t *dip, int64_t *addr, int64_t val)
}
/*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using ddi_poke8 and built against S10
- * or earlier will actually have a reference to ddi_pokec in the binary.
- */
-#ifdef _ILP32
-int
-ddi_pokec(dev_info_t *dip, int8_t *addr, int8_t val)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_pokes(dev_info_t *dip, int16_t *addr, int16_t val)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_pokel(dev_info_t *dip, int32_t *addr, int32_t val)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_poked(dev_info_t *dip, int64_t *addr, int64_t val)
-{
- return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-#endif /* _ILP32 */
-
-/*
* ddi_peekpokeio() is used primarily by the mem drivers for moving
* data to and from uio structures via peek and poke. Note that we
* use "internal" routines ddi_peek and ddi_poke to make this go
@@ -2886,7 +2813,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data)
*/
ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos +
sizeof (int64_t);
- return (DDI_PROP_RESULT_OK);
+ return (DDI_PROP_RESULT_OK);
case DDI_PROP_CMD_ENCODE:
/*
@@ -2934,7 +2861,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data)
*/
ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos +
sizeof (int64_t);
- return (DDI_PROP_RESULT_OK);
+ return (DDI_PROP_RESULT_OK);
case DDI_PROP_CMD_GET_ESIZE:
/*
@@ -3115,7 +3042,7 @@ ddi_prop_1275_string(prop_handle_t *ph, uint_t cmd, char *data)
*/
int
ddi_prop_1275_bytes(prop_handle_t *ph, uint_t cmd, uchar_t *data,
- uint_t nelements)
+ uint_t nelements)
{
switch (cmd) {
case DDI_PROP_CMD_DECODE:
@@ -4922,7 +4849,7 @@ impl_ddi_callback_init(void)
static void
callback_insert(int (*funcp)(caddr_t), caddr_t arg, uintptr_t *listid,
- int count)
+ int count)
{
struct ddi_callback *list, *marker, *new;
size_t size = sizeof (struct ddi_callback);
@@ -5614,7 +5541,7 @@ fail:
* devfs event subclass names as device class names.
*/
static int
-derive_devi_class(dev_info_t *dip, char *node_type, int flag)
+derive_devi_class(dev_info_t *dip, const char *node_type, int flag)
{
int rv = DDI_SUCCESS;
@@ -5659,10 +5586,10 @@ derive_devi_class(dev_info_t *dip, char *node_type, int flag)
* exceed IFNAMSIZ (16) characters in length.
*/
static boolean_t
-verify_name(char *name)
+verify_name(const char *name)
{
- size_t len = strlen(name);
- char *cp;
+ size_t len = strlen(name);
+ const char *cp;
if (len == 0 || len > IFNAMSIZ)
return (B_FALSE);
@@ -5680,9 +5607,9 @@ verify_name(char *name)
* attach it to the given devinfo node.
*/
-int
-ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
- minor_t minor_num, char *node_type, int flag, ddi_minor_type mtype,
+static int
+ddi_create_minor_common(dev_info_t *dip, const char *name, int spec_type,
+ minor_t minor_num, const char *node_type, int flag, ddi_minor_type mtype,
const char *read_priv, const char *write_priv, mode_t priv_mode)
{
struct ddi_minor_data *dmdp;
@@ -5793,7 +5720,7 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
*/
if (!(DEVI_IS_ATTACHING(dip) || DEVI_IS_DETACHING(dip)) &&
mtype != DDM_INTERNAL_PATH) {
- (void) i_log_devfs_minor_create(dip, name);
+ (void) i_log_devfs_minor_create(dip, dmdp->ddm_name);
}
/*
@@ -5804,16 +5731,16 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
}
int
-ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type,
- minor_t minor_num, char *node_type, int flag)
+ddi_create_minor_node(dev_info_t *dip, const char *name, int spec_type,
+ minor_t minor_num, const char *node_type, int flag)
{
return (ddi_create_minor_common(dip, name, spec_type, minor_num,
node_type, flag, DDM_MINOR, NULL, NULL, 0));
}
int
-ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type,
- minor_t minor_num, char *node_type, int flag,
+ddi_create_priv_minor_node(dev_info_t *dip, const char *name, int spec_type,
+ minor_t minor_num, const char *node_type, int flag,
const char *rdpriv, const char *wrpriv, mode_t priv_mode)
{
return (ddi_create_minor_common(dip, name, spec_type, minor_num,
@@ -5821,8 +5748,8 @@ ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type,
}
int
-ddi_create_default_minor_node(dev_info_t *dip, char *name, int spec_type,
- minor_t minor_num, char *node_type, int flag)
+ddi_create_default_minor_node(dev_info_t *dip, const char *name, int spec_type,
+ minor_t minor_num, const char *node_type, int flag)
{
return (ddi_create_minor_common(dip, name, spec_type, minor_num,
node_type, flag, DDM_DEFAULT, NULL, NULL, 0));
@@ -5842,7 +5769,7 @@ ddi_create_internal_pathname(dev_info_t *dip, char *name, int spec_type,
}
void
-ddi_remove_minor_node(dev_info_t *dip, char *name)
+ddi_remove_minor_node(dev_info_t *dip, const char *name)
{
int circ;
struct ddi_minor_data *dmdp, *dmdp1;
@@ -6956,7 +6883,7 @@ ddi_set_console_bell(void (*bellfunc)(clock_t duration))
int
ddi_dma_alloc_handle(dev_info_t *dip, ddi_dma_attr_t *attr,
- int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep)
+ int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep)
{
int (*funcp)() = ddi_dma_allochdl;
ddi_dma_attr_t dma_attr;
@@ -6986,9 +6913,9 @@ static uintptr_t dma_mem_list_id = 0;
int
ddi_dma_mem_alloc(ddi_dma_handle_t handle, size_t length,
- ddi_device_acc_attr_t *accattrp, uint_t flags,
- int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp,
- size_t *real_length, ddi_acc_handle_t *handlep)
+ ddi_device_acc_attr_t *accattrp, uint_t flags,
+ int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp,
+ size_t *real_length, ddi_acc_handle_t *handlep)
{
ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
dev_info_t *dip = hp->dmai_rdip;
@@ -7079,8 +7006,8 @@ ddi_dma_mem_free(ddi_acc_handle_t *handlep)
int
ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp,
- uint_t flags, int (*waitfp)(caddr_t), caddr_t arg,
- ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+ uint_t flags, int (*waitfp)(caddr_t), caddr_t arg,
+ ddi_dma_cookie_t *cookiep, uint_t *ccountp)
{
ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
dev_info_t *dip, *rdip;
@@ -7143,8 +7070,8 @@ ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp,
int
ddi_dma_addr_bind_handle(ddi_dma_handle_t handle, struct as *as,
- caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t),
- caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+ caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t),
+ caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
{
ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
dev_info_t *dip, *rdip;
@@ -7282,7 +7209,7 @@ ddi_dma_numwin(ddi_dma_handle_t handle, uint_t *nwinp)
int
ddi_dma_getwin(ddi_dma_handle_t h, uint_t win, off_t *offp,
- size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+ size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
{
int (*funcp)() = ddi_dma_win;
struct bus_ops *bop;
@@ -7358,8 +7285,8 @@ i_ddi_dma_clr_fault(ddi_dma_handle_t handle)
*/
int
ddi_regs_map_setup(dev_info_t *dip, uint_t rnumber, caddr_t *addrp,
- offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp,
- ddi_acc_handle_t *handle)
+ offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp,
+ ddi_acc_handle_t *handle)
{
ddi_map_req_t mr;
ddi_acc_hdl_t *hp;
@@ -7433,7 +7360,7 @@ ddi_regs_map_free(ddi_acc_handle_t *handlep)
int
ddi_device_zero(ddi_acc_handle_t handle, caddr_t dev_addr, size_t bytecount,
- ssize_t dev_advcnt, uint_t dev_datasz)
+ ssize_t dev_advcnt, uint_t dev_datasz)
{
uint8_t *b;
uint16_t *w;
@@ -7627,7 +7554,7 @@ i_ddi_devtspectype_to_minorname(dev_info_t *dip, dev_t dev, int spec_type)
*/
int
i_ddi_minorname_to_devtspectype(dev_info_t *dip, char *minor_name,
- dev_t *devtp, int *spectypep)
+ dev_t *devtp, int *spectypep)
{
int circ;
struct ddi_minor_data *dmdp;
@@ -8366,8 +8293,8 @@ umem_decr_devlockmem(struct ddi_umem_cookie *cookie)
*/
int
umem_lockmemory(caddr_t addr, size_t len, int flags, ddi_umem_cookie_t *cookie,
- struct umem_callback_ops *ops_vector,
- proc_t *procp)
+ struct umem_callback_ops *ops_vector,
+ proc_t *procp)
{
int error;
struct ddi_umem_cookie *p;
@@ -8838,8 +8765,8 @@ ddi_umem_unlock(ddi_umem_cookie_t cookie)
*/
struct buf *
ddi_umem_iosetup(ddi_umem_cookie_t cookie, off_t off, size_t len,
- int direction, dev_t dev, daddr_t blkno,
- int (*iodone)(struct buf *), int sleepflag)
+ int direction, dev_t dev, daddr_t blkno,
+ int (*iodone)(struct buf *), int sleepflag)
{
struct ddi_umem_cookie *p = (struct ddi_umem_cookie *)cookie;
struct buf *bp;
@@ -8919,7 +8846,7 @@ ddi_get_devstate(dev_info_t *dip)
void
ddi_dev_report_fault(dev_info_t *dip, ddi_fault_impact_t impact,
- ddi_fault_location_t location, const char *message)
+ ddi_fault_location_t location, const char *message)
{
struct ddi_fault_event_data fd;
ddi_eventcookie_t ec;
@@ -8950,7 +8877,7 @@ i_ddi_devi_class(dev_info_t *dip)
}
int
-i_ddi_set_devi_class(dev_info_t *dip, char *devi_class, int flag)
+i_ddi_set_devi_class(dev_info_t *dip, const char *devi_class, int flag)
{
struct dev_info *devi = DEVI(dip);
@@ -9912,7 +9839,7 @@ e_ddi_branch_unconfigure(
/* The dip still exists, so do a hold */
e_ddi_branch_hold(rdip);
}
-out:
+
kmem_free(devnm, MAXNAMELEN + 1);
ndi_devi_exit(pdip, circ);
return (ndi2errno(rv));
diff --git a/usr/src/uts/common/os/sunmdi.c b/usr/src/uts/common/os/sunmdi.c
index 0cdfd30392..6d1e10e0a4 100644
--- a/usr/src/uts/common/os/sunmdi.c
+++ b/usr/src/uts/common/os/sunmdi.c
@@ -3597,6 +3597,16 @@ i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
MDI_PI_LOCK(pip);
MDI_PI_SET_OFFLINING(pip);
break;
+
+ case MDI_PATHINFO_STATE_INIT:
+ /*
+ * Callers are not allowed to ask us to change the state to the
+ * initial state.
+ */
+ rv = MDI_FAILURE;
+ MDI_PI_UNLOCK(pip);
+ goto state_change_exit;
+
}
MDI_PI_UNLOCK(pip);
MDI_CLIENT_UNSTABLE(ct);
@@ -5722,6 +5732,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
break;
case DDI_RESUME:
+ case DDI_PM_RESUME:
MDI_DEBUG(2, (MDI_NOTE, dip,
"pHCI post_resume: called %p", (void *)ph));
if (error == DDI_SUCCESS) {
@@ -5769,6 +5780,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
break;
case DDI_RESUME:
+ case DDI_PM_RESUME:
MDI_DEBUG(2, (MDI_NOTE, dip,
"client post_attach: called %p", (void *)ct));
if (error == DDI_SUCCESS) {
@@ -6011,12 +6023,15 @@ i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
break;
case DDI_SUSPEND:
+ case DDI_PM_SUSPEND:
MDI_DEBUG(2, (MDI_NOTE, dip,
"pHCI post_suspend: called %p",
(void *)ph));
if (error != DDI_SUCCESS)
MDI_PHCI_SET_RESUME(ph);
break;
+ case DDI_HOTPLUG_DETACH:
+ break;
}
MDI_PHCI_UNLOCK(ph);
}
@@ -6054,11 +6069,14 @@ i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
break;
case DDI_SUSPEND:
+ case DDI_PM_SUSPEND:
MDI_DEBUG(2, (MDI_NOTE, dip,
"called %p", (void *)ct));
if (error != DDI_SUCCESS)
MDI_CLIENT_SET_RESUME(ct);
break;
+ case DDI_HOTPLUG_DETACH:
+ break;
}
MDI_CLIENT_UNLOCK(ct);
}
@@ -6820,6 +6838,10 @@ mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
i_mdi_pm_rele_client(ct, ct->ct_path_count);
}
break;
+ default:
+ dev_err(parent, CE_WARN, "!unhandled bus power operation: 0x%x",
+ op);
+ break;
}
MDI_CLIENT_UNLOCK(ct);
diff --git a/usr/src/uts/common/os/sunpci.c b/usr/src/uts/common/os/sunpci.c
index 209b269838..b1098b4fcc 100644
--- a/usr/src/uts/common/os/sunpci.c
+++ b/usr/src/uts/common/os/sunpci.c
@@ -145,104 +145,6 @@ pci_config_put64(ddi_acc_handle_t handle, off_t offset, uint64_t value)
ddi_put64(handle, (uint64_t *)cfgaddr, value);
}
-/*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using pci_config_get8 and built against S10
- * or earlier will actually have a reference to pci_config_getb in the binary.
- */
-#ifdef _ILP32
-uint8_t
-pci_config_getb(ddi_acc_handle_t handle, off_t offset)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- return (ddi_get8(handle, (uint8_t *)cfgaddr));
-}
-
-uint16_t
-pci_config_getw(ddi_acc_handle_t handle, off_t offset)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- return (ddi_get16(handle, (uint16_t *)cfgaddr));
-}
-
-uint32_t
-pci_config_getl(ddi_acc_handle_t handle, off_t offset)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- return (ddi_get32(handle, (uint32_t *)cfgaddr));
-}
-
-uint64_t
-pci_config_getll(ddi_acc_handle_t handle, off_t offset)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- return (ddi_get64(handle, (uint64_t *)cfgaddr));
-}
-
-void
-pci_config_putb(ddi_acc_handle_t handle, off_t offset, uint8_t value)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- ddi_put8(handle, (uint8_t *)cfgaddr, value);
-}
-
-void
-pci_config_putw(ddi_acc_handle_t handle, off_t offset, uint16_t value)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- ddi_put16(handle, (uint16_t *)cfgaddr, value);
-}
-
-void
-pci_config_putl(ddi_acc_handle_t handle, off_t offset, uint32_t value)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- ddi_put32(handle, (uint32_t *)cfgaddr, value);
-}
-
-void
-pci_config_putll(ddi_acc_handle_t handle, off_t offset, uint64_t value)
-{
- caddr_t cfgaddr;
- ddi_acc_hdl_t *hp;
-
- hp = impl_acc_hdl_get(handle);
- cfgaddr = hp->ah_addr + offset;
- ddi_put64(handle, (uint64_t *)cfgaddr, value);
-}
-#endif /* _ILP32 */
-
/*ARGSUSED*/
int
pci_report_pmcap(dev_info_t *dip, int cap, void *arg)
@@ -926,7 +828,7 @@ restoreconfig_err:
/*ARGSUSED*/
static int
pci_lookup_pmcap(dev_info_t *dip, ddi_acc_handle_t conf_hdl,
- uint16_t *pmcap_offsetp)
+ uint16_t *pmcap_offsetp)
{
uint8_t cap_ptr;
uint8_t cap_id;
diff --git a/usr/src/uts/common/os/sunpm.c b/usr/src/uts/common/os/sunpm.c
index 3ce7cc530d..7518c45cea 100644
--- a/usr/src/uts/common/os/sunpm.c
+++ b/usr/src/uts/common/os/sunpm.c
@@ -61,8 +61,8 @@
* tells what each component's power state values are, and provides human
* readable strings (currently unused) for each component name and power state.
* Devices which export pm-components(9P) are automatically power managed
- * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(1M)
- * after parsing power.conf(4)). The exception to this rule is that power
+ * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(8)
+ * after parsing power.conf(5)). The exception to this rule is that power
* manageable CPU devices may be automatically managed independently of autopm
* by either enabling or disabling (via PM_START_CPUPM and PM_STOP_CPUPM
* ioctls) cpupm. If the CPU devices are not managed independently, then they
@@ -72,13 +72,13 @@
* hardware state.
*
* Each device component also has a threshold time associated with each power
- * transition (see power.conf(4)), and a busy/idle state maintained by the
+ * transition (see power.conf(5)), and a busy/idle state maintained by the
* driver calling pm_idle_component(9F) and pm_busy_component(9F).
* Components are created idle.
*
* The PM framework provides several functions:
- * -implement PM policy as described in power.conf(4)
- * Policy is set by pmconfig(1M) issuing pm ioctls based on power.conf(4).
+ * -implement PM policy as described in power.conf(5)
+ * Policy is set by pmconfig(8) issuing pm ioctls based on power.conf(5).
* Policies consist of:
* -set threshold values (defaults if none provided by pmconfig)
* -set dependencies among devices
@@ -122,7 +122,7 @@
* cdrom is always up whenever the console framebuffer is up, so that the user
* can insert a cdrom and see a popup as a result.
*
- * The dependency terminology used in power.conf(4) is not easy to understand,
+ * The dependency terminology used in power.conf(5) is not easy to understand,
* so we've adopted a different terminology in the implementation. We write
* of a "keeps up" and a "kept up" device. A relationship can be established
* where one device keeps up another. That means that if the keepsup device
@@ -384,7 +384,7 @@ int cpr_platform_enable = 0;
/*
* pm_S3_enabled indicates that we believe the platform can support S3,
- * which we get from pmconfig(1M)
+ * which we get from pmconfig(8)
*/
int pm_S3_enabled;
@@ -1616,7 +1616,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level,
(PM_CP(dip, comp)->pmc_flags &
PM_PHC_WHILE_SET_POWER));
- resume_needed = suspended;
+ resume_needed = suspended;
}
} else {
if (POWERING_OFF(old_level, level)) {
@@ -1629,7 +1629,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level,
(PM_CP(dip, comp)->pmc_flags &
PM_PHC_WHILE_SET_POWER));
- resume_needed = suspended;
+ resume_needed = suspended;
}
}
}
@@ -2076,13 +2076,12 @@ e_pm_hold_rele_power(dev_info_t *dip, int cnt)
return;
PM_LOCK_POWER(dip, &circ);
- ASSERT(cnt >= 0 && PM_KUC(dip) >= 0 || cnt < 0 && PM_KUC(dip) > 0);
+ ASSERT(cnt >= 0 || (cnt < 0 && PM_KUC(dip) > 0));
PMD(PMD_KIDSUP, ("%s: kidsupcnt for %s@%s(%s#%d) %d->%d\n", pmf,
PM_DEVICE(dip), PM_KUC(dip), (PM_KUC(dip) + cnt)))
PM_KUC(dip) += cnt;
- ASSERT(PM_KUC(dip) >= 0);
PM_UNLOCK_POWER(dip, circ);
if (cnt < 0 && PM_KUC(dip) == 0)
@@ -7647,7 +7646,7 @@ pm_cfb_setup(const char *stdout_path)
*/
} else {
cmn_err(CE_WARN, "Kernel debugger present: see "
- "kmdb(1M) for interaction with power management.");
+ "kmdb(1) for interaction with power management.");
}
}
#ifdef DEBUG
diff --git a/usr/src/uts/common/os/swapgeneric.c b/usr/src/uts/common/os/swapgeneric.c
index 77167149fe..ce64aff89a 100644
--- a/usr/src/uts/common/os/swapgeneric.c
+++ b/usr/src/uts/common/os/swapgeneric.c
@@ -878,7 +878,7 @@ load_bootpath_drivers(char *bootpath)
#endif
dip = path_to_devinfo(pathcopy);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
/*
* i386 does not provide stub nodes for all boot devices,
* but we should be able to find the node for the parent,
@@ -910,7 +910,7 @@ load_bootpath_drivers(char *bootpath)
rval = load_boot_driver(leaf, NULL);
if (rval == -1) {
kmem_free(pathcopy, pathcopy_len);
- return (NULL);
+ return (0);
}
}
}
@@ -920,7 +920,7 @@ load_bootpath_drivers(char *bootpath)
cmn_err(CE_WARN, "can't bind driver for boot path <%s>",
bootpath);
kmem_free(pathcopy, pathcopy_len);
- return (NULL);
+ return (0);
}
/*
@@ -936,7 +936,7 @@ load_bootpath_drivers(char *bootpath)
modloadonly("drv", "ibp") == -1) {
cmn_err(CE_CONT, "ibp: cannot load platform driver\n");
kmem_free(pathcopy, pathcopy_len);
- return (NULL);
+ return (0);
}
/*
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index fb64000e4d..dca168b642 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -25,6 +25,7 @@
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2016 Joyent, Inc.
* Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -88,9 +89,9 @@ int getloadavg(int *, int);
int rusagesys(int, void *, void *, void *, void *);
int getpagesizes(int, size_t *, int);
int gtty(int, intptr_t);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
int hrtsys(struct hrtsysa *, rval_t *);
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
int ioctl(int, int, intptr_t);
int kill();
int labelsys(int, void *, void *, void *, void *, void *);
@@ -184,7 +185,7 @@ int statvfs(char *, struct statvfs *);
int fstatvfs(int, struct statvfs *);
offset_t llseek32(int32_t, uint32_t, uint32_t, int);
-#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
int sysi86(short, uintptr_t, uintptr_t, uintptr_t);
#endif
@@ -330,6 +331,7 @@ int setsockopt(int, int, int, void *, socklen_t *, int);
int sockconfig(int, void *, void *, void *, void *);
ssize_t sendfilev(int, int, const struct sendfilevec *, int, size_t *);
ssize_t getrandom(void *, size_t, unsigned int);
+void upanic(void *, size_t);
typedef int64_t (*llfcn_t)(); /* for casting one-word returns */
@@ -390,19 +392,15 @@ typedef int64_t (*llfcn_t)(); /* for casting one-word returns */
#define IF_sparc(true, false) false
#endif
-#if defined(__i386) && !defined(__amd64)
-#define IF_i386(true, false) true
-#else
#define IF_i386(true, false) false
-#endif
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
#define IF_x86(true, false) true
#else
#define IF_x86(true, false) false
#endif
-#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
#define IF_386_ABI(true, false) true
#else
#define IF_386_ABI(true, false) false
@@ -583,7 +581,7 @@ struct sysent sysent[NSYSCALL] =
/* 122 */ SYSENT_CL("writev", writev, 3),
/* 123 */ SYSENT_CL("preadv", preadv, 5),
/* 124 */ SYSENT_CL("pwritev", pwritev, 5),
- /* 125 */ SYSENT_LOADABLE(), /* (was fxstat) */
+ /* 125 */ SYSENT_CI("upanic", upanic, 2),
/* 126 */ SYSENT_CL("getrandom", getrandom, 3),
/* 127 */ SYSENT_CI("mmapobj", mmapobjsys, 5),
/* 128 */ IF_LP64(
@@ -948,7 +946,7 @@ struct sysent sysent32[NSYSCALL] =
/* 122 */ SYSENT_CI("writev", writev32, 3),
/* 123 */ SYSENT_CI("preadv", preadv, 5),
/* 124 */ SYSENT_CI("pwritev", pwritev, 5),
- /* 125 */ SYSENT_LOADABLE32(), /* was fxstat32 */
+ /* 125 */ SYSENT_CI("upanic", upanic, 2),
/* 126 */ SYSENT_CI("getrandom", getrandom, 3),
/* 127 */ SYSENT_CI("mmapobj", mmapobjsys, 5),
/* 128 */ SYSENT_CI("setrlimit", setrlimit32, 2),
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index c965db6737..f587430625 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
#include <sys/timer.h>
@@ -179,7 +179,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
/*
* timer_grab() and its companion routine, timer_release(), are wrappers
- * around timer_lock()/_unlock() which allow the timer_*(3R) routines to
+ * around timer_lock()/_unlock() which allow the timer_*(3C) routines to
* (a) share error handling code and (b) not grab p_lock themselves. Routines
* which are called with p_lock held (e.g. timer_lwpbind(), timer_lwpexit())
* must call timer_lock()/_unlock() explictly.
@@ -194,7 +194,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
* (a) The specified timer ID is out of range.
*
* (b) The specified timer ID does not correspond to a timer ID returned
- * from timer_create(3R).
+ * from timer_create(3C).
*
* (c) The specified timer ID is currently being removed.
*
@@ -482,105 +482,106 @@ timer_fire(itimer_t *it)
}
/*
- * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
- * Acquires p_lock and holds it on return, regardless of success.
+ * Find an unused (i.e. NULL) entry in p->p_itimer and set *id to the
+ * index of the unused entry, growing p->p_itimer as necessary (up to timer_max
+ * entries). Returns B_TRUE (with *id set) on success, B_FALSE on failure
+ * (e.g. the process already has the maximum number of allowed timers
+ * allocated).
*/
-static itimer_t *
-timer_alloc(proc_t *p, timer_t *id)
+static boolean_t
+timer_get_id(proc_t *p, timer_t *id)
{
- itimer_t *it, **itp = NULL;
+ itimer_t **itp = NULL, **itp_new;
+ uint_t target_sz;
uint_t i;
- ASSERT(MUTEX_NOT_HELD(&p->p_lock));
-
- it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
- bzero(it, sizeof (itimer_t));
- mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
-
- mutex_enter(&p->p_lock);
-retry:
- if (p->p_itimer != NULL) {
- for (i = 0; i < p->p_itimer_sz; i++) {
- if (p->p_itimer[i] == NULL) {
- itp = &(p->p_itimer[i]);
- break;
- }
- }
- }
-
- /*
- * A suitable slot was not found. If possible, allocate (or resize)
- * the p_itimer array and try again.
- */
- if (itp == NULL) {
- uint_t target_sz = _TIMER_ALLOC_INIT;
- itimer_t **itp_new;
-
- if (p->p_itimer != NULL) {
- ASSERT(p->p_itimer_sz != 0);
+ ASSERT(MUTEX_HELD(&p->p_lock));
- target_sz = p->p_itimer_sz * 2;
- }
+ if (p->p_itimer == NULL) {
/*
- * Protect against exceeding the max or overflow
+ * No timers have been allocated for this process, allocate
+ * the initial array.
*/
- if (target_sz > timer_max || target_sz > INT_MAX ||
- target_sz < p->p_itimer_sz) {
- kmem_cache_free(clock_timer_cache, it);
- return (NULL);
- }
+ ASSERT0(p->p_itimer_sz);
+ target_sz = _TIMER_ALLOC_INIT;
+
mutex_exit(&p->p_lock);
itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
KM_SLEEP);
mutex_enter(&p->p_lock);
- if (target_sz <= p->p_itimer_sz) {
- /*
- * A racing thread performed the resize while we were
- * waiting outside p_lock. Discard our now-useless
- * allocation and retry.
- */
- kmem_free(itp_new, target_sz * sizeof (itimer_t *));
- goto retry;
- } else {
+
+ if (p->p_itimer == NULL) {
/*
- * Instantiate the larger allocation and select the
- * first fresh entry for use.
+ * As long as no other thread beat us to allocating
+ * the initial p_itimer array, use what we allocated.
+ * Since we just allocated it, we know slot 0 is
+ * free.
*/
- if (p->p_itimer != NULL) {
- uint_t old_sz;
-
- old_sz = p->p_itimer_sz;
- bcopy(p->p_itimer, itp_new,
- old_sz * sizeof (itimer_t *));
- kmem_free(p->p_itimer,
- old_sz * sizeof (itimer_t *));
-
- /*
- * Short circuit to use the first free entry in
- * the new allocation. It's possible that
- * other lower-indexed timers were freed while
- * p_lock was dropped, but skipping over them
- * is not harmful at all. In the common case,
- * we skip the need to walk over an array
- * filled with timers before arriving at the
- * slot we know is fresh from the allocation.
- */
- i = old_sz;
- } else {
- /*
- * For processes lacking any existing timers,
- * we can simply select the first entry.
- */
- i = 0;
- }
p->p_itimer = itp_new;
p->p_itimer_sz = target_sz;
+ i = 0;
+ goto done;
+ }
+
+ /*
+ * Another thread beat us to allocating the initial array.
+ * Proceed to searching for an empty slot and growing the
+ * array if needed.
+ */
+ kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+ }
+
+retry:
+ /* Use the first empty slot (if any exist) */
+ for (i = 0; i < p->p_itimer_sz; i++) {
+ if (p->p_itimer[i] == NULL) {
+ goto done;
}
}
- ASSERT(i <= INT_MAX);
+ /* No empty slots, try to grow p->p_itimer and retry */
+ target_sz = p->p_itimer_sz * 2;
+ if (target_sz > timer_max || target_sz > INT_MAX ||
+ target_sz < p->p_itimer_sz) {
+ /* Protect against exceeding the max or overflow */
+ return (B_FALSE);
+ }
+
+ mutex_exit(&p->p_lock);
+ itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), KM_SLEEP);
+ mutex_enter(&p->p_lock);
+
+ if (target_sz <= p->p_itimer_sz) {
+ /*
+ * A racing thread performed the resize while we were
+ * waiting outside p_lock. Discard our now-useless
+ * allocation and retry.
+ */
+ kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+ goto retry;
+ }
+
+ ASSERT3P(p->p_itimer, !=, NULL);
+ bcopy(p->p_itimer, itp_new, p->p_itimer_sz * sizeof (itimer_t *));
+ kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
+
+ /*
+ * Short circuit to use the first free entry in the new allocation.
+ * It's possible that other lower-indexed timers were freed while
+ * p_lock was dropped, but skipping over them is not harmful at all.
+ * In the common case, we skip the need to walk over an array filled
+ * with timers before arriving at the slot we know is fresh from the
+ * allocation.
+ */
+ i = p->p_itimer_sz;
+
+ p->p_itimer = itp_new;
+ p->p_itimer_sz = target_sz;
+
+done:
+ ASSERT3U(i, <=, INT_MAX);
*id = (timer_t)i;
- return (it);
+ return (B_TRUE);
}
/*
@@ -612,19 +613,20 @@ timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
/*
- * Allocate a timer and choose a slot for it. This acquires p_lock.
+ * Allocate a timer and choose a slot for it.
*/
- it = timer_alloc(p, &tid);
- ASSERT(MUTEX_HELD(&p->p_lock));
+ it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+ bzero(it, sizeof (*it));
+ mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
- if (it == NULL) {
+ mutex_enter(&p->p_lock);
+ if (!timer_get_id(p, &tid)) {
mutex_exit(&p->p_lock);
kmem_free(sigq, sizeof (sigqueue_t));
- return (EAGAIN);
+ return (set_errno(EAGAIN));
}
ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
- ASSERT(evp != NULL);
/*
* If we develop other notification mechanisms, this will need
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 53be806026..cb57b60758 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -1211,7 +1211,7 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
hrtime_t
ts2hrt(const timestruc_t *tsp)
{
-#if defined(__amd64) || defined(__i386)
+#if defined(__x86)
/*
* On modern x86 CPUs, the simple version is faster.
*/
@@ -1232,7 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
hrt = (hrt << 7) - hrt - hrt - hrt;
hrt = (hrt << 9) + tsp->tv_nsec;
return (hrt);
-#endif /* defined(__amd64) || defined(__i386) */
+#endif /* defined(__x86) */
}
/*
diff --git a/usr/src/uts/common/os/upanic.c b/usr/src/uts/common/os/upanic.c
new file mode 100644
index 0000000000..b4d23eeaff
--- /dev/null
+++ b/usr/src/uts/common/os/upanic.c
@@ -0,0 +1,98 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include <sys/proc.h>
+#include <c2/audit.h>
+#include <sys/procfs.h>
+#include <sys/core.h>
+
+/*
+ * This function is meant to be a guaranteed abort that generates a core file
+ * that allows up to 1k of data to enter into an elfnote in the process. This is
+ * meant to insure that even in the face of other problems, this can get out.
+ */
+
+void
+upanic(void *addr, size_t len)
+{
+ kthread_t *t = curthread;
+ proc_t *p = curproc;
+ klwp_t *lwp = ttolwp(t);
+ uint32_t auditing = AU_AUDITING();
+ uint32_t upflag = P_UPF_PANICKED;
+ void *buf;
+ int code;
+
+ /*
+ * Before we worry about the data that the user has as a message, go
+ * ahead and make sure we try and get all the other threads stopped.
+ * That'll help us make sure that nothing else is going on and we don't
+ * lose a race.
+ */
+ mutex_enter(&p->p_lock);
+ lwp->lwp_cursig = SIGABRT;
+ mutex_exit(&p->p_lock);
+
+ proc_is_exiting(p);
+ if (exitlwps(1) != 0) {
+ mutex_enter(&p->p_lock);
+ lwp_exit();
+ }
+
+ /*
+ * Copy in the user data. We truncate it to PRUPANIC_BUFLEN no matter
+ * what and ensure that the last data was set to zero.
+ */
+ if (addr != NULL && len > 0) {
+ size_t copylen;
+
+ upflag |= P_UPF_HAVEMSG;
+
+ if (len >= PRUPANIC_BUFLEN) {
+ copylen = PRUPANIC_BUFLEN;
+ upflag |= P_UPF_TRUNCMSG;
+ } else {
+ copylen = len;
+ }
+
+ buf = kmem_zalloc(PRUPANIC_BUFLEN, KM_SLEEP);
+ if (copyin(addr, buf, copylen) != 0) {
+ upflag |= P_UPF_INVALMSG;
+ upflag &= ~P_UPF_HAVEMSG;
+ } else {
+ mutex_enter(&p->p_lock);
+ ASSERT3P(p->p_upanic, ==, NULL);
+ p->p_upanic = buf;
+ mutex_exit(&p->p_lock);
+ }
+ }
+
+ mutex_enter(&p->p_lock);
+ p->p_upanicflag = upflag;
+ mutex_exit(&p->p_lock);
+
+ /*
+ * If we're auditing we need to finish the system call itself and then
+ * begin the core dump.
+ */
+ if (auditing) {
+ audit_finish(0, SYS_upanic, 0, NULL);
+ audit_core_start(SIGABRT);
+ }
+ code = core(SIGABRT, B_FALSE);
+ if (auditing) /* audit core dump */
+ audit_core_finish(code ? CLD_KILLED : CLD_DUMPED);
+ exit(code ? CLD_KILLED : CLD_DUMPED, SIGABRT);
+}
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index f5ee76a2cb..1df2f479a5 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -18,14 +18,20 @@
*
* CDDL HEADER END
*/
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+ */
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2018 Joyent, Inc.
*/
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
@@ -60,6 +66,7 @@
#include <sys/mem_cage.h>
#include <sys/time.h>
#include <sys/zone.h>
+#include <sys/stdbool.h>
#include <vm/hat.h>
#include <vm/as.h>
@@ -68,149 +75,275 @@
#include <vm/pvn.h>
#include <vm/seg_kmem.h>
-static int checkpage(page_t *, int);
+/*
+ * FREE MEMORY MANAGEMENT
+ *
+ * Management of the pool of free pages is a tricky business. There are
+ * several critical threshold values which constrain our allocation of new
+ * pages and inform the rate of paging out of memory to swap. These threshold
+ * values, and the behaviour they induce, are described below in descending
+ * order of size -- and thus increasing order of severity!
+ *
+ * +---------------------------------------------------- physmem (all memory)
+ * |
+ * | Ordinarily there are no particular constraints placed on page
+ * v allocation. The page scanner is not running and page_create_va()
+ * | will effectively grant all page requests (whether from the kernel
+ * | or from user processes) without artificial delay.
+ * |
+ * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
+ * |
+ * | When we have less than "lotsfree" pages, pageout_scanner() is
+ * v signalled by schedpaging() to begin looking for pages that can
+ * | be evicted to disk to bring us back above lotsfree. At this
+ * | stage there is still no constraint on allocation of free pages.
+ * |
+ * | For small systems, we set a lower bound of 16MB for lotsfree;
+ * v this is the natural value for a system with 1GB memory. This is
+ * | to ensure that the pageout reserve pool contains at least 4MB
+ * | for use by ZFS.
+ * |
+ * | For systems with a large amount of memory, we constrain lotsfree
+ * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
+ * v at some point the required slack relates more closely to the
+ * | rate at which paging can occur than to the total amount of memory.
+ * |
+ * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
+ * |
+ * | When we drop below desfree, a number of kernel facilities will
+ * v wait before allocating more memory, under the assumption that
+ * | pageout or reaping will make progress and free up some memory.
+ * | This behaviour is not especially coordinated; look for comparisons
+ * | of desfree and freemem.
+ * |
+ * | In addition to various attempts at advisory caution, clock()
+ * | will wake up the thread that is ordinarily parked in sched().
+ * | This routine is responsible for the heavy-handed swapping out
+ * v of entire processes in an attempt to arrest the slide of free
+ * | memory. See comments in sched.c for more details.
+ * |
+ * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
+ * |
+ * | These two separate tunables have, by default, the same value.
+ * v Various parts of the kernel use minfree to signal the need for
+ * | more aggressive reclamation of memory, and sched() is more
+ * | aggressive at swapping processes out.
+ * |
+ * | If free memory falls below throttlefree, page_create_va() will
+ * | use page_create_throttle() to begin holding most requests for
+ * | new pages while pageout and reaping free up memory. Sleeping
+ * v allocations (e.g., KM_SLEEP) are held here while we wait for
+ * | more memory. Non-sleeping allocations are generally allowed to
+ * | proceed, unless their priority is explicitly lowered with
+ * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
+ * |
+ * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
+ * |
+ * | When we hit throttlefree, the situation is already dire. The
+ * v system is generally paging out memory and swapping out entire
+ * | processes in order to free up memory for continued operation.
+ * |
+ * | Unfortunately, evicting memory to disk generally requires short
+ * | term use of additional memory; e.g., allocation of buffers for
+ * | storage drivers, updating maps of free and used blocks, etc.
+ * | As such, pageout_reserve is the number of pages that we keep in
+ * | special reserve for use by pageout() and sched() and by any
+ * v other parts of the kernel that need to be working for those to
+ * | make forward progress such as the ZFS I/O pipeline.
+ * |
+ * | When we are below pageout_reserve, we fail or hold any allocation
+ * | that has not explicitly requested access to the reserve pool.
+ * | Access to the reserve is generally granted via the KM_PUSHPAGE
+ * | flag, or by marking a thread T_PUSHPAGE such that all allocations
+ * | can implicitly tap the reserve. For more details, see the
+ * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
+ * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
+ * |
+ * +---------------------------------------------------------- no free memory
+ * |
+ * | If we have arrived here, things are very bad indeed. It is
+ * v surprisingly difficult to tell if this condition is even fatal,
+ * | as enough memory may have been granted to pageout() and to the
+ * | ZFS I/O pipeline that requests for eviction that have already been
+ * | made will complete and free up memory some time soon.
+ * |
+ * | If free memory does not materialise, the system generally remains
+ * | deadlocked. The pageout_deadman() below is run once per second
+ * | from clock(), seeking to limit the amount of time a single request
+ * v to page out can be blocked before the system panics to get a crash
+ * | dump and return to service.
+ * |
+ * +-------------------------------------------------------------------------
+ */
/*
* The following parameters control operation of the page replacement
- * algorithm. They are initialized to 0, and then computed at boot time
- * based on the size of the system. If they are patched non-zero in
- * a loaded vmunix they are left alone and may thus be changed per system
- * using mdb on the loaded system.
+ * algorithm. They are initialized to 0, and then computed at boot time based
+ * on the size of the system; see setupclock(). If they are patched non-zero
+ * in a loaded vmunix they are left alone and may thus be changed per system
+ * using "mdb -kw" on the loaded system.
*/
pgcnt_t slowscan = 0;
pgcnt_t fastscan = 0;
static pgcnt_t handspreadpages = 0;
-static int loopfraction = 2;
+
+/*
+ * looppages:
+ * Cached copy of the total number of pages in the system (total_pages).
+ *
+ * loopfraction:
+ * Divisor used to relate fastscan to looppages in setupclock().
+ */
+static uint_t loopfraction = 2;
static pgcnt_t looppages;
-/* See comment below describing 4% and 80% */
-static int min_percent_cpu = 4;
-static int max_percent_cpu = 80;
+
+static uint_t min_percent_cpu = 4;
+static uint_t max_percent_cpu = 80;
static pgcnt_t maxfastscan = 0;
static pgcnt_t maxslowscan = 100;
-pgcnt_t maxpgio = 0;
-pgcnt_t minfree = 0;
-pgcnt_t desfree = 0;
-pgcnt_t lotsfree = 0;
-pgcnt_t needfree = 0;
-pgcnt_t throttlefree = 0;
-pgcnt_t pageout_reserve = 0;
+#define MEGABYTES (1024ULL * 1024ULL)
+
+/*
+ * pageout_threshold_style:
+ * set to 1 to use the previous default threshold size calculation;
+ * i.e., each threshold is half of the next largest value.
+ */
+uint_t pageout_threshold_style = 0;
+
+/*
+ * The operator may override these tunables to request a different minimum or
+ * maximum lotsfree value, or to change the divisor we use for automatic
+ * sizing.
+ *
+ * By default, we make lotsfree 1/64th of the total memory in the machine. The
+ * minimum and maximum are specified in bytes, rather than pages; a zero value
+ * means the default values (below) are used.
+ */
+uint_t lotsfree_fraction = 64;
+pgcnt_t lotsfree_min = 0;
+pgcnt_t lotsfree_max = 0;
-pgcnt_t deficit;
-pgcnt_t nscan;
-pgcnt_t desscan;
+#define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
+#define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
+
+/*
+ * If these tunables are set to non-zero values in /etc/system, and provided
+ * the value is not larger than the threshold above, the specified value will
+ * be used directly without any additional calculation or adjustment. The boot
+ * time value of these overrides is preserved in the "clockinit" struct. More
+ * detail is available in the comment at the top of the file.
+ */
+pgcnt_t maxpgio = 0;
+pgcnt_t minfree = 0;
+pgcnt_t desfree = 0;
+pgcnt_t lotsfree = 0;
+pgcnt_t needfree = 0;
+pgcnt_t throttlefree = 0;
+pgcnt_t pageout_reserve = 0;
+pri_t pageout_pri;
+
+pgcnt_t deficit;
+pgcnt_t nscan;
+pgcnt_t desscan;
/* kstats */
uint64_t low_mem_scan;
uint64_t zone_cap_scan;
-uint64_t n_throttle;
-clock_t zone_pageout_ticks; /* tunable to change zone pagescan ticks */
+#define MAX_PSCAN_THREADS 16
/*
- * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
- * are the number of ticks in each wakeup cycle that gives the
- * equivalent of some underlying %CPU duty cycle.
- *
- * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
- * will run 4 times/sec to update pageout scanning parameters and kickoff
- * the pageout_scanner() thread if necessary.
+ * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
+ * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
+ * that gives the equivalent of some underlying %CPU duty cycle.
*
- * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
- * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
- * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
+ * min_pageout_nsec:
+ * nanoseconds/wakeup equivalent of min_percent_cpu.
*
- * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
- * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
- * by the scanner in a 1 second interval is 80% of a CPU
- * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
- * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
+ * max_pageout_nsec:
+ * nanoseconds/wakeup equivalent of max_percent_cpu.
*
- * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
- * will be 200, so the CPU percentages are the same as when hz is 100.
- *
- * min_pageout_ticks:
- * ticks/wakeup equivalent of min_percent_cpu.
- *
- * max_pageout_ticks:
- * ticks/wakeup equivalent of max_percent_cpu.
- *
- * pageout_ticks:
- * Number of clock ticks budgeted for each wakeup cycle.
+ * pageout_nsec:
+ * Number of nanoseconds budgeted for each wakeup cycle.
* Computed each time around by schedpaging().
- * Varies between min_pageout_ticks .. max_pageout_ticks,
+ * Varies between min_pageout_nsec and max_pageout_nsec,
* depending on memory pressure or zones over their cap.
+ *
+ * zone_pageout_nsec:
+ * Number of nanoseconds budget for each cycle when a zone
+ * is over its memory cap. If this is zero, then the value
+ * of max_pageout_nsec is used instead.
*/
+static hrtime_t min_pageout_nsec;
+static hrtime_t max_pageout_nsec;
+static hrtime_t pageout_nsec;
+static hrtime_t zone_pageout_nsec;
-static clock_t min_pageout_ticks;
-static clock_t max_pageout_ticks;
-static clock_t pageout_ticks;
+static boolean_t reset_hands[MAX_PSCAN_THREADS];
-#define MAX_PSCAN_THREADS 16
-static boolean_t reset_hands[MAX_PSCAN_THREADS];
+#define PAGES_POLL_MASK 1023
+#define SCHEDPAGING_HZ 4
/*
- * These can be tuned in /etc/system or set with mdb.
- * 'des_page_scanners' is the desired number of page scanner threads. The
- * system will bring the actual number of threads into line with the desired
- * number. If des_page_scanners is set to an invalid value, the system will
- * correct the setting.
+ * despagescanners:
+ * The desired number of page scanner threads. The value can be set in
+ * /etc/system or tuned directly with 'mdb -kw'. The system will bring
+ * the actual number of threads into line with the desired number. If set
+ * to an invalid value, the system will correct the setting.
*/
-uint_t des_page_scanners;
-uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
-
-uint_t n_page_scanners;
-static pgcnt_t pscan_region_sz; /* informational only */
-
-
-#define PAGES_POLL_MASK 1023
+uint_t despagescanners = 0;
/*
* pageout_sample_lim:
- * The limit on the number of samples needed to establish a value
- * for new pageout parameters, fastscan, slowscan, and handspreadpages.
+ * The limit on the number of samples needed to establish a value for new
+ * pageout parameters: fastscan, slowscan, pageout_new_spread, and
+ * handspreadpages.
*
* pageout_sample_cnt:
- * Current sample number. Once the sample gets large enough,
- * set new values for handspreadpages, fastscan and slowscan.
+ * Current sample number. Once the sample gets large enough, set new
+ * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
*
* pageout_sample_pages:
* The accumulated number of pages scanned during sampling.
*
* pageout_sample_etime:
- * The accumulated number of nanoseconds for the sample.
+ * The accumulated nanoseconds for the sample.
*
* pageout_rate:
- * Rate in pages/second, computed at the end of sampling.
+ * Rate in pages/nanosecond, computed at the end of sampling.
*
* pageout_new_spread:
- * The new value to use for maxfastscan and (perhaps) handspreadpages.
- * Intended to be the number pages that can be scanned per sec using ~10%
- * of a CPU. Calculated after enough samples have been taken.
- * pageout_rate / 10
+ * Initially zero while the system scan rate is measured by
+ * pageout_scanner(), which then sets this value once per system boot after
+ * enough samples have been recorded (pageout_sample_cnt). Once set, this
+ * new value is used for fastscan and handspreadpages.
*/
-
typedef hrtime_t hrrate_t;
-static uint_t pageout_sample_lim = 4;
-static uint_t pageout_sample_cnt = 0;
+static uint64_t pageout_sample_lim = 4;
+static uint64_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
+static hrtime_t pageout_sample_etime = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
-static hrtime_t pageout_sample_etime = 0;
-
-/* True if page scanner is first starting up */
+/* True if the page scanner is first starting up */
#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
+/* The current number of page scanner threads */
+static uint_t n_page_scanners = 1;
+/* The number of page scanner threads that are actively scanning. */
+static uint_t pageouts_running;
+
/*
- * Record number of times a pageout_scanner wakeup cycle finished because it
+ * Record number of times a pageout_scanner() wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
* its budgeted number of pages. This is only done when scanning under low
* free memory conditions, not when scanning for zones over their cap.
*/
-uint64_t pageout_timeouts = 0;
+uint64_t pageout_timeouts = 0;
#ifdef VM_STATS
static struct pageoutvmstats_str {
@@ -225,10 +358,57 @@ static struct pageoutvmstats_str {
kmutex_t memavail_lock;
kcondvar_t memavail_cv;
-/*
- * The size of the clock loop.
- */
-#define LOOPPAGES total_pages
+typedef enum pageout_hand {
+ POH_FRONT = 1,
+ POH_BACK,
+} pageout_hand_t;
+
+typedef enum {
+ CKP_INELIGIBLE,
+ CKP_NOT_FREED,
+ CKP_FREED,
+} checkpage_result_t;
+
+static checkpage_result_t checkpage(page_t *, pageout_hand_t);
+
+static struct clockinit {
+ bool ci_init;
+ pgcnt_t ci_lotsfree_min;
+ pgcnt_t ci_lotsfree_max;
+ pgcnt_t ci_lotsfree;
+ pgcnt_t ci_desfree;
+ pgcnt_t ci_minfree;
+ pgcnt_t ci_throttlefree;
+ pgcnt_t ci_pageout_reserve;
+ pgcnt_t ci_maxpgio;
+ pgcnt_t ci_maxfastscan;
+ pgcnt_t ci_fastscan;
+ pgcnt_t ci_slowscan;
+ pgcnt_t ci_handspreadpages;
+ uint_t ci_despagescanners;
+} clockinit = { .ci_init = false };
+
+static inline pgcnt_t
+clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
+{
+ if (value < minimum) {
+ return (minimum);
+ } else if (value > maximum) {
+ return (maximum);
+ } else {
+ return (value);
+ }
+}
+
+static pgcnt_t
+tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
+{
+ if (initval == 0 || initval >= initval_ceiling) {
+ return (defval);
+ } else {
+ return (initval);
+ }
+}
/*
* Local boolean to control scanning when zones are over their cap. Avoids
@@ -242,108 +422,145 @@ kcondvar_t memavail_cv;
static boolean_t zones_over = B_FALSE;
/*
- * Set up the paging constants for the page scanner clock-hand algorithm.
- * Called at startup after the system is initialized and the amount of memory
- * and number of paging devices is known (recalc will be 0). Called again once
- * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
- * (recalc will be 1).
- *
- * Will also be called after a memory dynamic reconfiguration operation and
- * recalc will be 1 in those cases too.
+ * On large memory systems, multiple instances of the page scanner are run,
+ * each responsible for a separate region of memory. This speeds up page
+ * invalidation under low memory conditions.
*
- * lotsfree is 1/64 of memory, but at least 512K (ha!).
- * desfree is 1/2 of lotsfree.
- * minfree is 1/2 of desfree.
+ * despagescanners can be set in /etc/system or via mdb and it will
+ * be used as a guide for how many page scanners to create; the value
+ * will be adjusted if it is not sensible. Otherwise, the number of
+ * page scanners is determined dynamically based on handspreadpages.
*/
-void
-setupclock(int recalc)
+static void
+recalc_pagescanners(void)
{
- uint_t i;
- pgcnt_t sz, tmp;
+ pgcnt_t sz;
+ uint_t des;
- static spgcnt_t init_lfree, init_dfree, init_mfree;
- static spgcnt_t init_tfree, init_preserve, init_mpgio;
- static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
-
- looppages = LOOPPAGES;
+ /* If the initial calibration has not been done, take no action. */
+ if (pageout_new_spread == 0)
+ return;
/*
- * setupclock can be called to recalculate the paging
- * parameters in the case of dynamic reconfiguration of memory.
- * So to make sure we make the proper calculations, if such a
- * situation should arise, we save away the initial values
- * of each parameter so we can recall them when needed. This
- * way we don't lose the settings an admin might have made
- * through the /etc/system file.
+ * If the desired number of scanners is set in /etc/system
+ * then try to use it.
*/
+ if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+ despagescanners = clockinit.ci_despagescanners;
- if (!recalc) {
- init_lfree = lotsfree;
- init_dfree = desfree;
- init_mfree = minfree;
- init_tfree = throttlefree;
- init_preserve = pageout_reserve;
- init_mpgio = maxpgio;
- init_mfscan = maxfastscan;
- init_fscan = fastscan;
- init_sscan = slowscan;
- init_hspages = handspreadpages;
+ if (despagescanners != 0) {
+ /*
+ * We have a desired number of page scanners, either from
+ * /etc/system or set via mdb. Try and use it (it will be
+ * clamped below).
+ */
+ des = despagescanners;
+ } else {
+ /*
+ * Calculate the number of desired scanners based on the
+ * system's memory size.
+ *
+ * A 64GiB region size is used as the basis for calculating how
+ * many scanner threads should be created. For systems with up
+ * to 64GiB of RAM, a single thread is used; for very large
+ * memory systems the threads are limited to MAX_PSCAN_THREADS.
+ */
+ sz = btop(64ULL << 30);
+
+ if (sz > looppages) {
+ des = 1;
+ } else {
+ pgcnt_t tmp = sz;
+
+ for (des = 1; tmp < looppages; des++)
+ tmp += sz;
+ }
}
/*
- * Set up thresholds for paging:
+ * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+ * and so that each scanner covers at least 10% more than
+ * handspreadpages.
*/
+ des = clamp(des, 1,
+ looppages / (handspreadpages + handspreadpages / 10));
+ despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+}
- /*
- * Lotsfree is threshold where paging daemon turns on.
- */
- if (init_lfree == 0 || init_lfree >= looppages)
- lotsfree = MAX(looppages / 64, btop(512 * 1024));
- else
- lotsfree = init_lfree;
+/*
+ * Set up the paging constants for the clock algorithm used by
+ * pageout_scanner(), and by the virtual memory system overall. See the
+ * comments at the top of this file for more information about the threshold
+ * values and system responses to memory pressure.
+ *
+ * This routine is called once by main() at startup, after the initial size of
+ * physical memory is determined. It may be called again later if memory is
+ * added to or removed from the system, or if new measurements of the page scan
+ * rate become available.
+ */
+void
+setupclock(void)
+{
+ bool half = (pageout_threshold_style == 1);
+ bool recalc = true;
- /*
- * Desfree is amount of memory desired free.
- * If less than this for extended period, start swapping.
- */
- if (init_dfree == 0 || init_dfree >= lotsfree)
- desfree = lotsfree / 2;
- else
- desfree = init_dfree;
+ looppages = total_pages;
/*
- * Minfree is minimal amount of free memory which is tolerable.
+ * The operator may have provided specific values for some of the
+ * tunables via /etc/system. On our first call, we preserve those
+ * values so that they can be used for subsequent recalculations.
+ *
+ * A value of zero for any tunable means we will use the default
+ * sizing.
*/
- if (init_mfree == 0 || init_mfree >= desfree)
- minfree = desfree / 2;
- else
- minfree = init_mfree;
+ if (!clockinit.ci_init) {
+ clockinit.ci_init = true;
+
+ clockinit.ci_lotsfree_min = lotsfree_min;
+ clockinit.ci_lotsfree_max = lotsfree_max;
+ clockinit.ci_lotsfree = lotsfree;
+ clockinit.ci_desfree = desfree;
+ clockinit.ci_minfree = minfree;
+ clockinit.ci_throttlefree = throttlefree;
+ clockinit.ci_pageout_reserve = pageout_reserve;
+ clockinit.ci_maxpgio = maxpgio;
+ clockinit.ci_maxfastscan = maxfastscan;
+ clockinit.ci_fastscan = fastscan;
+ clockinit.ci_slowscan = slowscan;
+ clockinit.ci_handspreadpages = handspreadpages;
+ clockinit.ci_despagescanners = despagescanners;
- /*
- * Throttlefree is the point at which we start throttling
- * PG_WAIT requests until enough memory becomes available.
- */
- if (init_tfree == 0 || init_tfree >= desfree)
- throttlefree = minfree;
- else
- throttlefree = init_tfree;
+ /*
+ * The first call does not trigger a recalculation, only
+ * subsequent calls.
+ */
+ recalc = false;
+ }
/*
- * Pageout_reserve is the number of pages that we keep in
- * stock for pageout's own use. Having a few such pages
- * provides insurance against system deadlock due to
- * pageout needing pages. When freemem < pageout_reserve,
- * non-blocking allocations are denied to any threads
- * other than pageout and sched. (At some point we might
- * want to consider a per-thread flag like T_PUSHING_PAGES
- * to indicate that a thread is part of the page-pushing
- * dance (e.g. an interrupt thread) and thus is entitled
- * to the same special dispensation we accord pageout.)
+ * Configure paging threshold values. For more details on what each
+ * threshold signifies, see the comments at the top of this file.
*/
- if (init_preserve == 0 || init_preserve >= throttlefree)
- pageout_reserve = throttlefree / 2;
- else
- pageout_reserve = init_preserve;
+ lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
+ btop(LOTSFREE_MAX_DEFAULT));
+ lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
+ btop(LOTSFREE_MIN_DEFAULT));
+
+ lotsfree = tune(clockinit.ci_lotsfree, looppages,
+ clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
+
+ desfree = tune(clockinit.ci_desfree, lotsfree,
+ lotsfree / 2);
+
+ minfree = tune(clockinit.ci_minfree, desfree,
+ half ? desfree / 2 : 3 * desfree / 4);
+
+ throttlefree = tune(clockinit.ci_throttlefree, desfree,
+ minfree);
+
+ pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
+ half ? throttlefree / 2 : 3 * throttlefree / 4);
/*
* Maxpgio thresholds how much paging is acceptable.
@@ -352,143 +569,160 @@ setupclock(int recalc)
*
* XXX - Does not account for multiple swap devices.
*/
- if (init_mpgio == 0)
+ if (clockinit.ci_maxpgio == 0) {
maxpgio = (DISKRPM * 2) / 3;
- else
- maxpgio = init_mpgio;
+ } else {
+ maxpgio = clockinit.ci_maxpgio;
+ }
/*
- * When the system is in a low memory state, the page scan rate varies
- * between fastscan and slowscan based on the amount of free memory
- * available. When only zones are over their memory cap, the scan rate
- * is always fastscan.
- *
- * The fastscan rate should be set based on the number pages that can
- * be scanned per sec using ~10% of a CPU. Since this value depends on
- * the processor, MMU, Ghz etc., it must be determined dynamically.
- *
- * When the scanner first starts up, fastscan will be set to 0 and
- * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
- * However, once the scanner has collected enough samples, then fastscan
- * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
- * or maxfastscan (which is set from pageout_new_spread). Thus,
- * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
- * initialized.
- *
- * pageout_new_spread is calculated when the scanner first starts
- * running. During this initial sampling period the nscan_limit
- * is set to the total_pages of system memory. Thus, the scanner could
- * theoretically scan all of memory in one pass. However, each sample
- * is also limited by the %CPU budget. This is controlled by
- * pageout_ticks which is set in schedpaging(). During the sampling
- * period, pageout_ticks is set to max_pageout_ticks. This tick value
- * is derived from the max_percent_cpu (80%) described above. On a
- * system with more than a small amount of memory (~8GB), the scanner's
- * %CPU will be the limiting factor in calculating pageout_new_spread.
- *
- * At the end of the sampling period, the pageout_rate indicates how
- * many pages could be scanned per second. The pageout_new_spread is
- * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
- * Of course, this value could still be more than the physical memory
- * on the system. If so, fastscan is set to 1/2 of memory, as
- * mentioned above.
+ * The clock scan rate varies between fastscan and slowscan
+ * based on the amount of free memory available. Fastscan
+ * rate should be set based on the number pages that can be
+ * scanned per sec using ~10% of processor time. Since this
+ * value depends on the processor, MMU, Mhz etc., it is
+ * difficult to determine it in a generic manner for all
+ * architectures.
*
- * All of this leads up to the setting of handspreadpages, which is
- * set to fastscan. This is the distance, in pages, between the front
- * and back hands during scanning. It will dictate which pages will
- * be considered "hot" on the backhand and which pages will be "cold"
- * and reclaimed
+ * Instead of trying to determine the number of pages scanned
+ * per sec for every processor, fastscan is set to be the smaller
+ * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
+ * time is limited to ~4% of processor time.
*
- * If the scanner is limited by desscan, then at the highest rate it
- * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
- * scanner is limited by the %CPU, then at the highest rate (20% of a
- * CPU per cycle) the number of pages scanned could be much less.
+ * Setting fastscan to be 1/2 of memory allows pageout to scan
+ * all of memory in ~2 secs. This implies that user pages not
+ * accessed within 1 sec (assuming, handspreadpages == fastscan)
+ * can be reclaimed when free memory is very low. Stealing pages
+ * not accessed within 1 sec seems reasonable and ensures that
+ * active user processes don't thrash.
*
- * Thus, if the scanner is limited by desscan, then the handspreadpages
- * setting means 1sec between the front and back hands, but if the
- * scanner is limited by %CPU, it could be several seconds between the
- * two hands.
+ * Smaller values of fastscan result in scanning fewer pages
+ * every second and consequently pageout may not be able to free
+ * sufficient memory to maintain the minimum threshold. Larger
+ * values of fastscan result in scanning a lot more pages which
+ * could lead to thrashing and higher CPU usage.
*
- * The basic assumption is that at the worst case, stealing pages
- * not accessed within 1 sec seems reasonable and ensures that active
- * user processes don't thrash. This is especially true when the system
- * is in a low memory state.
+ * Fastscan needs to be limited to a maximum value and should not
+ * scale with memory to prevent pageout from consuming too much
+ * time for scanning on slow CPU's and avoid thrashing, as a
+ * result of scanning too many pages, on faster CPU's.
+ * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
+ * (the upper bound for fastscan) based on the average number
+ * of pages that can potentially be scanned in ~1 sec (using ~4%
+ * of the CPU) on some of the following machines that currently
+ * run Solaris 2.x:
*
- * There are some additional factors to consider for the case of
- * scanning when zones are over their cap. In this situation it is
- * also likely that the machine will have a large physical memory which
- * will take many seconds to fully scan (due to the %CPU and desscan
- * limits per cycle). It is probable that there will be few (or 0)
- * pages attributed to these zones in any single scanning cycle. The
- * result is that reclaiming enough pages for these zones might take
- * several additional seconds (this is generally not a problem since
- * the zone physical cap is just a soft cap).
+ * average memory scanned in ~1 sec
*
- * This is similar to the typical multi-processor situation in which
- * pageout is often unable to maintain the minimum paging thresholds
- * under heavy load due to the fact that user processes running on
- * other CPU's can be dirtying memory at a much faster pace than
- * pageout can find pages to free.
+ * 25 Mhz SS1+: 23 Meg
+ * LX: 37 Meg
+ * 50 Mhz SC2000: 68 Meg
*
- * One potential approach to address both of these cases is to enable
- * more than one CPU to run the page scanner, in such a manner that the
- * various clock hands don't overlap. However, this also makes it more
- * difficult to determine the values for fastscan, slowscan and
- * handspreadpages. This is left as a future enhancement, if necessary.
+ * 40 Mhz 486: 26 Meg
+ * 66 Mhz 486: 42 Meg
*
- * When free memory falls just below lotsfree, the scan rate goes from
- * 0 to slowscan (i.e., the page scanner starts running). This
+ * When free memory falls just below lotsfree, the scan rate
+ * goes from 0 to slowscan (i.e., pageout starts running). This
* transition needs to be smooth and is achieved by ensuring that
* pageout scans a small number of pages to satisfy the transient
* memory demand. This is set to not exceed 100 pages/sec (25 per
* wakeup) since scanning that many pages has no noticible impact
* on system performance.
*
- * The swapper is currently used to free up memory when pageout is
- * unable to meet memory demands. It does this by swapping out entire
- * processes. In addition to freeing up memory, swapping also reduces
- * the demand for memory because the swapped out processes cannot
- * run, and thereby consume memory. However, this is a pathological
- * state and performance will generally be considered unacceptable.
+ * In addition to setting fastscan and slowscan, pageout is
+ * limited to using ~4% of the CPU. This results in increasing
+ * the time taken to scan all of memory, which in turn means that
+ * user processes have a better opportunity of preventing their
+ * pages from being stolen. This has a positive effect on
+ * interactive and overall system performance when memory demand
+ * is high.
+ *
+ * Thus, the rate at which pages are scanned for replacement will
+ * vary linearly between slowscan and the number of pages that
+ * can be scanned using ~4% of processor time instead of varying
+ * linearly between slowscan and fastscan.
+ *
+ * Also, the processor time used by pageout will vary from ~1%
+ * at slowscan to ~4% at fastscan instead of varying between
+ * ~1% at slowscan and ~10% at fastscan.
+ *
+ * The values chosen for the various VM parameters (fastscan,
+ * handspreadpages, etc) are not universally true for all machines,
+ * but appear to be a good rule of thumb for the machines we've
+ * tested. They have the following ranges:
+ *
+ * cpu speed: 20 to 70 Mhz
+ * page size: 4K to 8K
+ * memory size: 16M to 5G
+ * page scan rate: 4000 - 17400 4K pages per sec
+ *
+ * The values need to be re-examined for machines which don't
+ * fall into the various ranges (e.g., slower or faster CPUs,
+ * smaller or larger pagesizes etc) shown above.
+ *
+ * On an MP machine, pageout is often unable to maintain the
+ * minimum paging thresholds under heavy load. This is due to
+ * the fact that user processes running on other CPU's can be
+ * dirtying memory at a much faster pace than pageout can find
+ * pages to free. The memory demands could be met by enabling
+ * more than one CPU to run the clock algorithm in such a manner
+ * that the various clock hands don't overlap. This also makes
+ * it more difficult to determine the values for fastscan, slowscan
+ * and handspreadpages.
+ *
+ * The swapper is currently used to free up memory when pageout
+ * is unable to meet memory demands by swapping out processes.
+ * In addition to freeing up memory, swapping also reduces the
+ * demand for memory by preventing user processes from running
+ * and thereby consuming memory.
*/
- if (init_mfscan == 0) {
- if (pageout_new_spread != 0)
+ if (clockinit.ci_maxfastscan == 0) {
+ if (pageout_new_spread != 0) {
maxfastscan = pageout_new_spread;
- else
+ } else {
maxfastscan = MAXHANDSPREADPAGES;
+ }
} else {
- maxfastscan = init_mfscan;
+ maxfastscan = clockinit.ci_maxfastscan;
}
- if (init_fscan == 0) {
+
+ if (clockinit.ci_fastscan == 0) {
fastscan = MIN(looppages / loopfraction, maxfastscan);
} else {
- fastscan = init_fscan;
- if (fastscan > looppages / loopfraction)
- fastscan = looppages / loopfraction;
+ fastscan = clockinit.ci_fastscan;
+ }
+
+ if (fastscan > looppages / loopfraction) {
+ fastscan = looppages / loopfraction;
}
/*
* Set slow scan time to 1/10 the fast scan time, but
* not to exceed maxslowscan.
*/
- if (init_sscan == 0)
+ if (clockinit.ci_slowscan == 0) {
slowscan = MIN(fastscan / 10, maxslowscan);
- else
- slowscan = init_sscan;
- if (slowscan > fastscan / 2)
+ } else {
+ slowscan = clockinit.ci_slowscan;
+ }
+
+ if (slowscan > fastscan / 2) {
slowscan = fastscan / 2;
+ }
/*
- * Handspreadpages is distance (in pages) between front and back
+ * Handspreadpages is the distance (in pages) between front and back
* pageout daemon hands. The amount of time to reclaim a page
* once pageout examines it increases with this distance and
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
- * Since pageout is limited to the %CPU per cycle, setting
- * handspreadpages to be "fastscan" results in the front hand being
- * a few secs (varies based on the processor speed) ahead of the back
- * hand at fastscan rates.
+ * Since pageout is limited to ~4% of the CPU, setting handspreadpages
+ * to be "fastscan" results in the front hand being a few secs
+ * (varies based on the processor speed) ahead of the back hand
+ * at fastscan rates. This distance can be further reduced, if
+ * necessary, by increasing the processor time used by pageout
+ * to be more than ~4% and preferrably not more than ~10%.
*
* As a result, user processes have a much better chance of
* referencing their pages before the back hand examines them.
@@ -496,91 +730,62 @@ setupclock(int recalc)
* the freelist since pageout does not end up freeing pages which
* may be referenced a sec later.
*/
- if (init_hspages == 0)
+ if (clockinit.ci_handspreadpages == 0) {
handspreadpages = fastscan;
- else
- handspreadpages = init_hspages;
+ } else {
+ handspreadpages = clockinit.ci_handspreadpages;
+ }
/*
* Make sure that back hand follows front hand by at least
- * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
- * for the back hand to look at a page during the same wakeup of
- * the pageout daemon in which the front hand cleared its ref bit.
+ * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
+ * back hand to look at a page during the same wakeup of the pageout
+ * daemon in which the front hand cleared its ref bit.
*/
- if (handspreadpages >= looppages)
+ if (handspreadpages >= looppages) {
handspreadpages = looppages - 1;
-
- if (recalc == 0) {
- /*
- * Setup basic values at initialization.
- */
- pscan_region_sz = total_pages;
- des_page_scanners = n_page_scanners = 1;
- reset_hands[0] = B_TRUE;
- return;
}
/*
- * Recalculating
- *
- * We originally set the number of page scanners to 1. Now that we
- * know what the handspreadpages is for a scanner, figure out how many
- * scanners we should run. We want to ensure that the regions don't
- * overlap and that they are not touching.
- *
- * A default 64GB region size is used as the initial value to calculate
- * how many scanner threads we should create on lower memory systems.
- * The idea is to limit the number of threads to a practical value
- * (e.g. a 64GB machine really only needs one scanner thread). For very
- * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
- * threads.
- *
- * The scanner threads themselves are evenly spread out around the
- * memory "clock" in pageout_scanner when we reset the hands, and each
- * thread will scan all of memory.
+ * Establish the minimum and maximum length of time to be spent
+ * scanning pages per wakeup, limiting the scanner duty cycle. The
+ * input percentage values (0-100) must be converted to a fraction of
+ * the number of nanoseconds in a second of wall time, then further
+ * scaled down by the number of scanner wakeups in a second.
*/
- sz = (btop(64ULL * 0x40000000ULL));
- if (sz < handspreadpages) {
- /*
- * 64GB is smaller than the separation between the front
- * and back hands; use double handspreadpages.
- */
- sz = handspreadpages << 1;
- }
- if (sz > total_pages) {
- sz = total_pages;
- }
- /* Record region size for inspection with mdb, otherwise unused */
- pscan_region_sz = sz;
+ min_pageout_nsec = MAX(1,
+ NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
+ max_pageout_nsec = MAX(min_pageout_nsec,
+ NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
- tmp = sz;
- for (i = 1; tmp < total_pages; i++) {
- tmp += sz;
- }
+ /*
+ * If not called for recalculation, return and skip the remaining
+ * steps.
+ */
+ if (!recalc)
+ return;
- if (i > MAX_PSCAN_THREADS)
- i = MAX_PSCAN_THREADS;
+ /*
+ * Set a flag to re-evaluate the clock hand positions.
+ */
+ for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
+ reset_hands[i] = B_TRUE;
- des_page_scanners = i;
+ recalc_pagescanners();
}
/*
* Pageout scheduling.
*
* Schedpaging controls the rate at which the page out daemon runs by
- * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
- * times a second. The pageout_ticks variable controls the percent of one
- * CPU that each page scanner thread should consume (see min_percent_cpu
- * and max_percent_cpu descriptions). The desscan variable records the number
- * of pages pageout should examine in its next pass; schedpaging sets this
- * value based on the amount of currently available memory. In addtition, the
- * nscan variable records the number of pages pageout has examined in its
- * current pass; schedpaging resets this value to zero each time it runs.
+ * setting the global variables nscan and desscan SCHEDPAGING_HZ
+ * times a second. Nscan records the number of pages pageout has examined
+ * in its current pass; schedpaging() resets this value to zero each time
+ * it runs. Desscan records the number of pages pageout should examine
+ * in its next pass; schedpaging() sets this value based on the amount of
+ * currently available memory.
*/
-#define RATETOSCHEDPAGING 4 /* times/second */
-
-/* held while pageout_scanner or schedpaging are modifying shared data */
static kmutex_t pageout_mutex;
/*
@@ -592,7 +797,24 @@ static struct async_reqs *push_list; /* pending reqs */
static kmutex_t push_lock; /* protects req pool */
static kcondvar_t push_cv;
-static int async_list_size = 256; /* number of async request structs */
+/*
+ * If pageout() is stuck on a single push for this many seconds,
+ * pageout_deadman() will assume the system has hit a memory deadlock. If set
+ * to 0, the deadman will have no effect.
+ *
+ * Note that we are only looking for stalls in the calls that pageout() makes
+ * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
+ * I/O, which should not take long unless the underlying strategy call blocks
+ * indefinitely for memory. The actual I/O request happens (or fails) later.
+ */
+uint_t pageout_deadman_seconds = 90;
+
+static uint_t pageout_stucktime = 0;
+static bool pageout_pushing = false;
+static uint64_t pageout_pushcount = 0;
+static uint64_t pageout_pushcount_seen = 0;
+
+static int async_list_size = 8192; /* number of async request structs */
static void pageout_scanner(void *);
@@ -623,153 +845,142 @@ schedpaging(void *arg)
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
- (void) atomic_swap_ulong(&nscan, 0);
- vavail = freemem - deficit;
- if (pageout_new_spread != 0)
- vavail -= needfree;
- if (vavail < 0)
- vavail = 0;
- if (vavail > lotsfree)
- vavail = lotsfree;
+ if (mutex_tryenter(&pageout_mutex)) {
- /*
- * Fix for 1161438 (CRS SPR# 73922). All variables
- * in the original calculation for desscan were 32 bit signed
- * ints. As freemem approaches 0x0 on a system with 1 Gig or
- * more of memory, the calculation can overflow. When this
- * happens, desscan becomes negative and pageout_scanner()
- * stops paging out.
- */
- if ((needfree) && (pageout_new_spread == 0)) {
- /*
- * If we've not yet collected enough samples to
- * calculate a spread, kick into high gear anytime
- * needfree is non-zero. Note that desscan will not be
- * the limiting factor for systems with larger memory;
- * the %CPU will limit the scan. That will also be
- * maxed out below.
- */
- desscan = fastscan / RATETOSCHEDPAGING;
- } else {
- /*
- * Once we've calculated a spread based on system
- * memory and usage, just treat needfree as another
- * form of deficit.
- */
- spgcnt_t faststmp, slowstmp, result;
+ if (pageouts_running != 0)
+ goto out;
- slowstmp = slowscan * vavail;
- faststmp = fastscan * (lotsfree - vavail);
- result = (slowstmp + faststmp) /
- nz(lotsfree) / RATETOSCHEDPAGING;
- desscan = (pgcnt_t)result;
- }
+ /* No pageout scanner threads running. */
+ nscan = 0;
+ vavail = freemem - deficit;
+ if (pageout_new_spread != 0)
+ vavail -= needfree;
+ vavail = clamp(vavail, 0, lotsfree);
- /*
- * If we've not yet collected enough samples to calculate a
- * spread, also kick %CPU to the max.
- */
- if (pageout_new_spread == 0) {
- pageout_ticks = max_pageout_ticks;
- } else {
- pageout_ticks = min_pageout_ticks +
- (lotsfree - vavail) *
- (max_pageout_ticks - min_pageout_ticks) /
- nz(lotsfree);
- }
+ if (needfree > 0 && pageout_new_spread == 0) {
+ /*
+ * If we've not yet collected enough samples to
+ * calculate a spread, use the old logic of kicking
+ * into high gear anytime needfree is non-zero.
+ */
+ desscan = fastscan / SCHEDPAGING_HZ;
+ } else {
+ /*
+ * Once we've calculated a spread based on system
+ * memory and usage, just treat needfree as another
+ * form of deficit.
+ */
+ spgcnt_t faststmp, slowstmp, result;
- if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
- /*
- * We have finished the pagescan initialization and the desired
- * number of page scanners has changed, either because
- * initialization just finished, because of a memory DR, or
- * because des_page_scanners has been modified on the fly (i.e.
- * by mdb). If we need more scanners, start them now, otherwise
- * the excess scanners will terminate on their own when they
- * reset their hands.
- */
- uint_t i;
- uint_t curr_nscan = n_page_scanners;
- pgcnt_t max = total_pages / handspreadpages;
+ slowstmp = slowscan * vavail;
+ faststmp = fastscan * (lotsfree - vavail);
+ result = (slowstmp + faststmp) /
+ nz(lotsfree) / SCHEDPAGING_HZ;
+ desscan = (pgcnt_t)result;
+ }
- if (des_page_scanners > max)
- des_page_scanners = max;
+ pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
+ (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
- if (des_page_scanners > MAX_PSCAN_THREADS) {
- des_page_scanners = MAX_PSCAN_THREADS;
- } else if (des_page_scanners == 0) {
- des_page_scanners = 1;
- }
+ DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
+ pageout_nsec);
- /*
- * Each thread has its own entry in the reset_hands array, so
- * we don't need any locking in pageout_scanner to check the
- * thread's reset_hands entry. Thus, we use a pre-allocated
- * fixed size reset_hands array and upper limit on the number
- * of pagescan threads.
- *
- * The reset_hands entries need to be true before we start new
- * scanners, but if we're reducing, we don't want a race on the
- * recalculation for the existing threads, so we set
- * n_page_scanners first.
- */
- n_page_scanners = des_page_scanners;
- for (i = 0; i < MAX_PSCAN_THREADS; i++) {
- reset_hands[i] = B_TRUE;
- }
+ if (pageout_new_spread != 0 && despagescanners != 0 &&
+ despagescanners != n_page_scanners) {
+ /*
+ * We have finished the pagescan initialisation and the
+ * desired number of page scanners has changed, either
+ * because initialisation just finished, because of a
+ * memory DR, or because despagescanners has been
+ * modified on the fly (i.e. by mdb).
+ */
+ uint_t i, curr_nscan = n_page_scanners;
+
+ /* Re-validate despagescanners */
+ recalc_pagescanners();
+
+ n_page_scanners = despagescanners;
+
+ for (i = 0; i < MAX_PSCAN_THREADS; i++)
+ reset_hands[i] = B_TRUE;
+
+ /* If we need more scanners, start them now. */
+ if (n_page_scanners > curr_nscan) {
+ for (i = curr_nscan; i < n_page_scanners; i++) {
+ (void) lwp_kernel_create(proc_pageout,
+ pageout_scanner,
+ (void *)(uintptr_t)i, TS_RUN,
+ pageout_pri);
+ }
+ }
- if (des_page_scanners > curr_nscan) {
- /* Create additional pageout scanner threads. */
- for (i = curr_nscan; i < des_page_scanners; i++) {
- (void) lwp_kernel_create(proc_pageout,
- pageout_scanner, (void *)(uintptr_t)i,
- TS_RUN, curthread->t_pri);
+ /*
+ * If the number of scanners has decreased, trigger a
+ * wakeup so that the excess threads will terminate.
+ */
+ if (n_page_scanners < curr_nscan) {
+ WAKE_PAGEOUT_SCANNER();
}
}
- }
-
- zones_over = B_FALSE;
-
- if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
- if (!PAGE_SCAN_STARTUP)
- low_mem_scan++;
- DTRACE_PROBE(schedpage__wake__low);
- WAKE_PAGEOUT_SCANNER();
-
- } else if (zone_num_over_cap > 0) {
- /* One or more zones are over their cap. */
- /* No page limit */
- desscan = total_pages;
+ zones_over = B_FALSE;
- /*
- * Increase the scanning CPU% to the max. This implies
- * 80% of one CPU/sec if the scanner can run each
- * opportunity. Can also be tuned via setting
- * zone_pageout_ticks in /etc/system or with mdb.
- */
- pageout_ticks = (zone_pageout_ticks != 0) ?
- zone_pageout_ticks : max_pageout_ticks;
+ if (PAGE_SCAN_STARTUP) {
+ /*
+ * We still need to measure the rate at which the
+ * system is able to scan pages of memory. Each of
+ * these initial samples is a scan of as much system
+ * memory as practical, regardless of whether or not we
+ * are experiencing memory pressure.
+ */
+ desscan = total_pages;
+ pageout_nsec = max_pageout_nsec;
- zones_over = B_TRUE;
- zone_cap_scan++;
+ DTRACE_PROBE(schedpage__wake__sample);
+ WAKE_PAGEOUT_SCANNER();
+ } else if (freemem < lotsfree + needfree) {
+ /*
+ * We need more memory.
+ */
+ low_mem_scan++;
- DTRACE_PROBE(schedpage__wake__zone);
- WAKE_PAGEOUT_SCANNER();
+ DTRACE_PROBE(schedpage__wake__low);
+ WAKE_PAGEOUT_SCANNER();
+ } else if (zone_num_over_cap > 0) {
+ /*
+ * One of more zones are over their cap.
+ */
- } else {
- /*
- * There are enough free pages, no need to
- * kick the scanner thread. And next time
- * around, keep more of the `highly shared'
- * pages.
- */
- cv_signal_pageout();
+ /* No page limit */
+ desscan = total_pages;
- mutex_enter(&pageout_mutex);
- if (po_share > MIN_PO_SHARE) {
- po_share >>= 1;
+ /*
+ * Increase the scanning CPU% to the max. This implies
+ * 80% of one CPU/sec if the scanner can run each
+ * opportunity. Can also be tuned via setting
+ * zone_pageout_nsec in /etc/system or with mdb.
+ */
+ pageout_nsec = (zone_pageout_nsec != 0) ?
+ zone_pageout_nsec : max_pageout_nsec;
+
+ zones_over = B_TRUE;
+ zone_cap_scan++;
+
+ DTRACE_PROBE(schedpage__wake__zone);
+ WAKE_PAGEOUT_SCANNER();
+ } else {
+ /*
+ * There are enough free pages, no need to
+ * kick the scanner thread. And next time
+ * around, keep more of the `highly shared'
+ * pages.
+ */
+ cv_signal_pageout();
+ if (po_share > MIN_PO_SHARE) {
+ po_share >>= 1;
+ }
}
+out:
mutex_exit(&pageout_mutex);
}
@@ -782,61 +993,55 @@ schedpaging(void *arg)
if (kmem_avail() > 0)
cv_broadcast(&memavail_cv);
- (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
+ (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
}
pgcnt_t pushes;
ulong_t push_list_size; /* # of requests on pageout queue */
-#define FRONT 1
-#define BACK 2
-
-int dopageout = 1; /* /etc/system tunable to disable page reclamation */
+/*
+ * Paging out should always be enabled. This tunable exists to hold pageout
+ * for debugging purposes. If set to 0, pageout_scanner() will go back to
+ * sleep each time it is woken by schedpaging().
+ */
+uint_t dopageout = 1;
/*
* The page out daemon, which runs as process 2.
*
- * Page out occurs when either:
- * a) there is less than lotsfree pages,
- * b) there are one or more zones over their physical memory cap.
- *
- * The daemon treats physical memory as a circular array of pages and scans the
- * pages using a 'two-handed clock' algorithm. The front hand moves through
- * the pages, clearing the reference bit. The back hand travels a distance
- * (handspreadpages) behind the front hand, freeing the pages that have not
- * been referenced in the time since the front hand passed. If modified, they
- * are first written to their backing store before being freed.
- *
- * In order to make page invalidation more responsive on machines with larger
- * memory, multiple pageout_scanner threads may be created. In this case, the
- * threads are evenly distributed around the the memory "clock face" so that
- * memory can be reclaimed more quickly (that is, there can be large regions in
- * which no pages can be reclaimed by a single thread, leading to lag which
- * causes undesirable behavior such as htable stealing).
+ * The daemon treats physical memory as a circular array of pages and scans
+ * the pages using a 'two-handed clock' algorithm. The front hand moves
+ * through the pages, clearing the reference bit. The back hand travels a
+ * distance (handspreadpages) behind the front hand, freeing the pages that
+ * have not been referenced in the time since the front hand passed. If
+ * modified, they are first written to their backing store before being
+ * freed.
*
- * As long as there are at least lotsfree pages, or no zones over their cap,
- * then pageout_scanner threads are not run. When pageout_scanner threads are
- * running for case (a), all pages are considered for pageout. For case (b),
- * only pages belonging to a zone over its cap will be considered for pageout.
+ * In order to make page invalidation more responsive on machines with
+ * larger memory, multiple pageout_scanner threads may be created. In this
+ * case, each thread is given a segment of the memory "clock face" so that
+ * memory can be reclaimed more quickly.
*
- * There are multiple threads that act on behalf of the pageout process.
- * A set of threads scan pages (pageout_scanner) and frees them up if
- * they don't require any VOP_PUTPAGE operation. If a page must be
- * written back to its backing store, the request is put on a list
- * and the other (pageout) thread is signaled. The pageout thread
- * grabs VOP_PUTPAGE requests from the list, and processes them.
- * Some filesystems may require resources for the VOP_PUTPAGE
- * operations (like memory) and hence can block the pageout
- * thread, but the pageout_scanner threads can still operate. There is still
- * no guarantee that memory deadlocks cannot occur.
+ * As long as there are at least lotsfree pages, or no zones over their
+ * cap, then pageout_scanner threads are not run. When pageout_scanner
+ * threads are running for case (a), all pages are considered for pageout.
+ * For case (b), only pages belonging to a zone over its cap will be
+ * considered for pageout.
*
- * The pageout_scanner parameters are determined in schedpaging().
+ * There are multiple threads that act on behalf of the pageout process. A
+ * set of threads scan pages (pageout_scanner) and frees them up if they
+ * don't require any VOP_PUTPAGE operation. If a page must be written back
+ * to its backing store, the request is put on a list and the other
+ * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
+ * requests from the list, and processes them. Some filesystems may require
+ * resources for the VOP_PUTPAGE operations (like memory) and hence can
+ * block the pageout thread, but the scanner thread can still operate.
+ * There is still no guarantee that memory deadlocks cannot occur.
*/
void
pageout()
{
struct async_reqs *arg;
- pri_t pageout_pri;
int i;
pgcnt_t max_pushes;
callb_cpr_t cprinfo;
@@ -863,14 +1068,16 @@ pageout()
kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
req_freelist = push_req;
- for (i = 0; i < async_list_size - 1; i++)
+ for (i = 0; i < async_list_size - 1; i++) {
push_req[i].a_next = &push_req[i + 1];
+ }
- pageout_pri = curthread->t_pri;
+ pageout_pri = curthread->t_pri - 1;
- /* Create the (first) pageout scanner thread. */
- (void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
- TS_RUN, pageout_pri - 1);
+ /* Create the first pageout scanner thread. */
+ (void) lwp_kernel_create(proc_pageout, pageout_scanner,
+ (void *)0, /* this is instance 0, not NULL */
+ TS_RUN, pageout_pri);
/*
* kick off pageout scheduler.
@@ -888,7 +1095,7 @@ pageout()
/*
* Limit pushes to avoid saturating pageout devices.
*/
- max_pushes = maxpgio / RATETOSCHEDPAGING;
+ max_pushes = maxpgio / SCHEDPAGING_HZ;
CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
for (;;) {
@@ -902,9 +1109,11 @@ pageout()
}
push_list = arg->a_next;
arg->a_next = NULL;
+ pageout_pushing = true;
mutex_exit(&push_lock);
DTRACE_PROBE(pageout__push);
+
if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
pushes++;
@@ -914,6 +1123,8 @@ pageout()
VN_RELE(arg->a_vp);
mutex_enter(&push_lock);
+ pageout_pushing = false;
+ pageout_pushcount++;
arg->a_next = req_freelist; /* back on freelist */
req_freelist = arg;
push_list_size--;
@@ -927,134 +1138,172 @@ pageout()
static void
pageout_scanner(void *a)
{
- struct page *fronthand, *backhand;
- uint_t count, iter = 0;
+ struct page *fronthand, *backhand, *fronthandstart;
+ struct page *regionstart, *regionend;
+ uint_t laps;
callb_cpr_t cprinfo;
- pgcnt_t nscan_cnt, nscan_limit;
+ pgcnt_t nscan_cnt, tick;
pgcnt_t pcount;
- uint_t inst = (uint_t)(uintptr_t)a;
+ bool bhwrapping, fhwrapping;
hrtime_t sample_start, sample_end;
- clock_t pageout_lbolt;
- kmutex_t pscan_mutex;
+ uint_t inst = (uint_t)(uintptr_t)a;
VERIFY3U(inst, <, MAX_PSCAN_THREADS);
- mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
+ CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
+ mutex_enter(&pageout_mutex);
- CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
- mutex_enter(&pscan_mutex);
+ /*
+ * The restart case does not attempt to point the hands at roughly
+ * the right point on the assumption that after one circuit things
+ * will have settled down, and restarts shouldn't be that often.
+ */
+ reset_hands[inst] = B_TRUE;
- min_pageout_ticks = MAX(1,
- ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
- max_pageout_ticks = MAX(min_pageout_ticks,
- ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
+ pageouts_running++;
+ mutex_exit(&pageout_mutex);
loop:
cv_signal_pageout();
+ mutex_enter(&pageout_mutex);
+ pageouts_running--;
CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&proc_pageout->p_cv, &pscan_mutex);
- CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
+ cv_wait(&proc_pageout->p_cv, &pageout_mutex);
+ CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+ pageouts_running++;
+ mutex_exit(&pageout_mutex);
- if (!dopageout)
+ /*
+ * Check if pageout has been disabled for debugging purposes.
+ */
+ if (!dopageout) {
goto loop;
+ }
+ /*
+ * One may reset the clock hands and scanned region for debugging
+ * purposes. Hands will also be reset on first thread startup, if
+ * the number of scanning threads (n_page_scanners) changes, or if
+ * memory is added to, or removed from, the system.
+ */
if (reset_hands[inst]) {
struct page *first;
- pgcnt_t offset = total_pages / n_page_scanners;
reset_hands[inst] = B_FALSE;
+
if (inst >= n_page_scanners) {
/*
- * The desired number of page scanners has been
- * reduced and this instance is no longer wanted.
- * Exit the lwp.
- */
+ * The desired number of page scanners has been
+ * reduced and this instance is no longer wanted.
+ * Exit the lwp.
+ */
VERIFY3U(inst, !=, 0);
- mutex_exit(&pscan_mutex);
+ DTRACE_PROBE1(pageout__exit, uint_t, inst);
+ mutex_enter(&pageout_mutex);
+ pageouts_running--;
+ mutex_exit(&pageout_mutex);
mutex_enter(&curproc->p_lock);
lwp_exit();
+ /* NOTREACHED */
}
+ first = page_first();
+
/*
- * The reset case repositions the hands at the proper place
- * on the memory clock face to prevent creep into another
- * thread's active region or when the number of threads has
- * changed.
- *
- * Set the two clock hands to be separated by a reasonable
- * amount, but no more than 360 degrees apart.
- *
- * If inst == 0, backhand starts at first page, otherwise
- * it is (inst * offset) around the memory "clock face" so that
- * we spread out each scanner instance evenly.
+ * Each scanner thread gets its own sector of the memory
+ * clock face.
*/
- first = page_first();
- backhand = page_nextn(first, offset * inst);
- if (handspreadpages >= total_pages) {
- fronthand = page_nextn(backhand, total_pages - 1);
+ pgcnt_t span, offset;
+
+ span = looppages / n_page_scanners;
+ VERIFY3U(span, >, handspreadpages);
+
+ offset = inst * span;
+ regionstart = page_nextn(first, offset);
+ if (inst == n_page_scanners - 1) {
+ /* The last instance goes up to the last page */
+ regionend = page_nextn(first, looppages - 1);
} else {
- fronthand = page_nextn(backhand, handspreadpages);
+ regionend = page_nextn(regionstart, span - 1);
}
+
+ backhand = regionstart;
+ fronthand = page_nextn(backhand, handspreadpages);
+ tick = 1;
+
+ bhwrapping = fhwrapping = B_FALSE;
+
+ DTRACE_PROBE4(pageout__reset, uint_t, inst,
+ pgcnt_t, regionstart, pgcnt_t, regionend,
+ pgcnt_t, fronthand);
}
/*
- * This CPU kstat is only incremented here and we're obviously on this
- * CPU, so no lock.
+ * This CPU kstat is only incremented here and we're obviously
+ * on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
- count = 0;
- /* Kernel probe */
- TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
- tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
+ /*
+ * Keep track of the number of times we have scanned all the way around
+ * the loop on this wakeup.
+ */
+ laps = 0;
- pcount = 0;
+ /*
+ * Track the number of pages visited during this scan so that we can
+ * periodically measure our duty cycle.
+ */
nscan_cnt = 0;
- if (PAGE_SCAN_STARTUP) {
- nscan_limit = total_pages;
- } else {
- nscan_limit = desscan;
- }
+ pcount = 0;
+
+ DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
+ hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
- DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
- page_t *, backhand, page_t *, fronthand);
+ /*
+ * Record the initial position of the front hand for this cycle so
+ * that we can detect when the hand wraps around.
+ */
+ fronthandstart = fronthand;
- pageout_lbolt = ddi_get_lbolt();
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
- * Only scan while at least one of these is true:
- * 1) one or more zones is over its cap
- * 2) there is not enough free memory
- * 3) during page scan startup when determining sample data
*/
- while (nscan_cnt < nscan_limit &&
- (zones_over ||
- freemem < lotsfree + needfree ||
- PAGE_SCAN_STARTUP)) {
- int rvfront, rvback;
+ while (nscan_cnt < desscan) {
+ checkpage_result_t rvfront, rvback;
+
+ /*
+ * Only scan while at least one of these is true:
+ * 1) one or more zones is over its cap
+ * 2) there is not enough free memory
+ * 3) during page scan startup when determining sample data
+ */
+ if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
+ !zones_over) {
+ /*
+ * We are not sampling and enough memory has become
+ * available that scanning is no longer required.
+ */
+ DTRACE_PROBE1(pageout__memfree, uint_t, inst);
+ break;
+ }
- DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+ DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
/*
- * Check to see if we have exceeded our %CPU budget
- * for this wakeup, but not on every single page visited,
- * just every once in a while.
+ * Periodically check to see if we have exceeded the CPU duty
+ * cycle for a single wakeup.
*/
if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
- clock_t pageout_cycle_ticks;
+ hrtime_t pageout_cycle_nsec;
- pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
- if (pageout_cycle_ticks >= pageout_ticks) {
- /*
- * This is where we normally break out of the
- * loop when scanning zones or sampling.
- */
- if (!zones_over) {
+ pageout_cycle_nsec = gethrtime() - sample_start;
+ if (pageout_cycle_nsec >= pageout_nsec) {
+ if (!zones_over)
atomic_inc_64(&pageout_timeouts);
- }
DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
@@ -1062,12 +1311,14 @@ loop:
/*
* If checkpage manages to add a page to the free list,
- * we give ourselves another couple of trips around memory.
+ * we give ourselves another couple of trips around the loop.
*/
- if ((rvfront = checkpage(fronthand, FRONT)) == 1)
- count = 0;
- if ((rvback = checkpage(backhand, BACK)) == 1)
- count = 0;
+ if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
+ laps = 0;
+ }
+ if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
+ laps = 0;
+ }
++pcount;
@@ -1080,25 +1331,35 @@ loop:
/*
* Don't include ineligible pages in the number scanned.
*/
- if (rvfront != -1 || rvback != -1)
+ if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
nscan_cnt++;
+ }
+
+ if (bhwrapping) {
+ backhand = regionstart;
+ bhwrapping = B_FALSE;
+ } else {
+ backhand = page_nextn(backhand, tick);
+ if (backhand == regionend)
+ bhwrapping = B_TRUE;
+ }
- backhand = page_next(backhand);
+ if (fhwrapping) {
+ fronthand = regionstart;
+ fhwrapping = B_FALSE;
+ } else {
+ fronthand = page_nextn(fronthand, tick);
+ if (fronthand == regionend)
+ fhwrapping = B_TRUE;
+ }
/*
- * backhand update and wraparound check are done separately
- * because lint barks when it finds an empty "if" body
+ * The front hand has wrapped around during this wakeup.
*/
-
- if ((fronthand = page_next(fronthand)) == page_first()) {
- DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
-
- /*
- * Every 64 wraps we reposition our hands within our
- * region to prevent creep into another thread.
- */
- if ((++iter % pageout_reset_cnt) == 0)
- reset_hands[inst] = B_TRUE;
+ if (fronthand == fronthandstart) {
+ laps++;
+ DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
+ uint_t, laps);
/*
* This CPU kstat is only incremented here and we're
@@ -1107,96 +1368,134 @@ loop:
CPU_STATS_ADDQ(CPU, vm, rev, 1);
/*
- * If scanning because the system is low on memory,
* then when we wraparound memory we want to try to
* reclaim more pages.
* If scanning only because zones are over their cap,
* then wrapping is common and we simply keep going.
- */
- if (freemem < lotsfree + needfree && ++count > 1) {
+ */
+ if (laps > 1 && freemem < lotsfree + needfree) {
/*
- * The system is low on memory.
* Extremely unlikely, but it happens.
- * We went around memory at least once
- * and didn't reclaim enough.
+ * We went around the loop at least once
+ * and didn't get far enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
- mutex_enter(&pageout_mutex);
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
- mutex_exit(&pageout_mutex);
} else {
- /*
- * Really a "goto loop", but if someone
- * is tracing or TNF_PROBE_ing, hit
- * those probes first.
- */
- mutex_exit(&pageout_mutex);
break;
}
}
}
}
- atomic_add_long(&nscan, nscan_cnt);
-
sample_end = gethrtime();
+ atomic_add_long(&nscan, nscan_cnt);
- DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
- uint_t, inst);
-
- /* Kernel probe */
- TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
- tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
- freemem);
+ DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
+ pgcnt_t, nscan_cnt, pgcnt_t, pcount)
/*
- * The following two blocks are only relevant when the scanner is
- * first started up. After the scanner runs for a while, neither of
- * the conditions will ever be true again.
- *
* The global variables used below are only modified by this thread and
* only during initial scanning when there is a single page scanner
- * thread running. Thus, we don't use any locking.
+ * thread running.
*/
- if (PAGE_SCAN_STARTUP) {
+ if (pageout_new_spread == 0) {
VERIFY3U(inst, ==, 0);
- pageout_sample_pages += pcount;
- pageout_sample_etime += sample_end - sample_start;
- ++pageout_sample_cnt;
- } else if (pageout_new_spread == 0) {
- uint_t i;
+ if (PAGE_SCAN_STARTUP) {
+ /*
+ * Continue accumulating samples until we have enough
+ * to get a reasonable value for average scan rate.
+ */
+ pageout_sample_pages += pcount;
+ pageout_sample_etime += sample_end - sample_start;
+ ++pageout_sample_cnt;
+ }
+ if (!PAGE_SCAN_STARTUP) {
+ /*
+ * We have enough samples, set the spread.
+ */
+ pageout_rate = (hrrate_t)pageout_sample_pages *
+ (hrrate_t)(NANOSEC) / pageout_sample_etime;
+ pageout_new_spread = pageout_rate / 10;
+ setupclock();
+ }
+ }
+
+ goto loop;
+}
+
+/*
+ * The pageout deadman is run once per second by clock().
+ */
+void
+pageout_deadman(void)
+{
+ if (panicstr != NULL) {
/*
- * We have run enough samples, set the spread.
+ * There is no pageout after panic.
*/
- VERIFY3U(inst, ==, 0);
- pageout_rate = (hrrate_t)pageout_sample_pages *
- (hrrate_t)(NANOSEC) / pageout_sample_etime;
- pageout_new_spread = pageout_rate / 10;
- setupclock(1);
+ return;
}
- goto loop;
+ if (pageout_deadman_seconds == 0) {
+ /*
+ * The deadman is not enabled.
+ */
+ return;
+ }
+
+ if (!pageout_pushing) {
+ goto reset;
+ }
+
+ /*
+ * We are pushing a page. Check to see if it is the same call we saw
+ * last time we looked:
+ */
+ if (pageout_pushcount != pageout_pushcount_seen) {
+ /*
+ * It is a different call from the last check, so we are not
+ * stuck.
+ */
+ goto reset;
+ }
+
+ if (++pageout_stucktime >= pageout_deadman_seconds) {
+ panic("pageout_deadman: stuck pushing the same page for %d "
+ "seconds (freemem is %lu)", pageout_deadman_seconds,
+ freemem);
+ }
+
+ return;
+
+reset:
+ /*
+ * Reset our tracking state to reflect that we are not stuck:
+ */
+ pageout_stucktime = 0;
+ pageout_pushcount_seen = pageout_pushcount;
}
/*
* Look at the page at hand. If it is locked (e.g., for physical i/o),
* system (u., page table) or free, then leave it alone. Otherwise,
* if we are running the front hand, turn off the page's reference bit.
- * If running the back hand, check whether the page has been reclaimed.
- * If not, free the page, pushing it to disk first if necessary.
+ * If the proc is over maxrss, we take it. If running the back hand,
+ * check whether the page has been reclaimed. If not, free the page,
+ * pushing it to disk first if necessary.
*
* Return values:
- * -1 if the page is not a candidate at all,
- * 0 if not freed, or
- * 1 if we freed it.
+ * CKP_INELIGIBLE if the page is not a candidate at all,
+ * CKP_NOT_FREED if the page was not freed, or
+ * CKP_FREED if we freed it.
*/
-static int
-checkpage(struct page *pp, int whichhand)
+static checkpage_result_t
+checkpage(struct page *pp, pageout_hand_t whichhand)
{
int ppattr;
int isfs = 0;
@@ -1206,7 +1505,7 @@ checkpage(struct page *pp, int whichhand)
/*
* Skip pages:
- * - associated with the kernel vnode since
+ * - associated with the kernel vnode since
* they are always "exclusively" locked.
* - that are free
* - that are shared more than po_share'd times
@@ -1218,21 +1517,21 @@ checkpage(struct page *pp, int whichhand)
if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
hat_page_checkshare(pp, po_share)) {
- return (-1);
+ return (CKP_INELIGIBLE);
}
if (!page_trylock(pp, SE_EXCL)) {
/*
* Skip the page if we can't acquire the "exclusive" lock.
*/
- return (-1);
+ return (CKP_INELIGIBLE);
} else if (PP_ISFREE(pp)) {
/*
* It became free between the above check and our actually
- * locking the page. Oh, well there will be other pages.
+ * locking the page. Oh well, there will be other pages.
*/
page_unlock(pp);
- return (-1);
+ return (CKP_INELIGIBLE);
}
/*
@@ -1242,7 +1541,7 @@ checkpage(struct page *pp, int whichhand)
*/
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
- return (-1);
+ return (CKP_INELIGIBLE);
}
if (zones_over) {
@@ -1251,11 +1550,11 @@ checkpage(struct page *pp, int whichhand)
if (pp->p_zoneid == ALL_ZONES ||
zone_pdata[pp->p_zoneid].zpers_over == 0) {
/*
- * Cross-zone shared page, or zone not over it's cap.
- * Leave the page alone.
- */
+ * Cross-zone shared page, or zone not over it's cap.
+ * Leave the page alone.
+ */
page_unlock(pp);
- return (-1);
+ return (CKP_INELIGIBLE);
}
zid = pp->p_zoneid;
}
@@ -1263,7 +1562,6 @@ checkpage(struct page *pp, int whichhand)
/*
* Maintain statistics for what we are freeing
*/
-
if (pp->p_vnode != NULL) {
if (pp->p_vnode->v_flag & VVMEXEC)
isexec = 1;
@@ -1277,34 +1575,44 @@ checkpage(struct page *pp, int whichhand)
* The back hand examines the REF bit and always considers
* SHARED pages as referenced.
*/
- if (whichhand == FRONT)
+ if (whichhand == POH_FRONT) {
pagesync_flag = HAT_SYNC_ZERORM;
- else
+ } else {
pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
HAT_SYNC_STOPON_SHARED;
+ }
ppattr = hat_pagesync(pp, pagesync_flag);
recheck:
/*
- * If page is referenced; fronthand makes unreferenced and reclaimable.
- * For the backhand, a process referenced the page since the front hand
- * went by, so it's not a candidate for freeing up.
+ * If page is referenced; make unreferenced but reclaimable.
+ * If this page is not referenced, then it must be reclaimable
+ * and we can add it to the free list.
*/
if (ppattr & P_REF) {
- DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
- if (whichhand == FRONT) {
+ DTRACE_PROBE2(pageout__isref, page_t *, pp,
+ pageout_hand_t, whichhand);
+
+ if (whichhand == POH_FRONT) {
+ /*
+ * Checking of rss or madvise flags needed here...
+ *
+ * If not "well-behaved", fall through into the code
+ * for not referenced.
+ */
hat_clrref(pp);
}
+
+ /*
+ * Somebody referenced the page since the front
+ * hand went by, so it's not a candidate for
+ * freeing up.
+ */
page_unlock(pp);
- return (0);
+ return (CKP_NOT_FREED);
}
- /*
- * This page is not referenced, so it must be reclaimable and we can
- * add it to the free list. This can be done by either hand.
- */
-
VM_STAT_ADD(pageoutvmstats.checkpage[0]);
/*
@@ -1315,31 +1623,32 @@ recheck:
if (!page_try_demote_pages(pp)) {
VM_STAT_ADD(pageoutvmstats.checkpage[1]);
page_unlock(pp);
- return (-1);
+ return (CKP_INELIGIBLE);
}
+
ASSERT(pp->p_szc == 0);
VM_STAT_ADD(pageoutvmstats.checkpage[2]);
+
/*
- * since page_try_demote_pages() could have unloaded some
+ * Since page_try_demote_pages() could have unloaded some
* mappings it makes sense to reload ppattr.
*/
ppattr = hat_page_getattr(pp, P_MOD | P_REF);
}
/*
- * If the page is currently dirty, we have to arrange
- * to have it cleaned before it can be freed.
+ * If the page is currently dirty, we have to arrange to have it
+ * cleaned before it can be freed.
*
* XXX - ASSERT(pp->p_vnode != NULL);
*/
- if ((ppattr & P_MOD) && pp->p_vnode) {
+ if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
struct vnode *vp = pp->p_vnode;
u_offset_t offset = pp->p_offset;
/*
- * Note: There is no possibility to test for process being
- * swapped out or about to exit since we can't get back to
- * process(es) from the page.
+ * XXX - Test for process being swapped out or about to exit?
+ * [Can't get back to process(es) using the page.]
*/
/*
@@ -1351,34 +1660,33 @@ recheck:
page_unlock(pp);
/*
- * Queue i/o request for the pageout thread.
+ * Queue I/O request for the pageout thread.
*/
if (!queue_io_request(vp, offset)) {
VN_RELE(vp);
- return (0);
+ return (CKP_NOT_FREED);
}
if (isfs) {
zone_pageout_stat(zid, ZPO_DIRTY);
} else {
zone_pageout_stat(zid, ZPO_ANONDIRTY);
}
- return (1);
+ return (CKP_FREED);
}
/*
- * Now we unload all the translations,
- * and put the page back on to the free list.
- * If the page was used (referenced or modified) after
- * the pagesync but before it was unloaded we catch it
- * and handle the page properly.
+ * Now we unload all the translations and put the page back on to the
+ * free list. If the page was used (referenced or modified) after the
+ * pagesync but before it was unloaded we catch it and handle the page
+ * properly.
*/
- DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
+ DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
ppattr = hat_page_getattr(pp, P_MOD | P_REF);
- if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
+ if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
goto recheck;
+ }
- /*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_FREE, 0, kcred);
CPU_STATS_ADD_K(vm, dfree, 1);
@@ -1395,7 +1703,7 @@ recheck:
zone_pageout_stat(zid, ZPO_ANON);
}
- return (1); /* freed a page! */
+ return (CKP_FREED);
}
/*
diff --git a/usr/src/uts/common/os/watchpoint.c b/usr/src/uts/common/os/watchpoint.c
index eee612ef93..24db9637d4 100644
--- a/usr/src/uts/common/os/watchpoint.c
+++ b/usr/src/uts/common/os/watchpoint.c
@@ -821,7 +821,6 @@ watch_xcopyin(const void *uaddr, void *kaddr, size_t count)
count -= part;
}
-error:
/* if we hit a watched address, do the watchpoint logic */
if (watchcode &&
(!sys_watchpoint(vaddr, watchcode, ta) ||