diff options
Diffstat (limited to 'usr/src/uts/common/os')
-rw-r--r-- | usr/src/uts/common/os/bio.c | 10 | ||||
-rw-r--r-- | usr/src/uts/common/os/clock.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/os/clock_highres.c | 36 | ||||
-rw-r--r-- | usr/src/uts/common/os/contract.c | 50 | ||||
-rw-r--r-- | usr/src/uts/common/os/core.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/os/cred.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/os/cyclic.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/os/exit.c | 15 | ||||
-rw-r--r-- | usr/src/uts/common/os/kstat_fr.c | 18 | ||||
-rw-r--r-- | usr/src/uts/common/os/logsubr.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/os/msacct.c | 36 | ||||
-rw-r--r-- | usr/src/uts/common/os/policy.c | 10 | ||||
-rw-r--r-- | usr/src/uts/common/os/priv_defs | 4 | ||||
-rw-r--r-- | usr/src/uts/common/os/vmem.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 789 |
15 files changed, 923 insertions, 78 deletions
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c index 0db01f80d7..c3d04e5508 100644 --- a/usr/src/uts/common/os/bio.c +++ b/usr/src/uts/common/os/bio.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) cpup = CPU; /* get pointer AFTER preemption is disabled */ CPU_STATS_ADDQ(cpup, vm, pgin, 1); CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); + + atomic_add_64(&curzone->zone_pgpgin, btopr(len)); + if ((flags & B_ASYNC) == 0) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) @@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) if (pp != NULL && pp->p_vnode != NULL) { if (IS_SWAPFSVP(pp->p_vnode)) { CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); + atomic_add_64(&curzone->zone_anonpgin, + btopr(len)); } else { if (pp->p_vnode->v_flag & VVMEXEC) { CPU_STATS_ADDQ(cpup, vm, execpgin, btopr(len)); + atomic_add_64(&curzone->zone_execpgin, + btopr(len)); } else { CPU_STATS_ADDQ(cpup, vm, fspgin, btopr(len)); + atomic_add_64(&curzone->zone_fspgin, + btopr(len)); } } } diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 451c9db48c..3f4dd63c82 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -23,6 +23,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -66,6 +67,7 @@ #include <sys/ddi_timer.h> #include <sys/random.h> #include <sys/modctl.h> +#include <sys/zone.h> /* * for NTP support @@ -1158,6 +1160,10 @@ loadavg_update() } while ((cpupart = cpupart->cp_next) != cp_list_head); + /* + * Third pass totals up per-zone statistics. + */ + zone_loadavg_update(); } /* diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index e097f355ec..7870617a26 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012, Joyent Inc. All rights reserved. + */ #include <sys/timer.h> #include <sys/systm.h> @@ -112,6 +114,25 @@ clock_highres_timer_settime(itimer_t *it, int flags, cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); + if (cyctime.cyt_when != 0 && cyctime.cyt_interval == 0 && + it->it_itime.it_interval.tv_sec == 0 && + it->it_itime.it_interval.tv_nsec == 0 && + (cyc = *cycp) != CYCLIC_NONE) { + /* + * If our existing timer is a one-shot and our new timer is a + * one-shot, we'll save ourselves a world of grief and just + * reprogram the cyclic. + */ + it->it_itime = *when; + + if (!(flags & TIMER_ABSTIME)) + cyctime.cyt_when += gethrtime(); + + hrt2ts(cyctime.cyt_when, &it->it_itime.it_value); + (void) cyclic_reprogram(cyc, cyctime.cyt_when); + return (0); + } + mutex_enter(&cpu_lock); if ((cyc = *cycp) != CYCLIC_NONE) { cyclic_remove(cyc); @@ -162,17 +183,14 @@ clock_highres_timer_settime(itimer_t *it, int flags, if (cyctime.cyt_interval == 0) { /* - * If this is a one-shot, then we set the interval to assure - * that the cyclic will next fire INT64_MAX nanoseconds after - * boot (which corresponds to over 292 years -- yes, Buck Rogers - * may have his 292-year-uptime-Solaris box malfunction). If - * this timer is never touched, this cyclic will simply - * consume space in the cyclic subsystem. As soon as + * If this is a one-shot, then we set the interval to be + * inifinite. If this timer is never touched, this cyclic will + * simply consume space in the cyclic subsystem. As soon as * timer_settime() or timer_delete() is called, the cyclic is * removed (so it's not possible to run the machine out * of resources by creating one-shots). */ - cyctime.cyt_interval = INT64_MAX - cyctime.cyt_when; + cyctime.cyt_interval = CY_INFINITY; } it->it_itime = *when; @@ -185,8 +203,6 @@ clock_highres_timer_settime(itimer_t *it, int flags, if (cyctime.cyt_when != 0) *cycp = cyc = cyclic_add(&hdlr, &cyctime); - else - *cycp = cyc = CYCLIC_NONE; /* * Now that we have the cyclic created, we need to bind it to our diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index a292f4e14f..ebaa6bfe41 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -497,7 +497,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit) contract_t *parent = &p->p_ct_process->conp_contract; int inherit = 0; - ASSERT(p == curproc); + VERIFY(p == curproc); mutex_enter(&ct->ct_lock); @@ -547,7 +547,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit) if (inherit) { ct->ct_state = CTS_INHERITED; - ASSERT(ct->ct_regent == parent); + VERIFY(ct->ct_regent == parent); contract_process_take(parent, ct); /* @@ -2063,8 +2063,8 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq) { ct_kevent_t *e, *first = NULL; - ASSERT(q->ctq_listno == CTEL_CONTRACT); - ASSERT(newq->ctq_listno == CTEL_PBUNDLE); + VERIFY(q->ctq_listno == CTEL_CONTRACT); + VERIFY(newq->ctq_listno == CTEL_PBUNDLE); mutex_enter(&q->ctq_lock); mutex_enter(&newq->ctq_lock); @@ -2077,8 +2077,16 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq) if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) { if (first == NULL) first = e; - list_insert_tail(&newq->ctq_events, e); - cte_hold(e); + /* + * It is possible for adoption to race with an owner's + * cte_publish_all(); we must only enqueue events that + * have not already been enqueued. + */ + if (!list_link_active((list_node_t *) + ((uintptr_t)e + newq->ctq_events.list_offset))) { + list_insert_tail(&newq->ctq_events, e); + cte_hold(e); + } } } @@ -2117,7 +2125,7 @@ cte_trim(ct_equeue_t *q, contract_t *ct) int flags, stopper; int start = 1; - ASSERT(MUTEX_HELD(&q->ctq_lock)); + VERIFY(MUTEX_HELD(&q->ctq_lock)); for (e = list_head(&q->ctq_events); e != NULL; e = next) { next = list_next(&q->ctq_events, e); @@ -2227,13 +2235,24 @@ cte_queue_drain(ct_equeue_t *q, int ack) * cte_publish_all. */ static void -cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp) +cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist) { ASSERT(MUTEX_HELD(&q->ctq_lock)); q->ctq_atime = *tsp; /* + * If this event may already exist on this queue, check to see if it + * is already there and return if so. + */ + if (mayexist && list_link_active((list_node_t *)((uintptr_t)e + + q->ctq_events.list_offset))) { + mutex_exit(&q->ctq_lock); + cte_rele(e); + return; + } + + /* * Don't publish if the event is informative and there aren't * any listeners, or if the queue has been shut down. */ @@ -2247,6 +2266,8 @@ cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp) /* * Enqueue event */ + VERIFY(!list_link_active((list_node_t *) + ((uintptr_t)e + q->ctq_events.list_offset))); list_insert_tail(&q->ctq_events, e); /* @@ -2318,14 +2339,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata) ct->ct_evcnt++; } mutex_exit(&ct->ct_lock); - cte_publish(&ct->ct_events, e, &ts); + cte_publish(&ct->ct_events, e, &ts, B_FALSE); /* * CTEL_BUNDLE - Next deliver to the contract type's bundle * queue. */ mutex_enter(&ct->ct_type->ct_type_events.ctq_lock); - cte_publish(&ct->ct_type->ct_type_events, e, &ts); + cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE); /* * CTEL_PBUNDLE - Finally, if the contract has an owner, @@ -2342,7 +2363,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata) q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]; mutex_enter(&q->ctq_lock); mutex_exit(&ct->ct_lock); - cte_publish(q, e, &ts); + + /* + * It is possible for this code to race with adoption; we + * publish the event indicating that the event may already + * be enqueued because adoption beat us to it (in which case + * cte_pubish() does nothing). + */ + cte_publish(q, e, &ts, B_TRUE); } else { mutex_exit(&ct->ct_lock); cte_rele(e); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index 9e04f631a9..3b3935a772 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -534,6 +535,10 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr) case 'z': s = p->p_zone->zone_name; break; + case 'Z': + /* This is zonepath + "/root/", except for GZ */ + s = p->p_zone->zone_rootpath; + break; case '%': (void) strcpy((s = buf), "%"); break; @@ -548,6 +553,9 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr) if ((size -= len) <= 0) return (ENAMETOOLONG); (void) strcpy(fp, s); + /* strip trailing "/root/" from non-GZ zonepath string */ + if (c == 'Z' && len > 6) + len -= 6; fp += len; } diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 1ec63249ab..20e57efaad 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -724,6 +724,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c index 1bb6baf445..93a318d260 100644 --- a/usr/src/uts/common/os/cyclic.c +++ b/usr/src/uts/common/os/cyclic.c @@ -24,6 +24,10 @@ */ /* + * Copyright (c) 2012, Joyent Inc. All rights reserved. + */ + +/* * The Cyclic Subsystem * -------------------- * @@ -1139,7 +1143,7 @@ top: CYC_TRACE(cpu, level, "softint-top", cyclics, pc); while (consndx != pc->cypc_prodndx) { - int pend, npend, opend; + uint32_t pend, npend, opend; int consmasked = consndx & sizemask; cyclic_t *cyclic = &cyclics[buf[consmasked]]; cyc_func_t handler = cyclic->cy_handler; diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index b97a09454b..7c5b8323e3 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -388,10 +389,16 @@ proc_exit(int why, int what) if (p->p_pid == z->zone_proc_initpid) { if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && - zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN && - z->zone_restart_init == B_TRUE && - restart_init(what, why) == 0) - return (0); + zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } else { + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, + CRED()); + } + } + /* * Since we didn't or couldn't restart init, we clear * the zone's init state and proceed with exit diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 83b817e866..a5f5a6f3c2 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -160,6 +161,7 @@ struct { kstat_named_t avenrun_5min; kstat_named_t avenrun_15min; kstat_named_t boot_time; + kstat_named_t nsec_per_tick; } system_misc_kstat = { { "ncpus", KSTAT_DATA_UINT32 }, { "lbolt", KSTAT_DATA_UINT32 }, @@ -171,6 +173,7 @@ struct { { "avenrun_5min", KSTAT_DATA_UINT32 }, { "avenrun_15min", KSTAT_DATA_UINT32 }, { "boot_time", KSTAT_DATA_UINT32 }, + { "nsec_per_tick", KSTAT_DATA_UINT32 }, }; struct { @@ -803,7 +806,6 @@ system_misc_kstat_update(kstat_t *ksp, int rw) { int myncpus = ncpus; int *loadavgp = &avenrun[0]; - int loadavg[LOADAVG_NSTATS]; time_t zone_boot_time; clock_t zone_lbolt; hrtime_t zone_hrtime; @@ -820,17 +822,11 @@ system_misc_kstat_update(kstat_t *ksp, int rw) */ mutex_enter(&cpu_lock); if (pool_pset_enabled()) { - psetid_t mypsid = zone_pset_get(curproc->p_zone); - int error; - myncpus = zone_ncpus_get(curproc->p_zone); ASSERT(myncpus > 0); - error = cpupart_get_loadavg(mypsid, &loadavg[0], - LOADAVG_NSTATS); - ASSERT(error == 0); - loadavgp = &loadavg[0]; } mutex_exit(&cpu_lock); + loadavgp = &curproc->p_zone->zone_avenrun[0]; } if (INGLOBALZONE(curproc)) { @@ -838,9 +834,7 @@ system_misc_kstat_update(kstat_t *ksp, int rw) zone_lbolt = ddi_get_lbolt(); zone_nproc = nproc; } else { - struct timeval tvp; - hrt2tv(curproc->p_zone->zone_zsched->p_mstart, &tvp); - zone_boot_time = tvp.tv_sec; + zone_boot_time = curproc->p_zone->zone_boot_time; zone_hrtime = gethrtime(); zone_lbolt = (clock_t)(NSEC_TO_TICK(zone_hrtime) - @@ -861,6 +855,8 @@ system_misc_kstat_update(kstat_t *ksp, int rw) system_misc_kstat.avenrun_15min.value.ui32 = (uint32_t)loadavgp[2]; system_misc_kstat.boot_time.value.ui32 = (uint32_t) zone_boot_time; + system_misc_kstat.nsec_per_tick.value.ui32 = (uint32_t) + nsec_per_tick; return (0); } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index f5cebbf82e..63a89a2ce8 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -248,8 +248,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2012, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c index df975eb7ee..c10dce81ca 100644 --- a/usr/src/uts/common/os/msacct.c +++ b/usr/src/uts/common/os/msacct.c @@ -33,6 +33,7 @@ #include <sys/debug.h> #include <sys/msacct.h> #include <sys/time.h> +#include <sys/zone.h> /* * Mega-theory block comment: @@ -390,6 +391,7 @@ void syscall_mstate(int fromms, int toms) { kthread_t *t = curthread; + zone_t *z = ttozone(t); struct mstate *ms; hrtime_t *mstimep; hrtime_t curtime; @@ -413,6 +415,10 @@ syscall_mstate(int fromms, int toms) newtime = curtime - ms->ms_state_start; } *mstimep += newtime; + if (fromms == LMS_USER) + atomic_add_64(&z->zone_utime, newtime); + else if (fromms == LMS_SYSTEM) + atomic_add_64(&z->zone_stime, newtime); t->t_mstate = toms; ms->ms_state_start = curtime; ms->ms_prev = fromms; @@ -602,7 +608,10 @@ new_mstate(kthread_t *t, int new_state) hrtime_t curtime; hrtime_t newtime; hrtime_t oldtime; + hrtime_t ztime; + hrtime_t origstart; klwp_t *lwp; + zone_t *z; ASSERT(new_state != LMS_WAIT_CPU); ASSERT((unsigned)new_state < NMSTATES); @@ -625,6 +634,7 @@ new_mstate(kthread_t *t, int new_state) ms = &lwp->lwp_mstate; state = t->t_mstate; + origstart = ms->ms_state_start; do { switch (state) { case LMS_TFAULT: @@ -637,7 +647,7 @@ new_mstate(kthread_t *t, int new_state) mstimep = &ms->ms_acct[state]; break; } - newtime = curtime - ms->ms_state_start; + ztime = newtime = curtime - ms->ms_state_start; if (newtime < 0) { curtime = gethrtime_unscaled(); oldtime = *mstimep - 1; /* force CAS to fail */ @@ -648,6 +658,20 @@ new_mstate(kthread_t *t, int new_state) t->t_mstate = new_state; ms->ms_state_start = curtime; } while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime); + + /* + * When the system boots the initial startup thread will have a + * ms_state_start of 0 which would add a huge system time to the global + * zone. We want to skip aggregating that initial bit of work. + */ + if (origstart != 0) { + z = ttozone(t); + if (state == LMS_USER) + atomic_add_64(&z->zone_utime, ztime); + else if (state == LMS_SYSTEM) + atomic_add_64(&z->zone_stime, ztime); + } + /* * Remember the previous running microstate. */ @@ -686,6 +710,8 @@ restore_mstate(kthread_t *t) hrtime_t waitrq; hrtime_t newtime; hrtime_t oldtime; + hrtime_t waittime; + zone_t *z; /* * Don't call restore mstate of threads without lwps. (Kernel threads) @@ -756,11 +782,15 @@ restore_mstate(kthread_t *t) oldtime = *mstimep; newtime += oldtime; } while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime); + /* * Update the WAIT_CPU timer and per-cpu waitrq total. */ - ms->ms_acct[LMS_WAIT_CPU] += (curtime - waitrq); - CPU->cpu_waitrq += (curtime - waitrq); + z = ttozone(t); + waittime = curtime - waitrq; + ms->ms_acct[LMS_WAIT_CPU] += waittime; + atomic_add_64(&z->zone_wtime, waittime); + CPU->cpu_waitrq += waittime; ms->ms_state_start = curtime; } diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 573ebbc367..d8f7882723 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -2563,3 +2564,12 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} + diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index a5a918b326..53617bd0fe 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -176,6 +176,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index f0027d18e9..d444def5cc 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1599,7 +1599,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 79ccd94ae4..4b9dc9fc93 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved. */ /* @@ -369,8 +370,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -423,8 +428,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1380,6 +1386,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1674,6 +1788,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1767,6 +1914,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1795,7 +1956,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1820,26 +1981,350 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_mcap_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_mcap_kstat_t *zmp = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; + zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; + zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; + zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; + zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; + zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; + + return (0); +} + +static kstat_t * +zone_mcap_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_mcap_kstat_t *zmp; + + if ((ksp = kstat_create_zone("memory_cap", zone->zone_id, + zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED, + sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_mcap_lock; + zone->zone_mcap_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); + + ksp->ks_update = zone_mcap_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_misc_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_misc_kstat_t *zmp = ksp->ks_data; + hrtime_t tmp; + + if (rw == KSTAT_WRITE) + return (EACCES); + + tmp = zone->zone_utime; + scalehrtime(&tmp); + zmp->zm_utime.value.ui64 = tmp; + tmp = zone->zone_stime; + scalehrtime(&tmp); + zmp->zm_stime.value.ui64 = tmp; + tmp = zone->zone_wtime; + scalehrtime(&tmp); + zmp->zm_wtime.value.ui64 = tmp; + + zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0]; + zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1]; + zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2]; + + zmp->zm_run_ticks.value.ui64 = zone->zone_run_ticks; + zmp->zm_run_wait.value.ui64 = zone->zone_runq_cntr; + zmp->zm_fss_shr_pct.value.ui64 = zone->zone_fss_shr_pct; + zmp->zm_fss_pri_hi.value.ui64 = zone->zone_fss_pri_hi; + zmp->zm_fss_pri_avg.value.ui64 = zone->zone_fss_pri_avg; + + return (0); +} + +static kstat_t * +zone_misc_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_misc_kstat_t *zmp; + + if ((ksp = kstat_create_zone("zones", zone->zone_id, + zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED, + sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_misc_lock; + zone->zone_misc_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min", + KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_run_ticks, "run_ticks", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_run_wait, "run_queue", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fss_shr_pct, "fss_share_percent", + KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_fss_pri_hi, "fss_pri_hi", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fss_pri_avg, "fss_pri_avg", + KSTAT_DATA_UINT64); + + ksp->ks_update = zone_misc_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { + zone->zone_mcap_stats = kmem_zalloc( + sizeof (zone_mcap_kstat_t), KM_SLEEP); + } + + if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { + zone->zone_misc_stats = kmem_zalloc( + sizeof (zone_misc_kstat_t), KM_SLEEP); + } } static void -zone_kstat_delete_common(kstat_t **pkstat) +zone_kstat_delete_common(kstat_t **pkstat, size_t datasz) { void *data; if (*pkstat != NULL) { data = (*pkstat)->ks_data; kstat_delete(*pkstat); - kmem_free(data, sizeof (zone_kstat_t)); + kmem_free(data, datasz); *pkstat = NULL; } } @@ -1847,9 +2332,23 @@ zone_kstat_delete_common(kstat_t **pkstat) static void zone_kstat_delete(zone_t *zone) { - zone_kstat_delete_common(&zone->zone_lockedmem_kstat); - zone_kstat_delete_common(&zone->zone_swapresv_kstat); - zone_kstat_delete_common(&zone->zone_nprocs_kstat); + zone_kstat_delete_common(&zone->zone_lockedmem_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_swapresv_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_nprocs_kstat, + sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_mcap_ksp, + sizeof (zone_mcap_kstat_t)); + zone_kstat_delete_common(&zone->zone_misc_ksp, + sizeof (zone_misc_kstat_t)); } /* @@ -1883,6 +2382,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -1906,7 +2407,13 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone0.zone_zfs_io_pri = 1; + zone0.zone_stime = 0; + zone0.zone_utime = 0; + zone0.zone_wtime = 0; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2013,6 +2520,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 1024, 1024, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2054,6 +2576,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2064,6 +2600,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2375,14 +2916,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - uint64_t mcap; - int err = 0; + zone->zone_mcap_nover++; + + return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ + uint64_t pageout; + int err; + + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} + +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -2794,6 +3386,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -2977,6 +3575,92 @@ zone_find_by_path(const char *path) } /* + * Public interface for updating per-zone load averages. Called once per + * second. + * + * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c. + */ +void +zone_loadavg_update() +{ + zone_t *zp; + zone_status_t status; + struct loadavg_s *lavg; + hrtime_t zone_total; + int i; + hrtime_t hr_avg; + int nrun; + static int64_t f[3] = { 135, 27, 9 }; + int64_t q, r; + + mutex_enter(&zonehash_lock); + for (zp = list_head(&zone_active); zp != NULL; + zp = list_next(&zone_active, zp)) { + mutex_enter(&zp->zone_lock); + + /* Skip zones that are on the way down or not yet up */ + status = zone_status_get(zp); + if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) { + /* For all practical purposes the zone doesn't exist. */ + mutex_exit(&zp->zone_lock); + continue; + } + + /* + * Update the 10 second moving average data in zone_loadavg. + */ + lavg = &zp->zone_loadavg; + + zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime; + scalehrtime(&zone_total); + + /* The zone_total should always be increasing. */ + lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ? + zone_total - lavg->lg_total : 0; + lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ; + /* lg_total holds the prev. 1 sec. total */ + lavg->lg_total = zone_total; + + /* + * To simplify the calculation, we don't calculate the load avg. + * until the zone has been up for at least 10 seconds and our + * moving average is thus full. + */ + if ((lavg->lg_len + 1) < S_LOADAVG_SZ) { + lavg->lg_len++; + mutex_exit(&zp->zone_lock); + continue; + } + + /* Now calculate the 1min, 5min, 15 min load avg. */ + hr_avg = 0; + for (i = 0; i < S_LOADAVG_SZ; i++) + hr_avg += lavg->lg_loads[i]; + hr_avg = hr_avg / S_LOADAVG_SZ; + nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX); + + /* Compute load avg. See comment in calcloadavg() */ + for (i = 0; i < 3; i++) { + q = (zp->zone_hp_avenrun[i] >> 16) << 7; + r = (zp->zone_hp_avenrun[i] & 0xffff) << 7; + zp->zone_hp_avenrun[i] += + ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4; + + /* avenrun[] can only hold 31 bits of load avg. */ + if (zp->zone_hp_avenrun[i] < + ((uint64_t)1<<(31+16-FSHIFT))) + zp->zone_avenrun[i] = (int32_t) + (zp->zone_hp_avenrun[i] >> (16 - FSHIFT)); + else + zp->zone_avenrun[i] = 0x7fffffff; + } + + mutex_exit(&zp->zone_lock); + } + mutex_exit(&zonehash_lock); +} + +/* * Get the number of cpus visible to this zone. The system-wide global * 'ncpus' is returned if pools are disabled, the caller is in the * global zone, or a NULL zone argument is passed in. @@ -3789,7 +4473,10 @@ zsched(void *arg) mutex_enter(&zone_status_lock); zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); mutex_exit(&zone_status_lock); + } else { + zone->zone_boot_time = gethrestime_sec(); } + pool_unlock(); } @@ -4081,7 +4768,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4104,6 +4791,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zoneid = zone->zone_id = id_alloc(zoneid_space); + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4172,10 +4860,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4474,6 +5166,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4484,7 +5177,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5222,14 +5924,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5284,6 +5978,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5315,10 +6017,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5335,7 +6038,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5344,6 +6049,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_INITNAME: err = zone_set_initname(zone, (const char *)buf); break; + case ZONE_ATTR_INITNORESTART: + zone->zone_restart_init = B_FALSE; + err = 0; + break; case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; @@ -5353,8 +6062,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); @@ -6075,6 +6793,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6085,7 +6804,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6363,7 +7082,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise |