summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os/zone.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
-rw-r--r--usr/src/uts/common/os/zone.c1108
1 files changed, 1048 insertions, 60 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 085a9f7e73..a161fb85a2 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright 2018, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -252,6 +252,8 @@
#include <sys/cpucaps.h>
#include <vm/seg.h>
#include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
/*
* This constant specifies the number of seconds that threads waiting for
@@ -312,6 +314,7 @@ static id_space_t *zoneid_space;
* 'global_zone'.
*/
zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
/*
@@ -327,8 +330,8 @@ static list_t zone_active;
static list_t zone_deathrow;
static kmutex_t zone_deathrow_lock;
-/* number of zones is limited by virtual interface limit in IP */
-uint_t maxzones = 8192;
+/* This can be dynamically reduced if various subsystems hit internal limits. */
+uint_t maxzones = MAX_ZONES;
/* Event channel to sent zone state change notifications */
evchan_t *zone_event_chan;
@@ -372,8 +375,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -419,8 +426,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
+
+/*
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ * 1) pages and RSS data associated with processes inside a zone
+ * 2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
+ *
+ * All zone physical memory cap data is stored in this array instead of within
+ * the zone structure itself. This is because zone structures come and go, but
+ * paging-related work can be asynchronous to any particular zone. In,
+ * particular:
+ * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
+ * associated with any zone.
+ * 2) Freeing segkp pages can occur long after the zone which first
+ * instantiated those pages has gone away.
+ * We want to be able to account for pages/zone without constantly having to
+ * take extra locks and finding the relevant zone structure, particularly during
+ * page scanning.
+ *
+ * The page scanner can run when "zone_num_over_cap" is non-zero. It can
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
+ * if that zone is over its cap.
+ *
+ * There is no locking for the page scanner to perform these two checks.
+ * We cannot have the page scanner blocking normal paging activity for
+ * running processes. Because the physical memory cap is a soft cap, it is
+ * fine for the scanner to simply read the current state of the counter and
+ * the zone's zpers_over entry in the array. The scanner should never modify
+ * either of these items. Internally the entries and the counter are managed
+ * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
+ * take care to ensure that we only take the zone_physcap_lock mutex when a
+ * zone is transitioning over/under its physical memory cap.
+ *
+ * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
+ * the "zone_pdata" array and associated counter.
+ *
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
+ * In the future we may need to expand these counters to 64-bit, but for now
+ * we're using 32-bit to conserve memory, since this array is statically
+ * allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
+ */
+uint_t zone_num_over_cap;
+zone_persist_t zone_pdata[MAX_ZONES];
+static kmutex_t zone_physcap_lock;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1379,6 +1450,127 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+ rctl_qty_t r = 0;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ return (r);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+ zone_persist_t *zp;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zp = &zone_pdata[zone->zone_id];
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+ mutex_exit(&zp->zpers_zfs_lock);
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1705,6 +1897,57 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ q = ptob(zp->zpers_pg_cnt);
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zoneid_t zid;
+ uint_t pg_val;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ zid = e->rcep_p.zone->zone_id;
+ if (nv == UINT64_MAX) {
+ pg_val = UINT32_MAX;
+ } else {
+ uint64_t pages = btop(nv);
+
+ /*
+ * Return from RCTLOP_SET is always ignored so just clamp an
+ * out-of-range value to our largest "limited" value.
+ */
+ if (pages >= UINT32_MAX) {
+ pg_val = UINT32_MAX - 1;
+ } else {
+ pg_val = (uint_t)pages;
+ }
+ }
+ zone_pdata[zid].zpers_pg_limit = pg_val;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1798,6 +2041,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1826,7 +2084,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1851,16 +2109,200 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp == NULL) {
+ zzp->zz_nread.value.ui64 = 0;
+ zzp->zz_reads.value.ui64 = 0;
+ zzp->zz_rtime.value.ui64 = 0;
+ zzp->zz_rlentime.value.ui64 = 0;
+ zzp->zz_nwritten.value.ui64 = 0;
+ zzp->zz_writes.value.ui64 = 0;
+ zzp->zz_waittime.value.ui64 = 0;
+ } else {
+ kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure
+ * used by kstat_runq_enter() and related functions. Since the
+ * I/O throttle counters are updated directly by the ZFS layer,
+ * there's no need to copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+ zzp->zz_waittime.value.ui64 =
+ zp->zpers_zfsp->zpers_zfs_rd_waittime;
+ }
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
static int
zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
+ zone_persist_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
+ zp = &zone_pdata[zone->zone_id];
+
+ zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zp->zpers_nover;
+#ifndef DEBUG
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
+#else
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+ zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
+#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
@@ -1893,6 +2335,12 @@ zone_mcap_kstat_create(zone_t *zone)
/* The kstat "name" field is not large enough for a full zonename */
kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
@@ -1936,6 +2384,8 @@ zone_misc_kstat_update(kstat_t *ksp, int rw)
zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
+ zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
+
zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
@@ -1979,6 +2429,8 @@ zone_misc_kstat_create(zone_t *zone)
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
+ KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
KSTAT_DATA_UINT32);
kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
@@ -1994,13 +2446,25 @@ zone_misc_kstat_create(zone_t *zone)
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
zone->zone_mcap_stats = kmem_zalloc(
sizeof (zone_mcap_kstat_t), KM_SLEEP);
@@ -2032,8 +2496,15 @@ zone_kstat_delete(zone_t *zone)
sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_swapresv_kstat,
sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_nprocs_kstat,
sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
zone_kstat_delete_common(&zone->zone_mcap_ksp,
sizeof (zone_mcap_kstat_t));
zone_kstat_delete_common(&zone->zone_misc_ksp,
@@ -2095,12 +2566,15 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
-
zone0.zone_stime = 0;
zone0.zone_utime = 0;
zone0.zone_wtime = 0;
+ zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+ zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2207,6 +2681,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 16384, 16384, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2248,6 +2737,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2258,6 +2761,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2279,6 +2787,9 @@ zone_init(void)
zone0.zone_ntasks = 1;
mutex_exit(&p0.p_lock);
zone0.zone_restart_init = B_TRUE;
+ zone0.zone_reboot_on_init_exit = B_FALSE;
+ zone0.zone_restart_init_0 = B_FALSE;
+ zone0.zone_init_status = -1;
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
@@ -2358,6 +2869,8 @@ zone_init(void)
static void
zone_free(zone_t *zone)
{
+ zone_dl_t *zdl;
+
ASSERT(zone != global_zone);
ASSERT(zone->zone_ntasks == 0);
ASSERT(zone->zone_nlwps == 0);
@@ -2373,6 +2886,9 @@ zone_free(zone_t *zone)
*/
cpucaps_zone_remove(zone);
+ /* Clear physical memory capping data. */
+ bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
+
ASSERT(zone->zone_cpucap == NULL);
/* remove from deathrow list */
@@ -2386,6 +2902,19 @@ zone_free(zone_t *zone)
list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
+
+ /*
+ * While dlmgmtd should have removed all of these, it could have left
+ * something behind or crashed. In which case it's not safe for us to
+ * assume that the list is empty which list_destroy() will ASSERT. We
+ * clean up for our userland comrades which may have crashed, or worse,
+ * been disabled by SMF.
+ */
+ while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+ if (zdl->zdl_net != NULL)
+ nvlist_free(zdl->zdl_net);
+ kmem_free(zdl, sizeof (zone_dl_t));
+ }
list_destroy(&zone->zone_dl_list);
if (zone->zone_rootvp != NULL)
@@ -2430,12 +2959,18 @@ zone_free(zone_t *zone)
static void
zone_status_set(zone_t *zone, zone_status_t status)
{
+ timestruc_t now;
+ uint64_t t;
nvlist_t *nvl = NULL;
ASSERT(MUTEX_HELD(&zone_status_lock));
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
status >= zone_status_get(zone));
+ /* Current time since Jan 1 1970 but consumers expect NS */
+ gethrestime(&now);
+ t = (now.tv_sec * NANOSEC) + now.tv_nsec;
+
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
@@ -2443,7 +2978,7 @@ zone_status_set(zone_t *zone, zone_status_t status)
nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
zone_status_table[zone->zone_status]) ||
nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
- nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
#ifdef DEBUG
@@ -2521,9 +3056,14 @@ zone_set_brand(zone_t *zone, const char *brand)
return (EINVAL);
}
- /* set up the brand specific data */
+ /*
+ * Set up the brand specific data.
+ * Note that it's possible that the hook has to drop the
+ * zone_status_lock and reaquire it before returning so we can't
+ * assume the lock has been held the entire time.
+ */
zone->zone_brand = bp;
- ZBROP(zone)->b_init_brand_data(zone);
+ ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
mutex_exit(&zone_status_lock);
return (0);
@@ -2596,18 +3136,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
}
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
-{
- uint64_t mcap;
- int err = 0;
-
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
-
- return (err);
-}
-
-static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
char sched_class[PC_CLNMSZ];
@@ -3014,6 +3542,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -3757,6 +4291,17 @@ zone_start_init(void)
*/
z->zone_proc_initpid = p->p_pid;
+ if (z->zone_setup_app_contract == B_TRUE) {
+ /*
+ * Normally a process cannot modify its own contract, but we're
+ * just starting the zone's init process and its contract is
+ * always initialized from the sys_process_tmpl template, so
+ * this is the simplest way to setup init's contract to kill
+ * the process if any other process in the contract exits.
+ */
+ p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+ }
+
/*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
@@ -3785,9 +4330,54 @@ zone_start_init(void)
lwp_exit();
}
} else {
+ id_t cid = curthread->t_cid;
+
if (zone_status_get(z) == ZONE_IS_BOOTING)
zone_status_set(z, ZONE_IS_RUNNING);
mutex_exit(&zone_status_lock);
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ /*
+ * If the zone is using FX then by default all
+ * processes start at the lowest priority and stay
+ * there. We provide a mechanism for the zone to
+ * indicate that it should run at "high priority". In
+ * this case we setup init to run at the highest FX
+ * priority (which is one level higher than the
+ * non-fixed scheduling classes can use).
+ */
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ (void) parmsset(&pcparms, curthread);
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ /*
+ * zsched always starts the init lwp at priority
+ * minclsyspri - 1. This priority gets set in t_pri and
+ * is invalid for RT, but RT never uses t_pri. However
+ * t_pri is used by procfs, so we always see processes
+ * within an RT zone with an invalid priority value.
+ * We fix that up now.
+ */
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
/* cause the process to return to userland. */
lwp_rtt();
}
@@ -4273,8 +4863,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
error = EINVAL;
name = nvpair_name(nvp);
- if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
- != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+ if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+ strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+ nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
@@ -4393,7 +4984,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4465,6 +5056,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zone->zone_id = zoneid;
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4472,6 +5064,9 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
zone->zone_restart_init = B_TRUE;
+ zone->zone_reboot_on_init_exit = B_FALSE;
+ zone->zone_restart_init_0 = B_FALSE;
+ zone->zone_init_status = -1;
zone->zone_brand = &native_brand;
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -4541,14 +5136,26 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_max_swap_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+
+ zone_pdata[zoneid].zpers_zfsp =
+ kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+ zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
*/
zone->zone_rctls = NULL;
+ /*
+ * Ensure page count is 0 (in case zoneid has wrapped).
+ * Initialize physical memory cap as unlimited.
+ */
+ zone_pdata[zoneid].zpers_pg_cnt = 0;
+ zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
+
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
return (zone_create_error(error, 0, extended_error));
@@ -4697,8 +5304,8 @@ zone_create(const char *zone_name, const char *zone_root,
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
- * and initialize zsched appropriately. I'm not sure that that
- * makes much of a difference, though.
+ * and initialize zsched appropriately. However, we allow zoneadmd
+ * to pass down both zone and project rctls for the zone's init.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
@@ -4837,6 +5444,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4847,7 +5455,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5176,6 +5793,7 @@ zone_destroy(zoneid_t zoneid)
zone_status_t status;
clock_t wait_time;
boolean_t log_refcounts;
+ zone_persist_t *zp;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -5209,6 +5827,12 @@ zone_destroy(zoneid_t zoneid)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ zp = &zone_pdata[zoneid];
+ mutex_enter(&zp->zpers_zfs_lock);
+ kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+ zp->zpers_zfsp = NULL;
+ mutex_exit(&zp->zpers_zfs_lock);
+
/*
* wait for zsched to exit
*/
@@ -5598,14 +6222,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5669,6 +6285,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ size = sizeof (boolean_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+ bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5700,10 +6333,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * No attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID) {
return (set_errno(EINVAL));
}
@@ -5716,11 +6348,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
mutex_exit(&zonehash_lock);
/*
- * At present most attributes can only be set on non-running,
+ * At present attributes can only be set on non-running,
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5733,6 +6365,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
zone->zone_restart_init = B_FALSE;
err = 0;
break;
+ case ZONE_ATTR_INITRESTART0:
+ zone->zone_restart_init_0 = B_TRUE;
+ err = 0;
+ break;
+ case ZONE_ATTR_INITREBOOT:
+ zone->zone_reboot_on_init_exit = B_TRUE;
+ err = 0;
+ break;
case ZONE_ATTR_BOOTARGS:
err = zone_set_bootargs(zone, (const char *)buf);
break;
@@ -5745,9 +6385,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_SECFLAGS:
err = zone_set_secflags(zone, (psecflags_t *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
- break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
@@ -5775,6 +6412,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_APP_SVC_CT:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_setup_app_contract = (boolean_t)buf;
+ err = 0;
+ }
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_fixed_hipri = (boolean_t)buf;
+ err = 0;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
@@ -6478,6 +7131,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6488,7 +7142,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6589,6 +7243,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp)
bcopy(zone->zone_name, zone_name, zone_namelen);
zoneid = zone->zone_id;
uniqid = zone->zone_uniqid;
+ arg.status = zone->zone_init_status;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
@@ -6766,7 +7421,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
@@ -6829,16 +7484,15 @@ zone_shutdown_global(void)
}
/*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
- zone_t *zone = curproc->p_zone;
const char *name = NULL;
vfs_t *vfsp = NULL;
@@ -6906,7 +7560,8 @@ zone_dataset_visible(const char *dataset, int *write)
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
- ASSERT(vfsp);
+ if (vfsp == NULL)
+ break;
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
@@ -6943,6 +7598,18 @@ zone_dataset_visible(const char *dataset, int *write)
}
/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_t *zone = curproc->p_zone;
+
+ return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
+/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
@@ -7044,6 +7711,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
zone_t *thiszone;
+ /*
+ * Only the GZ may add a datalink to a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may add a
+ * datalink to a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * When links exist in the GZ, they aren't added to the GZ's
+ * zone_dl_list. We must enforce this because link_activate()
+ * depends on zone_check_datalink() returning only NGZs.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((thiszone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(ENXIO));
@@ -7076,6 +7764,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
zone_t *zone;
int err = 0;
+ /*
+ * Only the GZ may remove a datalink from a zone's list.
+ */
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (set_errno(EPERM));
+
+ /*
+ * Only a process with the datalink config priv may remove a
+ * datalink from a zone's list.
+ */
+ if (secpolicy_dl_config(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ /*
+ * If we can't add a datalink to the GZ's zone_dl_list then we
+ * certainly can't remove them either.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ return (set_errno(EINVAL));
+
if ((zone = zone_find_by_id(zoneid)) == NULL)
return (set_errno(EINVAL));
@@ -7093,25 +7801,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
}
/*
- * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
- * the linkid. Otherwise we just check if the specified zoneidp has been
- * assigned the supplied linkid.
+ *
+ * This function may be used in two ways:
+ *
+ * 1. to get the zoneid of the zone this link is under, or
+ *
+ * 2. to verify that the link is under a specific zone.
+ *
+ * The first use is achieved by passing a zoneid of ALL_ZONES. The
+ * function then iterates the datalink list of every zone on the
+ * system until it finds the linkid. If the linkid is found then the
+ * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
+ * returned and zoneidp is not modified. The use of ALL_ZONES is
+ * limited to callers in the GZ to prevent leaking information to
+ * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
+ * to the second type in the list above.
+ *
+ * The second use is achieved by passing a specific zoneid. The GZ can
+ * use this to verify a link is under a particular zone. An NGZ can
+ * use this to verify a link is under itself. But an NGZ cannot use
+ * this to determine if a link is under some other zone as that would
+ * result in information leakage. If the link exists under the zone
+ * then 0 is returned. Otherwise, ENXIO is returned.
*/
int
zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
{
zone_t *zone;
+ zoneid_t zoneid = *zoneidp;
+ zoneid_t caller = getzoneid();
int err = ENXIO;
- if (*zoneidp != ALL_ZONES) {
- if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
- if (zone_dl_exists(zone, linkid))
+ /*
+ * Only the GZ may enquire about all zones; an NGZ may only
+ * enuqire about itself.
+ */
+ if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
+ zoneid = caller;
+
+ if (zoneid != caller && caller != GLOBAL_ZONEID)
+ return (err);
+
+ if (zoneid != ALL_ZONES) {
+ if ((zone = zone_find_by_id(zoneid)) != NULL) {
+ if (zone_dl_exists(zone, linkid)) {
+ /*
+ * We need to set this in case an NGZ
+ * passes ALL_ZONES.
+ */
+ *zoneidp = zoneid;
err = 0;
+ }
zone_rele(zone);
}
return (err);
}
+ ASSERT(caller == GLOBAL_ZONEID);
mutex_enter(&zonehash_lock);
for (zone = list_head(&zone_active); zone != NULL;
zone = list_next(&zone_active, zone)) {
@@ -7122,6 +7868,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
}
}
mutex_exit(&zonehash_lock);
+
return (err);
}
@@ -7142,6 +7889,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
zone_dl_t *zdl;
datalink_id_t *idptr = idarray;
+ /*
+ * Only the GZ or the owning zone may look at the datalink list.
+ */
+ if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
+ return (set_errno(EPERM));
+
if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
return (set_errno(EFAULT));
if ((zone = zone_find_by_id(zoneid)) == NULL)
@@ -7167,6 +7920,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
mutex_exit(&zone->zone_lock);
zone_rele(zone);
+ /*
+ * Prevent returning negative nump values -- we should never
+ * have this many links anyways.
+ */
+ if (num > INT_MAX)
+ return (set_errno(EOVERFLOW));
+
/* Increased or decreased, caller should be notified. */
if (num != dlcount) {
if (copyout(&num, nump, sizeof (num)) != 0)
@@ -7380,3 +8140,231 @@ done:
else
return (0);
}
+
+static void
+zone_incr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+
+ /* See if over (unlimited is UINT32_MAX), or already marked that way. */
+ if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck setting under mutex */
+ if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+ zp->zpers_over = 1;
+ zp->zpers_nover++;
+ zone_num_over_cap++;
+ DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * We want some hysteresis when the zone is going under its cap so that we're
+ * not continuously toggling page scanning back and forth by a single page
+ * around the cap. Using ~1% of the zone's page limit seems to be a good
+ * quantity. This table shows some various zone memory caps and the number of
+ * pages (assuming a 4k page size). Given this, we choose to shift the page
+ * limit by 7 places to get a hysteresis that is slightly less than 1%.
+ *
+ * cap pages pages 1% shift7 shift7
+ * 128M 32768 0x0008000 327 256 0x00100
+ * 512M 131072 0x0020000 1310 1024 0x00400
+ * 1G 262144 0x0040000 2621 2048 0x00800
+ * 4G 1048576 0x0100000 10485 8192 0x02000
+ * 8G 2097152 0x0200000 20971 16384 0x04000
+ * 16G 4194304 0x0400000 41943 32768 0x08000
+ * 32G 8388608 0x0800000 83886 65536 0x10000
+ * 64G 16777216 0x1000000 167772 131072 0x20000
+ */
+static void
+zone_decr_capped(zoneid_t zid)
+{
+ zone_persist_t *zp = &zone_pdata[zid];
+ uint32_t adjusted_limit;
+
+ /*
+ * See if under, or already marked that way. There is no need to
+ * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+ * since we'll never set zpers_over in zone_incr_capped().
+ */
+ if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
+ return;
+ }
+
+ adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
+
+ /* Recheck, accounting for our hysteresis. */
+ if (zp->zpers_pg_cnt >= adjusted_limit) {
+ return;
+ }
+
+ mutex_enter(&zone_physcap_lock);
+ /* Recheck under mutex. */
+ if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+ zp->zpers_over = 0;
+ ASSERT(zone_num_over_cap > 0);
+ zone_num_over_cap--;
+ DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
+ }
+ mutex_exit(&zone_physcap_lock);
+}
+
+/*
+ * For zone_add_page() and zone_rm_page(), access to the page we're touching is
+ * controlled by our caller's locking.
+ * On x86 our callers already did: ASSERT(x86_hm_held(pp))
+ * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
+ */
+void
+zone_add_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ ASSERT(!PP_ISFREE(pp));
+
+ zid = curzone->zone_id;
+ if (pp->p_zoneid == zid) {
+ /* Another mapping to this page for this zone, do nothing */
+ return;
+ }
+
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = page_get_pagecnt(pp->p_szc);
+ }
+
+ if (pp->p_share == 0) {
+ /* First mapping to this page. */
+ pp->p_zoneid = zid;
+ zp = &zone_pdata[zid];
+ ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
+ zone_incr_capped(zid);
+ return;
+ }
+
+ if (pp->p_zoneid != ALL_ZONES) {
+ /*
+ * The page is now being shared across a different zone.
+ * Decrement the original zone's usage.
+ */
+ zid = pp->p_zoneid;
+ pp->p_zoneid = ALL_ZONES;
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ }
+}
+
+void
+zone_rm_page(page_t *pp)
+{
+ uint_t pcnt;
+ zone_persist_t *zp;
+ zoneid_t zid;
+
+ /* Skip pages in segkmem, etc. (KV_KVP, ...) */
+ if (PP_ISKAS(pp))
+ return;
+
+ zid = pp->p_zoneid;
+ if (zid == ALL_ZONES || pp->p_share != 0)
+ return;
+
+ /* This is the last mapping to the page for a zone. */
+ if (pp->p_szc == 0) {
+ pcnt = 1;
+ } else {
+ /* large page */
+ pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
+ }
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
+ }
+ zone_decr_capped(zid);
+ pp->p_zoneid = ALL_ZONES;
+}
+
+void
+zone_pageout_stat(int zid, zone_pageout_op_t op)
+{
+ zone_persist_t *zp;
+
+ if (zid == ALL_ZONES)
+ return;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+#ifndef DEBUG
+ atomic_add_64(&zp->zpers_pg_out, 1);
+#else
+ switch (op) {
+ case ZPO_DIRTY:
+ atomic_add_64(&zp->zpers_pg_fsdirty, 1);
+ break;
+ case ZPO_FS:
+ atomic_add_64(&zp->zpers_pg_fs, 1);
+ break;
+ case ZPO_ANON:
+ atomic_add_64(&zp->zpers_pg_anon, 1);
+ break;
+ case ZPO_ANONDIRTY:
+ atomic_add_64(&zp->zpers_pg_anondirty, 1);
+ break;
+ default:
+ cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
+ break;
+ }
+#endif
+}
+
+/*
+ * Return the zone's physical memory cap and current free memory (in pages).
+ */
+void
+zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
+{
+ zone_persist_t *zp;
+
+ ASSERT(zid >= 0 && zid <= MAX_ZONEID);
+ zp = &zone_pdata[zid];
+
+ /*
+ * If memory or swap limits are set on the zone, use those, otherwise
+ * use the system values. physmem and freemem are also in pages.
+ */
+ if (zp->zpers_pg_limit == UINT32_MAX) {
+ *memcap = physmem;
+ *free = freemem;
+ } else {
+ int64_t freemem;
+
+ *memcap = (pgcnt_t)zp->zpers_pg_limit;
+ freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
+ if (freemem > 0) {
+ *free = (pgcnt_t)freemem;
+ } else {
+ *free = (pgcnt_t)0;
+ }
+ }
+}