summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os/zone.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
-rw-r--r--usr/src/uts/common/os/zone.c789
1 files changed, 754 insertions, 35 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 79ccd94ae4..4b9dc9fc93 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved.
*/
/*
@@ -369,8 +370,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -423,8 +428,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1380,6 +1386,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zone->zone_zfs_io_pri = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1674,6 +1788,39 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ /* No additional lock because not enforced in the kernel */
+ q = z->zone_phys_mem;
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_phys_mem_ctl = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1767,6 +1914,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+ zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1795,7 +1956,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1820,26 +1981,350 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the I/O throttle
+ * counters are updated directly by the ZFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_mcap_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_mcap_kstat_t *zmp = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+ zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+ zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
+ zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
+ zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
+ zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
+ zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
+ zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+ zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+ zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
+
+ return (0);
+}
+
+static kstat_t *
+zone_mcap_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_mcap_kstat_t *zmp;
+
+ if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
+ zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
+ sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_mcap_lock;
+ zone->zone_mcap_stats = zmp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+ KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_mcap_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_misc_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_misc_kstat_t *zmp = ksp->ks_data;
+ hrtime_t tmp;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ tmp = zone->zone_utime;
+ scalehrtime(&tmp);
+ zmp->zm_utime.value.ui64 = tmp;
+ tmp = zone->zone_stime;
+ scalehrtime(&tmp);
+ zmp->zm_stime.value.ui64 = tmp;
+ tmp = zone->zone_wtime;
+ scalehrtime(&tmp);
+ zmp->zm_wtime.value.ui64 = tmp;
+
+ zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
+ zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
+ zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
+
+ zmp->zm_run_ticks.value.ui64 = zone->zone_run_ticks;
+ zmp->zm_run_wait.value.ui64 = zone->zone_runq_cntr;
+ zmp->zm_fss_shr_pct.value.ui64 = zone->zone_fss_shr_pct;
+ zmp->zm_fss_pri_hi.value.ui64 = zone->zone_fss_pri_hi;
+ zmp->zm_fss_pri_avg.value.ui64 = zone->zone_fss_pri_avg;
+
+ return (0);
+}
+
+static kstat_t *
+zone_misc_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_misc_kstat_t *zmp;
+
+ if ((ksp = kstat_create_zone("zones", zone->zone_id,
+ zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
+ sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_misc_lock;
+ zone->zone_misc_stats = zmp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_run_ticks, "run_ticks", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_run_wait, "run_queue", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_shr_pct, "fss_share_percent",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_fss_pri_hi, "fss_pri_hi", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_pri_avg, "fss_pri_avg",
+ KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_misc_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
+ zone->zone_mcap_stats = kmem_zalloc(
+ sizeof (zone_mcap_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
+ zone->zone_misc_stats = kmem_zalloc(
+ sizeof (zone_misc_kstat_t), KM_SLEEP);
+ }
}
static void
-zone_kstat_delete_common(kstat_t **pkstat)
+zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
{
void *data;
if (*pkstat != NULL) {
data = (*pkstat)->ks_data;
kstat_delete(*pkstat);
- kmem_free(data, sizeof (zone_kstat_t));
+ kmem_free(data, datasz);
*pkstat = NULL;
}
}
@@ -1847,9 +2332,23 @@ zone_kstat_delete_common(kstat_t **pkstat)
static void
zone_kstat_delete(zone_t *zone)
{
- zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
- zone_kstat_delete_common(&zone->zone_swapresv_kstat);
- zone_kstat_delete_common(&zone->zone_nprocs_kstat);
+ zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_swapresv_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_nprocs_kstat,
+ sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_mcap_ksp,
+ sizeof (zone_mcap_kstat_t));
+ zone_kstat_delete_common(&zone->zone_misc_ksp,
+ sizeof (zone_misc_kstat_t));
}
/*
@@ -1883,6 +2382,8 @@ zone_zsd_init(void)
zone0.zone_locked_mem_ctl = UINT64_MAX;
ASSERT(zone0.zone_max_swap == 0);
zone0.zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_phys_mem = 0;
+ zone0.zone_phys_mem_ctl = UINT64_MAX;
zone0.zone_max_lofi = 0;
zone0.zone_max_lofi_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
@@ -1906,7 +2407,13 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
+ zone0.zone_zfs_io_pri = 1;
+ zone0.zone_stime = 0;
+ zone0.zone_utime = 0;
+ zone0.zone_wtime = 0;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2013,6 +2520,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 1024, 1024, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2054,6 +2576,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2064,6 +2600,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2375,14 +2916,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
return (0);
}
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats. Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats. We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
{
- uint64_t mcap;
- int err = 0;
+ zone->zone_mcap_nover++;
+
+ return (0);
+}
+
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+ uint64_t pageout;
+ int err;
+
+ if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+ zone->zone_mcap_pagedout += pageout;
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
+ return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults. This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+ uint32_t dusec;
+ int err;
+
+ if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+ zone->zone_pg_flt_delay = dusec;
+
+ return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+ uint64_t rss;
+ int err;
+
+ if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mem = rss;
return (err);
}
@@ -2794,6 +3386,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -2977,6 +3575,92 @@ zone_find_by_path(const char *path)
}
/*
+ * Public interface for updating per-zone load averages. Called once per
+ * second.
+ *
+ * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
+ */
+void
+zone_loadavg_update()
+{
+ zone_t *zp;
+ zone_status_t status;
+ struct loadavg_s *lavg;
+ hrtime_t zone_total;
+ int i;
+ hrtime_t hr_avg;
+ int nrun;
+ static int64_t f[3] = { 135, 27, 9 };
+ int64_t q, r;
+
+ mutex_enter(&zonehash_lock);
+ for (zp = list_head(&zone_active); zp != NULL;
+ zp = list_next(&zone_active, zp)) {
+ mutex_enter(&zp->zone_lock);
+
+ /* Skip zones that are on the way down or not yet up */
+ status = zone_status_get(zp);
+ if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
+ /* For all practical purposes the zone doesn't exist. */
+ mutex_exit(&zp->zone_lock);
+ continue;
+ }
+
+ /*
+ * Update the 10 second moving average data in zone_loadavg.
+ */
+ lavg = &zp->zone_loadavg;
+
+ zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
+ scalehrtime(&zone_total);
+
+ /* The zone_total should always be increasing. */
+ lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
+ zone_total - lavg->lg_total : 0;
+ lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
+ /* lg_total holds the prev. 1 sec. total */
+ lavg->lg_total = zone_total;
+
+ /*
+ * To simplify the calculation, we don't calculate the load avg.
+ * until the zone has been up for at least 10 seconds and our
+ * moving average is thus full.
+ */
+ if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
+ lavg->lg_len++;
+ mutex_exit(&zp->zone_lock);
+ continue;
+ }
+
+ /* Now calculate the 1min, 5min, 15 min load avg. */
+ hr_avg = 0;
+ for (i = 0; i < S_LOADAVG_SZ; i++)
+ hr_avg += lavg->lg_loads[i];
+ hr_avg = hr_avg / S_LOADAVG_SZ;
+ nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
+
+ /* Compute load avg. See comment in calcloadavg() */
+ for (i = 0; i < 3; i++) {
+ q = (zp->zone_hp_avenrun[i] >> 16) << 7;
+ r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
+ zp->zone_hp_avenrun[i] +=
+ ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
+
+ /* avenrun[] can only hold 31 bits of load avg. */
+ if (zp->zone_hp_avenrun[i] <
+ ((uint64_t)1<<(31+16-FSHIFT)))
+ zp->zone_avenrun[i] = (int32_t)
+ (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
+ else
+ zp->zone_avenrun[i] = 0x7fffffff;
+ }
+
+ mutex_exit(&zp->zone_lock);
+ }
+ mutex_exit(&zonehash_lock);
+}
+
+/*
* Get the number of cpus visible to this zone. The system-wide global
* 'ncpus' is returned if pools are disabled, the caller is in the
* global zone, or a NULL zone argument is passed in.
@@ -3789,7 +4473,10 @@ zsched(void *arg)
mutex_enter(&zone_status_lock);
zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
mutex_exit(&zone_status_lock);
+ } else {
+ zone->zone_boot_time = gethrestime_sec();
}
+
pool_unlock();
}
@@ -4081,7 +4768,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4104,6 +4791,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zoneid = zone->zone_id = id_alloc(zoneid_space);
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4172,10 +4860,14 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_locked_mem_ctl = UINT64_MAX;
zone->zone_max_swap = 0;
zone->zone_max_swap_ctl = UINT64_MAX;
+ zone->zone_phys_mem = 0;
+ zone->zone_phys_mem_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+ zone->zone_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
@@ -4474,6 +5166,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4484,7 +5177,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5222,14 +5924,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5284,6 +5978,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5315,10 +6017,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+ * attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID &&
+ attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
return (set_errno(EINVAL));
}
@@ -5335,7 +6038,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+ attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+ zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5344,6 +6049,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_INITNAME:
err = zone_set_initname(zone, (const char *)buf);
break;
+ case ZONE_ATTR_INITNORESTART:
+ zone->zone_restart_init = B_FALSE;
+ err = 0;
+ break;
case ZONE_ATTR_BOOTARGS:
err = zone_set_bootargs(zone, (const char *)buf);
break;
@@ -5353,8 +6062,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_FS_ALLOWED:
err = zone_set_fs_allowed(zone, (const char *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+ case ZONE_ATTR_PMCAP_NOVER:
+ err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PMCAP_PAGEOUT:
+ err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PG_FLT_DELAY:
+ err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+ break;
+ case ZONE_ATTR_RSS:
+ err = zone_set_rss(zone, (const uint64_t *)buf);
break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
@@ -6075,6 +6793,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6085,7 +6804,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6363,7 +7082,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise