diff options
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
-rw-r--r-- | usr/src/uts/common/os/zone.c | 1108 |
1 files changed, 1048 insertions, 60 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 085a9f7e73..a161fb85a2 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2018, Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -252,6 +252,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -312,6 +314,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -327,8 +330,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -372,8 +375,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -419,8 +426,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1450,127 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1705,6 +1897,57 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + + ASSERT(MUTEX_HELD(&p->p_lock)); + q = ptob(zp->zpers_pg_cnt); + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zoneid_t zid; + uint_t pg_val; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pdata[zid].zpers_pg_limit = pg_val; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1798,6 +2041,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1826,7 +2084,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1851,16 +2109,200 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); + zp = &zone_pdata[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2335,12 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1936,6 +2384,8 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; @@ -1979,6 +2429,8 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); @@ -1994,13 +2446,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2032,8 +2496,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2095,12 +2566,15 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2207,6 +2681,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2248,6 +2737,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2258,6 +2761,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2279,6 +2787,9 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_restart_init_0 = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2358,6 +2869,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2373,6 +2886,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -2386,6 +2902,19 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) @@ -2430,12 +2959,18 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2443,7 +2978,7 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG @@ -2521,9 +3056,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2596,18 +3136,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname) } static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ - uint64_t mcap; - int err = 0; - - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; - - return (err); -} - -static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; @@ -3014,6 +3542,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3757,6 +4291,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3785,9 +4330,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -4273,8 +4863,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4393,7 +4984,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4465,6 +5056,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4472,6 +5064,9 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_restart_init_0 = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4541,14 +5136,26 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_max_swap_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; /* * Zsched initializes the rctls. */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -4697,8 +5304,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4837,6 +5444,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4847,7 +5455,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5176,6 +5793,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5209,6 +5827,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -5598,14 +6222,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5669,6 +6285,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5700,10 +6333,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * No attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID) { return (set_errno(EINVAL)); } @@ -5716,11 +6348,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) mutex_exit(&zonehash_lock); /* - * At present most attributes can only be set on non-running, + * At present attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5733,6 +6365,14 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) zone->zone_restart_init = B_FALSE; err = 0; break; + case ZONE_ATTR_INITRESTART0: + zone->zone_restart_init_0 = B_TRUE; + err = 0; + break; + case ZONE_ATTR_INITREBOOT: + zone->zone_reboot_on_init_exit = B_TRUE; + err = 0; + break; case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; @@ -5745,9 +6385,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5775,6 +6412,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6478,6 +7131,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6488,7 +7142,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6589,6 +7243,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6766,7 +7421,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6829,16 +7484,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6906,7 +7560,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6943,6 +7598,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7044,6 +7711,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; zone_t *thiszone; + /* + * Only the GZ may add a datalink to a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may add a + * datalink to a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * When links exist in the GZ, they aren't added to the GZ's + * zone_dl_list. We must enforce this because link_activate() + * depends on zone_check_datalink() returning only NGZs. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((thiszone = zone_find_by_id(zoneid)) == NULL) return (set_errno(ENXIO)); @@ -7076,6 +7764,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; int err = 0; + /* + * Only the GZ may remove a datalink from a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may remove a + * datalink from a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * If we can't add a datalink to the GZ's zone_dl_list then we + * certainly can't remove them either. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((zone = zone_find_by_id(zoneid)) == NULL) return (set_errno(EINVAL)); @@ -7093,25 +7801,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) } /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid. Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned. */ int zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) { zone_t *zone; + zoneid_t zoneid = *zoneidp; + zoneid_t caller = getzoneid(); int err = ENXIO; - if (*zoneidp != ALL_ZONES) { - if ((zone = zone_find_by_id(*zoneidp)) != NULL) { - if (zone_dl_exists(zone, linkid)) + /* + * Only the GZ may enquire about all zones; an NGZ may only + * enuqire about itself. + */ + if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) + zoneid = caller; + + if (zoneid != caller && caller != GLOBAL_ZONEID) + return (err); + + if (zoneid != ALL_ZONES) { + if ((zone = zone_find_by_id(zoneid)) != NULL) { + if (zone_dl_exists(zone, linkid)) { + /* + * We need to set this in case an NGZ + * passes ALL_ZONES. + */ + *zoneidp = zoneid; err = 0; + } zone_rele(zone); } return (err); } + ASSERT(caller == GLOBAL_ZONEID); mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { @@ -7122,6 +7868,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) } } mutex_exit(&zonehash_lock); + return (err); } @@ -7142,6 +7889,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) zone_dl_t *zdl; datalink_id_t *idptr = idarray; + /* + * Only the GZ or the owning zone may look at the datalink list. + */ + if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) + return (set_errno(EPERM)); + if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) return (set_errno(EFAULT)); if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7167,6 +7920,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) @@ -7380,3 +8140,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). + */ + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { + return; + } + + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpers_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_persist_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpers_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpers_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpers_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpers_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpers_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_persist_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpers_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} |