summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os/zone.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
-rw-r--r--usr/src/uts/common/os/zone.c230
1 files changed, 143 insertions, 87 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 843adc1ee0..06a8549c5b 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017, Joyent Inc.
+ * Copyright 2018, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -313,6 +313,7 @@ static id_space_t *zoneid_space;
* 'global_zone'.
*/
zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
/*
@@ -429,11 +430,18 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
static const int ZONE_SYSCALL_API_VERSION = 7;
/*
- * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's
- * current page usage, its page limit, a flag indicating if the zone is
- * over its physical memory cap and various statistics. The zpcap_over flag is
- * the interface for the page scanner to use when reclaiming pages for zones
- * that are over their cap.
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ * 1) pages and RSS data associated with processes inside a zone
+ * 2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
*
* All zone physical memory cap data is stored in this array instead of within
* the zone structure itself. This is because zone structures come and go, but
@@ -448,33 +456,40 @@ static const int ZONE_SYSCALL_API_VERSION = 7;
* page scanning.
*
* The page scanner can run when "zone_num_over_cap" is non-zero. It can
- * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
* if that zone is over its cap.
*
* There is no locking for the page scanner to perform these two checks.
* We cannot have the page scanner blocking normal paging activity for
* running processes. Because the physical memory cap is a soft cap, it is
* fine for the scanner to simply read the current state of the counter and
- * the zone's zpcap_over entry in the array. The scanner should never modify
+ * the zone's zpers_over entry in the array. The scanner should never modify
* either of these items. Internally the entries and the counter are managed
* with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
* take care to ensure that we only take the zone_physcap_lock mutex when a
* zone is transitioning over/under its physical memory cap.
*
* The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
- * the "zone_pcap_data" array and associated counter.
+ * the "zone_pdata" array and associated counter.
*
- * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage
- * in terms of pages. These values are currently defined as uint32. Thus, the
- * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since
- * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
* zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
* In the future we may need to expand these counters to 64-bit, but for now
* we're using 32-bit to conserve memory, since this array is statically
* allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
*/
uint_t zone_num_over_cap;
-zone_pcap_t zone_pcap_data[MAX_ZONES];
+zone_persist_t zone_pdata[MAX_ZONES];
static kmutex_t zone_physcap_lock;
/*
@@ -1509,8 +1524,16 @@ static rctl_ops_t zone_cpu_burst_time_ops = {
static rctl_qty_t
zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
{
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+ rctl_qty_t r = 0;
+
ASSERT(MUTEX_HELD(&p->p_lock));
- return (p->p_zone->zone_zfs_io_pri);
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ return (r);
}
/*ARGSUSED*/
@@ -1519,6 +1542,7 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
rctl_qty_t nv)
{
zone_t *zone = e->rcep_p.zone;
+ zone_persist_t *zp;
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(e->rcep_t == RCENTITY_ZONE);
@@ -1529,7 +1553,11 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
/*
* set priority to the new value.
*/
- zone->zone_zfs_io_pri = nv;
+ zp = &zone_pdata[zone->zone_id];
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+ mutex_exit(&zp->zpers_zfs_lock);
return (0);
}
@@ -1871,10 +1899,10 @@ static rctl_qty_t
zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
- zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id];
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
ASSERT(MUTEX_HELD(&p->p_lock));
- q = ptob(zp->zpcap_pg_cnt);
+ q = ptob(zp->zpers_pg_cnt);
return (q);
}
@@ -1906,7 +1934,7 @@ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
pg_val = (uint_t)pages;
}
}
- zone_pcap_data[zid].zpcap_pg_limit = pg_val;
+ zone_pdata[zid].zpers_pg_limit = pg_val;
return (0);
}
@@ -2016,13 +2044,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_kstat_t *zk = ksp->ks_data;
- zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
if (rw == KSTAT_WRITE)
return (EACCES);
- zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt);
- zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit);
+ zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
return (0);
}
@@ -2170,26 +2198,42 @@ zone_zfs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_zfs_kstat_t *zzp = ksp->ks_data;
- kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
if (rw == KSTAT_WRITE)
return (EACCES);
- /*
- * Extract the ZFS statistics from the kstat_io_t structure used by
- * kstat_runq_enter() and related functions. Since the I/O throttle
- * counters are updated directly by the ZFS layer, there's no need to
- * copy those statistics here.
- *
- * Note that kstat_runq_enter() and the related functions use
- * gethrtime_unscaled(), so scale the time here.
- */
- zzp->zz_nread.value.ui64 = kiop->nread;
- zzp->zz_reads.value.ui64 = kiop->reads;
- zzp->zz_rtime.value.ui64 = kiop->rtime;
- zzp->zz_rlentime.value.ui64 = kiop->rlentime;
- zzp->zz_nwritten.value.ui64 = kiop->nwritten;
- zzp->zz_writes.value.ui64 = kiop->writes;
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp == NULL) {
+ zzp->zz_nread.value.ui64 = 0;
+ zzp->zz_reads.value.ui64 = 0;
+ zzp->zz_rtime.value.ui64 = 0;
+ zzp->zz_rlentime.value.ui64 = 0;
+ zzp->zz_nwritten.value.ui64 = 0;
+ zzp->zz_writes.value.ui64 = 0;
+ zzp->zz_waittime.value.ui64 = 0;
+ } else {
+ kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure
+ * used by kstat_runq_enter() and related functions. Since the
+ * I/O throttle counters are updated directly by the ZFS layer,
+ * there's no need to copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+ zzp->zz_waittime.value.ui64 =
+ zp->zpers_zfsp->zpers_zfs_rd_waittime;
+ }
+ mutex_exit(&zp->zpers_zfs_lock);
scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
@@ -2240,23 +2284,23 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
- zp = &zone_pcap_data[zone->zone_id];
+ zp = &zone_pdata[zone->zone_id];
- zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt);
- zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit);
+ zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
zmp->zm_swap.value.ui64 = zone->zone_max_swap;
zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
- zmp->zm_nover.value.ui64 = zp->zpcap_nover;
+ zmp->zm_nover.value.ui64 = zp->zpers_nover;
#ifndef DEBUG
- zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out);
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
#else
- zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty +
- zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty);
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+ zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
@@ -2523,11 +2567,13 @@ zone_zsd_init(void)
zone0.zone_swapresv_kstat = NULL;
zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
- zone0.zone_zfs_io_pri = 1;
zone0.zone_stime = 0;
zone0.zone_utime = 0;
zone0.zone_wtime = 0;
+ zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+ zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2839,7 +2885,7 @@ zone_free(zone_t *zone)
cpucaps_zone_remove(zone);
/* Clear physical memory capping data. */
- bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t));
+ bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
ASSERT(zone->zone_cpucap == NULL);
@@ -5090,7 +5136,10 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_lockedmem_kstat = NULL;
zone->zone_swapresv_kstat = NULL;
zone->zone_physmem_kstat = NULL;
- zone->zone_zfs_io_pri = 1;
+
+ zone_pdata[zoneid].zpers_zfsp =
+ kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+ zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
@@ -5101,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root,
* Ensure page count is 0 (in case zoneid has wrapped).
* Initialize physical memory cap as unlimited.
*/
- zone_pcap_data[zoneid].zpcap_pg_cnt = 0;
- zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX;
+ zone_pdata[zoneid].zpers_pg_cnt = 0;
+ zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
@@ -5741,6 +5790,7 @@ zone_destroy(zoneid_t zoneid)
zone_status_t status;
clock_t wait_time;
boolean_t log_refcounts;
+ zone_persist_t *zp;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -5774,6 +5824,12 @@ zone_destroy(zoneid_t zoneid)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ zp = &zone_pdata[zoneid];
+ mutex_enter(&zp->zpers_zfs_lock);
+ kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+ zp->zpers_zfsp = NULL;
+ mutex_exit(&zp->zpers_zfs_lock);
+
/*
* wait for zsched to exit
*/
@@ -8075,18 +8131,18 @@ done:
static void
zone_incr_capped(zoneid_t zid)
{
- zone_pcap_t *zp = &zone_pcap_data[zid];
+ zone_persist_t *zp = &zone_pdata[zid];
/* See if over (unlimited is UINT32_MAX), or already marked that way. */
- if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) {
+ if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
return;
}
mutex_enter(&zone_physcap_lock);
/* Recheck setting under mutex */
- if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) {
- zp->zpcap_over = 1;
- zp->zpcap_nover++;
+ if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+ zp->zpers_over = 1;
+ zp->zpers_nover++;
zone_num_over_cap++;
DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
}
@@ -8114,29 +8170,29 @@ zone_incr_capped(zoneid_t zid)
static void
zone_decr_capped(zoneid_t zid)
{
- zone_pcap_t *zp = &zone_pcap_data[zid];
+ zone_persist_t *zp = &zone_pdata[zid];
uint32_t adjusted_limit;
/*
* See if under, or already marked that way. There is no need to
- * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX)
- * since we'll never set zpcap_over in zone_incr_capped().
+ * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+ * since we'll never set zpers_over in zone_incr_capped().
*/
- if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) {
+ if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
return;
}
- adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7);
+ adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
/* Recheck, accounting for our hysteresis. */
- if (zp->zpcap_pg_cnt >= adjusted_limit) {
+ if (zp->zpers_pg_cnt >= adjusted_limit) {
return;
}
mutex_enter(&zone_physcap_lock);
/* Recheck under mutex. */
- if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) {
- zp->zpcap_over = 0;
+ if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+ zp->zpers_over = 0;
ASSERT(zone_num_over_cap > 0);
zone_num_over_cap--;
DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
@@ -8154,7 +8210,7 @@ void
zone_add_page(page_t *pp)
{
uint_t pcnt;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
zoneid_t zid;
/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8179,9 +8235,9 @@ zone_add_page(page_t *pp)
if (pp->p_share == 0) {
/* First mapping to this page. */
pp->p_zoneid = zid;
- zp = &zone_pcap_data[zid];
- ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX);
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt);
+ zp = &zone_pdata[zid];
+ ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
zone_incr_capped(zid);
return;
}
@@ -8194,10 +8250,10 @@ zone_add_page(page_t *pp)
zid = pp->p_zoneid;
pp->p_zoneid = ALL_ZONES;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
- if (zp->zpcap_pg_cnt > 0) {
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
}
zone_decr_capped(zid);
}
@@ -8207,7 +8263,7 @@ void
zone_rm_page(page_t *pp)
{
uint_t pcnt;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
zoneid_t zid;
/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8227,9 +8283,9 @@ zone_rm_page(page_t *pp)
}
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
- if (zp->zpcap_pg_cnt > 0) {
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ zp = &zone_pdata[zid];
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
}
zone_decr_capped(zid);
pp->p_zoneid = ALL_ZONES;
@@ -8238,29 +8294,29 @@ zone_rm_page(page_t *pp)
void
zone_pageout_stat(int zid, zone_pageout_op_t op)
{
- zone_pcap_t *zp;
+ zone_persist_t *zp;
if (zid == ALL_ZONES)
return;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
#ifndef DEBUG
- atomic_add_64(&zp->zpcap_pg_out, 1);
+ atomic_add_64(&zp->zpers_pg_out, 1);
#else
switch (op) {
case ZPO_DIRTY:
- atomic_add_64(&zp->zpcap_pg_fsdirty, 1);
+ atomic_add_64(&zp->zpers_pg_fsdirty, 1);
break;
case ZPO_FS:
- atomic_add_64(&zp->zpcap_pg_fs, 1);
+ atomic_add_64(&zp->zpers_pg_fs, 1);
break;
case ZPO_ANON:
- atomic_add_64(&zp->zpcap_pg_anon, 1);
+ atomic_add_64(&zp->zpers_pg_anon, 1);
break;
case ZPO_ANONDIRTY:
- atomic_add_64(&zp->zpcap_pg_anondirty, 1);
+ atomic_add_64(&zp->zpers_pg_anondirty, 1);
break;
default:
cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
@@ -8275,23 +8331,23 @@ zone_pageout_stat(int zid, zone_pageout_op_t op)
void
zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
{
- zone_pcap_t *zp;
+ zone_persist_t *zp;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
/*
* If memory or swap limits are set on the zone, use those, otherwise
* use the system values. physmem and freemem are also in pages.
*/
- if (zp->zpcap_pg_limit == UINT32_MAX) {
+ if (zp->zpers_pg_limit == UINT32_MAX) {
*memcap = physmem;
*free = freemem;
} else {
int64_t freemem;
- *memcap = (pgcnt_t)zp->zpcap_pg_limit;
- freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt;
+ *memcap = (pgcnt_t)zp->zpers_pg_limit;
+ freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
if (freemem > 0) {
*free = (pgcnt_t)freemem;
} else {