summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c344
-rw-r--r--usr/src/uts/common/os/vm_pageout.c4
-rw-r--r--usr/src/uts/common/os/zone.c230
-rw-r--r--usr/src/uts/common/sys/zone.h68
-rw-r--r--usr/src/uts/common/vm/vm_usage.c6
5 files changed, 397 insertions, 255 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
index 4861c64f8e..59357cbee5 100644
--- a/usr/src/uts/common/fs/zfs/zfs_zone.c
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2018, Joyent, Inc. All rights reserved.
*/
/*
@@ -166,8 +166,8 @@ zfs_zone_txg_delay()
* over the previous window.
*/
boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
-uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
-uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */
+uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
+uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */
boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
@@ -238,9 +238,9 @@ uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
typedef struct {
hrtime_t cycle_start;
- int cycle_cnt;
hrtime_t cycle_lat;
hrtime_t sys_avg_lat;
+ uint_t cycle_cnt;
} sys_lat_cycle_t;
typedef struct {
@@ -275,6 +275,7 @@ hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
/* time that we last updated per-zone throttle info */
+kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */
hrtime_t zfs_zone_last_checked = 0;
hrtime_t zfs_disk_last_laggard = 0;
@@ -412,22 +413,32 @@ compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
* Add IO op data to the zone.
*/
static void
-add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op)
{
+ zone_zfs_io_t *iop;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+
switch (op) {
case ZFS_ZONE_IOP_READ:
- (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
- zonep->zone_rd_ops.cycle_cnt++;
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops);
+ iop->zpers_rd_ops.cycle_cnt++;
break;
case ZFS_ZONE_IOP_WRITE:
- (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
- zonep->zone_wr_ops.cycle_cnt++;
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops);
+ iop->zpers_wr_ops.cycle_cnt++;
break;
case ZFS_ZONE_IOP_LOGICAL_WRITE:
- (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
- zonep->zone_lwr_ops.cycle_cnt++;
+ (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops);
+ iop->zpers_lwr_ops.cycle_cnt++;
break;
}
+ mutex_exit(&zpd->zpers_zfs_lock);
}
/*
@@ -502,13 +513,13 @@ add_sys_iop(hrtime_t unow, int op, int lat)
switch (op) {
case ZFS_ZONE_IOP_READ:
(void) compute_new_sys_avg(unow, &rd_lat);
- rd_lat.cycle_cnt++;
- rd_lat.cycle_lat += lat;
+ atomic_inc_uint(&rd_lat.cycle_cnt);
+ atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat);
break;
case ZFS_ZONE_IOP_WRITE:
(void) compute_new_sys_avg(unow, &wr_lat);
- wr_lat.cycle_cnt++;
- wr_lat.cycle_lat += lat;
+ atomic_inc_uint(&wr_lat.cycle_cnt);
+ atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat);
break;
}
}
@@ -575,10 +586,11 @@ calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
* The latency parameter is in usecs.
*/
static void
-add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op,
+ hrtime_t lat)
{
/* Add op to zone */
- add_zone_iop(zonep, unow, op);
+ add_zone_iop(zpd, unow, op);
/* Track system latency */
if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
@@ -591,14 +603,16 @@ add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
* return a non-zero value, otherwise return 0.
*/
static int
-get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops,
uint_t *lwops)
{
- *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
- *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
- *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+ ASSERT3P(zpd, !=, NULL);
- DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+ *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops);
+ *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops);
+ *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops);
+
+ DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd,
uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
return (*rops | *wops | *lwops);
@@ -637,20 +651,24 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
{
zoneio_stats_t *sp = arg;
uint_t rops, wops, lwops;
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop = zpd->zpers_zfsp;
+
+ ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock));
+ ASSERT3P(iop, !=, NULL);
if (zonep->zone_id == GLOBAL_ZONEID ||
- get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
- zonep->zone_io_util = 0;
+ get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) {
return (0);
}
- zonep->zone_io_util = (rops * sp->zi_avgrlat) +
- (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
- sp->zi_totutil += zonep->zone_io_util;
+ iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) +
+ (lwops * sp->zi_avgwlat);
+ sp->zi_totutil += iop->zpers_io_util;
- if (zonep->zone_io_util > 0) {
+ if (iop->zpers_io_util > 0) {
sp->zi_active++;
- sp->zi_totpri += zonep->zone_zfs_io_pri;
+ sp->zi_totpri += iop->zpers_zfs_io_pri;
}
/*
@@ -665,23 +683,27 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
*/
DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
uint_t, rops, uint_t, wops, uint_t, lwops,
- uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
+ uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri);
return (0);
}
static void
-zfs_zone_delay_inc(zone_t *zonep)
+zfs_zone_delay_inc(zone_zfs_io_t *zpd)
{
- if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
- zonep->zone_io_delay += zfs_zone_delay_step;
+ ASSERT3P(zpd, !=, NULL);
+
+ if (zpd->zpers_io_delay < zfs_zone_delay_ceiling)
+ zpd->zpers_io_delay += zfs_zone_delay_step;
}
static void
-zfs_zone_delay_dec(zone_t *zonep)
+zfs_zone_delay_dec(zone_zfs_io_t *zpd)
{
- if (zonep->zone_io_delay > 0)
- zonep->zone_io_delay -= zfs_zone_delay_step;
+ ASSERT3P(zpd, !=, NULL);
+
+ if (zpd->zpers_io_delay > 0)
+ zpd->zpers_io_delay -= zfs_zone_delay_step;
}
/*
@@ -691,18 +713,24 @@ zfs_zone_delay_dec(zone_t *zonep)
static int
zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
{
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop = zpd->zpers_zfsp;
zoneio_stats_t *sp = arg;
- uint16_t delay = zonep->zone_io_delay;
+ uint8_t delay;
uint_t fairutil = 0;
- zonep->zone_io_util_above_avg = B_FALSE;
+ ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock));
+ ASSERT3P(iop, !=, NULL);
+
+ delay = iop->zpers_io_delay;
+ iop->zpers_io_util_above_avg = 0;
/*
* Given the calculated total utilitzation for all zones, calculate the
* fair share of I/O for this zone.
*/
if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
- fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+ fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) /
sp->zi_totpri;
} else if (sp->zi_active > 0) {
fairutil = sp->zi_totutil / sp->zi_active;
@@ -712,14 +740,14 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
* Adjust each IO's delay. If the overall delay becomes too high, avoid
* increasing beyond the ceiling value.
*/
- if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
- zonep->zone_io_util_above_avg = B_TRUE;
+ if (iop->zpers_io_util > fairutil && sp->zi_overutil) {
+ iop->zpers_io_util_above_avg = 1;
if (sp->zi_active > 1)
- zfs_zone_delay_inc(zonep);
- } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+ zfs_zone_delay_inc(iop);
+ } else if (iop->zpers_io_util < fairutil || sp->zi_underutil ||
sp->zi_active <= 1) {
- zfs_zone_delay_dec(zonep);
+ zfs_zone_delay_dec(iop);
}
/*
@@ -732,8 +760,8 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
* arg4: actual I/O utilization
*/
DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
- uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
- uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
+ uintptr_t, delay, uintptr_t, iop->zpers_io_delay,
+ uintptr_t, fairutil, uintptr_t, iop->zpers_io_util);
return (0);
}
@@ -823,10 +851,20 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
uint_t cnt;
zone_q_bump_t *qbp = arg;
zio_priority_t p = qbp->zq_queue;
+ zone_persist_t *zpd = &zone_pdata[zonep->zone_id];
+ zone_zfs_io_t *iop;
- cnt = zonep->zone_zfs_queued[p];
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ cnt = iop->zpers_zfs_queued[p];
if (cnt == 0) {
- zonep->zone_zfs_weight = 0;
+ iop->zpers_zfs_weight = 0;
+ mutex_exit(&zpd->zpers_zfs_lock);
return (0);
}
@@ -837,8 +875,8 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
* done any IO over several iterations will see their weight max
* out.
*/
- if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
- zonep->zone_zfs_weight++;
+ if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX)
+ iop->zpers_zfs_weight++;
/*
* This zone's IO priority is the inverse of the number of IOs
@@ -852,7 +890,7 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
* which haven't done IO in a while aren't getting starved.
*/
pri = (qbp->zq_qdepth / cnt) *
- zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+ iop->zpers_zfs_io_pri * iop->zpers_zfs_weight;
/*
* If this zone has a higher priority than what we found so far,
@@ -861,8 +899,9 @@ get_sched_pri_cb(zone_t *zonep, void *arg)
if (pri > qbp->zq_priority) {
qbp->zq_zoneid = zonep->zone_id;
qbp->zq_priority = pri;
- qbp->zq_wt = zonep->zone_zfs_weight;
+ qbp->zq_wt = iop->zpers_zfs_weight;
}
+ mutex_exit(&zpd->zpers_zfs_lock);
return (0);
}
@@ -996,8 +1035,10 @@ zfs_zone_zio_init(zio_t *zp)
void
zfs_zone_io_throttle(zfs_zone_iop_type_t type)
{
- zone_t *zonep = curzone;
- hrtime_t unow, last_checked;
+ zoneid_t zid = curzone->zone_id;
+ zone_persist_t *zpd = &zone_pdata[zid];
+ zone_zfs_io_t *iop;
+ hrtime_t unow;
uint16_t wait;
unow = GET_USEC_TIME;
@@ -1007,34 +1048,60 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
* tracking physical IO operations are handled in zfs_zone_zio_done.
*/
if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
- mutex_enter(&zonep->zone_stg_io_lock);
- add_iop(zonep, unow, type, 0);
- mutex_exit(&zonep->zone_stg_io_lock);
+ add_iop(zpd, unow, type, 0);
}
if (!zfs_zone_delay_enable)
return;
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+
/*
* If the zone's I/O priority is set to zero, don't throttle that zone's
* operations at all.
*/
- if (zonep->zone_zfs_io_pri == 0)
+ if (iop->zpers_zfs_io_pri == 0) {
+ mutex_exit(&zpd->zpers_zfs_lock);
return;
+ }
- /*
- * XXX There's a potential race here in that more than one thread may
- * update the zone delays concurrently. The worst outcome is corruption
- * of our data to track each zone's IO, so the algorithm may make
- * incorrect throttling decisions until the data is refreshed.
- */
- last_checked = zfs_zone_last_checked;
- if ((unow - last_checked) > zfs_zone_adjust_time) {
- zfs_zone_last_checked = unow;
- zfs_zone_wait_adjust(unow, last_checked);
+ /* Handle periodically updating the per-zone I/O parameters */
+ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+ hrtime_t last_checked;
+ boolean_t do_update = B_FALSE;
+
+ /* Recheck under mutex */
+ mutex_enter(&zfs_last_check_lock);
+ last_checked = zfs_zone_last_checked;
+ if ((unow - last_checked) > zfs_zone_adjust_time) {
+ zfs_zone_last_checked = unow;
+ do_update = B_TRUE;
+ }
+ mutex_exit(&zfs_last_check_lock);
+
+ if (do_update) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ zfs_zone_wait_adjust(unow, last_checked);
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return;
+ }
+ }
}
- if ((wait = zonep->zone_io_delay) > 0) {
+ wait = iop->zpers_io_delay;
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ if (wait > 0) {
/*
* If this is a write and we're doing above normal TXG
* syncing, then throttle for longer than normal.
@@ -1050,15 +1117,15 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type)
* arg1: type of IO operation
* arg2: time to delay (in us)
*/
- DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+ DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid,
uintptr_t, type, uintptr_t, wait);
drv_usecwait(wait);
- if (zonep->zone_vfs_stats != NULL) {
- atomic_inc_64(&zonep->zone_vfs_stats->
+ if (curzone->zone_vfs_stats != NULL) {
+ atomic_inc_64(&curzone->zone_vfs_stats->
zv_delay_cnt.value.ui64);
- atomic_add_64(&zonep->zone_vfs_stats->
+ atomic_add_64(&curzone->zone_vfs_stats->
zv_delay_time.value.ui64, wait);
}
}
@@ -1100,8 +1167,23 @@ zfs_zone_report_txg_sync(void *dp)
hrtime_t
zfs_zone_txg_delay()
{
- if (curzone->zone_io_util_above_avg)
+ zone_persist_t *zpd = &zone_pdata[curzone->zone_id];
+ zone_zfs_io_t *iop;
+ uint8_t above;
+
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop == NULL) {
+ mutex_exit(&zpd->zpers_zfs_lock);
+ return (0);
+ }
+
+ above = iop->zpers_io_util_above_avg;
+ mutex_exit(&zpd->zpers_zfs_lock);
+
+ if (above) {
return (zfs_zone_txg_delay_nsec);
+ }
return (MSEC2NSEC(10));
}
@@ -1114,7 +1196,8 @@ zfs_zone_txg_delay()
void
zfs_zone_zio_start(zio_t *zp)
{
- zone_t *zonep;
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
/*
* I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
@@ -1124,14 +1207,14 @@ zfs_zone_zio_start(zio_t *zp)
if (zp->io_type == ZIO_TYPE_IOCTL)
return;
- if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
- return;
-
- mutex_enter(&zonep->zone_zfs_lock);
- if (zp->io_type == ZIO_TYPE_READ)
- kstat_runq_enter(&zonep->zone_zfs_rwstats);
- zonep->zone_zfs_weight = 0;
- mutex_exit(&zonep->zone_zfs_lock);
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ if (zp->io_type == ZIO_TYPE_READ)
+ kstat_runq_enter(&iop->zpers_zfs_rwstats);
+ iop->zpers_zfs_weight = 0;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
mutex_enter(&zfs_disk_lock);
zp->io_dispatched = gethrtime();
@@ -1140,8 +1223,6 @@ zfs_zone_zio_start(zio_t *zp)
zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
zfs_disk_rlastupdate = zp->io_dispatched;
mutex_exit(&zfs_disk_lock);
-
- zone_rele(zonep);
}
/*
@@ -1152,7 +1233,8 @@ zfs_zone_zio_start(zio_t *zp)
void
zfs_zone_zio_done(zio_t *zp)
{
- zone_t *zonep;
+ zone_persist_t *zpd;
+ zone_zfs_io_t *iop;
hrtime_t now, unow, udelta;
if (zp->io_type == ZIO_TYPE_IOCTL)
@@ -1161,34 +1243,33 @@ zfs_zone_zio_done(zio_t *zp)
if (zp->io_dispatched == 0)
return;
- if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
- return;
+ zpd = &zone_pdata[zp->io_zoneid];
now = gethrtime();
unow = NANO_TO_MICRO(now);
udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
- mutex_enter(&zonep->zone_zfs_lock);
-
- /*
- * To calculate the wsvc_t average, keep a cumulative sum of all the
- * wait time before each I/O was dispatched. Since most writes are
- * asynchronous, only track the wait time for read I/Os.
- */
- if (zp->io_type == ZIO_TYPE_READ) {
- zonep->zone_zfs_rwstats.reads++;
- zonep->zone_zfs_rwstats.nread += zp->io_size;
-
- zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
- zp->io_dispatched - zp->io_timestamp;
-
- kstat_runq_exit(&zonep->zone_zfs_rwstats);
- } else {
- zonep->zone_zfs_rwstats.writes++;
- zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ /*
+ * To calculate the wsvc_t average, keep a cumulative sum of
+ * all the wait time before each I/O was dispatched. Since most
+ * writes are asynchronous, only track the wait time for
+ * read I/Os.
+ */
+ if (zp->io_type == ZIO_TYPE_READ) {
+ iop->zpers_zfs_rwstats.reads++;
+ iop->zpers_zfs_rwstats.nread += zp->io_size;
+ iop->zpers_zfs_rd_waittime +=
+ zp->io_dispatched - zp->io_timestamp;
+ kstat_runq_exit(&iop->zpers_zfs_rwstats);
+ } else {
+ iop->zpers_zfs_rwstats.writes++;
+ iop->zpers_zfs_rwstats.nwritten += zp->io_size;
+ }
}
-
- mutex_exit(&zonep->zone_zfs_lock);
+ mutex_exit(&zpd->zpers_zfs_lock);
mutex_enter(&zfs_disk_lock);
zfs_disk_rcnt--;
@@ -1201,14 +1282,10 @@ zfs_zone_zio_done(zio_t *zp)
mutex_exit(&zfs_disk_lock);
if (zfs_zone_delay_enable) {
- mutex_enter(&zonep->zone_stg_io_lock);
- add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+ add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ?
ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
- mutex_exit(&zonep->zone_stg_io_lock);
}
- zone_rele(zonep);
-
/*
* sdt:::zfs-zone-latency
*
@@ -1224,7 +1301,8 @@ void
zfs_zone_zio_dequeue(zio_t *zp)
{
zio_priority_t p;
- zone_t *zonep;
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
p = zp->io_priority;
if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
@@ -1233,24 +1311,25 @@ zfs_zone_zio_dequeue(zio_t *zp)
/* We depend on p being defined as either 0 or 1 */
ASSERT(p < 2);
- if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
- return;
-
- mutex_enter(&zonep->zone_stg_io_lock);
- ASSERT(zonep->zone_zfs_queued[p] > 0);
- if (zonep->zone_zfs_queued[p] == 0)
- cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
- else
- zonep->zone_zfs_queued[p]--;
- mutex_exit(&zonep->zone_stg_io_lock);
- zone_rele(zonep);
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ ASSERT(iop->zpers_zfs_queued[p] > 0);
+ if (iop->zpers_zfs_queued[p] == 0) {
+ cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+ } else {
+ iop->zpers_zfs_queued[p]--;
+ }
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
}
void
zfs_zone_zio_enqueue(zio_t *zp)
{
zio_priority_t p;
- zone_t *zonep;
+ zone_persist_t *zpd = &zone_pdata[zp->io_zoneid];
+ zone_zfs_io_t *iop;
p = zp->io_priority;
if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
@@ -1259,13 +1338,12 @@ zfs_zone_zio_enqueue(zio_t *zp)
/* We depend on p being defined as either 0 or 1 */
ASSERT(p < 2);
- if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
- return;
-
- mutex_enter(&zonep->zone_stg_io_lock);
- zonep->zone_zfs_queued[p]++;
- mutex_exit(&zonep->zone_stg_io_lock);
- zone_rele(zonep);
+ mutex_enter(&zpd->zpers_zfs_lock);
+ iop = zpd->zpers_zfsp;
+ if (iop != NULL) {
+ iop->zpers_zfs_queued[p]++;
+ }
+ mutex_exit(&zpd->zpers_zfs_lock);
}
/*
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index c6e54a75c1..f5ee76a2cb 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1249,7 +1249,7 @@ checkpage(struct page *pp, int whichhand)
ASSERT(pp->p_zoneid == ALL_ZONES ||
pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
if (pp->p_zoneid == ALL_ZONES ||
- zone_pcap_data[pp->p_zoneid].zpcap_over == 0) {
+ zone_pdata[pp->p_zoneid].zpers_over == 0) {
/*
* Cross-zone shared page, or zone not over it's cap.
* Leave the page alone.
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 843adc1ee0..06a8549c5b 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017, Joyent Inc.
+ * Copyright 2018, Joyent Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -313,6 +313,7 @@ static id_space_t *zoneid_space;
* 'global_zone'.
*/
zone_t zone0;
+zone_zfs_io_t zone0_zp_zfs;
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
/*
@@ -429,11 +430,18 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
static const int ZONE_SYSCALL_API_VERSION = 7;
/*
- * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's
- * current page usage, its page limit, a flag indicating if the zone is
- * over its physical memory cap and various statistics. The zpcap_over flag is
- * the interface for the page scanner to use when reclaiming pages for zones
- * that are over their cap.
+ * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
+ * data which can be referenced independently of the zone_t structure. This
+ * data falls into two categories;
+ * 1) pages and RSS data associated with processes inside a zone
+ * 2) in-flight ZFS I/O data
+ *
+ * Each member of zone_persist_t stores the zone's current page usage, its page
+ * limit, a flag indicating if the zone is over its physical memory cap and
+ * various page-related statistics. The zpers_over flag is the interface for
+ * the page scanner to use when reclaiming pages for zones that are over their
+ * cap. The zone_persist_t structure also includes a mutex and a reference to a
+ * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
*
* All zone physical memory cap data is stored in this array instead of within
* the zone structure itself. This is because zone structures come and go, but
@@ -448,33 +456,40 @@ static const int ZONE_SYSCALL_API_VERSION = 7;
* page scanning.
*
* The page scanner can run when "zone_num_over_cap" is non-zero. It can
- * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine
+ * do a direct lookup of a zoneid into the "zone_pdata" array to determine
* if that zone is over its cap.
*
* There is no locking for the page scanner to perform these two checks.
* We cannot have the page scanner blocking normal paging activity for
* running processes. Because the physical memory cap is a soft cap, it is
* fine for the scanner to simply read the current state of the counter and
- * the zone's zpcap_over entry in the array. The scanner should never modify
+ * the zone's zpers_over entry in the array. The scanner should never modify
* either of these items. Internally the entries and the counter are managed
* with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
* take care to ensure that we only take the zone_physcap_lock mutex when a
* zone is transitioning over/under its physical memory cap.
*
* The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
- * the "zone_pcap_data" array and associated counter.
+ * the "zone_pdata" array and associated counter.
*
- * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage
- * in terms of pages. These values are currently defined as uint32. Thus, the
- * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since
- * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
+ * The zone_persist_t structure tracks the zone's physical cap and phyiscal
+ * usage in terms of pages. These values are currently defined as uint32. Thus,
+ * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
+ * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
* zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
* In the future we may need to expand these counters to 64-bit, but for now
* we're using 32-bit to conserve memory, since this array is statically
* allocated within the kernel based on the maximum number of zones supported.
+ *
+ * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
+ * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
+ * had to continuously find the zone structure associated with an I/O that has
+ * just completed. To avoid that overhead, we track the I/O data within the
+ * zone_zfs_io_t instead. We can directly access that data without having to
+ * lookup the full zone_t structure.
*/
uint_t zone_num_over_cap;
-zone_pcap_t zone_pcap_data[MAX_ZONES];
+zone_persist_t zone_pdata[MAX_ZONES];
static kmutex_t zone_physcap_lock;
/*
@@ -1509,8 +1524,16 @@ static rctl_ops_t zone_cpu_burst_time_ops = {
static rctl_qty_t
zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
{
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
+ rctl_qty_t r = 0;
+
ASSERT(MUTEX_HELD(&p->p_lock));
- return (p->p_zone->zone_zfs_io_pri);
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
+ mutex_exit(&zp->zpers_zfs_lock);
+
+ return (r);
}
/*ARGSUSED*/
@@ -1519,6 +1542,7 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
rctl_qty_t nv)
{
zone_t *zone = e->rcep_p.zone;
+ zone_persist_t *zp;
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(e->rcep_t == RCENTITY_ZONE);
@@ -1529,7 +1553,11 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
/*
* set priority to the new value.
*/
- zone->zone_zfs_io_pri = nv;
+ zp = &zone_pdata[zone->zone_id];
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp != NULL)
+ zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
+ mutex_exit(&zp->zpers_zfs_lock);
return (0);
}
@@ -1871,10 +1899,10 @@ static rctl_qty_t
zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
- zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id];
+ zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
ASSERT(MUTEX_HELD(&p->p_lock));
- q = ptob(zp->zpcap_pg_cnt);
+ q = ptob(zp->zpers_pg_cnt);
return (q);
}
@@ -1906,7 +1934,7 @@ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
pg_val = (uint_t)pages;
}
}
- zone_pcap_data[zid].zpcap_pg_limit = pg_val;
+ zone_pdata[zid].zpers_pg_limit = pg_val;
return (0);
}
@@ -2016,13 +2044,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_kstat_t *zk = ksp->ks_data;
- zone_pcap_t *zp = &zone_pcap_data[zone->zone_id];
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
if (rw == KSTAT_WRITE)
return (EACCES);
- zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt);
- zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit);
+ zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
return (0);
}
@@ -2170,26 +2198,42 @@ zone_zfs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_zfs_kstat_t *zzp = ksp->ks_data;
- kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+ zone_persist_t *zp = &zone_pdata[zone->zone_id];
if (rw == KSTAT_WRITE)
return (EACCES);
- /*
- * Extract the ZFS statistics from the kstat_io_t structure used by
- * kstat_runq_enter() and related functions. Since the I/O throttle
- * counters are updated directly by the ZFS layer, there's no need to
- * copy those statistics here.
- *
- * Note that kstat_runq_enter() and the related functions use
- * gethrtime_unscaled(), so scale the time here.
- */
- zzp->zz_nread.value.ui64 = kiop->nread;
- zzp->zz_reads.value.ui64 = kiop->reads;
- zzp->zz_rtime.value.ui64 = kiop->rtime;
- zzp->zz_rlentime.value.ui64 = kiop->rlentime;
- zzp->zz_nwritten.value.ui64 = kiop->nwritten;
- zzp->zz_writes.value.ui64 = kiop->writes;
+ mutex_enter(&zp->zpers_zfs_lock);
+ if (zp->zpers_zfsp == NULL) {
+ zzp->zz_nread.value.ui64 = 0;
+ zzp->zz_reads.value.ui64 = 0;
+ zzp->zz_rtime.value.ui64 = 0;
+ zzp->zz_rlentime.value.ui64 = 0;
+ zzp->zz_nwritten.value.ui64 = 0;
+ zzp->zz_writes.value.ui64 = 0;
+ zzp->zz_waittime.value.ui64 = 0;
+ } else {
+ kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure
+ * used by kstat_runq_enter() and related functions. Since the
+ * I/O throttle counters are updated directly by the ZFS layer,
+ * there's no need to copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+ zzp->zz_waittime.value.ui64 =
+ zp->zpers_zfsp->zpers_zfs_rd_waittime;
+ }
+ mutex_exit(&zp->zpers_zfs_lock);
scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
@@ -2240,23 +2284,23 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
if (rw == KSTAT_WRITE)
return (EACCES);
- zp = &zone_pcap_data[zone->zone_id];
+ zp = &zone_pdata[zone->zone_id];
- zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt);
- zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit);
+ zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
+ zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
zmp->zm_swap.value.ui64 = zone->zone_max_swap;
zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
- zmp->zm_nover.value.ui64 = zp->zpcap_nover;
+ zmp->zm_nover.value.ui64 = zp->zpers_nover;
#ifndef DEBUG
- zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out);
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
#else
- zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty +
- zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty);
+ zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
+ zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
#endif
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
@@ -2523,11 +2567,13 @@ zone_zsd_init(void)
zone0.zone_swapresv_kstat = NULL;
zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
- zone0.zone_zfs_io_pri = 1;
zone0.zone_stime = 0;
zone0.zone_utime = 0;
zone0.zone_wtime = 0;
+ zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
+ zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2839,7 +2885,7 @@ zone_free(zone_t *zone)
cpucaps_zone_remove(zone);
/* Clear physical memory capping data. */
- bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t));
+ bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
ASSERT(zone->zone_cpucap == NULL);
@@ -5090,7 +5136,10 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_lockedmem_kstat = NULL;
zone->zone_swapresv_kstat = NULL;
zone->zone_physmem_kstat = NULL;
- zone->zone_zfs_io_pri = 1;
+
+ zone_pdata[zoneid].zpers_zfsp =
+ kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
+ zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
@@ -5101,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root,
* Ensure page count is 0 (in case zoneid has wrapped).
* Initialize physical memory cap as unlimited.
*/
- zone_pcap_data[zoneid].zpcap_pg_cnt = 0;
- zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX;
+ zone_pdata[zoneid].zpers_pg_cnt = 0;
+ zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
zone_free(zone);
@@ -5741,6 +5790,7 @@ zone_destroy(zoneid_t zoneid)
zone_status_t status;
clock_t wait_time;
boolean_t log_refcounts;
+ zone_persist_t *zp;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -5774,6 +5824,12 @@ zone_destroy(zoneid_t zoneid)
zone_hold(zone);
mutex_exit(&zonehash_lock);
+ zp = &zone_pdata[zoneid];
+ mutex_enter(&zp->zpers_zfs_lock);
+ kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
+ zp->zpers_zfsp = NULL;
+ mutex_exit(&zp->zpers_zfs_lock);
+
/*
* wait for zsched to exit
*/
@@ -8075,18 +8131,18 @@ done:
static void
zone_incr_capped(zoneid_t zid)
{
- zone_pcap_t *zp = &zone_pcap_data[zid];
+ zone_persist_t *zp = &zone_pdata[zid];
/* See if over (unlimited is UINT32_MAX), or already marked that way. */
- if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) {
+ if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
return;
}
mutex_enter(&zone_physcap_lock);
/* Recheck setting under mutex */
- if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) {
- zp->zpcap_over = 1;
- zp->zpcap_nover++;
+ if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
+ zp->zpers_over = 1;
+ zp->zpers_nover++;
zone_num_over_cap++;
DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
}
@@ -8114,29 +8170,29 @@ zone_incr_capped(zoneid_t zid)
static void
zone_decr_capped(zoneid_t zid)
{
- zone_pcap_t *zp = &zone_pcap_data[zid];
+ zone_persist_t *zp = &zone_pdata[zid];
uint32_t adjusted_limit;
/*
* See if under, or already marked that way. There is no need to
- * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX)
- * since we'll never set zpcap_over in zone_incr_capped().
+ * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
+ * since we'll never set zpers_over in zone_incr_capped().
*/
- if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) {
+ if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
return;
}
- adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7);
+ adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
/* Recheck, accounting for our hysteresis. */
- if (zp->zpcap_pg_cnt >= adjusted_limit) {
+ if (zp->zpers_pg_cnt >= adjusted_limit) {
return;
}
mutex_enter(&zone_physcap_lock);
/* Recheck under mutex. */
- if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) {
- zp->zpcap_over = 0;
+ if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
+ zp->zpers_over = 0;
ASSERT(zone_num_over_cap > 0);
zone_num_over_cap--;
DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
@@ -8154,7 +8210,7 @@ void
zone_add_page(page_t *pp)
{
uint_t pcnt;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
zoneid_t zid;
/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8179,9 +8235,9 @@ zone_add_page(page_t *pp)
if (pp->p_share == 0) {
/* First mapping to this page. */
pp->p_zoneid = zid;
- zp = &zone_pcap_data[zid];
- ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX);
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt);
+ zp = &zone_pdata[zid];
+ ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
zone_incr_capped(zid);
return;
}
@@ -8194,10 +8250,10 @@ zone_add_page(page_t *pp)
zid = pp->p_zoneid;
pp->p_zoneid = ALL_ZONES;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
- if (zp->zpcap_pg_cnt > 0) {
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
}
zone_decr_capped(zid);
}
@@ -8207,7 +8263,7 @@ void
zone_rm_page(page_t *pp)
{
uint_t pcnt;
- zone_pcap_t *zp;
+ zone_persist_t *zp;
zoneid_t zid;
/* Skip pages in segkmem, etc. (KV_KVP, ...) */
@@ -8227,9 +8283,9 @@ zone_rm_page(page_t *pp)
}
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
- if (zp->zpcap_pg_cnt > 0) {
- atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt);
+ zp = &zone_pdata[zid];
+ if (zp->zpers_pg_cnt > 0) {
+ atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
}
zone_decr_capped(zid);
pp->p_zoneid = ALL_ZONES;
@@ -8238,29 +8294,29 @@ zone_rm_page(page_t *pp)
void
zone_pageout_stat(int zid, zone_pageout_op_t op)
{
- zone_pcap_t *zp;
+ zone_persist_t *zp;
if (zid == ALL_ZONES)
return;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
#ifndef DEBUG
- atomic_add_64(&zp->zpcap_pg_out, 1);
+ atomic_add_64(&zp->zpers_pg_out, 1);
#else
switch (op) {
case ZPO_DIRTY:
- atomic_add_64(&zp->zpcap_pg_fsdirty, 1);
+ atomic_add_64(&zp->zpers_pg_fsdirty, 1);
break;
case ZPO_FS:
- atomic_add_64(&zp->zpcap_pg_fs, 1);
+ atomic_add_64(&zp->zpers_pg_fs, 1);
break;
case ZPO_ANON:
- atomic_add_64(&zp->zpcap_pg_anon, 1);
+ atomic_add_64(&zp->zpers_pg_anon, 1);
break;
case ZPO_ANONDIRTY:
- atomic_add_64(&zp->zpcap_pg_anondirty, 1);
+ atomic_add_64(&zp->zpers_pg_anondirty, 1);
break;
default:
cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
@@ -8275,23 +8331,23 @@ zone_pageout_stat(int zid, zone_pageout_op_t op)
void
zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
{
- zone_pcap_t *zp;
+ zone_persist_t *zp;
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- zp = &zone_pcap_data[zid];
+ zp = &zone_pdata[zid];
/*
* If memory or swap limits are set on the zone, use those, otherwise
* use the system values. physmem and freemem are also in pages.
*/
- if (zp->zpcap_pg_limit == UINT32_MAX) {
+ if (zp->zpers_pg_limit == UINT32_MAX) {
*memcap = physmem;
*free = freemem;
} else {
int64_t freemem;
- *memcap = (pgcnt_t)zp->zpcap_pg_limit;
- freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt;
+ *memcap = (pgcnt_t)zp->zpers_pg_limit;
+ freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
if (freemem > 0) {
*free = (pgcnt_t)freemem;
} else {
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 1cca1e7555..87253134fd 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -22,7 +22,7 @@
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_ZONE_H
@@ -624,20 +624,6 @@ typedef struct zone {
struct cpucap *zone_cpucap; /* CPU caps data */
/*
- * Data and counters used for ZFS fair-share disk IO.
- */
- rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */
- uint_t zone_zfs_queued[2]; /* sync I/O enqueued count */
- uint64_t zone_zfs_weight; /* used to prevent starvation */
- uint64_t zone_io_util; /* IO utilization metric */
- boolean_t zone_io_util_above_avg; /* IO util percent > avg. */
- uint16_t zone_io_delay; /* IO delay on logical r/w */
- kmutex_t zone_stg_io_lock; /* protects IO window data */
- sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */
- sys_zio_cntr_t zone_wr_ops; /* writes and */
- sys_zio_cntr_t zone_lwr_ops; /* logical writes. */
-
- /*
* kstats and counters for VFS ops and bytes.
*/
kmutex_t zone_vfs_lock; /* protects VFS statistics */
@@ -650,7 +636,6 @@ typedef struct zone {
*/
kmutex_t zone_zfs_lock; /* protects ZFS statistics */
kstat_t *zone_zfs_ksp;
- kstat_io_t zone_zfs_rwstats;
zone_zfs_kstat_t *zone_zfs_stats;
/*
@@ -738,25 +723,48 @@ typedef struct zone {
kmutex_t zone_mount_lock;
} zone_t;
-/* zpcap_over is treated as a boolean but is 32 bits for alignment. */
-typedef struct zone_pcap {
- uint32_t zpcap_over; /* currently over cap */
- uint32_t zpcap_pg_cnt; /* current RSS in pages */
- uint32_t zpcap_pg_limit; /* current RRS limit in pages */
- uint32_t zpcap_nover; /* # of times over phys. cap */
+/*
+ * Data and counters used for ZFS fair-share disk IO.
+ */
+typedef struct zone_zfs_io {
+ uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */
+ uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */
+ sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */
+ sys_zio_cntr_t zpers_wr_ops; /* writes, and */
+ sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */
+ kstat_io_t zpers_zfs_rwstats;
+ uint64_t zpers_io_util; /* IO utilization metric */
+ uint64_t zpers_zfs_rd_waittime;
+ uint8_t zpers_io_delay; /* IO delay on logical r/w */
+ uint8_t zpers_zfs_weight; /* used to prevent starvation */
+ uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */
+} zone_zfs_io_t;
+
+/*
+ * "Persistent" zone data which can be accessed idependently of the zone_t.
+ */
+typedef struct zone_persist {
+ kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */
+ zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */
+ uint8_t zpers_over; /* currently over cap */
+ uint32_t zpers_pg_cnt; /* current RSS in pages */
+ uint32_t zpers_pg_limit; /* current RRS limit in pages */
+ uint32_t zpers_nover; /* # of times over phys. cap */
#ifndef DEBUG
- uint64_t zpcap_pg_out; /* # pages flushed */
+ uint64_t zpers_pg_out; /* # pages flushed */
#else
/*
- * To conserve memory, detailed pageout stats are only kept for DEBUG
+ * To conserve memory, some detailed kstats are only kept for DEBUG
* builds.
*/
- uint64_t zpcap_pg_anon; /* # clean anon pages flushed */
- uint64_t zpcap_pg_anondirty; /* # dirty anon pages flushed */
- uint64_t zpcap_pg_fs; /* # clean fs pages flushed */
- uint64_t zpcap_pg_fsdirty; /* # dirty fs pages flushed */
+ uint64_t zpers_zfs_rd_waittime;
+
+ uint64_t zpers_pg_anon; /* # clean anon pages flushed */
+ uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */
+ uint64_t zpers_pg_fs; /* # clean fs pages flushed */
+ uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */
#endif
-} zone_pcap_t;
+} zone_persist_t;
typedef enum zone_pageout_op {
ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY
@@ -994,7 +1002,7 @@ extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *);
/* Interfaces for page scanning */
extern uint_t zone_num_over_cap;
-extern zone_pcap_t zone_pcap_data[MAX_ZONES];
+extern zone_persist_t zone_pdata[MAX_ZONES];
extern rctl_hndl_t rc_zone_locked_mem;
extern rctl_hndl_t rc_zone_max_swap;
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index 10017d27ef..01c2666e91 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright 2017, Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
/*
@@ -1691,7 +1691,7 @@ vmu_get_zone_rss(zoneid_t zid)
}
ASSERT(zid >= 0 && zid <= MAX_ZONEID);
- pgcnt = zone_pcap_data[zid].zpcap_pg_cnt;
+ pgcnt = zone_pdata[zid].zpers_pg_cnt;
zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt);
zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap;
@@ -1728,7 +1728,7 @@ vmu_calculate()
int i;
for (i = 0; i <= MAX_ZONEID; i++) {
- if (zone_pcap_data[i].zpcap_pg_cnt > 0) {
+ if (zone_pdata[i].zpers_pg_cnt > 0) {
vmu_get_zone_rss(i);
}
}