diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_zone.c | 344 | ||||
-rw-r--r-- | usr/src/uts/common/os/vm_pageout.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 230 | ||||
-rw-r--r-- | usr/src/uts/common/sys/zone.h | 68 | ||||
-rw-r--r-- | usr/src/uts/common/vm/vm_usage.c | 6 |
5 files changed, 397 insertions, 255 deletions
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c index 4861c64f8e..59357cbee5 100644 --- a/usr/src/uts/common/fs/zfs/zfs_zone.c +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015, Joyent, Inc. All rights reserved. + * Copyright 2018, Joyent, Inc. All rights reserved. */ /* @@ -166,8 +166,8 @@ zfs_zone_txg_delay() * over the previous window. */ boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ -uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ -uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */ +uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */ boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ @@ -238,9 +238,9 @@ uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ typedef struct { hrtime_t cycle_start; - int cycle_cnt; hrtime_t cycle_lat; hrtime_t sys_avg_lat; + uint_t cycle_cnt; } sys_lat_cycle_t; typedef struct { @@ -275,6 +275,7 @@ hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ /* time that we last updated per-zone throttle info */ +kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */ hrtime_t zfs_zone_last_checked = 0; hrtime_t zfs_disk_last_laggard = 0; @@ -412,22 +413,32 @@ compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) * Add IO op data to the zone. */ static void -add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op) { + zone_zfs_io_t *iop; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + switch (op) { case ZFS_ZONE_IOP_READ: - (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); - zonep->zone_rd_ops.cycle_cnt++; + (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops); + iop->zpers_rd_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_WRITE: - (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); - zonep->zone_wr_ops.cycle_cnt++; + (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops); + iop->zpers_wr_ops.cycle_cnt++; break; case ZFS_ZONE_IOP_LOGICAL_WRITE: - (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); - zonep->zone_lwr_ops.cycle_cnt++; + (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops); + iop->zpers_lwr_ops.cycle_cnt++; break; } + mutex_exit(&zpd->zpers_zfs_lock); } /* @@ -502,13 +513,13 @@ add_sys_iop(hrtime_t unow, int op, int lat) switch (op) { case ZFS_ZONE_IOP_READ: (void) compute_new_sys_avg(unow, &rd_lat); - rd_lat.cycle_cnt++; - rd_lat.cycle_lat += lat; + atomic_inc_uint(&rd_lat.cycle_cnt); + atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat); break; case ZFS_ZONE_IOP_WRITE: (void) compute_new_sys_avg(unow, &wr_lat); - wr_lat.cycle_cnt++; - wr_lat.cycle_lat += lat; + atomic_inc_uint(&wr_lat.cycle_cnt); + atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat); break; } } @@ -575,10 +586,11 @@ calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) * The latency parameter is in usecs. */ static void -add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op, + hrtime_t lat) { /* Add op to zone */ - add_zone_iop(zonep, unow, op); + add_zone_iop(zpd, unow, op); /* Track system latency */ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) @@ -591,14 +603,16 @@ add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) * return a non-zero value, otherwise return 0. */ static int -get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, +get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops, uint_t *lwops) { - *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); - *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); - *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + ASSERT3P(zpd, !=, NULL); - DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id, + *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops); + *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops); + *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd, uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); return (*rops | *wops | *lwops); @@ -637,20 +651,24 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) { zoneio_stats_t *sp = arg; uint_t rops, wops, lwops; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; + + ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock)); + ASSERT3P(iop, !=, NULL); if (zonep->zone_id == GLOBAL_ZONEID || - get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { - zonep->zone_io_util = 0; + get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) { return (0); } - zonep->zone_io_util = (rops * sp->zi_avgrlat) + - (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); - sp->zi_totutil += zonep->zone_io_util; + iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) + + (lwops * sp->zi_avgwlat); + sp->zi_totutil += iop->zpers_io_util; - if (zonep->zone_io_util > 0) { + if (iop->zpers_io_util > 0) { sp->zi_active++; - sp->zi_totpri += zonep->zone_zfs_io_pri; + sp->zi_totpri += iop->zpers_zfs_io_pri; } /* @@ -665,23 +683,27 @@ zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) */ DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, uint_t, rops, uint_t, wops, uint_t, lwops, - uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri); + uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri); return (0); } static void -zfs_zone_delay_inc(zone_t *zonep) +zfs_zone_delay_inc(zone_zfs_io_t *zpd) { - if (zonep->zone_io_delay < zfs_zone_delay_ceiling) - zonep->zone_io_delay += zfs_zone_delay_step; + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay < zfs_zone_delay_ceiling) + zpd->zpers_io_delay += zfs_zone_delay_step; } static void -zfs_zone_delay_dec(zone_t *zonep) +zfs_zone_delay_dec(zone_zfs_io_t *zpd) { - if (zonep->zone_io_delay > 0) - zonep->zone_io_delay -= zfs_zone_delay_step; + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay > 0) + zpd->zpers_io_delay -= zfs_zone_delay_step; } /* @@ -691,18 +713,24 @@ zfs_zone_delay_dec(zone_t *zonep) static int zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) { + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; zoneio_stats_t *sp = arg; - uint16_t delay = zonep->zone_io_delay; + uint8_t delay; uint_t fairutil = 0; - zonep->zone_io_util_above_avg = B_FALSE; + ASSERT(MUTEX_HELD(&zpd->zpers_zfs_lock)); + ASSERT3P(iop, !=, NULL); + + delay = iop->zpers_io_delay; + iop->zpers_io_util_above_avg = 0; /* * Given the calculated total utilitzation for all zones, calculate the * fair share of I/O for this zone. */ if (zfs_zone_priority_enable && sp->zi_totpri > 0) { - fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) / sp->zi_totpri; } else if (sp->zi_active > 0) { fairutil = sp->zi_totutil / sp->zi_active; @@ -712,14 +740,14 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) * Adjust each IO's delay. If the overall delay becomes too high, avoid * increasing beyond the ceiling value. */ - if (zonep->zone_io_util > fairutil && sp->zi_overutil) { - zonep->zone_io_util_above_avg = B_TRUE; + if (iop->zpers_io_util > fairutil && sp->zi_overutil) { + iop->zpers_io_util_above_avg = 1; if (sp->zi_active > 1) - zfs_zone_delay_inc(zonep); - } else if (zonep->zone_io_util < fairutil || sp->zi_underutil || + zfs_zone_delay_inc(iop); + } else if (iop->zpers_io_util < fairutil || sp->zi_underutil || sp->zi_active <= 1) { - zfs_zone_delay_dec(zonep); + zfs_zone_delay_dec(iop); } /* @@ -732,8 +760,8 @@ zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) * arg4: actual I/O utilization */ DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, - uintptr_t, delay, uintptr_t, zonep->zone_io_delay, - uintptr_t, fairutil, uintptr_t, zonep->zone_io_util); + uintptr_t, delay, uintptr_t, iop->zpers_io_delay, + uintptr_t, fairutil, uintptr_t, iop->zpers_io_util); return (0); } @@ -823,10 +851,20 @@ get_sched_pri_cb(zone_t *zonep, void *arg) uint_t cnt; zone_q_bump_t *qbp = arg; zio_priority_t p = qbp->zq_queue; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop; - cnt = zonep->zone_zfs_queued[p]; + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + cnt = iop->zpers_zfs_queued[p]; if (cnt == 0) { - zonep->zone_zfs_weight = 0; + iop->zpers_zfs_weight = 0; + mutex_exit(&zpd->zpers_zfs_lock); return (0); } @@ -837,8 +875,8 @@ get_sched_pri_cb(zone_t *zonep, void *arg) * done any IO over several iterations will see their weight max * out. */ - if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX) - zonep->zone_zfs_weight++; + if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX) + iop->zpers_zfs_weight++; /* * This zone's IO priority is the inverse of the number of IOs @@ -852,7 +890,7 @@ get_sched_pri_cb(zone_t *zonep, void *arg) * which haven't done IO in a while aren't getting starved. */ pri = (qbp->zq_qdepth / cnt) * - zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + iop->zpers_zfs_io_pri * iop->zpers_zfs_weight; /* * If this zone has a higher priority than what we found so far, @@ -861,8 +899,9 @@ get_sched_pri_cb(zone_t *zonep, void *arg) if (pri > qbp->zq_priority) { qbp->zq_zoneid = zonep->zone_id; qbp->zq_priority = pri; - qbp->zq_wt = zonep->zone_zfs_weight; + qbp->zq_wt = iop->zpers_zfs_weight; } + mutex_exit(&zpd->zpers_zfs_lock); return (0); } @@ -996,8 +1035,10 @@ zfs_zone_zio_init(zio_t *zp) void zfs_zone_io_throttle(zfs_zone_iop_type_t type) { - zone_t *zonep = curzone; - hrtime_t unow, last_checked; + zoneid_t zid = curzone->zone_id; + zone_persist_t *zpd = &zone_pdata[zid]; + zone_zfs_io_t *iop; + hrtime_t unow; uint16_t wait; unow = GET_USEC_TIME; @@ -1007,34 +1048,60 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type) * tracking physical IO operations are handled in zfs_zone_zio_done. */ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { - mutex_enter(&zonep->zone_stg_io_lock); - add_iop(zonep, unow, type, 0); - mutex_exit(&zonep->zone_stg_io_lock); + add_iop(zpd, unow, type, 0); } if (!zfs_zone_delay_enable) return; + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + /* * If the zone's I/O priority is set to zero, don't throttle that zone's * operations at all. */ - if (zonep->zone_zfs_io_pri == 0) + if (iop->zpers_zfs_io_pri == 0) { + mutex_exit(&zpd->zpers_zfs_lock); return; + } - /* - * XXX There's a potential race here in that more than one thread may - * update the zone delays concurrently. The worst outcome is corruption - * of our data to track each zone's IO, so the algorithm may make - * incorrect throttling decisions until the data is refreshed. - */ - last_checked = zfs_zone_last_checked; - if ((unow - last_checked) > zfs_zone_adjust_time) { - zfs_zone_last_checked = unow; - zfs_zone_wait_adjust(unow, last_checked); + /* Handle periodically updating the per-zone I/O parameters */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + hrtime_t last_checked; + boolean_t do_update = B_FALSE; + + /* Recheck under mutex */ + mutex_enter(&zfs_last_check_lock); + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + do_update = B_TRUE; + } + mutex_exit(&zfs_last_check_lock); + + if (do_update) { + mutex_exit(&zpd->zpers_zfs_lock); + + zfs_zone_wait_adjust(unow, last_checked); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + } } - if ((wait = zonep->zone_io_delay) > 0) { + wait = iop->zpers_io_delay; + mutex_exit(&zpd->zpers_zfs_lock); + + if (wait > 0) { /* * If this is a write and we're doing above normal TXG * syncing, then throttle for longer than normal. @@ -1050,15 +1117,15 @@ zfs_zone_io_throttle(zfs_zone_iop_type_t type) * arg1: type of IO operation * arg2: time to delay (in us) */ - DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id, + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid, uintptr_t, type, uintptr_t, wait); drv_usecwait(wait); - if (zonep->zone_vfs_stats != NULL) { - atomic_inc_64(&zonep->zone_vfs_stats-> + if (curzone->zone_vfs_stats != NULL) { + atomic_inc_64(&curzone->zone_vfs_stats-> zv_delay_cnt.value.ui64); - atomic_add_64(&zonep->zone_vfs_stats-> + atomic_add_64(&curzone->zone_vfs_stats-> zv_delay_time.value.ui64, wait); } } @@ -1100,8 +1167,23 @@ zfs_zone_report_txg_sync(void *dp) hrtime_t zfs_zone_txg_delay() { - if (curzone->zone_io_util_above_avg) + zone_persist_t *zpd = &zone_pdata[curzone->zone_id]; + zone_zfs_io_t *iop; + uint8_t above; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + above = iop->zpers_io_util_above_avg; + mutex_exit(&zpd->zpers_zfs_lock); + + if (above) { return (zfs_zone_txg_delay_nsec); + } return (MSEC2NSEC(10)); } @@ -1114,7 +1196,8 @@ zfs_zone_txg_delay() void zfs_zone_zio_start(zio_t *zp) { - zone_t *zonep; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; /* * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for @@ -1124,14 +1207,14 @@ zfs_zone_zio_start(zio_t *zp) if (zp->io_type == ZIO_TYPE_IOCTL) return; - if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) - return; - - mutex_enter(&zonep->zone_zfs_lock); - if (zp->io_type == ZIO_TYPE_READ) - kstat_runq_enter(&zonep->zone_zfs_rwstats); - zonep->zone_zfs_weight = 0; - mutex_exit(&zonep->zone_zfs_lock); + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&iop->zpers_zfs_rwstats); + iop->zpers_zfs_weight = 0; + } + mutex_exit(&zpd->zpers_zfs_lock); mutex_enter(&zfs_disk_lock); zp->io_dispatched = gethrtime(); @@ -1140,8 +1223,6 @@ zfs_zone_zio_start(zio_t *zp) zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); zfs_disk_rlastupdate = zp->io_dispatched; mutex_exit(&zfs_disk_lock); - - zone_rele(zonep); } /* @@ -1152,7 +1233,8 @@ zfs_zone_zio_start(zio_t *zp) void zfs_zone_zio_done(zio_t *zp) { - zone_t *zonep; + zone_persist_t *zpd; + zone_zfs_io_t *iop; hrtime_t now, unow, udelta; if (zp->io_type == ZIO_TYPE_IOCTL) @@ -1161,34 +1243,33 @@ zfs_zone_zio_done(zio_t *zp) if (zp->io_dispatched == 0) return; - if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) - return; + zpd = &zone_pdata[zp->io_zoneid]; now = gethrtime(); unow = NANO_TO_MICRO(now); udelta = unow - NANO_TO_MICRO(zp->io_dispatched); - mutex_enter(&zonep->zone_zfs_lock); - - /* - * To calculate the wsvc_t average, keep a cumulative sum of all the - * wait time before each I/O was dispatched. Since most writes are - * asynchronous, only track the wait time for read I/Os. - */ - if (zp->io_type == ZIO_TYPE_READ) { - zonep->zone_zfs_rwstats.reads++; - zonep->zone_zfs_rwstats.nread += zp->io_size; - - zonep->zone_zfs_stats->zz_waittime.value.ui64 += - zp->io_dispatched - zp->io_timestamp; - - kstat_runq_exit(&zonep->zone_zfs_rwstats); - } else { - zonep->zone_zfs_rwstats.writes++; - zonep->zone_zfs_rwstats.nwritten += zp->io_size; + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + /* + * To calculate the wsvc_t average, keep a cumulative sum of + * all the wait time before each I/O was dispatched. Since most + * writes are asynchronous, only track the wait time for + * read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + iop->zpers_zfs_rwstats.reads++; + iop->zpers_zfs_rwstats.nread += zp->io_size; + iop->zpers_zfs_rd_waittime += + zp->io_dispatched - zp->io_timestamp; + kstat_runq_exit(&iop->zpers_zfs_rwstats); + } else { + iop->zpers_zfs_rwstats.writes++; + iop->zpers_zfs_rwstats.nwritten += zp->io_size; + } } - - mutex_exit(&zonep->zone_zfs_lock); + mutex_exit(&zpd->zpers_zfs_lock); mutex_enter(&zfs_disk_lock); zfs_disk_rcnt--; @@ -1201,14 +1282,10 @@ zfs_zone_zio_done(zio_t *zp) mutex_exit(&zfs_disk_lock); if (zfs_zone_delay_enable) { - mutex_enter(&zonep->zone_stg_io_lock); - add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ? ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); - mutex_exit(&zonep->zone_stg_io_lock); } - zone_rele(zonep); - /* * sdt:::zfs-zone-latency * @@ -1224,7 +1301,8 @@ void zfs_zone_zio_dequeue(zio_t *zp) { zio_priority_t p; - zone_t *zonep; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; p = zp->io_priority; if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) @@ -1233,24 +1311,25 @@ zfs_zone_zio_dequeue(zio_t *zp) /* We depend on p being defined as either 0 or 1 */ ASSERT(p < 2); - if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) - return; - - mutex_enter(&zonep->zone_stg_io_lock); - ASSERT(zonep->zone_zfs_queued[p] > 0); - if (zonep->zone_zfs_queued[p] == 0) - cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); - else - zonep->zone_zfs_queued[p]--; - mutex_exit(&zonep->zone_stg_io_lock); - zone_rele(zonep); + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + ASSERT(iop->zpers_zfs_queued[p] > 0); + if (iop->zpers_zfs_queued[p] == 0) { + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + } else { + iop->zpers_zfs_queued[p]--; + } + } + mutex_exit(&zpd->zpers_zfs_lock); } void zfs_zone_zio_enqueue(zio_t *zp) { zio_priority_t p; - zone_t *zonep; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; p = zp->io_priority; if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) @@ -1259,13 +1338,12 @@ zfs_zone_zio_enqueue(zio_t *zp) /* We depend on p being defined as either 0 or 1 */ ASSERT(p < 2); - if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) - return; - - mutex_enter(&zonep->zone_stg_io_lock); - zonep->zone_zfs_queued[p]++; - mutex_exit(&zonep->zone_stg_io_lock); - zone_rele(zonep); + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + iop->zpers_zfs_queued[p]++; + } + mutex_exit(&zpd->zpers_zfs_lock); } /* diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index c6e54a75c1..f5ee76a2cb 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1249,7 +1249,7 @@ checkpage(struct page *pp, int whichhand) ASSERT(pp->p_zoneid == ALL_ZONES || pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); if (pp->p_zoneid == ALL_ZONES || - zone_pcap_data[pp->p_zoneid].zpcap_over == 0) { + zone_pdata[pp->p_zoneid].zpers_over == 0) { /* * Cross-zone shared page, or zone not over it's cap. * Leave the page alone. diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 843adc1ee0..06a8549c5b 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017, Joyent Inc. + * Copyright 2018, Joyent Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -313,6 +313,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -429,11 +430,18 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, static const int ZONE_SYSCALL_API_VERSION = 7; /* - * "zone_pcap_data" is an array indexed by zoneid. Each member stores the zone's - * current page usage, its page limit, a flag indicating if the zone is - * over its physical memory cap and various statistics. The zpcap_over flag is - * the interface for the page scanner to use when reclaiming pages for zones - * that are over their cap. + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. * * All zone physical memory cap data is stored in this array instead of within * the zone structure itself. This is because zone structures come and go, but @@ -448,33 +456,40 @@ static const int ZONE_SYSCALL_API_VERSION = 7; * page scanning. * * The page scanner can run when "zone_num_over_cap" is non-zero. It can - * do a direct lookup of a zoneid into the "zone_pcap_data" array to determine + * do a direct lookup of a zoneid into the "zone_pdata" array to determine * if that zone is over its cap. * * There is no locking for the page scanner to perform these two checks. * We cannot have the page scanner blocking normal paging activity for * running processes. Because the physical memory cap is a soft cap, it is * fine for the scanner to simply read the current state of the counter and - * the zone's zpcap_over entry in the array. The scanner should never modify + * the zone's zpers_over entry in the array. The scanner should never modify * either of these items. Internally the entries and the counter are managed * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We * take care to ensure that we only take the zone_physcap_lock mutex when a * zone is transitioning over/under its physical memory cap. * * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage - * the "zone_pcap_data" array and associated counter. + * the "zone_pdata" array and associated counter. * - * The zone_pcap_t structure tracks the zone's physical cap and phyiscal usage - * in terms of pages. These values are currently defined as uint32. Thus, the - * maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) since - * UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. * In the future we may need to expand these counters to 64-bit, but for now * we're using 32-bit to conserve memory, since this array is statically * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. */ uint_t zone_num_over_cap; -zone_pcap_t zone_pcap_data[MAX_ZONES]; +zone_persist_t zone_pdata[MAX_ZONES]; static kmutex_t zone_physcap_lock; /* @@ -1509,8 +1524,16 @@ static rctl_ops_t zone_cpu_burst_time_ops = { static rctl_qty_t zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) { + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + ASSERT(MUTEX_HELD(&p->p_lock)); - return (p->p_zone->zone_zfs_io_pri); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); } /*ARGSUSED*/ @@ -1519,6 +1542,7 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(e->rcep_t == RCENTITY_ZONE); @@ -1529,7 +1553,11 @@ zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, /* * set priority to the new value. */ - zone->zone_zfs_io_pri = nv; + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); return (0); } @@ -1871,10 +1899,10 @@ static rctl_qty_t zone_phys_mem_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; - zone_pcap_t *zp = &zone_pcap_data[p->p_zone->zone_id]; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; ASSERT(MUTEX_HELD(&p->p_lock)); - q = ptob(zp->zpcap_pg_cnt); + q = ptob(zp->zpers_pg_cnt); return (q); } @@ -1906,7 +1934,7 @@ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, pg_val = (uint_t)pages; } } - zone_pcap_data[zid].zpcap_pg_limit = pg_val; + zone_pdata[zid].zpers_pg_limit = pg_val; return (0); } @@ -2016,13 +2044,13 @@ zone_physmem_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_kstat_t *zk = ksp->ks_data; - zone_pcap_t *zp = &zone_pcap_data[zone->zone_id]; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; if (rw == KSTAT_WRITE) return (EACCES); - zk->zk_usage.value.ui64 = ptob(zp->zpcap_pg_cnt); - zk->zk_value.value.ui64 = ptob(zp->zpcap_pg_limit); + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); return (0); } @@ -2170,26 +2198,42 @@ zone_zfs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_zfs_kstat_t *zzp = ksp->ks_data; - kstat_io_t *kiop = &zone->zone_zfs_rwstats; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; if (rw == KSTAT_WRITE) return (EACCES); - /* - * Extract the ZFS statistics from the kstat_io_t structure used by - * kstat_runq_enter() and related functions. Since the I/O throttle - * counters are updated directly by the ZFS layer, there's no need to - * copy those statistics here. - * - * Note that kstat_runq_enter() and the related functions use - * gethrtime_unscaled(), so scale the time here. - */ - zzp->zz_nread.value.ui64 = kiop->nread; - zzp->zz_reads.value.ui64 = kiop->reads; - zzp->zz_rtime.value.ui64 = kiop->rtime; - zzp->zz_rlentime.value.ui64 = kiop->rlentime; - zzp->zz_nwritten.value.ui64 = kiop->nwritten; - zzp->zz_writes.value.ui64 = kiop->writes; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); @@ -2240,23 +2284,23 @@ zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; - zone_pcap_t *zp; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); - zp = &zone_pcap_data[zone->zone_id]; + zp = &zone_pdata[zone->zone_id]; - zmp->zm_rss.value.ui64 = ptob(zp->zpcap_pg_cnt); - zmp->zm_phys_cap.value.ui64 = ptob(zp->zpcap_pg_limit); + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); zmp->zm_swap.value.ui64 = zone->zone_max_swap; zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; - zmp->zm_nover.value.ui64 = zp->zpcap_nover; + zmp->zm_nover.value.ui64 = zp->zpers_nover; #ifndef DEBUG - zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_out); + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); #else - zmp->zm_pagedout.value.ui64 = ptob(zp->zpcap_pg_fsdirty + - zp->zpcap_pg_fs + zp->zpcap_pg_anon + zp->zpcap_pg_anondirty); + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); #endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; @@ -2523,11 +2567,13 @@ zone_zsd_init(void) zone0.zone_swapresv_kstat = NULL; zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2839,7 +2885,7 @@ zone_free(zone_t *zone) cpucaps_zone_remove(zone); /* Clear physical memory capping data. */ - bzero(&zone_pcap_data[zone->zone_id], sizeof (zone_pcap_t)); + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); ASSERT(zone->zone_cpucap == NULL); @@ -5090,7 +5136,10 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_lockedmem_kstat = NULL; zone->zone_swapresv_kstat = NULL; zone->zone_physmem_kstat = NULL; - zone->zone_zfs_io_pri = 1; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -5101,8 +5150,8 @@ zone_create(const char *zone_name, const char *zone_root, * Ensure page count is 0 (in case zoneid has wrapped). * Initialize physical memory cap as unlimited. */ - zone_pcap_data[zoneid].zpcap_pg_cnt = 0; - zone_pcap_data[zoneid].zpcap_pg_limit = UINT32_MAX; + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); @@ -5741,6 +5790,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5774,6 +5824,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -8075,18 +8131,18 @@ done: static void zone_incr_capped(zoneid_t zid) { - zone_pcap_t *zp = &zone_pcap_data[zid]; + zone_persist_t *zp = &zone_pdata[zid]; /* See if over (unlimited is UINT32_MAX), or already marked that way. */ - if (zp->zpcap_pg_cnt <= zp->zpcap_pg_limit || zp->zpcap_over == 1) { + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { return; } mutex_enter(&zone_physcap_lock); /* Recheck setting under mutex */ - if (zp->zpcap_pg_cnt > zp->zpcap_pg_limit && zp->zpcap_over == 0) { - zp->zpcap_over = 1; - zp->zpcap_nover++; + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; zone_num_over_cap++; DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); } @@ -8114,29 +8170,29 @@ zone_incr_capped(zoneid_t zid) static void zone_decr_capped(zoneid_t zid) { - zone_pcap_t *zp = &zone_pcap_data[zid]; + zone_persist_t *zp = &zone_pdata[zid]; uint32_t adjusted_limit; /* * See if under, or already marked that way. There is no need to - * check for an unlimited cap (zpcap_pg_limit == UINT32_MAX) - * since we'll never set zpcap_over in zone_incr_capped(). + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). */ - if (zp->zpcap_over == 0 || zp->zpcap_pg_cnt >= zp->zpcap_pg_limit) { + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { return; } - adjusted_limit = zp->zpcap_pg_limit - (zp->zpcap_pg_limit >> 7); + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); /* Recheck, accounting for our hysteresis. */ - if (zp->zpcap_pg_cnt >= adjusted_limit) { + if (zp->zpers_pg_cnt >= adjusted_limit) { return; } mutex_enter(&zone_physcap_lock); /* Recheck under mutex. */ - if (zp->zpcap_pg_cnt < adjusted_limit && zp->zpcap_over == 1) { - zp->zpcap_over = 0; + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; ASSERT(zone_num_over_cap > 0); zone_num_over_cap--; DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); @@ -8154,7 +8210,7 @@ void zone_add_page(page_t *pp) { uint_t pcnt; - zone_pcap_t *zp; + zone_persist_t *zp; zoneid_t zid; /* Skip pages in segkmem, etc. (KV_KVP, ...) */ @@ -8179,9 +8235,9 @@ zone_add_page(page_t *pp) if (pp->p_share == 0) { /* First mapping to this page. */ pp->p_zoneid = zid; - zp = &zone_pcap_data[zid]; - ASSERT(zp->zpcap_pg_cnt + pcnt < UINT32_MAX); - atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, pcnt); + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); zone_incr_capped(zid); return; } @@ -8194,10 +8250,10 @@ zone_add_page(page_t *pp) zid = pp->p_zoneid; pp->p_zoneid = ALL_ZONES; ASSERT(zid >= 0 && zid <= MAX_ZONEID); - zp = &zone_pcap_data[zid]; + zp = &zone_pdata[zid]; - if (zp->zpcap_pg_cnt > 0) { - atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt); + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); } zone_decr_capped(zid); } @@ -8207,7 +8263,7 @@ void zone_rm_page(page_t *pp) { uint_t pcnt; - zone_pcap_t *zp; + zone_persist_t *zp; zoneid_t zid; /* Skip pages in segkmem, etc. (KV_KVP, ...) */ @@ -8227,9 +8283,9 @@ zone_rm_page(page_t *pp) } ASSERT(zid >= 0 && zid <= MAX_ZONEID); - zp = &zone_pcap_data[zid]; - if (zp->zpcap_pg_cnt > 0) { - atomic_add_32((uint32_t *)&zp->zpcap_pg_cnt, -pcnt); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); } zone_decr_capped(zid); pp->p_zoneid = ALL_ZONES; @@ -8238,29 +8294,29 @@ zone_rm_page(page_t *pp) void zone_pageout_stat(int zid, zone_pageout_op_t op) { - zone_pcap_t *zp; + zone_persist_t *zp; if (zid == ALL_ZONES) return; ASSERT(zid >= 0 && zid <= MAX_ZONEID); - zp = &zone_pcap_data[zid]; + zp = &zone_pdata[zid]; #ifndef DEBUG - atomic_add_64(&zp->zpcap_pg_out, 1); + atomic_add_64(&zp->zpers_pg_out, 1); #else switch (op) { case ZPO_DIRTY: - atomic_add_64(&zp->zpcap_pg_fsdirty, 1); + atomic_add_64(&zp->zpers_pg_fsdirty, 1); break; case ZPO_FS: - atomic_add_64(&zp->zpcap_pg_fs, 1); + atomic_add_64(&zp->zpers_pg_fs, 1); break; case ZPO_ANON: - atomic_add_64(&zp->zpcap_pg_anon, 1); + atomic_add_64(&zp->zpers_pg_anon, 1); break; case ZPO_ANONDIRTY: - atomic_add_64(&zp->zpcap_pg_anondirty, 1); + atomic_add_64(&zp->zpers_pg_anondirty, 1); break; default: cmn_err(CE_PANIC, "Invalid pageout operator %d", op); @@ -8275,23 +8331,23 @@ zone_pageout_stat(int zid, zone_pageout_op_t op) void zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) { - zone_pcap_t *zp; + zone_persist_t *zp; ASSERT(zid >= 0 && zid <= MAX_ZONEID); - zp = &zone_pcap_data[zid]; + zp = &zone_pdata[zid]; /* * If memory or swap limits are set on the zone, use those, otherwise * use the system values. physmem and freemem are also in pages. */ - if (zp->zpcap_pg_limit == UINT32_MAX) { + if (zp->zpers_pg_limit == UINT32_MAX) { *memcap = physmem; *free = freemem; } else { int64_t freemem; - *memcap = (pgcnt_t)zp->zpcap_pg_limit; - freemem = zp->zpcap_pg_limit - zp->zpcap_pg_cnt; + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; if (freemem > 0) { *free = (pgcnt_t)freemem; } else { diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 1cca1e7555..87253134fd 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -22,7 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2017, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_ZONE_H @@ -624,20 +624,6 @@ typedef struct zone { struct cpucap *zone_cpucap; /* CPU caps data */ /* - * Data and counters used for ZFS fair-share disk IO. - */ - rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */ - uint_t zone_zfs_queued[2]; /* sync I/O enqueued count */ - uint64_t zone_zfs_weight; /* used to prevent starvation */ - uint64_t zone_io_util; /* IO utilization metric */ - boolean_t zone_io_util_above_avg; /* IO util percent > avg. */ - uint16_t zone_io_delay; /* IO delay on logical r/w */ - kmutex_t zone_stg_io_lock; /* protects IO window data */ - sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */ - sys_zio_cntr_t zone_wr_ops; /* writes and */ - sys_zio_cntr_t zone_lwr_ops; /* logical writes. */ - - /* * kstats and counters for VFS ops and bytes. */ kmutex_t zone_vfs_lock; /* protects VFS statistics */ @@ -650,7 +636,6 @@ typedef struct zone { */ kmutex_t zone_zfs_lock; /* protects ZFS statistics */ kstat_t *zone_zfs_ksp; - kstat_io_t zone_zfs_rwstats; zone_zfs_kstat_t *zone_zfs_stats; /* @@ -738,25 +723,48 @@ typedef struct zone { kmutex_t zone_mount_lock; } zone_t; -/* zpcap_over is treated as a boolean but is 32 bits for alignment. */ -typedef struct zone_pcap { - uint32_t zpcap_over; /* currently over cap */ - uint32_t zpcap_pg_cnt; /* current RSS in pages */ - uint32_t zpcap_pg_limit; /* current RRS limit in pages */ - uint32_t zpcap_nover; /* # of times over phys. cap */ +/* + * Data and counters used for ZFS fair-share disk IO. + */ +typedef struct zone_zfs_io { + uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */ + uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */ + sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zpers_wr_ops; /* writes, and */ + sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */ + kstat_io_t zpers_zfs_rwstats; + uint64_t zpers_io_util; /* IO utilization metric */ + uint64_t zpers_zfs_rd_waittime; + uint8_t zpers_io_delay; /* IO delay on logical r/w */ + uint8_t zpers_zfs_weight; /* used to prevent starvation */ + uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */ +} zone_zfs_io_t; + +/* + * "Persistent" zone data which can be accessed idependently of the zone_t. + */ +typedef struct zone_persist { + kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */ + zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */ + uint8_t zpers_over; /* currently over cap */ + uint32_t zpers_pg_cnt; /* current RSS in pages */ + uint32_t zpers_pg_limit; /* current RRS limit in pages */ + uint32_t zpers_nover; /* # of times over phys. cap */ #ifndef DEBUG - uint64_t zpcap_pg_out; /* # pages flushed */ + uint64_t zpers_pg_out; /* # pages flushed */ #else /* - * To conserve memory, detailed pageout stats are only kept for DEBUG + * To conserve memory, some detailed kstats are only kept for DEBUG * builds. */ - uint64_t zpcap_pg_anon; /* # clean anon pages flushed */ - uint64_t zpcap_pg_anondirty; /* # dirty anon pages flushed */ - uint64_t zpcap_pg_fs; /* # clean fs pages flushed */ - uint64_t zpcap_pg_fsdirty; /* # dirty fs pages flushed */ + uint64_t zpers_zfs_rd_waittime; + + uint64_t zpers_pg_anon; /* # clean anon pages flushed */ + uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */ + uint64_t zpers_pg_fs; /* # clean fs pages flushed */ + uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */ #endif -} zone_pcap_t; +} zone_persist_t; typedef enum zone_pageout_op { ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY @@ -994,7 +1002,7 @@ extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); /* Interfaces for page scanning */ extern uint_t zone_num_over_cap; -extern zone_pcap_t zone_pcap_data[MAX_ZONES]; +extern zone_persist_t zone_pdata[MAX_ZONES]; extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 10017d27ef..01c2666e91 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,7 +25,7 @@ */ /* - * Copyright 2017, Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ /* @@ -1691,7 +1691,7 @@ vmu_get_zone_rss(zoneid_t zid) } ASSERT(zid >= 0 && zid <= MAX_ZONEID); - pgcnt = zone_pcap_data[zid].zpcap_pg_cnt; + pgcnt = zone_pdata[zid].zpers_pg_cnt; zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt); zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap; @@ -1728,7 +1728,7 @@ vmu_calculate() int i; for (i = 0; i <= MAX_ZONEID; i++) { - if (zone_pcap_data[i].zpcap_pg_cnt > 0) { + if (zone_pdata[i].zpers_pg_cnt > 0) { vmu_get_zone_rss(i); } } |