diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
22 files changed, 1615 insertions, 85 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index e039b9cac4..98aad58025 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -125,6 +126,7 @@ #include <sys/refcount.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -2017,6 +2019,16 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); + /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + if (availrmem <= pages_pp_maximum) + return (1); + #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -2917,6 +2929,14 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -3484,9 +3504,6 @@ arc_init(void) if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; - if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) - arc_c_min = arc_meta_limit / 2; - if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 7a0abd22b5..16e42b951a 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2703,7 +2703,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_copies); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { - ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || + zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, dbuf_write_nofill_done, db, diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 39234eba53..743f5c4656 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011, Joyent, Inc. All rights reserved. */ #include <sys/dmu.h> @@ -950,6 +951,7 @@ xuio_stat_wbuf_nocopy() } #ifdef _KERNEL + int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { @@ -1562,7 +1564,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (wp & WP_NOFILL) { ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; + checksum = ZIO_CHECKSUM_NOPARITY; compress = ZIO_COMPRESS_OFF; dedup = B_FALSE; } diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index b4579e278c..2301942907 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -39,11 +39,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index 5ef7f54af1..aeeefd178e 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/dmu_objset.h> @@ -4133,9 +4134,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, dsl_dataset_t *snap; uint64_t used, comp, uncomp; - err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); - if (err != 0) - break; + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { @@ -4164,7 +4169,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); + if (snap != new) + dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 1cd49c8274..b6af7598e2 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -36,6 +36,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); @@ -839,7 +840,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { - txg_delay(dd->dd_pool, tx->tx_txg, 1); + txg_delay(dd->dd_pool, tx->tx_txg, + zfs_zone_txg_delay()); err = ERESTART; } dsl_pool_memory_pressure(dd->dd_pool); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 418a04c7c2..316b37cebd 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -40,6 +40,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -529,11 +530,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) /* * If this transaction group is over 7/8ths capacity, delay - * the caller 1 clock tick. This will slow down the "fill" - * rate until the sync process can catch up with us. + * the caller some number of clock ticks. This will slow down the + * "fill" rate until the sync process can catch up with us. */ if (reserved && reserved > (write_limit - (write_limit >> 3))) - txg_delay(dp, tx->tx_txg, 1); + txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay()); return (0); } diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h index b748571ea0..ffca0a7dcb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VDEV_DISK_H #define _SYS_VDEV_DISK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/vdev.h> #ifdef _KERNEL #include <sys/buf.h> @@ -40,14 +39,22 @@ extern "C" { #endif +#ifdef _KERNEL typedef struct vdev_disk { ddi_devid_t vd_devid; char *vd_minor; ldi_handle_t vd_lh; } vdev_disk_t; +#endif +extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int); + +/* + * Since vdev_disk.c is not compiled into libzpool, this function should only be + * defined in the zfs kernel module. + */ #ifdef _KERNEL -extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); #endif #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 1df61a587d..c297ae165c 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -102,6 +102,7 @@ struct vdev_queue { avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; + zoneid_t vq_last_zone_id; kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h new file mode 100644 index 0000000000..496b718bd6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include <sys/vdev.h> +#include <sys/semaphore.h> +#include <sys/buf.h> +#ifdef _KERNEL +#include <sys/ddi.h> +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +extern int vdev_raidz_physio(vdev_t *, + caddr_t, size_t, uint64_t, uint64_t, boolean_t); +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..069ec004f3 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern int zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index f6cf3f5349..032b77715f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -24,6 +24,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _ZIO_H @@ -79,6 +80,8 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_SHA256_MAC, + ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_FUNCTIONS }; @@ -421,6 +424,9 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ + hrtime_t io_start; /* time I/O entered zio pipeline */ + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 55b1f3884b..2269ef271e 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -30,6 +30,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * Pool-wide transaction groups. @@ -411,6 +412,8 @@ txg_sync_thread(dsl_pool_t *dp) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index d7417736b4..f78580d0f1 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -20,9 +20,11 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -325,8 +327,18 @@ vdev_disk_close(vdev_t *vd) } int -vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, - uint64_t offset, int flags) +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +} + +int +vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, + size_t size, uint64_t offset, int flags) { buf_t *bp; int error = 0; @@ -479,6 +491,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); @@ -490,6 +504,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an @@ -574,7 +590,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) /* read vdev label */ offset = vdev_label_offset(size, l, 0); - if (vdev_disk_physio(vd_lh, (caddr_t)label, + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 5a0d3ee970..4ea958a9f6 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,12 +21,14 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/avl.h> +#include <sys/zfs_zone.h> /* * These tunables are for performance analysis. @@ -120,6 +122,8 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + vq->vq_last_zone_id = 0; } void @@ -139,6 +143,7 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { avl_add(&vq->vq_deadline_tree, zio); + zfs_zone_zio_enqueue(zio); avl_add(zio->io_vdev_tree, zio); } @@ -146,6 +151,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { avl_remove(&vq->vq_deadline_tree, zio); + zfs_zone_zio_dequeue(zio); avl_remove(zio->io_vdev_tree, zio); } @@ -188,7 +194,11 @@ again: avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); +#ifdef _KERNEL + fio = lio = zfs_zone_schedule(vq); +#else fio = lio = avl_first(&vq->vq_deadline_tree); +#endif t = fio->io_vdev_tree; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 4b0f5602c1..6094e01876 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -21,11 +21,15 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_file.h> +#include <sys/vdev_raidz.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/fs/zfs.h> @@ -152,6 +156,8 @@ typedef struct raidz_map { VDEV_RAIDZ_64MUL_2((x), mask); \ } +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + /* * Force reconstruction to use the general purpose method. */ @@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { }; static raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - uint64_t nparity) +vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; - uint64_t b = zio->io_offset >> unit_shift; - uint64_t s = zio->io_size >> unit_shift; + uint64_t b = offset >> unit_shift; + uint64_t s = size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; @@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = data; for (c = c + 1; c < acols; c++) rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + @@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd) vdev_close(vd->vdev_child[c]); } +/* + * Handle a read or write request to a RAID-Z dump device. + * + * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes + * to the dump zvol are written across a full 128Kb block. As a result, an + * individual I/O may not span all columns in the RAID-Z map; moreover, a small + * I/O may only span a single column. + * + * Note that since there are no parity bits calculated or written, this format + * remains the same no matter how many parity bits are used in a normal RAID-Z + * stripe. + */ +int +vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, + uint64_t offset, uint64_t origoffset, boolean_t doread) +{ + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, err = 0; + + uint64_t start, end, colstart, colend; + uint64_t coloffset, colsize, colskip; + + int flags = doread ? B_READ : B_WRITE; + +#ifdef _KERNEL + + /* + * Don't write past the end of the block + */ + VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); + + /* + * Even if this I/O operation doesn't span the full block size, let's + * treat the on-disk format as if the only blocks are the complete 128k + * size. + */ + start = offset; + end = start + size; + + /* + * Allocate a RAID-Z map for this block. Note that this block starts + * from the "original" offset, this is, the offset of the extent which + * contains the requisite offset of the data being read or written. + */ + rm = vdev_raidz_map_alloc(data - (offset - origoffset), + SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + coloffset = origoffset; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, coloffset += rc->rc_size) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Find the start and end of this column in the RAID-Z matrix, + * keeping in mind that the stated size and offset of the + * operation may not fill the entire column for this vdev. + * + * If any portion of the data being read or written spans this + * column, issue the appropriate operation to the child vdev. + */ + if (coloffset + rc->rc_size <= start) + continue; + if (coloffset >= end) + continue; + + colstart = MAX(coloffset, start); + colend = MIN(end, coloffset + rc->rc_size); + colsize = colend - colstart; + colskip = colstart - coloffset; + + VERIFY3U(colsize, <=, rc->rc_size); + VERIFY3U(colskip, <=, rc->rc_size); + + /* + * Note that the child vdev will have a vdev label at the start + * of its range of offsets, hence the need for + * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another + * example of why this calculation is needed. + */ + if ((err = vdev_disk_physio(cvd, + ((char *)rc->rc_data) + colskip, colsize, + VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, + flags)) != 0) + break; + } + + vdev_raidz_map_free(rm); +#endif /* KERNEL */ + + return (err); +} + static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { @@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, + tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { @@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) int c, ret = 0; raidz_col_t *rc; + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 929fc06296..baffc223a3 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -23,6 +23,7 @@ * Portions Copyright 2011 Martin Matuska * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -1757,7 +1758,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -1775,7 +1777,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -1802,13 +1805,25 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os = NULL; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) return (error); - error = zfs_ioc_objset_stats_impl(zc, os); + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (error); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); return (error); @@ -2022,8 +2037,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + top: if (zc->zc_cookie == 0) (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, @@ -2072,8 +2100,10 @@ top: objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -2789,6 +2819,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -2832,8 +2863,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -2842,13 +2874,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 0c39274caf..9fae31fa6b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -25,6 +25,10 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -4145,6 +4149,8 @@ top: &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -4655,27 +4661,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, return (0); } -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * <modify memory> - * munmap() - * close() - * <time lapse> - * putpage() via fsflush - * - * If we wait until fsflush to come along, we can have a modification time that - * is some arbitrary point in the future. In order to prevent this in the - * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is - * torn down. - */ /* ARGSUSED */ static int zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, @@ -4687,10 +4672,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..08f4f38e04 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +int +zfs_zone_txg_delay() +{ + return (1); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* amount to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */ + +hrtime_t zfs_zone_last_checked = 0; + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the threshold + * at which the throttle will start delaying zones. The calculation should + * correspond closely with the %b column from iostat. + */ +uint_t zfs_zone_util_threshold = 80; + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. + */ +kmutex_t zfs_disk_lock; +uint_t zfs_disk_rcnt; +hrtime_t zfs_disk_rtime = 0; +hrtime_t zfs_disk_rlastupdate = 0; + +hrtime_t zfs_disk_last_rtime = 0; + +/* + * Data used to keep track of how often txg flush is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_flush_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the + * number of I/Os that can be pending on a device. If there are more than a + * few ops already queued up, beyond those already issued to the vdev, then + * use scheduling to get the next zio. + */ +int zfs_zone_schedule_thresh = 5; + +/* + * Tunables for delay throttling when TxG flush is occurring. + */ +int zfs_zone_txg_throttle_scale = 2; +int zfs_zone_txg_delay_ticks = 2; + +typedef struct { + int zq_qdepth; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new historical count. + * If we're still within an active cycle there is nothing to do, return false. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen + * + * at fixed intervals, we use a timestamp to see when the last update was made. + * If it was more than one cycle ago, then we need to decay the average by the + * proper number of additional cycles in which no IO was performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static int +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (0); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (1); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t, + uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__calc__wt__avg( + (uintptr_t)cp->sys_avg_lat, + (uintptr_t)cp->cycle_lat, + (uintptr_t)cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id, + (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat), + (uintptr_t)*wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + extern void __dtrace_probe_zfs__zone__utilization( + uint_t, uint_t, uint_t, uint_t, uint_t, uint_t); + + __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id), + (uint_t)rops, (uint_t)wops, (uint_t)lwops, + (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && + sp->zi_diskutil > zfs_zone_util_threshold) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + extern void __dtrace_probe_zfs__zone__throttle( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__throttle( + (uintptr_t)zonep->zone_id, (uintptr_t)delay, + (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil, + (uintptr_t)zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow) +{ + zoneio_stats_t stats; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - zfs_zone_last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + extern void __dtrace_probe_zfs__zone__stats( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat), + (uintptr_t)(stats.zi_avgwlat), + (uintptr_t)(stats.zi_active), + (uintptr_t)(stats.zi_totutil), + (uintptr_t)(stats.zi_totpri), + (uintptr_t)(stats.zi_diskutil)); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + zone_q_bump_t *qbp = arg; + + extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id), + (uintptr_t)(zonep->zone_zfs_queued)); + + if (zonep->zone_zfs_queued == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < 20) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * schedule it next. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. + * + * For single-threaded synchronous workloads a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple workloads + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel workloads (and ops in the queue) vs. other zones which + * are doing simple single-threaded workloads, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_t *vq, int qdepth) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(&vq->vq_deadline_tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL; + zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__sched__bump( + (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt), + (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt)); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out RX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track IO operations per zone. Called from dmu_tx_count_write for write ops + * and dmu_read_uio for read ops. For each operation, increment that zone's + * counter based on the type of operation. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice because this is called + * at a high level by a zone thread, but we also will count the phys. writes + * that are performed at a low level via zfs_zone_zio_start. + * + * Without this, it can look like a non-global zone never writes (case 1). + * Depending on when the TXG is flushed, the counts may be in the same sample + * bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through dmu_read_uio. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counters for logical operations here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + zfs_zone_wait_adjust(unow); + zfs_zone_last_checked = unow; + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TxG + * flushing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_flush_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + extern void __dtrace_probe_zfs__zone__wait( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id), + (uintptr_t)type, (uintptr_t)wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TxG flush rate is running above the expected rate. + * If so, this implies that we are filling TxG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the flush rate is going to be 1. When there + * is a heavy write load, TxG's fill up fast and the sync thread will write + * the TxG more frequently (perhaps once a second). In this case the rate + * will be > 1. The flush rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_flush_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_flush_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +int +zfs_zone_txg_delay() +{ + zone_t *zonep = curzone; + int delay = 1; + + if (zonep->zone_io_util_above_avg) + delay = zfs_zone_txg_delay_ticks; + + extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id), + (uintptr_t)delay); + + return (delay); +} + +/* + * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_start; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + extern void __dtrace_probe_zfs__zone__latency( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid), + (uintptr_t)(zp->io_type), (uintptr_t)(udelta)); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued > 0); + if (zonep->zone_zfs_queued == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. This function is where zio's are found + * at the head of the queue (by avl_first), then pulled off (by + * vdev_queue_io_remove) and issued. We do our scheduling here to find the + * next zio to issue. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq) +{ + int cnt; + zoneid_t last_zone; + zio_t *zp; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + cnt = avl_numnodes(&vq->vq_deadline_tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few ops in the queue then just issue the head. + * If there are more than a few ops already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zp = avl_first(&vq->vq_deadline_tree); + else + zp = get_next_zio(vq, cnt); + + vq->vq_last_zone_id = zp->io_zoneid; + + /* + * Probe with 3 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, and the zone that was associated + * with the next IO that is scheduled. + */ + extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt), + (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid)); + + return (zp); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 64e9acbae1..89c88bc181 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -36,6 +36,7 @@ #include <sys/dmu_objset.h> #include <sys/arc.h> #include <sys/ddt.h> +#include <sys/zfs_zone.h> /* * ========================================================================== @@ -501,6 +502,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); + zio->io_start = gethrtime(); + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); @@ -552,11 +555,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); @@ -894,6 +900,8 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; + zio->io_start = gethrtime(); + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { @@ -2279,6 +2287,9 @@ zio_vdev_io_start(zio_t *zio) ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_start = gethrtime(); + if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index c8fe20f2eb..c7dd90c45d 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -77,6 +78,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256_mac"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"}, }; enum zio_checksum diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index df9a16bccb..4dc63888fd 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -76,9 +76,11 @@ #include <sys/zfs_rlock.h> #include <sys/vdev_disk.h> #include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include "zfs_namecheck.h" @@ -1059,27 +1061,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, } static int -zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, - boolean_t doread, boolean_t isdump) +zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset, + uint64_t size, boolean_t doread, boolean_t isdump) { vdev_disk_t *dvd; int c; int numerrors = 0; - for (c = 0; c < vd->vdev_children; c++) { - ASSERT(vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - int err = zvol_dumpio_vdev(vd->vdev_child[c], - addr, offset, size, doread, isdump); - if (err != 0) { - numerrors++; - } else if (doread) { - break; + if (vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) { + for (c = 0; c < vd->vdev_children; c++) { + int err = zvol_dumpio_vdev(vd->vdev_child[c], + addr, offset, origoffset, size, doread, isdump); + if (err != 0) { + numerrors++; + } else if (doread) { + break; + } } } - if (!vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops) return (numerrors < vd->vdev_children ? 0 : EIO); if (doread && !vdev_readable(vd)) @@ -1087,19 +1090,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, else if (!doread && !vdev_writeable(vd)) return (EIO); - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); + if (vd->vdev_ops == &vdev_raidz_ops) { + return (vdev_raidz_physio(vd, + addr, size, offset, origoffset, doread)); + } + offset += VDEV_LABEL_START_SIZE; if (ddi_in_panic() || isdump) { ASSERT(!doread); if (doread) return (EIO); + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), lbtodb(size))); } else { - return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, - doread ? B_READ : B_WRITE)); + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); + + return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size, + offset, doread ? B_READ : B_WRITE)); } } @@ -1131,7 +1142,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); offset += DVA_GET_OFFSET(&ze->ze_dva); - error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); + error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva), + size, doread, isdump); if (!ddi_in_panic()) spa_config_exit(spa, SCL_STATE, FTAG); @@ -1322,6 +1334,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1340,6 +1354,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } } zfs_range_unlock(rl); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + return (error); } @@ -1369,6 +1387,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1399,6 +1419,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + return (error); } @@ -1852,7 +1876,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) ZIO_COMPRESS_OFF) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), - ZIO_CHECKSUM_OFF) == 0); + ZIO_CHECKSUM_NOPARITY) == 0); if (version >= SPA_VERSION_DEDUP) { VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_DEDUP), |
