summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c23
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c14
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_disk.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_raidz.h49
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_zone.h62
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h6
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c131
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c58
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c31
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c1179
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c11
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c3
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c62
22 files changed, 1615 insertions, 85 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index e039b9cac4..98aad58025 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -125,6 +126,7 @@
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -2017,6 +2019,16 @@ arc_reclaim_needed(void)
if (availrmem < swapfs_minfree + swapfs_reserve + extra)
return (1);
+ /*
+ * Check that we have enough availrmem that memory locking (e.g., via
+ * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
+ * stores the number of pages that cannot be locked; when availrmem
+ * drops below pages_pp_maximum, page locking mechanisms such as
+ * page_pp_lock() will fail.)
+ */
+ if (availrmem <= pages_pp_maximum)
+ return (1);
+
#if defined(__i386)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
@@ -2917,6 +2929,14 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
+ /*
+ * At this point, this read I/O has already missed in the ARC
+ * and will be going through to the disk. The I/O throttle
+ * should delay this I/O if this zone is using more than its I/O
+ * priority allows.
+ */
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -3484,9 +3504,6 @@ arc_init(void)
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
- arc_c_min = arc_meta_limit / 2;
-
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 7a0abd22b5..16e42b951a 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2703,7 +2703,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_copies);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 39234eba53..743f5c4656 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu.h>
@@ -950,6 +951,7 @@ xuio_stat_wbuf_nocopy()
}
#ifdef _KERNEL
+
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
@@ -1562,7 +1564,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (wp & WP_NOFILL) {
ASSERT(!ismd && level == 0);
- checksum = ZIO_CHECKSUM_OFF;
+ checksum = ZIO_CHECKSUM_NOPARITY;
compress = ZIO_COMPRESS_OFF;
dedup = B_FALSE;
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b4579e278c..2301942907 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
#include <sys/varargs.h>
+#include <sys/zfs_zone.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
-
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
if (len == 0)
return;
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
min_bs = SPA_MINBLOCKSHIFT;
max_bs = SPA_MAXBLOCKSHIFT;
min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 5ef7f54af1..aeeefd178e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu_objset.h>
@@ -4133,9 +4134,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
dsl_dataset_t *snap;
uint64_t used, comp, uncomp;
- err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
- if (err != 0)
- break;
+ if (snapobj == new->ds_object) {
+ snap = new;
+ } else {
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+ if (err != 0)
+ break;
+ }
if (snap->ds_phys->ds_prev_snap_txg ==
oldsnap->ds_phys->ds_creation_txg) {
@@ -4164,7 +4169,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
* was not a snapshot of/before new.
*/
snapobj = snap->ds_phys->ds_prev_snap_obj;
- dsl_dataset_rele(snap, FTAG);
+ if (snap != new)
+ dsl_dataset_rele(snap, FTAG);
if (snapobj == 0) {
err = EINVAL;
break;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 1cd49c8274..b6af7598e2 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -36,6 +36,7 @@
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
#include "zfs_namecheck.h"
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -839,7 +840,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else {
if (err == EAGAIN) {
- txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ zfs_zone_txg_delay());
err = ERESTART;
}
dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 418a04c7c2..316b37cebd 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -40,6 +40,7 @@
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
@@ -529,11 +530,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
/*
* If this transaction group is over 7/8ths capacity, delay
- * the caller 1 clock tick. This will slow down the "fill"
- * rate until the sync process can catch up with us.
+ * the caller some number of clock ticks. This will slow down the
+ * "fill" rate until the sync process can catch up with us.
*/
if (reserved && reserved > (write_limit - (write_limit >> 3)))
- txg_delay(dp, tx->tx_txg, 1);
+ txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay());
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
index b748571ea0..ffca0a7dcb 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/buf.h>
@@ -40,14 +39,22 @@
extern "C" {
#endif
+#ifdef _KERNEL
typedef struct vdev_disk {
ddi_devid_t vd_devid;
char *vd_minor;
ldi_handle_t vd_lh;
} vdev_disk_t;
+#endif
+extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
#ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
#endif
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 1df61a587d..c297ae165c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -102,6 +102,7 @@ struct vdev_queue {
avl_tree_t vq_read_tree;
avl_tree_t vq_write_tree;
avl_tree_t vq_pending_tree;
+ zoneid_t vq_last_zone_id;
kmutex_t vq_lock;
};
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..496b718bd6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define _SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#include <sys/semaphore.h>
+#include <sys/buf.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, uint64_t, boolean_t);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..069ec004f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZONE_H
+#define _SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_ZONE_IOP_READ = 0,
+ ZFS_ZONE_IOP_WRITE,
+ ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern int zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index f6cf3f5349..032b77715f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _ZIO_H
@@ -79,6 +80,8 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_SHA256_MAC,
+ ZIO_CHECKSUM_NOPARITY,
ZIO_CHECKSUM_FUNCTIONS
};
@@ -421,6 +424,9 @@ struct zio {
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
+ zoneid_t io_zoneid; /* zone which originated this I/O */
+ hrtime_t io_start; /* time I/O entered zio pipeline */
+ hrtime_t io_dispatched; /* time I/O was dispatched to disk */
/* Taskq dispatching state */
taskq_ent_t io_tqent;
};
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 55b1f3884b..2269ef271e 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -30,6 +30,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
+#include <sys/zfs_zone.h>
/*
* Pool-wide transaction groups.
@@ -411,6 +412,8 @@ txg_sync_thread(dsl_pool_t *dp)
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
+ zfs_zone_report_txg_sync(dp);
+
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index d7417736b4..f78580d0f1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -20,9 +20,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
@@ -325,8 +327,18 @@ vdev_disk_close(vdev_t *vd)
}
int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
- uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+ size_t size, uint64_t offset, int flags)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ ASSERT(vd->vdev_ops == &vdev_disk_ops);
+ return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+ size_t size, uint64_t offset, int flags)
{
buf_t *bp;
int error = 0;
@@ -479,6 +491,8 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
+ zfs_zone_zio_start(zio);
+
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
@@ -490,6 +504,8 @@ vdev_disk_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
+ zfs_zone_zio_done(zio);
+
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
@@ -574,7 +590,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
- if (vdev_disk_physio(vd_lh, (caddr_t)label,
+ if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5a0d3ee970..4ea958a9f6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,12 +21,14 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
+#include <sys/zfs_zone.h>
/*
* These tunables are for performance analysis.
@@ -120,6 +122,8 @@ vdev_queue_init(vdev_t *vd)
avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ vq->vq_last_zone_id = 0;
}
void
@@ -139,6 +143,7 @@ static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
avl_add(&vq->vq_deadline_tree, zio);
+ zfs_zone_zio_enqueue(zio);
avl_add(zio->io_vdev_tree, zio);
}
@@ -146,6 +151,7 @@ static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
avl_remove(&vq->vq_deadline_tree, zio);
+ zfs_zone_zio_dequeue(zio);
avl_remove(zio->io_vdev_tree, zio);
}
@@ -188,7 +194,11 @@ again:
avl_numnodes(&vq->vq_deadline_tree) == 0)
return (NULL);
+#ifdef _KERNEL
+ fio = lio = zfs_zone_schedule(vq);
+#else
fio = lio = avl_first(&vq->vq_deadline_tree);
+#endif
t = fio->io_vdev_tree;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 4b0f5602c1..6094e01876 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,11 +21,15 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
@@ -152,6 +156,8 @@ typedef struct raidz_map {
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
+
/*
* Force reconstruction to use the general purpose method.
*/
@@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
};
static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
- uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
- uint64_t b = zio->io_offset >> unit_shift;
- uint64_t s = zio->io_size >> unit_shift;
+ uint64_t b = offset >> unit_shift;
+ uint64_t s = size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
@@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++)
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
@@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_skipstart = 1;
}
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd)
vdev_close(vd->vdev_child[c]);
}
+/*
+ * Handle a read or write request to a RAID-Z dump device.
+ *
+ * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes
+ * to the dump zvol are written across a full 128Kb block. As a result, an
+ * individual I/O may not span all columns in the RAID-Z map; moreover, a small
+ * I/O may only span a single column.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+ uint64_t offset, uint64_t origoffset, boolean_t doread)
+{
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, err = 0;
+
+ uint64_t start, end, colstart, colend;
+ uint64_t coloffset, colsize, colskip;
+
+ int flags = doread ? B_READ : B_WRITE;
+
+#ifdef _KERNEL
+
+ /*
+ * Don't write past the end of the block
+ */
+ VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this I/O operation doesn't span the full block size, let's
+ * treat the on-disk format as if the only blocks are the complete 128k
+ * size.
+ */
+ start = offset;
+ end = start + size;
+
+ /*
+ * Allocate a RAID-Z map for this block. Note that this block starts
+ * from the "original" offset, this is, the offset of the extent which
+ * contains the requisite offset of the data being read or written.
+ */
+ rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+ SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ coloffset = origoffset;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, coloffset += rc->rc_size) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Find the start and end of this column in the RAID-Z matrix,
+ * keeping in mind that the stated size and offset of the
+ * operation may not fill the entire column for this vdev.
+ *
+ * If any portion of the data being read or written spans this
+ * column, issue the appropriate operation to the child vdev.
+ */
+ if (coloffset + rc->rc_size <= start)
+ continue;
+ if (coloffset >= end)
+ continue;
+
+ colstart = MAX(coloffset, start);
+ colend = MIN(end, coloffset + rc->rc_size);
+ colsize = colend - colstart;
+ colskip = colstart - coloffset;
+
+ VERIFY3U(colsize, <=, rc->rc_size);
+ VERIFY3U(colskip, <=, rc->rc_size);
+
+ /*
+ * Note that the child vdev will have a vdev label at the start
+ * of its range of offsets, hence the need for
+ * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
+ * example of why this calculation is needed.
+ */
+ if ((err = vdev_disk_physio(cvd,
+ ((char *)rc->rc_data) + colskip, colsize,
+ VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+ flags)) != 0)
+ break;
+ }
+
+ vdev_raidz_map_free(rm);
+#endif /* KERNEL */
+
+ return (err);
+}
+
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
@@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+ tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
int c, ret = 0;
raidz_col_t *rc;
+ blkptr_t *bp = zio->io_bp;
+ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 929fc06296..baffc223a3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -23,6 +23,7 @@
* Portions Copyright 2011 Martin Matuska
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -1757,7 +1758,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
}
static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+ boolean_t cachedpropsonly)
{
int error = 0;
nvlist_t *nv;
@@ -1775,7 +1777,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZVOL) {
+ dmu_objset_type(os) == DMU_OST_ZVOL &&
+ !cachedpropsonly) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
@@ -1802,13 +1805,25 @@ static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os = NULL;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
- if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
return (error);
- error = zfs_ioc_objset_stats_impl(zc, os);
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (error);
+ error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
dmu_objset_rele(os, FTAG);
return (error);
@@ -2022,8 +2037,21 @@ static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
top:
if (zc->zc_cookie == 0)
(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
@@ -2072,8 +2100,10 @@ top:
objset_t *ossnap;
error = dmu_objset_from_ds(ds, &ossnap);
- if (error == 0)
- error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc,
+ ossnap, cachedpropsonly);
+ }
dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
@@ -2789,6 +2819,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
+ int error;
ASSERT(zplprops != NULL);
@@ -2832,8 +2863,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
- if (norm == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+ if (norm == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
@@ -2842,13 +2874,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if (norm)
u8 = 1;
- if (u8 == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+ if (u8 == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
- if (sense == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+ if (sense == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 0c39274caf..9fae31fa6b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -25,6 +25,10 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -4145,6 +4149,8 @@ top:
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
}
dmu_tx_commit(tx);
@@ -4655,27 +4661,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
return (0);
}
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
/* ARGSUSED */
static int
zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4687,10 +4672,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..08f4f38e04
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+int
+zfs_zone_txg_delay()
+{
+ return (1);
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
+uint16_t zfs_zone_delay_step = 5; /* amount to change delay */
+uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */
+
+hrtime_t zfs_zone_last_checked = 0;
+
+boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O. The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down. As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency. If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t zfs_zone_rw_lat_limit = 10;
+
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level. This tunable controls the threshold
+ * at which the throttle will start delaying zones. The calculation should
+ * correspond closely with the %b column from iostat.
+ */
+uint_t zfs_zone_util_threshold = 80;
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds. Our system
+ * average cycle is one second or 1 million microseconds. Our zone counter
+ * update cycle is two seconds or 2 million microseconds. We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
+uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
+
+uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
+
+typedef struct {
+ hrtime_t cycle_start;
+ int cycle_cnt;
+ hrtime_t cycle_lat;
+ hrtime_t sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+ hrtime_t zi_now;
+ uint_t zi_avgrlat;
+ uint_t zi_avgwlat;
+ uint64_t zi_totpri;
+ uint64_t zi_totutil;
+ int zi_active;
+ uint_t zi_diskutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t rd_lat;
+static sys_lat_cycle_t wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization.
+ */
+kmutex_t zfs_disk_lock;
+uint_t zfs_disk_rcnt;
+hrtime_t zfs_disk_rtime = 0;
+hrtime_t zfs_disk_rlastupdate = 0;
+
+hrtime_t zfs_disk_last_rtime = 0;
+
+/*
+ * Data used to keep track of how often txg flush is running.
+ */
+extern int zfs_txg_timeout;
+static uint_t txg_last_check;
+static uint_t txg_cnt;
+static uint_t txg_flush_rate;
+
+boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the
+ * number of I/Os that can be pending on a device. If there are more than a
+ * few ops already queued up, beyond those already issued to the vdev, then
+ * use scheduling to get the next zio.
+ */
+int zfs_zone_schedule_thresh = 5;
+
+/*
+ * Tunables for delay throttling when TxG flush is occurring.
+ */
+int zfs_zone_txg_throttle_scale = 2;
+int zfs_zone_txg_delay_ticks = 2;
+
+typedef struct {
+ int zq_qdepth;
+ int zq_priority;
+ int zq_wt;
+ zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define GET_USEC_TIME (gethrtime() / 1000)
+#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * If the number of ops is >1 then we can just use that value. However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system. We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made. If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new historical count.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new zone count.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_cycle_time)
+ return (delta);
+
+ /* A previous cycle is past, compute the new zone count. */
+
+ /*
+ * Figure out how many generations we have to decay the historical
+ * count, since multiple cycles may have elapsed since our last IO.
+ * We depend on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+ /* If more than 5 cycles since last the IO, reset count. */
+ if (gen_cnt > 5) {
+ cp->zone_avg_cnt = 0;
+ } else {
+ /* Update the count. */
+ int i;
+
+ /*
+ * If the zone did more than 1 IO, just use its current count
+ * as the historical value, otherwise decay the historical
+ * count and factor that into the new historical count. We
+ * pick a threshold > 1 so that we don't lose track of IO due
+ * to int rounding.
+ */
+ if (cp->cycle_cnt > 1)
+ cp->zone_avg_cnt = cp->cycle_cnt;
+ else
+ cp->zone_avg_cnt = cp->cycle_cnt +
+ (cp->zone_avg_cnt / 2);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+
+ return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+ zonep->zone_rd_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+ zonep->zone_wr_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_LOGICAL_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+ zonep->zone_lwr_ops.cycle_cnt++;
+ break;
+ }
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen
+ *
+ * at fixed intervals, we use a timestamp to see when the last update was made.
+ * If it was more than one cycle ago, then we need to decay the average by the
+ * proper number of additional cycles in which no IO was performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static int
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new average.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_sys_avg_cycle)
+ return (0);
+
+ /* A previous cycle is past, compute a new system average. */
+
+ /*
+ * Figure out how many generations we have to decay, since multiple
+ * cycles may have elapsed since our last IO.
+ * We count on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+ /* If more than 5 cycles since last the IO, reset average. */
+ if (gen_cnt > 5) {
+ cp->sys_avg_lat = 0;
+ } else {
+ /* Update the average. */
+ int i;
+
+ cp->sys_avg_lat =
+ (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->sys_avg_lat = cp->sys_avg_lat / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+ cp->cycle_lat = 0;
+
+ return (1);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_new_sys_avg(unow, &rd_lat);
+ rd_lat.cycle_cnt++;
+ rd_lat.cycle_lat += lat;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_new_sys_avg(unow, &wr_lat);
+ wr_lat.cycle_cnt++;
+ wr_lat.cycle_lat += lat;
+ break;
+ }
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ uint_t cnt;
+
+ if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ cnt = cp->zone_avg_cnt;
+ } else {
+ /*
+ * If we're less than half way through the cycle then use
+ * the current count plus half the historical count, otherwise
+ * just use the current count.
+ */
+ if (delta < (zfs_zone_cycle_time / 2))
+ cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+ else
+ cnt = cp->cycle_cnt;
+ }
+
+ return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ if (compute_new_sys_avg(unow, cp)) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ return (cp->sys_avg_lat);
+ } else {
+ /*
+ * We're within a cycle; weight the current activity higher
+ * compared to the historical data and use that.
+ */
+ extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
+ uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__calc__wt__avg(
+ (uintptr_t)cp->sys_avg_lat,
+ (uintptr_t)cp->cycle_lat,
+ (uintptr_t)cp->cycle_cnt);
+
+ return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+ (1 + (cp->cycle_cnt * 8)));
+ }
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+ /* Add op to zone */
+ add_zone_iop(zonep, unow, op);
+
+ /* Track system latency */
+ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+ add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone. If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+ uint_t *lwops)
+{
+ *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+ *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+ *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+ extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
+ (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+
+ return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+ *rlat = calc_avg_lat(unow, &rd_lat);
+ *wlat = calc_avg_lat(unow, &wr_lat);
+
+ /*
+ * In an attempt to improve the accuracy of the throttling algorithm,
+ * assume that IO operations can't have zero latency. Instead, assume
+ * a reasonable lower bound for each operation type. If the actual
+ * observed latencies are non-zero, use those latency values instead.
+ */
+ if (*rlat == 0)
+ *rlat = 1000;
+ if (*wlat == 0)
+ *wlat = 1000;
+
+ extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
+ uintptr_t);
+
+ __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
+ (uintptr_t)*wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint_t rops, wops, lwops;
+
+ if (zonep->zone_id == GLOBAL_ZONEID ||
+ get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+ zonep->zone_io_util = 0;
+ return (0);
+ }
+
+ zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+ (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+ sp->zi_totutil += zonep->zone_io_util;
+
+ if (zonep->zone_io_util > 0) {
+ sp->zi_active++;
+ sp->zi_totpri += zonep->zone_zfs_io_pri;
+ }
+
+ /*
+ * sdt:::zfs-zone-utilization
+ *
+ * arg0: zone ID
+ * arg1: read operations observed during time window
+ * arg2: physical write operations observed during time window
+ * arg3: logical write ops observed during time window
+ * arg4: calculated utilization given read and write ops
+ * arg5: I/O priority assigned to this zone
+ */
+ extern void __dtrace_probe_zfs__zone__utilization(
+ uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
+
+ __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
+ (uint_t)rops, (uint_t)wops, (uint_t)lwops,
+ (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+
+ return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+ if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+ zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+ if (zonep->zone_io_delay > 0)
+ zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay. Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint16_t delay = zonep->zone_io_delay;
+ uint_t fairutil = 0;
+
+ zonep->zone_io_util_above_avg = B_FALSE;
+
+ /*
+ * Given the calculated total utilitzation for all zones, calculate the
+ * fair share of I/O for this zone.
+ */
+ if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+ fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+ sp->zi_totpri;
+ } else if (sp->zi_active > 0) {
+ fairutil = sp->zi_totutil / sp->zi_active;
+ }
+
+ /*
+ * Adjust each IO's delay. If the overall delay becomes too high, avoid
+ * increasing beyond the ceiling value.
+ */
+ if (zonep->zone_io_util > fairutil &&
+ sp->zi_diskutil > zfs_zone_util_threshold) {
+ zonep->zone_io_util_above_avg = B_TRUE;
+
+ if (sp->zi_active > 1)
+ zfs_zone_delay_inc(zonep);
+ } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+ zfs_zone_delay_dec(zonep);
+ }
+
+ /*
+ * sdt:::zfs-zone-throttle
+ *
+ * arg0: zone ID
+ * arg1: old delay for this zone
+ * arg2: new delay for this zone
+ * arg3: calculated fair I/O utilization
+ * arg4: actual I/O utilization
+ */
+ extern void __dtrace_probe_zfs__zone__throttle(
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__throttle(
+ (uintptr_t)zonep->zone_id, (uintptr_t)delay,
+ (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
+ (uintptr_t)zonep->zone_io_util);
+
+ return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow)
+{
+ zoneio_stats_t stats;
+
+ (void) bzero(&stats, sizeof (stats));
+
+ stats.zi_now = unow;
+ get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+ if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+ stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+ else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+ stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+ if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+ return;
+
+ /*
+ * Calculate disk utilization for the most recent period.
+ */
+ if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+ stats.zi_diskutil = 0;
+ } else {
+ stats.zi_diskutil =
+ ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+ ((unow - zfs_zone_last_checked) * 1000);
+ }
+ zfs_disk_last_rtime = zfs_disk_rtime;
+
+ /*
+ * sdt:::zfs-zone-stats
+ *
+ * Statistics observed over the last period:
+ *
+ * arg0: average system read latency
+ * arg1: average system write latency
+ * arg2: number of active zones
+ * arg3: total I/O 'utilization' for all zones
+ * arg4: total I/O priority of all active zones
+ * arg5: calculated disk utilization
+ */
+ extern void __dtrace_probe_zfs__zone__stats(
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
+ (uintptr_t)(stats.zi_avgwlat),
+ (uintptr_t)(stats.zi_active),
+ (uintptr_t)(stats.zi_totutil),
+ (uintptr_t)(stats.zi_totpri),
+ (uintptr_t)(stats.zi_diskutil));
+
+ (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue. Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+ int pri;
+ zone_q_bump_t *qbp = arg;
+
+ extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t);
+ __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id),
+ (uintptr_t)(zonep->zone_zfs_queued));
+
+ if (zonep->zone_zfs_queued == 0) {
+ zonep->zone_zfs_weight = 0;
+ return (0);
+ }
+
+ /*
+ * On each pass, increment the zone's weight. We use this as input
+ * to the calculation to prevent starvation. The value is reset
+ * each time we issue an IO for this zone so zones which haven't
+ * done any IO over several iterations will see their weight max
+ * out.
+ */
+ if (zonep->zone_zfs_weight < 20)
+ zonep->zone_zfs_weight++;
+
+ /*
+ * This zone's IO priority is the inverse of the number of IOs
+ * the zone has enqueued * zone's configured priority * weight.
+ * The queue depth has already been scaled by 10 to avoid problems
+ * with int rounding.
+ *
+ * This means that zones with fewer IOs in the queue will get
+ * preference unless other zone's assigned priority pulls them
+ * ahead. The weight is factored in to help ensure that zones
+ * which haven't done IO in a while aren't getting starved.
+ */
+ pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) *
+ zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+ /*
+ * If this zone has a higher priority than what we found so far,
+ * schedule it next.
+ */
+ if (pri > qbp->zq_priority) {
+ qbp->zq_zoneid = zonep->zone_id;
+ qbp->zq_priority = pri;
+ qbp->zq_wt = zonep->zone_zfs_weight;
+ }
+ return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue.
+ *
+ * For single-threaded synchronous workloads a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple workloads
+ * in parallel. This can cause an imbalance in performance if there are zones
+ * with many parallel workloads (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded workloads, such as interactive tasks in the
+ * shell. These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result. This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority. We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_t *vq, int qdepth)
+{
+ zone_q_bump_t qbump;
+ zio_t *zp = NULL, *zphead;
+ int cnt = 0;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ /* To avoid problems with int rounding, scale the queue depth by 10 */
+ qbump.zq_qdepth = qdepth * 10;
+ qbump.zq_priority = 0;
+ qbump.zq_zoneid = 0;
+ (void) zone_walk(get_sched_pri_cb, &qbump);
+
+ zphead = avl_first(&vq->vq_deadline_tree);
+
+ /* Check if the scheduler didn't pick a zone for some reason!? */
+ if (qbump.zq_zoneid != 0) {
+ for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL;
+ zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) {
+ if (zp->io_zoneid == qbump.zq_zoneid)
+ break;
+ cnt++;
+ }
+ }
+
+ if (zp == NULL) {
+ zp = zphead;
+ } else if (zp != zphead) {
+ /*
+ * Only fire the probe if we actually picked a different zio
+ * than the one already at the head of the queue.
+ */
+ extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t,
+ uintptr_t, uintptr_t, uintptr_t);
+ __dtrace_probe_zfs__zone__sched__bump(
+ (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt),
+ (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt));
+ }
+
+ return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out RX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+ zone_t *zonep = curzone;
+
+ zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track IO operations per zone. Called from dmu_tx_count_write for write ops
+ * and dmu_read_uio for read ops. For each operation, increment that zone's
+ * counter based on the type of operation.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls. Those ops go into a TXG which
+ * we'll count here. Sometime later a kernel taskq thread (we'll see the
+ * vdev IO as zone 0) will perform some number of physical writes to commit
+ * the TXG to disk. Those writes are not associated with the zone which
+ * made the write syscalls and the number of operations is not correlated
+ * between the taskq and the zone.
+ * 2) An application opens a file with O_SYNC. Each write will result in
+ * an operation which we'll see here plus a low-level vdev write from
+ * that zone.
+ * 3) An application does write syscalls followed by an fsync(). We'll
+ * count the writes going into a TXG here. We'll also see some number
+ * (usually much smaller, maybe only 1) of low-level vdev writes from this
+ * zone when the fsync is performed, plus some other low-level vdev writes
+ * from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ * writing out dirty pages to swap, or sync(2) calls, which will be handled
+ * by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice because this is called
+ * at a high level by a zone thread, but we also will count the phys. writes
+ * that are performed at a low level via zfs_zone_zio_start.
+ *
+ * Without this, it can look like a non-global zone never writes (case 1).
+ * Depending on when the TXG is flushed, the counts may be in the same sample
+ * bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics. The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through dmu_read_uio.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+ zone_t *zonep = curzone;
+ hrtime_t unow;
+ uint16_t wait;
+
+ unow = GET_USEC_TIME;
+
+ /*
+ * Only bump the counters for logical operations here. The counters for
+ * tracking physical IO operations are handled in zfs_zone_zio_done.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, type, 0);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ if (!zfs_zone_delay_enable)
+ return;
+
+ /*
+ * XXX There's a potential race here in that more than one thread may
+ * update the zone delays concurrently. The worst outcome is corruption
+ * of our data to track each zone's IO, so the algorithm may make
+ * incorrect throttling decisions until the data is refreshed.
+ */
+ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+ zfs_zone_wait_adjust(unow);
+ zfs_zone_last_checked = unow;
+ }
+
+ if ((wait = zonep->zone_io_delay) > 0) {
+ /*
+ * If this is a write and we're doing above normal TxG
+ * flushing, then throttle for longer than normal.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+ (txg_cnt > 1 || txg_flush_rate > 1))
+ wait *= zfs_zone_txg_throttle_scale;
+
+ /*
+ * sdt:::zfs-zone-wait
+ *
+ * arg0: zone ID
+ * arg1: type of IO operation
+ * arg2: time to delay (in us)
+ */
+ extern void __dtrace_probe_zfs__zone__wait(
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
+ (uintptr_t)type, (uintptr_t)wait);
+
+ drv_usecwait(wait);
+
+ if (zonep->zone_vfs_stats != NULL) {
+ atomic_inc_64(&zonep->zone_vfs_stats->
+ zv_delay_cnt.value.ui64);
+ atomic_add_64(&zonep->zone_vfs_stats->
+ zv_delay_time.value.ui64, wait);
+ }
+ }
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TxG flush rate is running above the expected rate.
+ * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * write workload. We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load. In this case, the flush rate is going to be 1. When there
+ * is a heavy write load, TxG's fill up fast and the sync thread will write
+ * the TxG more frequently (perhaps once a second). In this case the rate
+ * will be > 1. The flush rate is a lagging indicator since it can be up
+ * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_flush_rate to keep track of the previous
+ * 5 second interval. In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+ uint_t now;
+
+ txg_cnt++;
+ now = (uint_t)(gethrtime() / NANOSEC);
+ if ((now - txg_last_check) >= zfs_txg_timeout) {
+ txg_flush_rate = txg_cnt / 2;
+ txg_cnt = 0;
+ txg_last_check = now;
+ }
+}
+
+int
+zfs_zone_txg_delay()
+{
+ zone_t *zonep = curzone;
+ int delay = 1;
+
+ if (zonep->zone_io_util_above_avg)
+ delay = zfs_zone_txg_delay_ticks;
+
+ extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id),
+ (uintptr_t)delay);
+
+ return (delay);
+}
+
+/*
+ * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+ zone_t *zonep;
+
+ /*
+ * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+ * an actual I/O operation. Ignore those operations as they relate to
+ * throttling and scheduling.
+ */
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_zfs_lock);
+ if (zp->io_type == ZIO_TYPE_READ)
+ kstat_runq_enter(&zonep->zone_zfs_rwstats);
+ zonep->zone_zfs_weight = 0;
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zp->io_dispatched = gethrtime();
+
+ if (zfs_disk_rcnt++ != 0)
+ zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = zp->io_dispatched;
+ mutex_exit(&zfs_disk_lock);
+
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+ zone_t *zonep;
+ hrtime_t now, unow, udelta;
+
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ now = gethrtime();
+ unow = NANO_TO_MICRO(now);
+ udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+ mutex_enter(&zonep->zone_zfs_lock);
+
+ /*
+ * To calculate the wsvc_t average, keep a cumulative sum of all the
+ * wait time before each I/O was dispatched. Since most writes are
+ * asynchronous, only track the wait time for read I/Os.
+ */
+ if (zp->io_type == ZIO_TYPE_READ) {
+ zonep->zone_zfs_rwstats.reads++;
+ zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+ zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+ zp->io_dispatched - zp->io_start;
+
+ kstat_runq_exit(&zonep->zone_zfs_rwstats);
+ } else {
+ zonep->zone_zfs_rwstats.writes++;
+ zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+ }
+
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zfs_disk_rcnt--;
+ zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = now;
+ mutex_exit(&zfs_disk_lock);
+
+ if (zfs_zone_delay_enable) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+ ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ zone_rele(zonep);
+
+ /*
+ * sdt:::zfs-zone-latency
+ *
+ * arg0: zone ID
+ * arg1: type of I/O operation
+ * arg2: I/O latency (in us)
+ */
+ extern void __dtrace_probe_zfs__zone__latency(
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
+ (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+ zone_t *zonep;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ ASSERT(zonep->zone_zfs_queued > 0);
+ if (zonep->zone_zfs_queued == 0)
+ cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+ else
+ zonep->zone_zfs_queued--;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+ zone_t *zonep;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ zonep->zone_zfs_queued++;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. This function is where zio's are found
+ * at the head of the queue (by avl_first), then pulled off (by
+ * vdev_queue_io_remove) and issued. We do our scheduling here to find the
+ * next zio to issue.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq)
+{
+ int cnt;
+ zoneid_t last_zone;
+ zio_t *zp;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ cnt = avl_numnodes(&vq->vq_deadline_tree);
+ last_zone = vq->vq_last_zone_id;
+
+ /*
+ * If there are only a few ops in the queue then just issue the head.
+ * If there are more than a few ops already queued up, then use
+ * scheduling to get the next zio.
+ */
+ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+ zp = avl_first(&vq->vq_deadline_tree);
+ else
+ zp = get_next_zio(vq, cnt);
+
+ vq->vq_last_zone_id = zp->io_zoneid;
+
+ /*
+ * Probe with 3 args; the number of IOs in the queue, the zone that
+ * was last scheduled off this queue, and the zone that was associated
+ * with the next IO that is scheduled.
+ */
+ extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t,
+ uintptr_t);
+
+ __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt),
+ (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid));
+
+ return (zp);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 64e9acbae1..89c88bc181 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -36,6 +36,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/zfs_zone.h>
/*
* ==========================================================================
@@ -501,6 +502,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
+ zio->io_start = gethrtime();
+
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
@@ -552,11 +555,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ zio->io_zoneid = pio->io_zoneid;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
+ } else {
+ zfs_zone_zio_init(zio);
}
return (zio);
@@ -894,6 +900,8 @@ zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
+ zio->io_start = gethrtime();
+
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2279,6 +2287,9 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ zio->io_start = gethrtime();
+
if (vd == NULL) {
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index c8fe20f2eb..c7dd90c45d 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -77,6 +78,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256_mac"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"},
};
enum zio_checksum
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index df9a16bccb..4dc63888fd 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -76,9 +76,11 @@
#include <sys/zfs_rlock.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/dumphdr.h>
#include <sys/zil_impl.h>
+#include <sys/sdt.h>
#include "zfs_namecheck.h"
@@ -1059,27 +1061,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
}
static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
- boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+ uint64_t size, boolean_t doread, boolean_t isdump)
{
vdev_disk_t *dvd;
int c;
int numerrors = 0;
- for (c = 0; c < vd->vdev_children; c++) {
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
- vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- int err = zvol_dumpio_vdev(vd->vdev_child[c],
- addr, offset, size, doread, isdump);
- if (err != 0) {
- numerrors++;
- } else if (doread) {
- break;
+ if (vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, origoffset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
}
}
- if (!vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
return (numerrors < vd->vdev_children ? 0 : EIO);
if (doread && !vdev_readable(vd))
@@ -1087,19 +1090,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
else if (!doread && !vdev_writeable(vd))
return (EIO);
- dvd = vd->vdev_tsd;
- ASSERT3P(dvd, !=, NULL);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ return (vdev_raidz_physio(vd,
+ addr, size, offset, origoffset, doread));
+ }
+
offset += VDEV_LABEL_START_SIZE;
if (ddi_in_panic() || isdump) {
ASSERT(!doread);
if (doread)
return (EIO);
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
lbtodb(size)));
} else {
- return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
- doread ? B_READ : B_WRITE));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+
+ return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+ offset, doread ? B_READ : B_WRITE));
}
}
@@ -1131,7 +1142,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
offset += DVA_GET_OFFSET(&ze->ze_dva);
- error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+ error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+ size, doread, isdump);
if (!ddi_in_panic())
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1322,6 +1334,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1340,6 +1354,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
}
}
zfs_range_unlock(rl);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+ error);
+
return (error);
}
@@ -1369,6 +1387,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
sync = !(zv->zv_flags & ZVOL_WCE) ||
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
@@ -1399,6 +1419,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
zfs_range_unlock(rl);
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+ error);
+
return (error);
}
@@ -1852,7 +1876,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
ZIO_COMPRESS_OFF) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- ZIO_CHECKSUM_OFF) == 0);
+ ZIO_CHECKSUM_NOPARITY) == 0);
if (version >= SPA_VERSION_DEDUP) {
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_DEDUP),