diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs')
23 files changed, 436 insertions, 72 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5caabf8260..d8e9f26bdb 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -190,6 +190,7 @@ uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; /* * Note that buffers can be in one of 6 states: @@ -292,6 +293,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { @@ -347,7 +351,10 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1362,6 +1369,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1460,6 +1478,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1644,6 +1672,48 @@ arc_buf_size(arc_buf_t *buf) } /* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + +/* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: @@ -2638,8 +2708,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3186,6 +3258,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 437e0ac85c..e8bf55c321 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2089,7 +2089,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 190b26e5bf..a9308b0c08 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -574,7 +574,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (dn->dn_indblkshift - SPA_BLKPTRSHIFT); while (level++ < maxlevel) { - txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs)) + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift; blkcnt = 1 + (blkcnt >> epbs); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index d9cd70f1c8..968fbd80d6 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -5983,6 +5983,10 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6111,6 +6115,8 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); + /* * Clear the dirty config list. */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 30681b6464..a254c8d656 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ #include <sys/zfs_context.h> #include <sys/spa_impl.h> +#include <sys/spa_boot.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> @@ -249,6 +250,26 @@ int zfs_flags = 0; */ int zfs_recover = 0; +extern int zfs_txg_synctime_ms; + +/* + * Expiration time in units of zfs_txg_synctime_ms. This value has two + * meanings. First it is used to determine when the spa_deadman logic + * should fire. By default the spa_deadman will fire if spa_sync has + * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). + * Secondly, the value determines if an I/O is considered "hung". + * Any I/O that has not completed in zfs_deadman_synctime is considered + * "hung" resulting in a system panic. + * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). + */ +uint64_t zfs_deadman_synctime = 1000ULL; + +/* + * Override the zfs deadman behavior via /etc/system. By default the + * deadman is enabled except on VMware and sparc deployments. + */ +int zfs_deadman_enabled = -1; + /* * ========================================================================== @@ -418,6 +439,23 @@ spa_lookup(const char *name) } /* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +} + +/* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. @@ -427,6 +465,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; + cyc_handler_t hdlr; + cyc_time_t when; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -458,6 +498,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; + + spa->spa_deadman_synctime = zfs_deadman_synctime * + zfs_txg_synctime_ms * MICROSEC; + + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 synctimes before checking again. + */ + when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -540,6 +599,12 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; + refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); @@ -1507,6 +1572,12 @@ spa_prev_software_version(spa_t *spa) } uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); @@ -1600,7 +1671,9 @@ spa_init(int mode) spa_mode_global = mode; -#ifndef _KERNEL +#ifdef _KERNEL + spa_arch_init(); +#else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 28dbc57275..b109dcafbc 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -99,6 +99,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h index 6661e47cfc..8ae05ce364 100644 --- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -181,7 +182,7 @@ typedef struct sa_hdr_phys { */ #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ { \ BF32_SET_SB(x, 10, 6, 3, 0, size); \ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 1043f4038a..172a9f141e 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -604,6 +604,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_boot.h b/usr/src/uts/common/fs/zfs/sys/spa_boot.h index 1d3622f5a1..8df5072a55 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H @@ -35,6 +39,8 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); +extern void spa_arch_init(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 027832e858..42ce5556d3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -227,6 +227,10 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + cyclic_id_t spa_deadman_cycid; /* cyclic id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + uint64_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 7e34889b61..5a7836612b 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -79,6 +79,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index c772d954bb..e4c02bde1d 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -105,6 +105,8 @@ struct vdev_queue { avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; zoneid_t vq_last_zone_id; + uint64_t vq_io_complete_ts; + uint64_t vq_io_delta_ts; kmutex_t vq_lock; }; @@ -321,6 +323,14 @@ extern void vdev_set_min_asize(vdev_t *vd); */ extern int zfs_vdev_cache_size; +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index fdd0412fee..0dc8d8859c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -22,8 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -67,6 +69,7 @@ extern "C" { #include <sys/sysevent/dev.h> #include <sys/fm/util.h> #include <sys/sunddi.h> +#include <sys/cyclic.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 4d781ad2a4..86e901be0d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -240,12 +240,24 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index ce3a983d9f..9c718f691a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -21,8 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. @@ -406,6 +404,7 @@ struct zio { uint64_t io_offset; uint64_t io_deadline; + uint64_t io_timestamp; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -554,6 +553,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); +extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fa0a579e66..18180ecad3 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -3153,3 +3153,41 @@ vdev_split(vdev_t *vd) } vdev_propagate_state(cvd); } + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_pending_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_pending_tree); + delta = ddi_get_lbolt64() - fio->io_timestamp; + if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) { + zfs_dbgmsg("SLOW IO: zio timestamp %llu, " + "delta %llu, last io %llu", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + fm_panic("I/O to pool '%s' appears to be " + "hung.", spa_name(spa)); + } + } + mutex_exit(&vq->vq_lock); + } +} diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 1ba343226f..dfadeca9d4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -42,11 +42,6 @@ extern ldi_ident_t zfs_li; -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - static void vdev_disk_hold(vdev_t *vd) { @@ -170,7 +165,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave - * us, even if it is one of multiple paths to the save device. But we + * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * @@ -416,8 +411,8 @@ vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, static void vdev_disk_io_intr(buf_t *bp) { - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. @@ -429,7 +424,7 @@ vdev_disk_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = EIO; - kmem_free(vdb, sizeof (vdev_disk_buf_t)); + kmem_free(vb, sizeof (vdev_buf_t)); zio_interrupt(zio); } @@ -460,7 +455,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; + vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; @@ -524,10 +519,10 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; + vb->vb_io = zio; + bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 043fa51294..1fbce5e542 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -25,6 +25,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_file.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -140,12 +141,55 @@ vdev_file_close(vdev_t *vd) vd->vdev_tsd = NULL; } +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = ENOSPC; + + kmem_free(vb, sizeof (vdev_buf_t)); + zio_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + buf_t *bp = arg; + vnode_t *vp = bp->b_private; + ssize_t resid; + int error; + + error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, + vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (error == 0) { + bp->b_resid = resid; + biodone(bp); + } else { + bioerror(bp, error); + biodone(bp); + } +} + static int vdev_file_io_start(zio_t *zio) { + spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; + vdev_buf_t *vb; + buf_t *bp; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ @@ -166,15 +210,22 @@ vdev_file_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; + vb->vb_io = zio; + bp = &vb->vb_buf; - zio_interrupt(zio); + bioinit(bp); + bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_private = vf->vf_vnode; + bp->b_iodone = (int (*)())vdev_file_io_intr; + + taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], + vdev_file_io_strategy, bp, 0, &zio->io_tqent); return (ZIO_PIPELINE_STOP); } diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 4ea958a9f6..8dec283fee 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -24,6 +24,10 @@ * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -298,6 +302,7 @@ again: zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -369,7 +374,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = ddi_get_lbolt64(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -394,10 +400,16 @@ vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); + mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); + vq->vq_io_complete_ts = ddi_get_lbolt64(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 2292f658b3..c7bfbbaec4 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -50,6 +50,7 @@ #include <sys/spa.h> #include <sys/zap.h> #include <sys/sa.h> +#include <sys/sa_impl.h> #include <sys/varargs.h> #include <sys/policy.h> #include <sys/atomic.h> @@ -64,7 +65,6 @@ #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> -#include <sys/sa.h> #include "zfs_comutil.h" int zfsfstype; @@ -578,7 +578,6 @@ static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - znode_phys_t *znp = data; int error = 0; /* @@ -597,20 +596,18 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, return (EEXIST); if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); - hdrsize = sa_hdrsize(data); - if (hdrsize != 0) { - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_GID_OFFSET)); - } else { + if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled @@ -618,6 +615,25 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, */ *userp = 0; *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); } } return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 0c86cac427..92dc05f4a0 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1947,13 +1947,16 @@ zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) * or not the object is an extended attribute directory. */ static int -zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, - int *is_xattrdir) +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; + uint64_t parent_mode; sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; int count = 0; int error; @@ -1967,9 +1970,32 @@ zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); - *pobjp = parent; + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (EINVAL); + + *pobjp = parent; + return (0); } @@ -2018,7 +2044,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, if (prevdb) zfs_release_sa_handle(prevhdl, prevdb, FTAG); - if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj, + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index e2e98b7896..00964aa83f 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -2928,7 +2928,7 @@ zio_done(zio_t *zio) * Hand it off to the otherwise-unused claim taskq. */ ASSERT(zio->io_tqent.tqent_next == NULL); - (void) taskq_dispatch_ent( + taskq_dispatch_ent( spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 9ae7d1f697..a9d4ab4070 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors and panic injection */ - if (handler->zi_record.zi_guid != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ @@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults or panic injection */ - if (handler->zi_record.zi_start == 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* @@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* - * Ignore label specific faults, panic injection - * or fake writes - */ - if (handler->zi_record.zi_start != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { @@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio) handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* @@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { @@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } +uint64_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *handler; + uint64_t seconds = 0; + + if (zio_injection_enabled == 0) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + seconds = handler->zi_record.zi_timer; + break; + } + + } + rw_exit(&inject_lock); + return (seconds); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, |