diff options
author | ahrens <none@none> | 2006-11-03 11:39:28 -0800 |
---|---|---|
committer | ahrens <none@none> | 2006-11-03 11:39:28 -0800 |
commit | 614409b5be5411058e7e9b6cc93dddaff9fb13f7 (patch) | |
tree | 994d814287dee3e4d808d3f845b3f62e5c99acbc /usr/src | |
parent | ada9354b28215e27f2a3b25b9c352681c9cbdfa1 (diff) | |
download | illumos-gate-614409b5be5411058e7e9b6cc93dddaff9fb13f7.tar.gz |
6472021 vdev knobs can not be turned
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/mdb/common/modules/zfs/zfs.c | 120 | ||||
-rw-r--r-- | usr/src/cmd/zdb/zdb.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev.h | 14 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 117 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_cache.c | 55 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_file.c | 16 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_queue.c | 45 |
8 files changed, 156 insertions, 229 deletions
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index d3132acc61..57ee55f050 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -329,6 +329,74 @@ zio_pipeline(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) /* ARGSUSED */ static int +zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + /* + * This table can be approximately generated by running: + * egrep "^[a-z0-9_]+ [a-z0-9_]+( =.*)?;" *.c | cut -d ' ' -f 2 + */ + static const char *params[] = { + "arc_reduce_dnlc_percent", + "zfs_arc_max", + "zfs_arc_min", + "arc_kmem_reclaim_shift", + "zfs_mdcomp_disable", + "zfs_prefetch_disable", + "zfetch_max_streams", + "zfetch_min_sec_reap", + "zfetch_block_cap", + "zfetch_array_rd_sz", + "zfs_default_bs", + "zfs_default_ibs", + "metaslab_aliquot", + "reference_tracking_enable", + "reference_history", + "zio_taskq_threads", + "spa_max_replication_override", + "spa_mode", + "zfs_flags", + "txg_time", + "zfs_vdev_cache_max", + "zfs_vdev_cache_size", + "zfs_vdev_cache_bshift", + "vdev_mirror_shift", + "zfs_vdev_max_pending", + "zfs_vdev_min_pending", + "zfs_scrub_limit", + "zfs_vdev_time_shift", + "zfs_vdev_ramp_rate", + "zfs_vdev_aggregation_limit", + "fzap_default_block_shift", + "zfs_immediate_write_sz", + "zfs_read_chunk_size", + "zil_disable", + "zfs_nocacheflush", + "zio_gang_bang", + "zio_injection_enabled", + "zvol_immediate_write_sz", + }; + int i; + + for (i = 0; i < sizeof (params) / sizeof (params[0]); i++) { + int sz; + uint64_t val64; + uint32_t *val32p = (uint32_t *)&val64; + + sz = mdb_readvar(&val64, params[i]); + if (sz == 4) { + mdb_printf("%s = 0x%x\n", params[i], *val32p); + } else if (sz == 8) { + mdb_printf("%s = 0x%llx\n", params[i], val64); + } else { + mdb_warn("variable %s not found", params[i]); + } + } + + return (DCMD_OK); +} + +/* ARGSUSED */ +static int blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { blkptr_t bp; @@ -832,8 +900,8 @@ spa_print_config(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) void vdev_help(void) { - mdb_printf("[vdev_t*]::vdev [-qr]\n" - "\t-> -q display vdev_queue parameters\n" + mdb_printf("[vdev_t*]::vdev [-er]\n" + "\t-> -e display vdev stats\n" "\t-> -r recursive (visit all children)\n"); } @@ -845,21 +913,12 @@ vdev_help(void) * ADDR STATE AUX DESC * fffffffbcde23df0 HEALTHY - /dev/dsk/c0t0d0 * - * or with "-q" to print out a vdev_t's vdev_queue parameters: - * - * vdev_t: c26ae4c0 - * c26ae73c min pending 0x2 - * c26ae744 max pending 0x23 - * c26ae74c agg limit 0x20000 - * c26ae754 time shift 0x4 - * c26ae75c ramp rate 0x2 - * * If '-r' is specified, recursively visit all children. * * With '-e', the statistics associated with the vdev are printed as well. */ static int -do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats, +do_print_vdev(uintptr_t addr, int flags, int depth, int stats, int recursive) { vdev_t vdev; @@ -954,32 +1013,6 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats, mdb_printf("%-9s %-12s %*s%s\n", state, aux, depth, "", desc); - if (queue) { - mdb_inc_indent(4); - mdb_printf("\n"); - mdb_printf("%p min pending 0x%llx\n", - (uintptr_t)(addr + offsetof(vdev_t, - vdev_queue.vq_min_pending)), - vdev.vdev_queue.vq_min_pending); - mdb_printf("%p max pending 0x%llx\n", - (uintptr_t)(addr + offsetof(vdev_t, - vdev_queue.vq_max_pending)), - vdev.vdev_queue.vq_max_pending); - mdb_printf("%p agg limit 0x%llx\n", - (uintptr_t)(addr + offsetof(vdev_t, - vdev_queue.vq_agg_limit)), - vdev.vdev_queue.vq_agg_limit); - mdb_printf("%p time shift 0x%llx\n", - (uintptr_t)(addr + offsetof(vdev_t, - vdev_queue.vq_time_shift)), - vdev.vdev_queue.vq_time_shift); - mdb_printf("%p ramp rate 0x%llx\n", - (uintptr_t)(addr + offsetof(vdev_t, - vdev_queue.vq_ramp_rate)), - vdev.vdev_queue.vq_ramp_rate); - mdb_dec_indent(4); - } - if (stats) { vdev_stat_t *vs = &vdev.vdev_stat; int i; @@ -1008,7 +1041,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats, mdb_dec_indent(4); } - if (queue || stats) + if (stats) mdb_printf("\n"); } @@ -1025,7 +1058,7 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats, } for (c = 0; c < children; c++) { - if (do_print_vdev(child[c], flags, depth + 2, queue, stats, + if (do_print_vdev(child[c], flags, depth + 2, stats, recursive)) return (DCMD_ERR); } @@ -1036,12 +1069,10 @@ do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats, static int vdev_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - int print_queue = FALSE; int recursive = FALSE; int stats = FALSE; if (mdb_getopts(argc, argv, - 'q', MDB_OPT_SETBITS, TRUE, &print_queue, 'r', MDB_OPT_SETBITS, TRUE, &recursive, 'e', MDB_OPT_SETBITS, TRUE, &stats, NULL) != argc) @@ -1052,7 +1083,7 @@ vdev_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_ERR); } - return (do_print_vdev(addr, flags, 0, print_queue, stats, recursive)); + return (do_print_vdev(addr, flags, 0, stats, recursive)); } typedef struct metaslab_walk_data { @@ -1546,8 +1577,9 @@ static const mdb_dcmd_t dcmds[] = { { "spa_verify", ":", "verify spa_t consistency", spa_verify }, { "spa_space", ":[-b]", "print spa_t on-disk space usage", spa_space }, { "spa_vdevs", ":", "given a spa_t, print vdev summary", spa_vdevs }, - { "vdev", ":[-qre]", "vdev_t summary", vdev_print }, + { "vdev", ":[-re]", "vdev_t summary", vdev_print }, { "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline }, + { "zfs_params", "", "print zfs tunable parameters", zfs_params }, { NULL } }; diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 1fa0a6b408..a834f95e12 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -2056,6 +2056,8 @@ out: int main(int argc, char **argv) { + extern int zfs_vdev_cache_size; + int i, c; struct rlimit rl = { 1024, 1024 }; spa_t *spa; @@ -2065,7 +2067,6 @@ main(int argc, char **argv) int verbose = 0; int error; int flag, set; - vdev_knob_t *vk; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); @@ -2147,10 +2148,7 @@ main(int argc, char **argv) * Disable vdev caching. If we don't do this, live pool traversal * won't make progress because it will never see disk updates. */ - for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) { - if (strcmp(vk->vk_name, "cache_size") == 0) - vk->vk_default = 0; - } + zfs_vdev_cache_size = 0; for (c = 0; c < 256; c++) { if (dump_all && c != 'L' && c != 'l' && c != 'R') diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index a13c620421..ae8d157d1a 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -41,18 +41,6 @@ extern "C" { extern boolean_t zfs_nocacheflush; /* - * Vdev knobs. - */ -typedef struct vdev_knob { - char *vk_name; /* knob name */ - char *vk_desc; /* knob description */ - uint64_t vk_min; /* minimum legal value */ - uint64_t vk_max; /* maximum legal value */ - uint64_t vk_default; /* default value */ - size_t vk_offset; /* offset into vdev_t */ -} vdev_knob_t; - -/* * Fault injection modes. */ #define VDEV_FAULT_NONE 0 @@ -113,8 +101,6 @@ extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); -extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk); - extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t *vd, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index c41cf5402a..d136a8f527 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -91,22 +91,12 @@ struct vdev_cache_entry { }; struct vdev_cache { - uint64_t vc_size; - uint64_t vc_bshift; - uint64_t vc_blocksize; - uint64_t vc_max; avl_tree_t vc_offset_tree; avl_tree_t vc_lastused_tree; kmutex_t vc_lock; }; struct vdev_queue { - uint64_t vq_min_pending; - uint64_t vq_max_pending; - uint64_t vq_scrub_limit; - uint64_t vq_agg_limit; - uint64_t vq_time_shift; - uint64_t vq_ramp_rate; uint64_t vq_scrub_count; avl_tree_t vq_deadline_tree; avl_tree_t vq_read_tree; diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 00eff00202..007833e95e 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -45,15 +45,6 @@ * Virtual device management. */ -/* - * These tunables are for performance analysis, and override the - * (not-easily-turnable) vdev "knobs". - */ -int zfs_vdev_cache_max; -int zfs_vdev_max_pending; -int zfs_vdev_min_pending; -int zfs_vdev_time_shift; - static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, @@ -774,7 +765,6 @@ int vdev_open(vdev_t *vd) { int error; - vdev_knob_t *vk; int c; uint64_t osize = 0; uint64_t asize, psize; @@ -791,23 +781,6 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; - for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) { - uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset); - - *valp = vk->vk_default; - *valp = MAX(*valp, vk->vk_min); - *valp = MIN(*valp, vk->vk_max); - } - - if (zfs_vdev_cache_max) - vd->vdev_cache.vc_max = zfs_vdev_cache_max; - if (zfs_vdev_max_pending) - vd->vdev_queue.vq_max_pending = zfs_vdev_max_pending; - if (zfs_vdev_min_pending) - vd->vdev_queue.vq_min_pending = zfs_vdev_min_pending; - if (zfs_vdev_time_shift) - vd->vdev_queue.vq_time_shift = zfs_vdev_time_shift; - if (vd->vdev_ops->vdev_op_leaf) { vdev_cache_init(vd); vdev_queue_init(vd); @@ -1748,96 +1721,6 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) } /* - * Various knobs to tune a vdev. - */ -static vdev_knob_t vdev_knob[] = { - { - "cache_size", - "size of the read-ahead cache", - 0, - 1ULL << 30, - 10ULL << 20, - offsetof(struct vdev, vdev_cache.vc_size) - }, - { - "cache_bshift", - "log2 of cache blocksize", - SPA_MINBLOCKSHIFT, - SPA_MAXBLOCKSHIFT, - 16, - offsetof(struct vdev, vdev_cache.vc_bshift) - }, - { - "cache_max", - "largest block size to cache", - 0, - SPA_MAXBLOCKSIZE, - 1ULL << 14, - offsetof(struct vdev, vdev_cache.vc_max) - }, - { - "min_pending", - "minimum pending I/Os to the disk", - 1, - 10000, - 4, - offsetof(struct vdev, vdev_queue.vq_min_pending) - }, - { - "max_pending", - "maximum pending I/Os to the disk", - 1, - 10000, - 35, - offsetof(struct vdev, vdev_queue.vq_max_pending) - }, - { - "scrub_limit", - "maximum scrub/resilver I/O queue", - 0, - 10000, - 70, - offsetof(struct vdev, vdev_queue.vq_scrub_limit) - }, - { - "agg_limit", - "maximum size of aggregated I/Os", - 0, - SPA_MAXBLOCKSIZE, - SPA_MAXBLOCKSIZE, - offsetof(struct vdev, vdev_queue.vq_agg_limit) - }, - { - "time_shift", - "deadline = pri + (lbolt >> time_shift)", - 0, - 63, - 6, - offsetof(struct vdev, vdev_queue.vq_time_shift) - }, - { - "ramp_rate", - "exponential I/O issue ramp-up rate", - 1, - 10000, - 2, - offsetof(struct vdev, vdev_queue.vq_ramp_rate) - }, -}; - -vdev_knob_t * -vdev_knob_next(vdev_knob_t *vk) -{ - if (vk == NULL) - return (vdev_knob); - - if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t)) - return (NULL); - - return (vk); -} - -/* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index 67a8924b52..2d8795c660 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -60,9 +60,24 @@ * (4) Write. Update cache contents after write completion. * * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds vc_size. + * if the total cache size exceeds zfs_vdev_cache_size. */ +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software + * track buffer. At most zfs_vdev_cache_size bytes will be kept in each + * vdev's vdev_cache. + */ +int zfs_vdev_cache_max = 1<<14; +int zfs_vdev_cache_size = 10ULL << 20; +int zfs_vdev_cache_bshift = 16; + +#define VCBS (1 << zfs_vdev_cache_bshift) + static int vdev_cache_offset_compare(const void *a1, const void *a2) { @@ -109,7 +124,7 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, vc->vc_blocksize); + zio_buf_free(ve->ve_data, VCBS); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -122,20 +137,20 @@ static vdev_cache_entry_t * vdev_cache_allocate(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize); + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); vdev_cache_entry_t *ve; ASSERT(MUTEX_HELD(&vc->vc_lock)); - if (vc->vc_size == 0) + if (zfs_vdev_cache_size == 0) return (NULL); /* * If adding a new entry would exceed the cache size, * evict the oldest entry (LRU). */ - if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) > - vc->vc_size) { + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) { dprintf("can't evict in %p, still filling\n", vc); @@ -148,7 +163,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = lbolt; - ve->ve_data = zio_buf_alloc(vc->vc_blocksize); + ve->ve_data = zio_buf_alloc(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -159,7 +174,7 @@ vdev_cache_allocate(zio_t *zio) static void vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) { - uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize); + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(ve->ve_fill_io == NULL); @@ -185,7 +200,7 @@ vdev_cache_fill(zio_t *zio) vdev_cache_entry_t *ve = zio->io_private; zio_t *dio; - ASSERT(zio->io_size == vc->vc_blocksize); + ASSERT(zio->io_size == VCBS); /* * Add data to the cache. @@ -227,8 +242,8 @@ vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize); - uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize); + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -236,17 +251,16 @@ vdev_cache_read(zio_t *zio) if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (EINVAL); - if (zio->io_size > vc->vc_max) + if (zio->io_size > zfs_vdev_cache_max) return (EOVERFLOW); /* * If the I/O straddles two or more cache blocks, don't cache it. */ - if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, - vc->vc_blocksize)) + if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) return (EXDEV); - ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize); + ASSERT(cache_phase + zio->io_size <= VCBS); mutex_enter(&vc->vc_lock); @@ -283,8 +297,7 @@ vdev_cache_read(zio_t *zio) } fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, - ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ, - ZIO_PRIORITY_CACHE_FILL, + ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, vdev_cache_fill, ve); @@ -309,8 +322,8 @@ vdev_cache_write(zio_t *zio) vdev_cache_entry_t *ve, ve_search; uint64_t io_start = zio->io_offset; uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize); - uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize); + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; ASSERT(zio->io_type == ZIO_TYPE_WRITE); @@ -325,7 +338,7 @@ vdev_cache_write(zio_t *zio) while (ve != NULL && ve->ve_offset < max_offset) { uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; @@ -352,8 +365,6 @@ vdev_cache_init(vdev_t *vd) avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, sizeof (vdev_cache_entry_t), offsetof(struct vdev_cache_entry, ve_lastused_node)); - - vc->vc_blocksize = 1ULL << vc->vc_bshift; } void diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index a82abf80b7..b8e79f8c0c 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -54,14 +54,6 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); -#ifdef _KERNEL - /* - * When using a file vdev in kernel context, the underlying filesystem - * will already be caching the data. Don't cache it again here. - */ - vd->vdev_cache.vc_size = 0; -#endif - /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the @@ -156,8 +148,14 @@ vdev_file_io_start(zio_t *zio) return; } + /* + * In the kernel, don't bother double-caching, but in userland, + * we want to test the vdev_cache code. + */ +#ifndef _KERNEL if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) return; +#endif if ((zio = vdev_queue_io(zio)) == NULL) return; @@ -186,8 +184,10 @@ vdev_file_io_done(zio_t *zio) { vdev_queue_io_done(zio); +#ifndef _KERNEL if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); +#endif if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 631948bb1b..6b0b2a6f6c 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -32,6 +32,33 @@ #include <sys/avl.h> /* + * These tunables are for performance analysis. + */ +/* + * zfs_vdev_max_pending is the maximum number of i/os concurrently + * pending to each device. zfs_vdev_min_pending is the initial number + * of i/os pending to each device (before it starts ramping up to + * max_pending). + */ +int zfs_vdev_max_pending = 35; +int zfs_vdev_min_pending = 4; + +/* maximum scrub/resilver I/O queue */ +int zfs_scrub_limit = 70; + +/* deadline = pri + (lbolt >> time_shift) */ +int zfs_vdev_time_shift = 6; + +/* exponential I/O issue ramp-up rate */ +int zfs_vdev_ramp_rate = 2; + +/* + * i/os will be aggregated into a single large i/o up to + * zfs_vdev_aggregation_limit bytes long. + */ +int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; + +/* * Virtual device vector for disk I/O scheduling. */ int @@ -119,7 +146,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) avl_add(zio->io_vdev_tree, zio); if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && - ++vq->vq_scrub_count >= vq->vq_scrub_limit) + ++vq->vq_scrub_count >= zfs_scrub_limit) spa_scrub_throttle(zio->io_spa, 1); } @@ -127,7 +154,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && - vq->vq_scrub_count-- >= vq->vq_scrub_limit) + vq->vq_scrub_count-- >= zfs_scrub_limit) spa_scrub_throttle(zio->io_spa, -1); avl_remove(&vq->vq_deadline_tree, zio); @@ -182,14 +209,14 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, size = fio->io_size; while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - size + dio->io_size <= vq->vq_agg_limit) { + size + dio->io_size <= zfs_vdev_aggregation_limit) { dio->io_delegate_next = fio; fio = dio; size += dio->io_size; } while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - size + dio->io_size <= vq->vq_agg_limit) { + size + dio->io_size <= zfs_vdev_aggregation_limit) { lio->io_delegate_next = dio; lio = dio; size += dio->io_size; @@ -200,7 +227,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, uint64_t offset = 0; int nagg = 0; - ASSERT(size <= vq->vq_agg_limit); + ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_child_io(fio, NULL, fio->io_vd, fio->io_offset, buf, size, fio->io_type, @@ -266,12 +293,12 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) + + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); mutex_exit(&vq->vq_lock); @@ -294,8 +321,8 @@ vdev_queue_io_done(zio_t *zio) avl_remove(&vq->vq_pending_tree, zio); - for (i = 0; i < vq->vq_ramp_rate; i++) { - nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func); + for (i = 0; i < zfs_vdev_ramp_rate; i++) { + nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); if (nio == NULL) break; mutex_exit(&vq->vq_lock); |