summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c210
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c61
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c77
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_zfetch.c89
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h9
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c4
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c5
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c3
-rw-r--r--usr/src/uts/common/syscall/rw.c15
15 files changed, 329 insertions, 161 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index baca8e3a12..c1e1ebd7a2 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -139,6 +139,12 @@ typedef enum arc_reclaim_strategy {
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
+/*
+ * minimum lifespan of a prefetched block in seconds
+ * (this is converted to ticks during the arc initialization)
+ */
+static int arc_min_prefetch_lifespan = 1;
+
static kmutex_t arc_reclaim_lock;
static int arc_dead;
@@ -264,6 +270,7 @@ static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -535,7 +542,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
mutex_enter(&ab->b_state->mtx);
- ASSERT(refcount_count(&ab->b_refcnt) > 0);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(&ab->b_state->list, ab);
if (GHOST_STATE(ab->b_state)) {
@@ -547,6 +553,9 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
ASSERT3U(ab->b_state->lsize, >=, delta);
atomic_add_64(&ab->b_state->lsize, -delta);
mutex_exit(&ab->b_state->mtx);
+ /* remove the prefetch flag is we get a reference */
+ if (ab->b_flags & ARC_PREFETCH)
+ ab->b_flags &= ~ARC_PREFETCH;
}
}
@@ -605,9 +614,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(&old_state->list, ab);
- /* ghost elements have a ghost size */
- if (GHOST_STATE(old_state)) {
- ASSERT(ab->b_datacnt == 0);
+ /*
+ * If prefetching out of the ghost cache,
+ * we will have a non-null datacnt.
+ */
+ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+ /* ghost elements have a ghost size */
ASSERT(ab->b_buf == NULL);
from_delta = ab->b_size;
}
@@ -645,14 +657,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
buf_hash_remove(ab);
}
- /*
- * If this buffer isn't being transferred to the MRU-top
- * state, it's safe to clear its prefetch flag
- */
- if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) {
- ab->b_flags &= ~ARC_PREFETCH;
- }
-
/* adjust state sizes */
if (to_delta)
atomic_add_64(&new_state->size, to_delta);
@@ -918,8 +922,14 @@ arc_evict(arc_state_t *state, int64_t bytes)
for (ab = list_tail(&state->list); ab; ab = ab_prev) {
ab_prev = list_prev(&state->list, ab);
+ /* prefetch buffers have a minimum lifespan */
+ if (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
+ lbolt - ab->b_arc_access < arc_min_prefetch_lifespan) {
+ skipped++;
+ continue;
+ }
hash_lock = HDR_LOCK(ab);
- if (mutex_tryenter(hash_lock)) {
+ if (!HDR_IO_IN_PROGRESS(ab) && mutex_tryenter(hash_lock)) {
ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
ASSERT(ab->b_datacnt > 0);
while (ab->b_buf) {
@@ -991,6 +1001,7 @@ top:
ab_prev = list_prev(&state->list, ab);
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
arc_change_state(arc.anon, ab, hash_lock);
mutex_exit(hash_lock);
@@ -1102,12 +1113,13 @@ arc_flush(void)
ASSERT(arc_eviction_list == NULL);
}
+int arc_kmem_reclaim_shift = 5; /* log2(fraction of arc to reclaim) */
+
void
arc_kmem_reclaim(void)
{
uint64_t to_free;
- /* Remove 12.5% */
/*
* We need arc_reclaim_lock because we don't want multiple
* threads trying to reclaim concurrently.
@@ -1127,16 +1139,16 @@ arc_kmem_reclaim(void)
mutex_enter(&arc_reclaim_lock);
#ifdef _KERNEL
- to_free = MAX(arc.c >> 3, ptob(needfree));
+ to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree));
#else
- to_free = arc.c >> 3;
+ to_free = arc.c >> arc_kmem_reclaim_shift;
#endif
if (arc.c > to_free)
atomic_add_64(&arc.c, -to_free);
else
arc.c = arc.c_min;
- atomic_add_64(&arc.p, -(arc.p >> 3));
+ atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift));
if (arc.c > arc.size)
arc.c = arc.size;
if (arc.c < arc.c_min)
@@ -1468,14 +1480,25 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
} else if (buf->b_state == arc.mru) {
/*
- * If this buffer is in the MRU-top state and has the prefetch
- * flag, the first read was actually part of a prefetch. In
- * this situation, we simply want to clear the flag and return.
- * A subsequent access should bump this into the MFU state.
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
*/
if ((buf->b_flags & ARC_PREFETCH) != 0) {
- buf->b_flags &= ~ARC_PREFETCH;
- atomic_add_64(&arc.mru->hits, 1);
+ if (refcount_count(&buf->b_refcnt) == 0) {
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc.mru->mtx);
+ list_remove(&arc.mru->list, buf);
+ list_insert_head(&arc.mru->list, buf);
+ mutex_exit(&arc.mru->mtx);
+ } else {
+ buf->b_flags &= ~ARC_PREFETCH;
+ atomic_add_64(&arc.mru->hits, 1);
+ }
+ buf->b_arc_access = lbolt;
mutex_exit(hash_lock);
return;
}
@@ -1506,7 +1529,8 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
if (buf->b_flags & ARC_PREFETCH) {
new_state = arc.mru;
- buf->b_flags &= ~ARC_PREFETCH;
+ if (refcount_count(&buf->b_refcnt) > 0)
+ buf->b_flags &= ~ARC_PREFETCH;
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
} else {
new_state = arc.mfu;
@@ -1526,26 +1550,45 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
*
- * NOTE: the add_reference() that occurred when we did
- * the arc_read() should have kicked this off the list,
- * so even if it was a prefetch, it will be put back at
- * the head of the list when we remove_reference().
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
*/
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ ASSERT(refcount_count(&buf->b_refcnt) == 0);
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc.mfu->mtx);
+ list_remove(&arc.mfu->list, buf);
+ list_insert_head(&arc.mfu->list, buf);
+ mutex_exit(&arc.mfu->mtx);
+ }
atomic_add_64(&arc.mfu->hits, 1);
+ buf->b_arc_access = lbolt;
} else if (buf->b_state == arc.mfu_ghost) {
+ arc_state_t *new_state = arc.mfu;
/*
* This buffer has been accessed more than once but has
* been evicted from the cache. Move it back to the
* MFU state.
*/
+ if (buf->b_flags & ARC_PREFETCH) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ new_state = arc.mru;
+ }
+
arc_adapt(blksz, arc.mfu_ghost);
if (arc_evict_needed())
- evict_state = arc.mfu;
+ evict_state = new_state;
buf->b_arc_access = lbolt;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(arc.mfu, buf, hash_lock);
+ arc_change_state(new_state, buf, hash_lock);
atomic_add_64(&arc.mfu_ghost->hits, 1);
} else {
@@ -1628,19 +1671,6 @@ arc_read_done(zio_t *zio)
}
acb->acb_buf = abuf;
abuf = NULL;
- } else {
- /*
- * The caller did not provide a callback function.
- * In this case, we should just remove the reference.
- */
- if (HDR_FREED_IN_READ(hdr)) {
- ASSERT3P(hdr->b_state, ==, arc.anon);
- (void) refcount_remove(&hdr->b_refcnt,
- acb->acb_private);
- } else {
- (void) remove_reference(hdr, hash_lock,
- acb->acb_private);
- }
}
}
hdr->b_acb = NULL;
@@ -1658,15 +1688,15 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
- /* translate checksum errors into IO errors */
+ /* convert checksum errors into IO errors */
if (zio->io_error == ECKSUM)
zio->io_error = EIO;
}
/*
- * Broadcast before we drop the hash_lock. This is less efficient,
- * but avoids the possibility that the hdr (and hence the cv) might
- * be freed before we get to the cv_broadcast().
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
*/
cv_broadcast(&hdr->b_cv);
@@ -1731,7 +1761,7 @@ arc_read_done(zio_t *zio)
int
arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags, zbookmark_t *zb)
+ uint32_t *arc_flags, zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
@@ -1742,8 +1772,18 @@ top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
if (hdr && hdr->b_datacnt > 0) {
+ *arc_flags |= ARC_CACHED;
+
if (HDR_IO_IN_PROGRESS(hdr)) {
- if ((arc_flags & ARC_NOWAIT) && done) {
+
+ if (*arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_NOWAIT);
+
+ if (done) {
arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
@@ -1761,10 +1801,6 @@ top:
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
return (0);
- } else if (arc_flags & ARC_WAIT) {
- cv_wait(&hdr->b_cv, hash_lock);
- mutex_exit(hash_lock);
- goto top;
}
mutex_exit(hash_lock);
return (0);
@@ -1796,6 +1832,9 @@ top:
hdr->b_flags &= ~ARC_BUF_AVAILABLE;
}
add_reference(hdr, hash_lock, private);
+ } else if (*arc_flags & ARC_PREFETCH &&
+ refcount_count(&hdr->b_refcnt) == 0) {
+ hdr->b_flags |= ARC_PREFETCH;
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access_and_exit(hdr, hash_lock);
@@ -1825,15 +1864,26 @@ top:
(void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
-
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ if (BP_GET_LEVEL(bp) > 0)
+ hdr->b_flags |= ARC_INDIRECT;
} else {
/* this block is in the ghost cache */
ASSERT(GHOST_STATE(hdr->b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- add_reference(hdr, hash_lock, private);
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
ASSERT(hdr->b_buf == NULL);
+
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ else
+ add_reference(hdr, hash_lock, private);
buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
buf->b_hdr = hdr;
buf->b_efunc = NULL;
@@ -1844,6 +1894,7 @@ top:
atomic_add_64(&arc.size, hdr->b_size);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
+
}
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -1853,13 +1904,6 @@ top:
ASSERT(hdr->b_acb == NULL);
hdr->b_acb = acb;
-
- /*
- * If this DVA is part of a prefetch, mark the buf
- * header with the prefetch flag
- */
- if (arc_flags & ARC_PREFETCH)
- hdr->b_flags |= ARC_PREFETCH;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
/*
@@ -1883,10 +1927,10 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, flags, zb);
- if (arc_flags & ARC_WAIT)
+ if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
- ASSERT(arc_flags & ARC_NOWAIT);
+ ASSERT(*arc_flags & ARC_NOWAIT);
zio_nowait(rzio);
}
return (0);
@@ -2260,22 +2304,33 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
if (ab->b_state != arc.anon)
arc_change_state(arc.anon, ab, hash_lock);
- if (refcount_is_zero(&ab->b_refcnt)) {
+ if (HDR_IO_IN_PROGRESS(ab)) {
+ /*
+ * This should only happen when we prefetch.
+ */
+ ASSERT(ab->b_flags & ARC_PREFETCH);
+ ASSERT3U(ab->b_datacnt, ==, 1);
+ ab->b_flags |= ARC_FREED_IN_READ;
+ if (HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ } else if (refcount_is_zero(&ab->b_refcnt)) {
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
atomic_add_64(&arc.deleted, 1);
} else {
/*
- * We could have an outstanding read on this
- * block, so multiple active references are
- * possible. But we should only have a single
- * data buffer associated at this point.
+ * We still have an active reference on this
+ * buffer. This can happen, e.g., from
+ * dbuf_unoverride().
*/
- ASSERT3U(ab->b_datacnt, ==, 1);
- if (HDR_IO_IN_PROGRESS(ab))
- ab->b_flags |= ARC_FREED_IN_READ;
- if (HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
+ ASSERT(!HDR_IN_HASH_TABLE(ab));
ab->b_arc_access = 0;
bzero(&ab->b_dva, sizeof (dva_t));
ab->b_birth = 0;
@@ -2351,6 +2406,9 @@ arc_init(void)
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+ /* Convert seconds to clock ticks */
+ arc_min_prefetch_lifespan *= hz;
+
/* Start out with 1/8 of all memory */
arc.c = physmem * PAGESIZE / 8;
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index edb40bd96d..cb919a4111 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -464,10 +464,11 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
}
static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
blkptr_t *bp;
zbookmark_t zb;
+ uint32_t aflags = ARC_NOWAIT;
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
@@ -505,6 +506,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db.db_size, db));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
+ *flags |= DB_RF_CACHED;
mutex_exit(&db->db_mtx);
return;
}
@@ -524,8 +526,10 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_level > 0 ? byteswap_uint64_array :
dmu_ot[db->db_dnode->dn_type].ot_byteswap,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
- ARC_NOWAIT, &zb);
+ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+ if (aflags & ARC_CACHED)
+ *flags |= DB_RF_CACHED;
}
int
@@ -533,21 +537,26 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
int havepzio = (zio != NULL);
+ int prefetch;
/*
* We don't have to hold the mutex to check db_state because it
* can't be freed while we have a hold on the buffer.
*/
ASSERT(!refcount_is_zero(&db->db_holds));
- if (db->db_state == DB_CACHED)
- return (0);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&db->db_dnode->dn_struct_rwlock);
} else if (db->db_state == DB_UNCACHED) {
@@ -555,15 +564,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
zio = zio_root(db->db_dnode->dn_objset->os_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
}
- dbuf_read_impl(db, zio, flags);
+ dbuf_read_impl(db, zio, &flags);
+
/* dbuf_read_impl has dropped db_mtx for us */
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 &&
- db->db_dnode != NULL) {
+ if (prefetch)
dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
- db->db.db_size);
- }
+ db->db.db_size, flags & DB_RF_CACHED);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&db->db_dnode->dn_struct_rwlock);
@@ -571,8 +578,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (!havepzio)
err = zio_wait(zio);
} else {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
while (db->db_state == DB_READ ||
db->db_state == DB_FILL) {
@@ -1444,7 +1457,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
void
dbuf_prefetch(dnode_t *dn, uint64_t blkid)
{
- dmu_buf_impl_t *db, *parent = NULL;
+ dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
ASSERT(blkid != DB_BONUS_BLKID);
@@ -1455,17 +1468,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
/* dbuf_find() returns with db_mtx held */
if (db = dbuf_find(dn, 0, blkid)) {
- /*
- * This dbuf is already in the cache. We assume that
- * it is already CACHED, or else about to be either
- * read or filled.
- */
+ if (refcount_count(&db->db_holds) > 0) {
+ /*
+ * This dbuf is active. We assume that it is
+ * already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
mutex_exit(&db->db_mtx);
- return;
}
- if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
dn->dn_objset->os_dsl_dataset->ds_object : 0;
@@ -1477,10 +1494,10 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
dmu_ot[dn->dn_type].ot_byteswap,
NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- (ARC_NOWAIT | ARC_PREFETCH), &zb);
+ &aflags, &zb);
}
- if (parent)
- dbuf_rele(parent, NULL);
+ if (db)
+ dbuf_rele(db, NULL);
}
}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 1561d30141..32c61a0645 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -147,11 +147,16 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
return (0);
}
-int
-dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
- dnode_t *dn;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t flags;
@@ -160,21 +165,10 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
ASSERT(length <= DMU_MAX_ACCESS);
- if (length == 0) {
- if (numbufsp)
- *numbufsp = 0;
- *dbpp = NULL;
- return (0);
- }
-
flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
if (length > zfetch_array_rd_sz)
flags |= DB_RF_NOPREFETCH;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
-
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
@@ -193,12 +187,11 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
- dnode_rele(dn, FTAG);
zio_nowait(zio);
return (EIO);
}
/* initiate async i/o */
- if (read && db->db_state == DB_UNCACHED) {
+ if (read) {
rw_exit(&dn->dn_struct_rwlock);
(void) dbuf_read(db, zio, flags);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -206,7 +199,6 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
dbp[i] = &db->db;
}
rw_exit(&dn->dn_struct_rwlock);
- dnode_rele(dn, FTAG);
/* wait for async i/o */
err = zio_wait(zio);
@@ -238,6 +230,38 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
return (0);
}
+int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ int err;
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ return (err);
+}
+
void
dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
{
@@ -383,6 +407,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_t **dbp;
int numbufs, i;
+ if (size == 0)
+ return;
+
VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp));
@@ -424,6 +451,9 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
int numbufs, i;
int err = 0;
+ if (size == 0)
+ return (0);
+
err = dmu_buf_hold_array(os, object, offset, size,
FALSE, FTAG, &numbufs, &dbp);
if (err)
@@ -620,6 +650,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
int blksz = BP_GET_LSIZE(bp);
if (data == NULL) {
+ uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
zbookmark_t zb;
@@ -630,7 +661,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
(void) arc_read(NULL, spa, bp,
dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- ARC_WAIT, &zb);
+ &aflags, &zb);
if (abuf) {
err = dump_data(ba, type, object, blkid * blksz,
@@ -1511,6 +1542,16 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
* this zio to the parent zio passed in.
*/
cv_wait(&db->db_changed, &db->db_mtx);
+ if (!db->db_data_pending &&
+ db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
+ /*
+ * IO was compressed away
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
ASSERT(db->db_data_pending ||
(db->db_blkptr && db->db_blkptr->blk_birth == txg));
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 3f7128c5f4..97e60fc192 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -142,6 +142,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
osi->os_rootbp = *bp;
osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
if (!BP_IS_HOLE(&osi->os_rootbp)) {
+ uint32_t aflags = ARC_WAIT;
zbookmark_t zb;
zb.zb_objset = ds ? ds->ds_object : 0;
zb.zb_object = 0;
@@ -152,7 +153,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
err = arc_read(NULL, spa, &osi->os_rootbp,
dmu_ot[DMU_OT_OBJSET].ot_byteswap,
arc_bcopy_func, osi->os_phys,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb);
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
kmem_free(osi, sizeof (objset_impl_t));
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
index 8109cc0055..07adcfa039 100644
--- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -43,7 +42,7 @@ uint32_t zfetch_max_streams = 8;
/* min time before stream reclaim */
uint32_t zfetch_min_sec_reap = 2;
/* max number of blocks to fetch at a time */
-uint32_t zfetch_block_cap = 32;
+uint32_t zfetch_block_cap = 256;
/* number of bytes in a array_read at which we stop prefetching (1Mb) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
@@ -52,20 +51,24 @@ static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int dmu_zfetch_find(zfetch_t *, zstream_t *);
+static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
static void dmu_zfetch_stream_update(zfetch_t *, zstream_t *);
static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
-
/*
* Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear to a pair of existing prefetch
+ * blocks to be read are part of a co-linear pair of existing prefetch
* streams. If a set is found, coalesce the streams, removing one, and
* configure the prefetch so it looks for a strided access pattern.
*
+ * In other words: if we find two sequential access streams that are
+ * the same length and distance N appart, and this read is N from the
+ * last stream, then we are probably in a strided access pattern. So
+ * combine the two sequential streams into a single strided stream.
+ *
* If no co-linear streams are found, return NULL.
*/
static int
@@ -249,9 +252,9 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
}
/* compute fetch size */
- if (blkid + nblks > dn->dn_maxblkid) {
- fetchsz = dn->dn_maxblkid - blkid;
- ASSERT(blkid + fetchsz <= dn->dn_maxblkid);
+ if (blkid + nblks + 1 > dn->dn_maxblkid) {
+ fetchsz = (dn->dn_maxblkid - blkid) + 1;
+ ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
} else {
fetchsz = nblks;
}
@@ -266,10 +269,11 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
* located and returns true, otherwise it returns false
*/
static int
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh)
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
{
zstream_t *zs;
int64_t diff;
+ int reset = !prefetched;
int rc = 0;
if (zh == NULL)
@@ -287,20 +291,33 @@ top:
for (zs = list_head(&zf->zf_stream); zs;
zs = list_next(&zf->zf_stream, zs)) {
-
+ /*
+ * XXX - should this be an assert?
+ */
if (zs->zst_len == 0) {
/* bogus stream */
continue;
}
- if (zh->zst_offset - zs->zst_offset < zs->zst_len) {
+ /*
+ * We hit this case when we are in a strided prefetch stream:
+ * we will read "len" blocks before "striding".
+ */
+ if (zh->zst_offset >= zs->zst_offset &&
+ zh->zst_offset < zs->zst_offset + zs->zst_len) {
/* already fetched */
- rw_exit(&zf->zf_rwlock);
- return (1);
+ rc = 1;
+ goto out;
}
+ /*
+ * This is the forward sequential read case: we increment
+ * len by one each time we hit here, so we will enter this
+ * case on every read.
+ */
if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
- /* forward sequential access */
+
+ reset = !prefetched && zs->zst_len > 1;
mutex_enter(&zs->zst_lock);
@@ -308,7 +325,6 @@ top:
mutex_exit(&zs->zst_lock);
goto top;
}
-
zs->zst_len += zh->zst_len;
diff = zs->zst_len - zfetch_block_cap;
if (diff > 0) {
@@ -320,9 +336,14 @@ top:
break;
+ /*
+ * Same as above, but reading backwards through the file.
+ */
} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
/* backwards sequential access */
+ reset = !prefetched && zs->zst_len > 1;
+
mutex_enter(&zs->zst_lock);
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
@@ -388,11 +409,33 @@ top:
}
if (zs) {
- rc = 1;
- dmu_zfetch_dofetch(zf, zs);
- mutex_exit(&zs->zst_lock);
- }
+ if (reset) {
+ zstream_t *remove = zs;
+ rc = 0;
+ mutex_exit(&zs->zst_lock);
+ rw_exit(&zf->zf_rwlock);
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ /*
+ * Relocate the stream, in case someone removes
+ * it while we were acquiring the WRITER lock.
+ */
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (zs == remove) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ break;
+ }
+ }
+ } else {
+ rc = 1;
+ dmu_zfetch_dofetch(zf, zs);
+ mutex_exit(&zs->zst_lock);
+ }
+ }
+out:
rw_exit(&zf->zf_rwlock);
return (rc);
}
@@ -527,7 +570,7 @@ dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
* routines to create, delete, find, or operate upon prefetch streams.
*/
void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size)
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
{
zstream_t zst;
zstream_t *newstream;
@@ -550,7 +593,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size)
zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
P2ALIGN(offset, blksz)) >> blkshft;
- fetched = dmu_zfetch_find(zf, &zst);
+ fetched = dmu_zfetch_find(zf, &zst, prefetched);
if (!fetched) {
fetched = dmu_zfetch_colinear(zf, &zst);
}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 07494dacd4..aef4932ec0 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -34,6 +34,8 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+uint64_t metaslab_aliquot = 512ULL << 10;
+
/*
* ==========================================================================
* Metaslab classes
@@ -146,7 +148,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
- mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
mg->mg_vd = vd;
metaslab_class_add(mc, mg);
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 811ac94436..37a91018d3 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -60,6 +60,7 @@ struct arc_buf {
#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
+#define ARC_CACHED (1 << 4) /* I/O was already in cache */
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
@@ -74,7 +75,7 @@ int arc_referenced(arc_buf_t *buf);
int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags, zbookmark_t *zb);
+ uint32_t *arc_flags, zbookmark_t *zb);
int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *done, void *private, int priority, int flags,
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 5724f7a324..6fff9edaae 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -47,11 +47,12 @@ extern "C" {
* define flags for dbuf_read
*/
-#define DB_RF_MUST_SUCCEED 0
+#define DB_RF_MUST_SUCCEED (1 << 0)
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
#define DB_RF_NEVERWAIT (1 << 4)
+#define DB_RF_CACHED (1 << 5)
/*
* The state transition diagram for dbufs looks like:
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index d89fb9be25..3d3a79dc4e 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -322,6 +322,8 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
*/
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
index 35466d6874..c94bced933 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -66,7 +65,7 @@ typedef struct zfetch {
void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_rele(zfetch_t *);
-void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index b2c6567af8..b6dcfca86c 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -1760,7 +1760,7 @@ static vdev_knob_t vdev_knob[] = {
"minimum pending I/Os to the disk",
1,
10000,
- 2,
+ 4,
offsetof(struct vdev, vdev_queue.vq_min_pending)
},
{
@@ -1792,7 +1792,7 @@ static vdev_knob_t vdev_knob[] = {
"deadline = pri + (lbolt >> time_shift)",
0,
63,
- 8,
+ 6,
offsetof(struct vdev, vdev_queue.vq_time_shift)
},
{
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 14a6ce7e6e..73d1a83d94 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -51,6 +51,8 @@ typedef struct mirror_map {
mirror_child_t mm_child[1];
} mirror_map_t;
+int vdev_mirror_shift = 21;
+
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
@@ -96,7 +98,8 @@ vdev_mirror_map_alloc(zio_t *zio)
mm->mm_children = c;
mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
- mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c);
+ mm->mm_preferred = mm->mm_replacing ? 0 :
+ (zio->io_offset >> vdev_mirror_shift) % c;
mm->mm_root = B_FALSE;
for (c = 0; c < mm->mm_children; c++) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 457f596c7f..e4acfc4998 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -446,7 +446,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n = MIN(zfs_read_chunk_size,
zp->z_phys->zp_size - uio->uio_loffset);
n = MIN(n, cnt);
- error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+ error = dmu_buf_hold_array_by_bonus(zp->z_dbuf,
uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp);
if (error)
goto out;
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 2d7cdef165..92dcf192ff 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -152,6 +152,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
{
blkptr_t blk = *bp;
zbookmark_t zb;
+ uint32_t aflags = ARC_WAIT;
int error;
zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
@@ -163,7 +164,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb);
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
if (error == 0) {
char *data = (*abufpp)->b_data;
diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c
index a7b5ce6b9d..3eb98b50ac 100644
--- a/usr/src/uts/common/syscall/rw.c
+++ b/usr/src/uts/common/syscall/rw.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,9 +51,9 @@
#include <sys/rctl.h>
#include <sys/nbmlock.h>
-#define COPYOUT_MIN_SIZE (1<<17) /* 128K */
+#define COPYOUT_MAX_CACHE (1<<17) /* 128K */
-static size_t copyout_min_size = COPYOUT_MIN_SIZE;
+size_t copyout_max_cached = COPYOUT_MAX_CACHE; /* global so it's patchable */
/*
* read, write, pread, pwrite, readv, and writev syscalls.
@@ -167,7 +166,7 @@ read(int fdes, void *cbuf, size_t count)
/*
* Only use bypass caches when the count is large enough
*/
- if (bcount < copyout_min_size)
+ if (bcount <= copyout_max_cached)
auio.uio_extflg = UIO_COPY_CACHED;
else
auio.uio_extflg = UIO_COPY_DEFAULT;
@@ -723,7 +722,7 @@ readv(int fdes, struct iovec *iovp, int iovcnt)
auio.uio_segflg = UIO_USERSPACE;
auio.uio_llimit = MAXOFFSET_T;
auio.uio_fmode = fflag;
- if (bcount < copyout_min_size)
+ if (bcount <= copyout_max_cached)
auio.uio_extflg = UIO_COPY_CACHED;
else
auio.uio_extflg = UIO_COPY_DEFAULT;