diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/fs/zfs/arc.c | 210 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dbuf.c | 61 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu.c | 77 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_objset.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_zfetch.c | 89 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/metaslab.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/arc.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dbuf.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/vdev_mirror.c | 5 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vnops.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zil.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/rw.c | 15 |
15 files changed, 329 insertions, 161 deletions
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index baca8e3a12..c1e1ebd7a2 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -139,6 +139,12 @@ typedef enum arc_reclaim_strategy { /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* + * minimum lifespan of a prefetched block in seconds + * (this is converted to ticks during the arc initialization) + */ +static int arc_min_prefetch_lifespan = 1; + static kmutex_t arc_reclaim_lock; static int arc_dead; @@ -264,6 +270,7 @@ static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock); #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) @@ -535,7 +542,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); mutex_enter(&ab->b_state->mtx); - ASSERT(refcount_count(&ab->b_refcnt) > 0); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(&ab->b_state->list, ab); if (GHOST_STATE(ab->b_state)) { @@ -547,6 +553,9 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ASSERT3U(ab->b_state->lsize, >=, delta); atomic_add_64(&ab->b_state->lsize, -delta); mutex_exit(&ab->b_state->mtx); + /* remove the prefetch flag is we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; } } @@ -605,9 +614,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(list_link_active(&ab->b_arc_node)); list_remove(&old_state->list, ab); - /* ghost elements have a ghost size */ - if (GHOST_STATE(old_state)) { - ASSERT(ab->b_datacnt == 0); + /* + * If prefetching out of the ghost cache, + * we will have a non-null datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } @@ -645,14 +657,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) buf_hash_remove(ab); } - /* - * If this buffer isn't being transferred to the MRU-top - * state, it's safe to clear its prefetch flag - */ - if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) { - ab->b_flags &= ~ARC_PREFETCH; - } - /* adjust state sizes */ if (to_delta) atomic_add_64(&new_state->size, to_delta); @@ -918,8 +922,14 @@ arc_evict(arc_state_t *state, int64_t bytes) for (ab = list_tail(&state->list); ab; ab = ab_prev) { ab_prev = list_prev(&state->list, ab); + /* prefetch buffers have a minimum lifespan */ + if (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + lbolt - ab->b_arc_access < arc_min_prefetch_lifespan) { + skipped++; + continue; + } hash_lock = HDR_LOCK(ab); - if (mutex_tryenter(hash_lock)) { + if (!HDR_IO_IN_PROGRESS(ab) && mutex_tryenter(hash_lock)) { ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { @@ -991,6 +1001,7 @@ top: ab_prev = list_prev(&state->list, ab); hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); arc_change_state(arc.anon, ab, hash_lock); mutex_exit(hash_lock); @@ -1102,12 +1113,13 @@ arc_flush(void) ASSERT(arc_eviction_list == NULL); } +int arc_kmem_reclaim_shift = 5; /* log2(fraction of arc to reclaim) */ + void arc_kmem_reclaim(void) { uint64_t to_free; - /* Remove 12.5% */ /* * We need arc_reclaim_lock because we don't want multiple * threads trying to reclaim concurrently. @@ -1127,16 +1139,16 @@ arc_kmem_reclaim(void) mutex_enter(&arc_reclaim_lock); #ifdef _KERNEL - to_free = MAX(arc.c >> 3, ptob(needfree)); + to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree)); #else - to_free = arc.c >> 3; + to_free = arc.c >> arc_kmem_reclaim_shift; #endif if (arc.c > to_free) atomic_add_64(&arc.c, -to_free); else arc.c = arc.c_min; - atomic_add_64(&arc.p, -(arc.p >> 3)); + atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift)); if (arc.c > arc.size) arc.c = arc.size; if (arc.c < arc.c_min) @@ -1468,14 +1480,25 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock) } else if (buf->b_state == arc.mru) { /* - * If this buffer is in the MRU-top state and has the prefetch - * flag, the first read was actually part of a prefetch. In - * this situation, we simply want to clear the flag and return. - * A subsequent access should bump this into the MFU state. + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). */ if ((buf->b_flags & ARC_PREFETCH) != 0) { - buf->b_flags &= ~ARC_PREFETCH; - atomic_add_64(&arc.mru->hits, 1); + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc.mru->mtx); + list_remove(&arc.mru->list, buf); + list_insert_head(&arc.mru->list, buf); + mutex_exit(&arc.mru->mtx); + } else { + buf->b_flags &= ~ARC_PREFETCH; + atomic_add_64(&arc.mru->hits, 1); + } + buf->b_arc_access = lbolt; mutex_exit(hash_lock); return; } @@ -1506,7 +1529,8 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock) if (buf->b_flags & ARC_PREFETCH) { new_state = arc.mru; - buf->b_flags &= ~ARC_PREFETCH; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); } else { new_state = arc.mfu; @@ -1526,26 +1550,45 @@ arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * - * NOTE: the add_reference() that occurred when we did - * the arc_read() should have kicked this off the list, - * so even if it was a prefetch, it will be put back at - * the head of the list when we remove_reference(). + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc.mfu->mtx); + list_remove(&arc.mfu->list, buf); + list_insert_head(&arc.mfu->list, buf); + mutex_exit(&arc.mfu->mtx); + } atomic_add_64(&arc.mfu->hits, 1); + buf->b_arc_access = lbolt; } else if (buf->b_state == arc.mfu_ghost) { + arc_state_t *new_state = arc.mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc.mru; + } + arc_adapt(blksz, arc.mfu_ghost); if (arc_evict_needed()) - evict_state = arc.mfu; + evict_state = new_state; buf->b_arc_access = lbolt; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc.mfu, buf, hash_lock); + arc_change_state(new_state, buf, hash_lock); atomic_add_64(&arc.mfu_ghost->hits, 1); } else { @@ -1628,19 +1671,6 @@ arc_read_done(zio_t *zio) } acb->acb_buf = abuf; abuf = NULL; - } else { - /* - * The caller did not provide a callback function. - * In this case, we should just remove the reference. - */ - if (HDR_FREED_IN_READ(hdr)) { - ASSERT3P(hdr->b_state, ==, arc.anon); - (void) refcount_remove(&hdr->b_refcnt, - acb->acb_private); - } else { - (void) remove_reference(hdr, hash_lock, - acb->acb_private); - } } } hdr->b_acb = NULL; @@ -1658,15 +1688,15 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); - /* translate checksum errors into IO errors */ + /* convert checksum errors into IO errors */ if (zio->io_error == ECKSUM) zio->io_error = EIO; } /* - * Broadcast before we drop the hash_lock. This is less efficient, - * but avoids the possibility that the hdr (and hence the cv) might - * be freed before we get to the cv_broadcast(). + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). */ cv_broadcast(&hdr->b_cv); @@ -1731,7 +1761,7 @@ arc_read_done(zio_t *zio) int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags, zbookmark_t *zb) + uint32_t *arc_flags, zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -1742,8 +1772,18 @@ top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (hdr && hdr->b_datacnt > 0) { + *arc_flags |= ARC_CACHED; + if (HDR_IO_IN_PROGRESS(hdr)) { - if ((arc_flags & ARC_NOWAIT) && done) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), @@ -1761,10 +1801,6 @@ top: add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); - } else if (arc_flags & ARC_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; } mutex_exit(hash_lock); return (0); @@ -1796,6 +1832,9 @@ top: hdr->b_flags &= ~ARC_BUF_AVAILABLE; } add_reference(hdr, hash_lock, private); + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access_and_exit(hdr, hash_lock); @@ -1825,15 +1864,26 @@ top: (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } - + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - add_reference(hdr, hash_lock, private); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); buf = kmem_cache_alloc(buf_cache, KM_SLEEP); buf->b_hdr = hdr; buf->b_efunc = NULL; @@ -1844,6 +1894,7 @@ top: atomic_add_64(&arc.size, hdr->b_size); ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; + } acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -1853,13 +1904,6 @@ top: ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; - - /* - * If this DVA is part of a prefetch, mark the buf - * header with the prefetch flag - */ - if (arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; hdr->b_flags |= ARC_IO_IN_PROGRESS; /* @@ -1883,10 +1927,10 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, flags, zb); - if (arc_flags & ARC_WAIT) + if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); - ASSERT(arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } return (0); @@ -2260,22 +2304,33 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_cksum0 == bp->blk_cksum.zc_word[0]); if (ab->b_state != arc.anon) arc_change_state(arc.anon, ab, hash_lock); - if (refcount_is_zero(&ab->b_refcnt)) { + if (HDR_IO_IN_PROGRESS(ab)) { + /* + * This should only happen when we prefetch. + */ + ASSERT(ab->b_flags & ARC_PREFETCH); + ASSERT3U(ab->b_datacnt, ==, 1); + ab->b_flags |= ARC_FREED_IN_READ; + if (HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } else if (refcount_is_zero(&ab->b_refcnt)) { mutex_exit(hash_lock); arc_hdr_destroy(ab); atomic_add_64(&arc.deleted, 1); } else { /* - * We could have an outstanding read on this - * block, so multiple active references are - * possible. But we should only have a single - * data buffer associated at this point. + * We still have an active reference on this + * buffer. This can happen, e.g., from + * dbuf_unoverride(). */ - ASSERT3U(ab->b_datacnt, ==, 1); - if (HDR_IO_IN_PROGRESS(ab)) - ab->b_flags |= ARC_FREED_IN_READ; - if (HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); + ASSERT(!HDR_IN_HASH_TABLE(ab)); ab->b_arc_access = 0; bzero(&ab->b_dva, sizeof (dva_t)); ab->b_birth = 0; @@ -2351,6 +2406,9 @@ arc_init(void) mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan *= hz; + /* Start out with 1/8 of all memory */ arc.c = physmem * PAGESIZE / 8; diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index edb40bd96d..cb919a4111 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -464,10 +464,11 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { blkptr_t *bp; zbookmark_t zb; + uint32_t aflags = ARC_NOWAIT; ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ @@ -505,6 +506,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db->db.db_size, db)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; + *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } @@ -524,8 +526,10 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db->db_level > 0 ? byteswap_uint64_array : dmu_ot[db->db_dnode->dn_type].ot_byteswap, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, - ARC_NOWAIT, &zb); + (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + &aflags, &zb); + if (aflags & ARC_CACHED) + *flags |= DB_RF_CACHED; } int @@ -533,21 +537,26 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; int havepzio = (zio != NULL); + int prefetch; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); - if (db->db_state == DB_CACHED) - return (0); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; + mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&db->db_dnode->dn_struct_rwlock); } else if (db->db_state == DB_UNCACHED) { @@ -555,15 +564,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } - dbuf_read_impl(db, zio, flags); + dbuf_read_impl(db, zio, &flags); + /* dbuf_read_impl has dropped db_mtx for us */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && - db->db_dnode != NULL) { + if (prefetch) dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size); - } + db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&db->db_dnode->dn_struct_rwlock); @@ -571,8 +578,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (!havepzio) err = zio_wait(zio); } else { + mutex_exit(&db->db_mtx); + if (prefetch) + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&db->db_dnode->dn_struct_rwlock); + + mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { @@ -1444,7 +1457,7 @@ dbuf_destroy(dmu_buf_impl_t *db) void dbuf_prefetch(dnode_t *dn, uint64_t blkid) { - dmu_buf_impl_t *db, *parent = NULL; + dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DB_BONUS_BLKID); @@ -1455,17 +1468,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) /* dbuf_find() returns with db_mtx held */ if (db = dbuf_find(dn, 0, blkid)) { - /* - * This dbuf is already in the cache. We assume that - * it is already CACHED, or else about to be either - * read or filled. - */ + if (refcount_count(&db->db_holds) > 0) { + /* + * This dbuf is active. We assume that it is + * already CACHED, or else about to be either + * read or filled. + */ + mutex_exit(&db->db_mtx); + return; + } mutex_exit(&db->db_mtx); - return; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { + if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; zb.zb_objset = dn->dn_objset->os_dsl_dataset ? dn->dn_objset->os_dsl_dataset->ds_object : 0; @@ -1477,10 +1494,10 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) dmu_ot[dn->dn_type].ot_byteswap, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - (ARC_NOWAIT | ARC_PREFETCH), &zb); + &aflags, &zb); } - if (parent) - dbuf_rele(parent, NULL); + if (db) + dbuf_rele(db, NULL); } } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 1561d30141..32c61a0645 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -147,11 +147,16 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) return (0); } -int -dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than <os, object> -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +static int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { - dnode_t *dn; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t flags; @@ -160,21 +165,10 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, ASSERT(length <= DMU_MAX_ACCESS); - if (length == 0) { - if (numbufsp) - *numbufsp = 0; - *dbpp = NULL; - return (0); - } - flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; if (length > zfetch_array_rd_sz) flags |= DB_RF_NOPREFETCH; - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); - rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; @@ -193,12 +187,11 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); - dnode_rele(dn, FTAG); zio_nowait(zio); return (EIO); } /* initiate async i/o */ - if (read && db->db_state == DB_UNCACHED) { + if (read) { rw_exit(&dn->dn_struct_rwlock); (void) dbuf_read(db, zio, flags); rw_enter(&dn->dn_struct_rwlock, RW_READER); @@ -206,7 +199,6 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); /* wait for async i/o */ err = zio_wait(zio); @@ -238,6 +230,38 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, return (0); } +int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + int err; + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp); + + return (err); +} + void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { @@ -383,6 +407,9 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_t **dbp; int numbufs, i; + if (size == 0) + return; + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); @@ -424,6 +451,9 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int numbufs, i; int err = 0; + if (size == 0) + return (0); + err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) @@ -620,6 +650,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { int blksz = BP_GET_LSIZE(bp); if (data == NULL) { + uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; zbookmark_t zb; @@ -630,7 +661,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) (void) arc_read(NULL, spa, bp, dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - ARC_WAIT, &zb); + &aflags, &zb); if (abuf) { err = dump_data(ba, type, object, blkid * blksz, @@ -1511,6 +1542,16 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake, * this zio to the parent zio passed in. */ cv_wait(&db->db_changed, &db->db_mtx); + if (!db->db_data_pending && + db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { + /* + * IO was compressed away + */ + *bp = *db->db_blkptr; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } ASSERT(db->db_data_pending || (db->db_blkptr && db->db_blkptr->blk_birth == txg)); } diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 3f7128c5f4..97e60fc192 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -142,6 +142,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, osi->os_rootbp = *bp; osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t)); if (!BP_IS_HOLE(&osi->os_rootbp)) { + uint32_t aflags = ARC_WAIT; zbookmark_t zb; zb.zb_objset = ds ? ds->ds_object : 0; zb.zb_object = 0; @@ -152,7 +153,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, err = arc_read(NULL, spa, &osi->os_rootbp, dmu_ot[DMU_OT_OBJSET].ot_byteswap, arc_bcopy_func, osi->os_phys, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb); + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); kmem_free(osi, sizeof (objset_impl_t)); diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c index 8109cc0055..07adcfa039 100644 --- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c +++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -43,7 +42,7 @@ uint32_t zfetch_max_streams = 8; /* min time before stream reclaim */ uint32_t zfetch_min_sec_reap = 2; /* max number of blocks to fetch at a time */ -uint32_t zfetch_block_cap = 32; +uint32_t zfetch_block_cap = 256; /* number of bytes in a array_read at which we stop prefetching (1Mb) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; @@ -52,20 +51,24 @@ static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *); +static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); static void dmu_zfetch_stream_update(zfetch_t *, zstream_t *); static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); - /* * Given a zfetch structure and a zstream structure, determine whether the - * blocks to be read are part of a co-linear to a pair of existing prefetch + * blocks to be read are part of a co-linear pair of existing prefetch * streams. If a set is found, coalesce the streams, removing one, and * configure the prefetch so it looks for a strided access pattern. * + * In other words: if we find two sequential access streams that are + * the same length and distance N appart, and this read is N from the + * last stream, then we are probably in a strided access pattern. So + * combine the two sequential streams into a single strided stream. + * * If no co-linear streams are found, return NULL. */ static int @@ -249,9 +252,9 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) } /* compute fetch size */ - if (blkid + nblks > dn->dn_maxblkid) { - fetchsz = dn->dn_maxblkid - blkid; - ASSERT(blkid + fetchsz <= dn->dn_maxblkid); + if (blkid + nblks + 1 > dn->dn_maxblkid) { + fetchsz = (dn->dn_maxblkid - blkid) + 1; + ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); } else { fetchsz = nblks; } @@ -266,10 +269,11 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) * located and returns true, otherwise it returns false */ static int -dmu_zfetch_find(zfetch_t *zf, zstream_t *zh) +dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; int64_t diff; + int reset = !prefetched; int rc = 0; if (zh == NULL) @@ -287,20 +291,33 @@ top: for (zs = list_head(&zf->zf_stream); zs; zs = list_next(&zf->zf_stream, zs)) { - + /* + * XXX - should this be an assert? + */ if (zs->zst_len == 0) { /* bogus stream */ continue; } - if (zh->zst_offset - zs->zst_offset < zs->zst_len) { + /* + * We hit this case when we are in a strided prefetch stream: + * we will read "len" blocks before "striding". + */ + if (zh->zst_offset >= zs->zst_offset && + zh->zst_offset < zs->zst_offset + zs->zst_len) { /* already fetched */ - rw_exit(&zf->zf_rwlock); - return (1); + rc = 1; + goto out; } + /* + * This is the forward sequential read case: we increment + * len by one each time we hit here, so we will enter this + * case on every read. + */ if (zh->zst_offset == zs->zst_offset + zs->zst_len) { - /* forward sequential access */ + + reset = !prefetched && zs->zst_len > 1; mutex_enter(&zs->zst_lock); @@ -308,7 +325,6 @@ top: mutex_exit(&zs->zst_lock); goto top; } - zs->zst_len += zh->zst_len; diff = zs->zst_len - zfetch_block_cap; if (diff > 0) { @@ -320,9 +336,14 @@ top: break; + /* + * Same as above, but reading backwards through the file. + */ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { /* backwards sequential access */ + reset = !prefetched && zs->zst_len > 1; + mutex_enter(&zs->zst_lock); if (zh->zst_offset != zs->zst_offset - zh->zst_len) { @@ -388,11 +409,33 @@ top: } if (zs) { - rc = 1; - dmu_zfetch_dofetch(zf, zs); - mutex_exit(&zs->zst_lock); - } + if (reset) { + zstream_t *remove = zs; + rc = 0; + mutex_exit(&zs->zst_lock); + rw_exit(&zf->zf_rwlock); + rw_enter(&zf->zf_rwlock, RW_WRITER); + /* + * Relocate the stream, in case someone removes + * it while we were acquiring the WRITER lock. + */ + for (zs = list_head(&zf->zf_stream); zs; + zs = list_next(&zf->zf_stream, zs)) { + if (zs == remove) { + dmu_zfetch_stream_remove(zf, zs); + mutex_destroy(&zs->zst_lock); + kmem_free(zs, sizeof (zstream_t)); + break; + } + } + } else { + rc = 1; + dmu_zfetch_dofetch(zf, zs); + mutex_exit(&zs->zst_lock); + } + } +out: rw_exit(&zf->zf_rwlock); return (rc); } @@ -527,7 +570,7 @@ dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) * routines to create, delete, find, or operate upon prefetch streams. */ void -dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size) +dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; @@ -550,7 +593,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size) zst.zst_len = (P2ROUNDUP(offset + size, blksz) - P2ALIGN(offset, blksz)) >> blkshft; - fetched = dmu_zfetch_find(zf, &zst); + fetched = dmu_zfetch_find(zf, &zst, prefetched); if (!fetched) { fetched = dmu_zfetch_colinear(zf, &zst); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 07494dacd4..aef4932ec0 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -34,6 +34,8 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> +uint64_t metaslab_aliquot = 512ULL << 10; + /* * ========================================================================== * Metaslab classes @@ -146,7 +148,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = 2ULL << 20; /* XXX -- tweak me */ + mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; metaslab_class_add(mc, mg); diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 811ac94436..37a91018d3 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -60,6 +60,7 @@ struct arc_buf { #define ARC_WAIT (1 << 1) /* perform I/O synchronously */ #define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ #define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ +#define ARC_CACHED (1 << 4) /* I/O was already in cache */ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); @@ -74,7 +75,7 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags, zbookmark_t *zb); + uint32_t *arc_flags, zbookmark_t *zb); int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *done, void *private, int priority, int flags, diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 5724f7a324..6fff9edaae 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -47,11 +47,12 @@ extern "C" { * define flags for dbuf_read */ -#define DB_RF_MUST_SUCCEED 0 +#define DB_RF_MUST_SUCCEED (1 << 0) #define DB_RF_CANFAIL (1 << 1) #define DB_RF_HAVESTRUCT (1 << 2) #define DB_RF_NOPREFETCH (1 << 3) #define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) /* * The state transition diagram for dbufs looks like: diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index d89fb9be25..3d3a79dc4e 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -322,6 +322,8 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db); */ int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); /* diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h index 35466d6874..c94bced933 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -66,7 +65,7 @@ typedef struct zfetch { void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_rele(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index b2c6567af8..b6dcfca86c 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1760,7 +1760,7 @@ static vdev_knob_t vdev_knob[] = { "minimum pending I/Os to the disk", 1, 10000, - 2, + 4, offsetof(struct vdev, vdev_queue.vq_min_pending) }, { @@ -1792,7 +1792,7 @@ static vdev_knob_t vdev_knob[] = { "deadline = pri + (lbolt >> time_shift)", 0, 63, - 8, + 6, offsetof(struct vdev, vdev_queue.vq_time_shift) }, { diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 14a6ce7e6e..73d1a83d94 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -51,6 +51,8 @@ typedef struct mirror_map { mirror_child_t mm_child[1]; } mirror_map_t; +int vdev_mirror_shift = 21; + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -96,7 +98,8 @@ vdev_mirror_map_alloc(zio_t *zio) mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c); + mm->mm_preferred = mm->mm_replacing ? 0 : + (zio->io_offset >> vdev_mirror_shift) % c; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 457f596c7f..e4acfc4998 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -446,7 +446,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n = MIN(zfs_read_chunk_size, zp->z_phys->zp_size - uio->uio_loffset); n = MIN(n, cnt); - error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id, + error = dmu_buf_hold_array_by_bonus(zp->z_dbuf, uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp); if (error) goto out; diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 2d7cdef165..92dcf192ff 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -152,6 +152,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) { blkptr_t blk = *bp; zbookmark_t zb; + uint32_t aflags = ARC_WAIT; int error; zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; @@ -163,7 +164,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array, arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb); + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); if (error == 0) { char *data = (*abufpp)->b_data; diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c index a7b5ce6b9d..3eb98b50ac 100644 --- a/usr/src/uts/common/syscall/rw.c +++ b/usr/src/uts/common/syscall/rw.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,9 +51,9 @@ #include <sys/rctl.h> #include <sys/nbmlock.h> -#define COPYOUT_MIN_SIZE (1<<17) /* 128K */ +#define COPYOUT_MAX_CACHE (1<<17) /* 128K */ -static size_t copyout_min_size = COPYOUT_MIN_SIZE; +size_t copyout_max_cached = COPYOUT_MAX_CACHE; /* global so it's patchable */ /* * read, write, pread, pwrite, readv, and writev syscalls. @@ -167,7 +166,7 @@ read(int fdes, void *cbuf, size_t count) /* * Only use bypass caches when the count is large enough */ - if (bcount < copyout_min_size) + if (bcount <= copyout_max_cached) auio.uio_extflg = UIO_COPY_CACHED; else auio.uio_extflg = UIO_COPY_DEFAULT; @@ -723,7 +722,7 @@ readv(int fdes, struct iovec *iovp, int iovcnt) auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = fflag; - if (bcount < copyout_min_size) + if (bcount <= copyout_max_cached) auio.uio_extflg = UIO_COPY_CACHED; else auio.uio_extflg = UIO_COPY_DEFAULT; |