diff options
| author | Matthew Ahrens <mahrens@delphix.com> | 2017-07-10 14:17:21 -0700 |
|---|---|---|
| committer | Prakash Surya <prakash.surya@delphix.com> | 2018-04-22 16:57:23 -0700 |
| commit | adb52d9262f45a04318fc6e188fe2b7f59d989a5 (patch) | |
| tree | 40dba0528af2e305f828398ab72ae554611ae6c4 /usr | |
| parent | 21f7c81cc1156e9202ce3412d3ecaa697c3b2222 (diff) | |
| download | illumos-joyent-adb52d9262f45a04318fc6e188fe2b7f59d989a5.tar.gz | |
9337 zfs get all is slow due to uncached metadata
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Diffstat (limited to 'usr')
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dbuf.c | 182 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dmu.c | 108 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_objset.c | 8 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dbuf.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu.h | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu_objset.h | 12 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vfsops.c | 52 |
7 files changed, 283 insertions, 100 deletions
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 2863c90cd3..8600df4c0a 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -49,6 +49,7 @@ #include <sys/abd.h> #include <sys/vdev.h> #include <sys/cityhash.h> +#include <sys/spa_impl.h> uint_t zfs_dbuf_evict_key; @@ -74,24 +75,58 @@ static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* - * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that - * are not currently held but have been recently released. These dbufs - * are not eligible for arc eviction until they are aged out of the cache. - * Dbufs are added to the dbuf cache once the last hold is released. If a - * dbuf is later accessed and still exists in the dbuf cache, then it will - * be removed from the cache and later re-added to the head of the cache. - * Dbufs that are aged out of the cache will be immediately destroyed and - * become eligible for arc eviction. + * There are two dbuf caches; each dbuf can only be in one of them at a time. + * + * 1. Cache of metadata dbufs, to help make read-heavy administrative commands + * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs + * that represent the metadata that describes filesystems/snapshots/ + * bookmarks/properties/etc. We only evict from this cache when we export a + * pool, to short-circuit as much I/O as possible for all administrative + * commands that need the metadata. There is no eviction policy for this + * cache, because we try to only include types in it which would occupy a + * very small amount of space per object but create a large impact on the + * performance of these commands. Instead, after it reaches a maximum size + * (which should only happen on very small memory systems with a very large + * number of filesystem objects), we stop taking new dbufs into the + * metadata cache, instead putting them in the normal dbuf cache. + * + * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + * + * Dbufs are added to these caches once the last hold is released. If a dbuf is + * later accessed and still exists in the dbuf cache, then it will be removed + * from the cache and later re-added to the head of the cache. + * + * If a given dbuf meets the requirements for the metadata cache, it will go + * there, otherwise it will be considered for the generic LRU dbuf cache. The + * caches and the refcounts tracking their sizes are stored in an array indexed + * by those caches' matching enum values (from dbuf_cached_state_t). */ -static multilist_t *dbuf_cache; -static refcount_t dbuf_cache_size; -uint64_t dbuf_cache_max_bytes = 0; +typedef struct dbuf_cache { + multilist_t *cache; + refcount_t size; +} dbuf_cache_t; +dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; -/* Set the default size of the dbuf cache to log2 fraction of arc size. */ +/* Size limits for the caches */ +uint64_t dbuf_cache_max_bytes = 0; +uint64_t dbuf_metadata_cache_max_bytes = 0; +/* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; +int dbuf_metadata_cache_shift = 6; /* - * The dbuf cache uses a three-stage eviction policy: + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ +uint64_t dbuf_metadata_cache_overflow; + +/* + * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we @@ -394,6 +429,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } /* + * This returns whether this dbuf should be stored in the metadata cache, which + * is based on whether it's from one of the dnode types that store data related + * to traversing dataset hierarchies. + */ +static boolean_t +dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) +{ + DB_DNODE_ENTER(db); + dmu_object_type_t type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + /* Check if this dbuf is one of the types we care about */ + if (DMU_OT_IS_METADATA_CACHED(type)) { + /* If we hit this, then we set something up wrong in dmu_ot */ + ASSERT(DMU_OT_IS_METADATA(type)); + + /* + * Sanity check for small-memory systems: don't allocate too + * much memory for this purpose. + */ + if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > + dbuf_metadata_cache_max_bytes) { + dbuf_metadata_cache_overflow++; + DTRACE_PROBE1(dbuf__metadata__cache__overflow, + dmu_buf_impl_t *, db); + return (B_FALSE); + } + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly @@ -428,7 +498,7 @@ dbuf_cache_above_hiwater(void) uint64_t dbuf_cache_hiwater_bytes = (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; - return (refcount_count(&dbuf_cache_size) > + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); } @@ -438,7 +508,7 @@ dbuf_cache_above_lowater(void) uint64_t dbuf_cache_lowater_bytes = (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; - return (refcount_count(&dbuf_cache_size) > + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); } @@ -448,8 +518,9 @@ dbuf_cache_above_lowater(void) static void dbuf_evict_one(void) { - int idx = multilist_get_random_index(dbuf_cache); - multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx); + int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + multilist_sublist_t *mls = multilist_sublist_lock( + dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -472,8 +543,10 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); - (void) refcount_remove_many(&dbuf_cache_size, + (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); + db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); } else { multilist_sublist_unlock(mls); @@ -560,7 +633,8 @@ dbuf_evict_notify(void) * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ - if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_max_bytes) { if (dbuf_cache_above_hiwater()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); @@ -600,15 +674,21 @@ retry: mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* - * Setup the parameters for the dbuf cache. We set the size of the - * dbuf cache to 1/32nd (default) of the size of the ARC. If the value - * has been set in /etc/system and it's not greater than the size of - * the ARC, then we honor that value. + * Setup the parameters for the dbuf caches. We set the sizes of the + * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) + * of the size of the ARC, respectively. If the values are set in + * /etc/system and they're not greater than the size of the ARC, then + * we honor that value. */ if (dbuf_cache_max_bytes == 0 || dbuf_cache_max_bytes >= arc_max_bytes()) { dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; } + if (dbuf_metadata_cache_max_bytes == 0 || + dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { + dbuf_metadata_cache_max_bytes = + arc_max_bytes() >> dbuf_metadata_cache_shift; + } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc @@ -616,10 +696,13 @@ retry: */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); - dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_cache_link), - dbuf_cache_multilist_index_func); - refcount_create(&dbuf_cache_size); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + dbuf_caches[dcs].cache = + multilist_create(sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_caches[dcs].size); + } tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; @@ -653,8 +736,10 @@ dbuf_fini(void) mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); - refcount_destroy(&dbuf_cache_size); - multilist_destroy(dbuf_cache); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + refcount_destroy(&dbuf_caches[dcs].size); + multilist_destroy(dbuf_caches[dcs].cache); + } } /* @@ -2037,9 +2122,15 @@ dbuf_destroy(dmu_buf_impl_t *db) dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { - multilist_remove(dbuf_cache, db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); + + db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); @@ -2093,6 +2184,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); @@ -2231,6 +2323,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -2263,6 +2356,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -2597,9 +2691,15 @@ top: if (multilist_link_active(&db->db_cache_link)) { ASSERT(refcount_is_zero(&db->db_holds)); - multilist_remove(dbuf_cache, db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); + + db->db_caching_status = DB_NO_CACHE; } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); @@ -2816,12 +2916,22 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { - multilist_insert(dbuf_cache, db); - (void) refcount_add_many(&dbuf_cache_size, + ASSERT3U(db->db_caching_status, ==, + DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(dbuf_caches[dcs].cache, db); + (void) refcount_add_many(&dbuf_caches[dcs].size, db->db.db_size, db); mutex_exit(&db->db_mtx); - dbuf_evict_notify(); + if (db->db_caching_status == DB_DBUF_CACHE) { + dbuf_evict_notify(); + } } if (do_arc_evict) diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 41600208c8..f035b05af2 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -73,60 +73,60 @@ uint32_t zfs_per_txg_dirty_frees_percent = 30; int zfs_object_remap_one_indirect_delay_ticks = 0; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { DMU_BSWAP_UINT8, TRUE, "unallocated" }, - { DMU_BSWAP_ZAP, TRUE, "object directory" }, - { DMU_BSWAP_UINT64, TRUE, "object array" }, - { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, - { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, - { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, - { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, - { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, - { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, - { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, - { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, - { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, - { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, - { DMU_BSWAP_ZAP, TRUE, "DSL props" }, - { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, - { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, - { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, - { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, - { DMU_BSWAP_UINT8, FALSE, "zvol object" }, - { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, - { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, - { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, - { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, - { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, - { DMU_BSWAP_UINT8, TRUE, "SPA history" }, - { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, - { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, - { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, - { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, - { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, - { DMU_BSWAP_UINT8, TRUE, "FUID table" }, - { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, - { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, - { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, - { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, - { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, - { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, - { DMU_BSWAP_UINT8, TRUE, "System attributes" }, - { DMU_BSWAP_ZAP, TRUE, "SA master node" }, - { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, - { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, - { DMU_BSWAP_ZAP, TRUE, "scan translations" }, - { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, - { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, - { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, - { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } + { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 574812d3b4..f345ed9383 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -496,6 +496,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } + /* + * These properties will be filled in by the logic in zfs_get_zplprop() + * when they are queried for the first time. + */ + os->os_version = OBJSET_PROP_UNINITIALIZED; + os->os_normalization = OBJSET_PROP_UNINITIALIZED; + os->os_utf8only = OBJSET_PROP_UNINITIALIZED; + os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 69617b3dca..f467878b72 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -83,6 +83,13 @@ typedef enum dbuf_states { DB_EVICTING } dbuf_states_t; +typedef enum dbuf_cached_state { + DB_NO_CACHE = -1, + DB_DBUF_CACHE, + DB_DBUF_METADATA_CACHE, + DB_CACHE_MAX +} dbuf_cached_state_t; + struct dnode; struct dmu_tx; @@ -229,11 +236,12 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; - /* - * Link in dbuf_cache. - */ + /* Link in dbuf_cache or dbuf_metadata_cache */ multilist_node_t db_cache_link; + /* Tells us which dbuf cache this dbuf is in, if any */ + dbuf_cached_state_t db_caching_status; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 8ea733a830..6a33cb7d81 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap { /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data - * (dmu_object_byteswap_t). + * (dmu_object_byteswap_t). All of the types created by this method + * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ @@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap { ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) +#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) + /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill @@ -799,6 +803,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; + boolean_t ot_dbuf_metadata_cache; char *ot_name; } dmu_object_type_info_t; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index 59e87aab80..25ff864217 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -39,6 +39,7 @@ #include <sys/zio.h> #include <sys/zil.h> #include <sys/sa.h> +#include <sys/zfs_ioctl.h> #ifdef __cplusplus extern "C" { @@ -69,6 +70,7 @@ typedef struct objset_phys { dnode_phys_t os_groupused_dnode; } objset_phys_t; +#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) struct objset { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; @@ -100,6 +102,16 @@ struct objset { zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; int os_recordsize; + /* + * The next four values are used as a cache of whatever's on disk, and + * are initialized the first time these properties are queried. Before + * being initialized with their real values, their values are + * OBJSET_PROP_UNINITIALIZED. + */ + uint64_t os_version; + uint64_t os_normalization; + uint64_t os_utf8only; + uint64_t os_casesensitivity; /* * Pointer is constant; the blkptr it points to is protected by diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 3de658666a..f7beea4cc9 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -2232,6 +2232,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) dmu_tx_commit(tx); zfsvfs->z_version = newvers; + os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); @@ -2244,17 +2245,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { - const char *pname; - int error = ENOENT; + uint64_t *cached_copy = NULL; /* - * Look up the file system's value for the property. For the - * version property, we look up a slightly different string. + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. */ - if (prop == ZFS_PROP_VERSION) + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; - else + } else { pname = zfs_prop_to_name(prop); + } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); @@ -2279,6 +2310,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) } error = 0; } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + return (error); } |
