summaryrefslogtreecommitdiff
path: root/usr
diff options
context:
space:
mode:
authorMatthew Ahrens <mahrens@delphix.com>2017-07-10 14:17:21 -0700
committerPrakash Surya <prakash.surya@delphix.com>2018-04-22 16:57:23 -0700
commitadb52d9262f45a04318fc6e188fe2b7f59d989a5 (patch)
tree40dba0528af2e305f828398ab72ae554611ae6c4 /usr
parent21f7c81cc1156e9202ce3412d3ecaa697c3b2222 (diff)
downloadillumos-joyent-adb52d9262f45a04318fc6e188fe2b7f59d989a5.tar.gz
9337 zfs get all is slow due to uncached metadata
Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Thomas Caputi <tcaputi@datto.com> Approved by: Richard Lowe <richlowe@richlowe.net>
Diffstat (limited to 'usr')
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c182
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c108
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h14
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c52
7 files changed, 283 insertions, 100 deletions
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 2863c90cd3..8600df4c0a 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -49,6 +49,7 @@
#include <sys/abd.h>
#include <sys/vdev.h>
#include <sys/cityhash.h>
+#include <sys/spa_impl.h>
uint_t zfs_dbuf_evict_key;
@@ -74,24 +75,58 @@ static kcondvar_t dbuf_evict_cv;
static boolean_t dbuf_evict_thread_exit;
/*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ * that represent the metadata that describes filesystems/snapshots/
+ * bookmarks/properties/etc. We only evict from this cache when we export a
+ * pool, to short-circuit as much I/O as possible for all administrative
+ * commands that need the metadata. There is no eviction policy for this
+ * cache, because we try to only include types in it which would occupy a
+ * very small amount of space per object but create a large impact on the
+ * performance of these commands. Instead, after it reaches a maximum size
+ * (which should only happen on very small memory systems with a very large
+ * number of filesystem objects), we stop taking new dbufs into the
+ * metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
*/
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+ multilist_t *cache;
+ refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
/*
- * The dbuf cache uses a three-stage eviction policy:
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
* - A low water marker designates when the dbuf eviction thread
* should stop evicting from the dbuf cache.
* - When we reach the maximum size (aka mid water mark), we
@@ -394,6 +429,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
}
/*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+ DB_DNODE_ENTER(db);
+ dmu_object_type_t type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /* Check if this dbuf is one of the types we care about */
+ if (DMU_OT_IS_METADATA_CACHED(type)) {
+ /* If we hit this, then we set something up wrong in dmu_ot */
+ ASSERT(DMU_OT_IS_METADATA(type));
+
+ /*
+ * Sanity check for small-memory systems: don't allocate too
+ * much memory for this purpose.
+ */
+ if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+ dbuf_metadata_cache_max_bytes) {
+ dbuf_metadata_cache_overflow++;
+ DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+ dmu_buf_impl_t *, db);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* This function *must* return indices evenly distributed between all
* sublists of the multilist. This is needed due to how the dbuf eviction
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
@@ -428,7 +498,7 @@ dbuf_cache_above_hiwater(void)
uint64_t dbuf_cache_hiwater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
- return (refcount_count(&dbuf_cache_size) >
+ return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
}
@@ -438,7 +508,7 @@ dbuf_cache_above_lowater(void)
uint64_t dbuf_cache_lowater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
- return (refcount_count(&dbuf_cache_size) >
+ return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
}
@@ -448,8 +518,9 @@ dbuf_cache_above_lowater(void)
static void
dbuf_evict_one(void)
{
- int idx = multilist_get_random_index(dbuf_cache);
- multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+ int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(
+ dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
@@ -472,8 +543,10 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
- (void) refcount_remove_many(&dbuf_cache_size,
+ (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
db->db.db_size, db);
+ ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+ db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
} else {
multilist_sublist_unlock(mls);
@@ -560,7 +633,8 @@ dbuf_evict_notify(void)
* because it's OK to occasionally make the wrong decision here,
* and grabbing the lock results in massive lock contention.
*/
- if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+ if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+ dbuf_cache_max_bytes) {
if (dbuf_cache_above_hiwater())
dbuf_evict_one();
cv_signal(&dbuf_evict_cv);
@@ -600,15 +674,21 @@ retry:
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
/*
- * Setup the parameters for the dbuf cache. We set the size of the
- * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
- * has been set in /etc/system and it's not greater than the size of
- * the ARC, then we honor that value.
+ * Setup the parameters for the dbuf caches. We set the sizes of the
+ * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+ * of the size of the ARC, respectively. If the values are set in
+ * /etc/system and they're not greater than the size of the ARC, then
+ * we honor that value.
*/
if (dbuf_cache_max_bytes == 0 ||
dbuf_cache_max_bytes >= arc_max_bytes()) {
dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
}
+ if (dbuf_metadata_cache_max_bytes == 0 ||
+ dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+ dbuf_metadata_cache_max_bytes =
+ arc_max_bytes() >> dbuf_metadata_cache_shift;
+ }
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@@ -616,10 +696,13 @@ retry:
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
- dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_cache_link),
- dbuf_cache_multilist_index_func);
- refcount_create(&dbuf_cache_size);
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ dbuf_caches[dcs].cache =
+ multilist_create(sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_cache_link),
+ dbuf_cache_multilist_index_func);
+ refcount_create(&dbuf_caches[dcs].size);
+ }
tsd_create(&zfs_dbuf_evict_key, NULL);
dbuf_evict_thread_exit = B_FALSE;
@@ -653,8 +736,10 @@ dbuf_fini(void)
mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv);
- refcount_destroy(&dbuf_cache_size);
- multilist_destroy(dbuf_cache);
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ refcount_destroy(&dbuf_caches[dcs].size);
+ multilist_destroy(dbuf_caches[dcs].cache);
+ }
}
/*
@@ -2037,9 +2122,15 @@ dbuf_destroy(dmu_buf_impl_t *db)
dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) {
- multilist_remove(dbuf_cache, db);
- (void) refcount_remove_many(&dbuf_cache_size,
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
+
+ db->db_caching_status = DB_NO_CACHE;
}
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -2093,6 +2184,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link));
kmem_cache_free(dbuf_kmem_cache, db);
@@ -2231,6 +2323,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
@@ -2263,6 +2356,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
avl_add(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
@@ -2597,9 +2691,15 @@ top:
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(refcount_is_zero(&db->db_holds));
- multilist_remove(dbuf_cache, db);
- (void) refcount_remove_many(&dbuf_cache_size,
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
+
+ db->db_caching_status = DB_NO_CACHE;
}
(void) refcount_add(&db->db_holds, tag);
DBUF_VERIFY(db);
@@ -2816,12 +2916,22 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
- multilist_insert(dbuf_cache, db);
- (void) refcount_add_many(&dbuf_cache_size,
+ ASSERT3U(db->db_caching_status, ==,
+ DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(dbuf_caches[dcs].cache, db);
+ (void) refcount_add_many(&dbuf_caches[dcs].size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
- dbuf_evict_notify();
+ if (db->db_caching_status == DB_DBUF_CACHE) {
+ dbuf_evict_notify();
+ }
}
if (do_arc_evict)
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 41600208c8..f035b05af2 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -73,60 +73,60 @@ uint32_t zfs_per_txg_dirty_frees_percent = 30;
int zfs_object_remap_one_indirect_delay_ticks = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
- { DMU_BSWAP_UINT8, TRUE, "unallocated" },
- { DMU_BSWAP_ZAP, TRUE, "object directory" },
- { DMU_BSWAP_UINT64, TRUE, "object array" },
- { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
- { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
- { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
- { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
- { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
- { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
- { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
- { DMU_BSWAP_ZAP, TRUE, "DSL props" },
- { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
- { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
- { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
- { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
- { DMU_BSWAP_UINT8, FALSE, "zvol object" },
- { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
- { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
- { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
- { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
- { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
- { DMU_BSWAP_UINT8, TRUE, "SPA history" },
- { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
- { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
- { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
- { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
- { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
- { DMU_BSWAP_UINT8, TRUE, "FUID table" },
- { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
- { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
- { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
- { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
- { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
- { DMU_BSWAP_UINT8, TRUE, "System attributes" },
- { DMU_BSWAP_ZAP, TRUE, "SA master node" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
- { DMU_BSWAP_ZAP, TRUE, "scan translations" },
- { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
- { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
- { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" },
+ { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" },
+ { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" },
+ { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
+ { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" },
+ { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" },
+ { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 574812d3b4..f345ed9383 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -496,6 +496,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
+ /*
+ * These properties will be filled in by the logic in zfs_get_zplprop()
+ * when they are queried for the first time.
+ */
+ os->os_version = OBJSET_PROP_UNINITIALIZED;
+ os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+ os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+ os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
if (ds == NULL || !ds->ds_is_snapshot)
os->os_zil_header = os->os_phys->os_zil_header;
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 69617b3dca..f467878b72 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -83,6 +83,13 @@ typedef enum dbuf_states {
DB_EVICTING
} dbuf_states_t;
+typedef enum dbuf_cached_state {
+ DB_NO_CACHE = -1,
+ DB_DBUF_CACHE,
+ DB_DBUF_METADATA_CACHE,
+ DB_CACHE_MAX
+} dbuf_cached_state_t;
+
struct dnode;
struct dmu_tx;
@@ -229,11 +236,12 @@ typedef struct dmu_buf_impl {
*/
avl_node_t db_link;
- /*
- * Link in dbuf_cache.
- */
+ /* Link in dbuf_cache or dbuf_metadata_cache */
multilist_node_t db_cache_link;
+ /* Tells us which dbuf cache this dbuf is in, if any */
+ dbuf_cached_state_t db_caching_status;
+
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 8ea733a830..6a33cb7d81 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap {
/*
* Defines a uint8_t object type. Object types specify if the data
* in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t).
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
*/
#define DMU_OT(byteswap, metadata) \
(DMU_OT_NEWTYPE | \
@@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
+#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
/*
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
@@ -799,6 +803,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t size);
typedef struct dmu_object_type_info {
dmu_object_byteswap_t ot_byteswap;
boolean_t ot_metadata;
+ boolean_t ot_dbuf_metadata_cache;
char *ot_name;
} dmu_object_type_info_t;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 59e87aab80..25ff864217 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -39,6 +39,7 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
#ifdef __cplusplus
extern "C" {
@@ -69,6 +70,7 @@ typedef struct objset_phys {
dnode_phys_t os_groupused_dnode;
} objset_phys_t;
+#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
struct objset {
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
@@ -100,6 +102,16 @@ struct objset {
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
+ /*
+ * The next four values are used as a cache of whatever's on disk, and
+ * are initialized the first time these properties are queried. Before
+ * being initialized with their real values, their values are
+ * OBJSET_PROP_UNINITIALIZED.
+ */
+ uint64_t os_version;
+ uint64_t os_normalization;
+ uint64_t os_utf8only;
+ uint64_t os_casesensitivity;
/*
* Pointer is constant; the blkptr it points to is protected by
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 3de658666a..f7beea4cc9 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -2232,6 +2232,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
+ os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs);
@@ -2244,17 +2245,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
- const char *pname;
- int error = ENOENT;
+ uint64_t *cached_copy = NULL;
/*
- * Look up the file system's value for the property. For the
- * version property, we look up a slightly different string.
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
*/
- if (prop == ZFS_PROP_VERSION)
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
- else
+ } else {
pname = zfs_prop_to_name(prop);
+ }
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
@@ -2279,6 +2310,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
}
error = 0;
}
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
return (error);
}