diff options
author | ek110237 <none@none> | 2007-10-24 16:54:46 -0700 |
---|---|---|
committer | ek110237 <none@none> | 2007-10-24 16:54:46 -0700 |
commit | f18faf3f3e5def85fdfff681617d227703ace2ad (patch) | |
tree | f3e763ede9b38b1c489a18a8bf6a649314201e39 /usr/src | |
parent | 8696d418011068e5cedf3a229f7a6613e7798e92 (diff) | |
download | illumos-joyent-f18faf3f3e5def85fdfff681617d227703ace2ad.tar.gz |
6425096 want online 'zfs recv' (read only and read/write)
6597182 .zfs/snapshot code could use a little more comments
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/common/zfs/zfs_namecheck.c | 6 | ||||
-rw-r--r-- | usr/src/lib/libzfs/common/libzfs_dataset.c | 62 | ||||
-rw-r--r-- | usr/src/uts/common/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_objset.c | 21 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_send.c | 193 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/dsl_dataset.c | 205 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/rrwlock.c | 249 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu.h | 6 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu_objset.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dsl_dataset.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/rrwlock.h | 80 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_znode.h | 18 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_ctldir.c | 42 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_ioctl.c | 81 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vfsops.c | 400 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vnops.c | 123 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_znode.c | 49 |
18 files changed, 1294 insertions, 257 deletions
diff --git a/usr/src/common/zfs/zfs_namecheck.c b/usr/src/common/zfs/zfs_namecheck.c index 0bfe9be296..cee25a62b1 100644 --- a/usr/src/common/zfs/zfs_namecheck.c +++ b/usr/src/common/zfs/zfs_namecheck.c @@ -54,14 +54,14 @@ valid_char(char c) return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || - c == '-' || c == '_' || c == '.' || c == ':'); + c == '-' || c == '_' || c == '.' || c == ':' || c == '%'); } /* * Snapshot names must be made up of alphanumeric characters plus the following * characters: * - * [-_.:] + * [-_.:%] */ int snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) @@ -126,7 +126,7 @@ permset_namecheck(const char *path, namecheck_err_t *why, char *what) * Where each component is made up of alphanumeric characters plus the following * characters: * - * [-_.:] + * [-_.:%] */ int dataset_namecheck(const char *path, namecheck_err_t *why, char *what) diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index 8a00f94c7e..04a37032ea 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -130,7 +130,8 @@ path_to_str(const char *path, int types) * 'buf' detailing exactly why the name was not valid. */ static int -zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type) +zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying) { namecheck_err_t why; char what; @@ -203,13 +204,20 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type) return (0); } + if (modifying && strchr(path, '%') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid character %c in name"), '%'); + return (0); + } + return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { - return (zfs_validate_name(NULL, name, type)); + return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* @@ -420,7 +428,7 @@ zfs_open(libzfs_handle_t *hdl, const char *path, int types) /* * Validate the name before we even try to open it. */ - if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET)) { + if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid dataset name")); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); @@ -2428,7 +2436,7 @@ zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; - if (!zfs_validate_name(hdl, path, types)) + if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* @@ -2486,7 +2494,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ - if (!zfs_validate_name(hdl, path, type)) + if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ @@ -2777,7 +2785,7 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) "cannot create '%s'"), target); /* validate the target name */ - if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM)) + if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ @@ -3042,7 +3050,7 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive) "cannot snapshot '%s'"), path); /* validate the target name */ - if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT)) + if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* make sure the parent exists and is of the appropriate type */ @@ -3246,7 +3254,6 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix, dmu_replay_record_t drr; struct drr_begin *drrb = &zc.zc_begin_record; char errbuf[1024]; - prop_changelist_t *clp; char chopprefix[ZFS_MAXNAMELEN]; begin_time = time(NULL); @@ -3331,7 +3338,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix, (void) strcpy(zc.zc_value, tosnap); (void) strncat(zc.zc_value, drr.drr_u.drr_begin.drr_toname+choplen, sizeof (zc.zc_value)); - if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT)) + if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strcpy(zc.zc_name, zc.zc_value); @@ -3347,26 +3354,10 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (h == NULL) return (-1); - if (!dryrun) { - /* - * We need to unmount all the dependents of the dataset - * and the dataset itself. If it's a volume - * then remove device link. - */ - if (h->zfs_type == ZFS_TYPE_FILESYSTEM) { - clp = changelist_gather(h, ZFS_PROP_NAME, 0); - if (clp == NULL) - return (-1); - if (changelist_prefix(clp) != 0) { - changelist_free(clp); - return (-1); - } - } else { - if (zvol_remove_link(hdl, h->zfs_name) != 0) { - zfs_close(h); - return (-1); - } - + if (!dryrun && h->zfs_type == ZFS_TYPE_VOLUME) { + if (zvol_remove_link(hdl, h->zfs_name) != 0) { + zfs_close(h); + return (-1); } } zfs_close(h); @@ -3474,13 +3465,8 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix, if (err == 0 && ioctl_err == 0) err = zvol_create_link(hdl, zc.zc_value); - } else { - if (drrb->drr_fromguid) { - err = changelist_postfix(clp); - changelist_free(clp); - } else { - err = zfs_mount(h, NULL, 0); - } + } else if (!drrb->drr_fromguid) { + err = zfs_mount(h, NULL, 0); } zfs_close(h); } @@ -3750,7 +3736,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) errbuf)); } } - if (!zfs_validate_name(hdl, target, zhp->zfs_type)) + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { if (recursive) { @@ -3759,7 +3745,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } - if (!zfs_validate_name(hdl, target, zhp->zfs_type)) + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); uint64_t unused; diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index ba15278285..c86259c8b1 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1010,6 +1010,7 @@ ZFS_OBJS += \ zfs_log.o \ zfs_replay.o \ zfs_rlock.o \ + rrwlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 8753f062d3..2758d84791 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -44,7 +44,6 @@ #include <sys/dmu_impl.h> #include <sys/zfs_ioctl.h> - spa_t * dmu_objset_spa(objset_t *os) { @@ -244,6 +243,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); osi->os_meta_dnode = dnode_special_open(osi, &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); @@ -266,10 +266,10 @@ int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, objset_t **osp) { - dsl_dataset_t *ds; - int err; objset_t *os; + dsl_dataset_t *ds; objset_impl_t *osi; + int err; os = kmem_alloc(sizeof (objset_t), KM_SLEEP); err = dsl_dataset_open(name, mode, os, &ds); @@ -387,6 +387,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); + mutex_destroy(&osi->os_user_ptr_lock); kmem_free(osi, sizeof (objset_impl_t)); } @@ -1049,3 +1050,17 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) err = func(name, arg); return (err); } + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + os->os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); + return (os->os->os_user_ptr); +} diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index d1b5cc1ecc..812abd0265 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -295,12 +295,9 @@ struct restorearg { zio_cksum_t zc; }; -/* ARGSUSED */ static int -replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +replay_incremental_check(dsl_dataset_t *ds, struct drr_begin *drrb) { - dsl_dataset_t *ds = arg1; - struct drr_begin *drrb = arg2; const char *snapname; int err; uint64_t val; @@ -312,10 +309,6 @@ replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) /* most recent snapshot must match fromguid */ if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) return (ENODEV); - /* must not have any changes since most recent snapshot */ - if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (ETXTBSY); /* new snapshot name must not exist */ snapname = strrchr(drrb->drr_toname, '@'); @@ -326,16 +319,31 @@ replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); if (err == 0) - return (EEXIST); + return (EEXIST); if (err != ENOENT) - return (err); + return (err); return (0); } /* ARGSUSED */ +static int +replay_offline_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct drr_begin *drrb = arg2; + + /* must not have any changes since most recent snapshot */ + if (dsl_dataset_modified_since_lastsnap(ds)) + return (ETXTBSY); + + return (replay_incremental_check(ds, drrb)); +} + +/* ARGSUSED */ static void -replay_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +replay_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -402,6 +410,57 @@ replay_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); } +struct onlineincarg { + dsl_dir_t *dd; + dsl_dataset_t *ohds; + boolean_t force; + const char *cosname; +}; + +/* ARGSUSED */ +static int +replay_online_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + struct onlineincarg *oia = arg1; + + if (dsl_dataset_modified_since_lastsnap(oia->ohds) && !oia->force) + return (ETXTBSY); + + return (replay_incremental_check(oia->ohds, arg2)); +} + +/* ARGSUSED */ +static void +replay_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + struct onlineincarg *oia = arg1; + dsl_dataset_t *ohds = oia->ohds; + dsl_dir_t *dd = oia->dd; + dsl_dataset_t *ods, *ds; + uint64_t dsobj; + + VERIFY(0 == dsl_dataset_open_obj(ohds->ds_dir->dd_pool, + ohds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_STANDARD, FTAG, &ods)); + + dsobj = dsl_dataset_create_sync(dd, strrchr(oia->cosname, '/') + 1, + ods, tx); + + /* open the temporary clone */ + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, + DS_MODE_EXCLUSIVE, FTAG, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", + ds->ds_phys->ds_dir_obj); + + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG); +} + static int replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -729,13 +788,16 @@ restore_free(struct restorearg *ra, objset_t *os, int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, vnode_t *vp, uint64_t voffset) + boolean_t force, boolean_t online, vnode_t *vp, uint64_t voffset, + char *cosname) { struct restorearg ra; dmu_replay_record_t *drr; char *cp; objset_t *os = NULL; zio_cksum_t pzc; + char *clonebuf = NULL; + size_t len; bzero(&ra, sizeof (ra)); ra.vp = vp; @@ -790,8 +852,9 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, /* * Process the begin in syncing context. */ - if (drrb->drr_fromguid) { - /* incremental backup */ + if (drrb->drr_fromguid && !online) { + /* offline incremental receive */ + dsl_dataset_t *ds = NULL; cp = strchr(tosnap, '@'); @@ -816,11 +879,52 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, (void) dsl_dataset_rollback(ds); } ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, - replay_incremental_check, replay_incremental_sync, - ds, drrb, 1); + replay_offline_incremental_check, + replay_offline_incremental_sync, ds, drrb, 1); dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + } else if (drrb->drr_fromguid && online) { + /* online incremental receive */ + + const char *tail; + struct onlineincarg oia = { 0 }; + + /* + * Get the dsl_dir for the parent of the + * temporary clone. + */ + cp = strchr(tosnap, '@'); + *cp = '\0'; + + /* tmp clone is: tonsap + '/' + '%' + "snapX" */ + len = strlen(tosnap) + 2 + strlen(cp + 1) + 1; + clonebuf = kmem_alloc(len, KM_SLEEP); + (void) snprintf(clonebuf, len, "%s%c%c%s%c", + tosnap, '/', '%', cp + 1, '\0'); + ra.err = dsl_dir_open(tosnap, FTAG, &oia.dd, &tail); + *cp = '@'; + if (ra.err) + goto out; + + /* open the dataset we are logically receiving into */ + *cp = '\0'; + ra.err = dsl_dataset_open(tosnap, DS_MODE_STANDARD, + FTAG, &oia.ohds); + *cp = '@'; + if (ra.err) { + dsl_dir_close(oia.dd, FTAG); + goto out; + } + + oia.force = force; + oia.cosname = clonebuf; + ra.err = dsl_sync_task_do(oia.dd->dd_pool, + replay_online_incremental_check, + replay_online_incremental_sync, &oia, drrb, 5); + dsl_dataset_close(oia.ohds, DS_MODE_STANDARD, FTAG); + dsl_dir_close(oia.dd, FTAG); } else { /* full backup */ + dsl_dir_t *dd = NULL; const char *tail; @@ -854,8 +958,8 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, cp = strchr(tosnap, '@'); *cp = '\0'; - ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, - DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); + ra.err = dmu_objset_open(clonebuf == NULL ? tosnap : clonebuf, + DMU_OST_ANY, DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); *cp = '@'; ASSERT3U(ra.err, ==, 0); @@ -918,9 +1022,11 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, goto out; } - ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> - ds_dir->dd_pool, replay_end_check, replay_end_sync, - os, drrb, 3); + if (clonebuf == NULL) { + ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> + ds_dir->dd_pool, replay_end_check, + replay_end_sync, os, drrb, 3); + } goto out; } default: @@ -931,8 +1037,11 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, } out: - if (os) + if (os) { + if (drrb->drr_fromguid && online && !ra.err) + dmu_objset_name(os, cosname); dmu_objset_close(os); + } /* * Make sure we don't rollback/destroy unless we actually @@ -949,15 +1058,29 @@ out: cp = strchr(tosnap, '@'); *cp = '\0'; - err = dsl_dataset_open(tosnap, + err = dsl_dataset_open(clonebuf == NULL ? tosnap : clonebuf, DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, FTAG, &ds); if (err == 0) { txg_wait_synced(ds->ds_dir->dd_pool, 0); if (drrb->drr_fromguid) { - /* incremental: rollback to most recent snap */ - (void) dsl_dataset_rollback(ds); - dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + if (clonebuf != NULL) { + /* + * online incremental: destroy + * the temporarily created clone. + */ + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, + FTAG); + (void) dmu_objset_destroy(clonebuf); + } else { + /* + * offline incremental: rollback to + * most recent snapshot. + */ + (void) dsl_dataset_rollback(ds); + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, + FTAG); + } } else { /* full: destroy whole fs */ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); @@ -967,8 +1090,26 @@ out: *cp = '@'; } + if (clonebuf != NULL) + kmem_free(clonebuf, len); kmem_free(ra.buf, ra.bufsize); if (sizep) *sizep = ra.voff; return (ra.err); } + +int +dmu_replay_end_snapshot(char *name, struct drr_begin *drrb) +{ + objset_t *os; + int err; + + err = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_STANDARD, &os); + if (err) + return (err); + + err = dsl_sync_task_do(dmu_objset_ds(os)->ds_dir->dd_pool, + replay_end_check, replay_end_sync, os, drrb, 3); + dmu_objset_close(os); + return (err); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index 08bc980ffb..1cba47175a 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -1535,6 +1535,21 @@ dsl_dataset_space(dsl_dataset_t *ds, *availobjsp = DN_MAX_OBJECT - *usedobjsp; } +boolean_t +dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || + dsl_pool_sync_context(dp)); + if (ds->ds_prev == NULL) + return (B_FALSE); + if (ds->ds_phys->ds_bp.blk_birth > + ds->ds_prev->ds_phys->ds_creation_txg) + return (B_TRUE); + return (B_FALSE); +} + /* ARGSUSED */ static int dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1601,7 +1616,7 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dsl_dataset_close(hds, DS_MODE_NONE, FTAG); } -struct renamearg { +struct renamesnaparg { dsl_sync_task_group_t *dstg; char failed[MAXPATHLEN]; char *oldsnap; @@ -1611,7 +1626,7 @@ struct renamearg { static int dsl_snapshot_rename_one(char *name, void *arg) { - struct renamearg *ra = arg; + struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; char *cp; int err; @@ -1659,7 +1674,7 @@ static int dsl_recursive_rename(char *oldname, const char *newname) { int err; - struct renamearg *ra; + struct renamesnaparg *ra; dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); @@ -1674,7 +1689,7 @@ dsl_recursive_rename(char *oldname, const char *newname) kmem_free(fsname, len + 1); return (err); } - ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP); + ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ra->oldsnap = strchr(oldname, '@') + 1; @@ -1704,7 +1719,7 @@ dsl_recursive_rename(char *oldname, const char *newname) (void) strcpy(oldname, ra->failed); dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamearg)); + kmem_free(ra, sizeof (struct renamesnaparg)); spa_close(spa, FTAG); return (err); } @@ -2051,6 +2066,186 @@ dsl_dataset_promote(const char *name) return (err); } +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + +/* ARGSUSED */ +static int +dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *cds = arg1; /* clone to become new head */ + boolean_t *forcep = arg2; + dsl_dir_t *cdd = cds->ds_dir; + dsl_pool_t *dp = cds->ds_dir->dd_pool; + dsl_dataset_t *ods; /* the snapshot cds is cloned off of */ + dsl_dataset_t *ohds = NULL; + dsl_dir_t *odd; + int err; + + /* check that it is a clone */ + if (cdd->dd_phys->dd_clone_parent_obj == 0) + return (EINVAL); + + /* check that cds is not a snapshot */ + if (dsl_dataset_is_snapshot(cds)) + return (EINVAL); + + /* open the origin */ + if (err = dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods)) + return (err); + odd = ods->ds_dir; + + /* make sure the clone is descendant of origin */ + if (cdd->dd_parent != odd) { + err = EINVAL; + goto out; + } + + /* check that there are no snapshots after the origin */ + if (cds->ds_phys->ds_prev_snap_obj != ods->ds_object || + ods->ds_phys->ds_next_snap_obj != + odd->dd_phys->dd_head_dataset_obj) { + err = EINVAL; + goto out; + } + + /* + * Verify origin head dataset hasn't been modified or + * 'force' has been passed down. + */ + if (!(*forcep) && + (err = dsl_dataset_open_obj(cdd->dd_pool, + odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE, + FTAG, &ohds)) == 0) { + if (dsl_dataset_modified_since_lastsnap(ohds)) + err = ETXTBSY; + dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG); + } +out: + dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG); + return (err); +} + +/* ARGSUSED */ +static void +dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *cds = arg1; /* clone to become new head */ + dsl_dir_t *cdd = cds->ds_dir; + dsl_pool_t *dp = cds->ds_dir->dd_pool; + dsl_dataset_t *ods, *ohds; + dsl_dir_t *odd; + uint64_t itor = 0; + blkptr_t bp; + uint64_t unique = 0; + int err; + + ASSERT(cdd->dd_phys->dd_clone_parent_obj != 0); + ASSERT(dsl_dataset_is_snapshot(cds) == 0); + + /* open the origin */ + VERIFY(0 == dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods)); + odd = ods->ds_dir; + ASSERT(cds->ds_phys->ds_prev_snap_obj == ods->ds_object); + ASSERT(ods->ds_phys->ds_next_snap_obj == + odd->dd_phys->dd_head_dataset_obj); + + /* open the origin head */ + VERIFY(0 == dsl_dataset_open_obj(cdd->dd_pool, + odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE, + FTAG, &ohds)); + ASSERT(odd == ohds->ds_dir); + + dmu_buf_will_dirty(cds->ds_dbuf, tx); + dmu_buf_will_dirty(ohds->ds_dbuf, tx); + dmu_buf_will_dirty(ods->ds_dbuf, tx); + + /* compute unique space */ + while ((err = bplist_iterate(&cds->ds_deadlist, &itor, &bp)) == 0) { + if (bp.blk_birth > ods->ds_phys->ds_prev_snap_txg) + unique += bp_get_dasize(cdd->dd_pool->dp_spa, &bp); + } + VERIFY(err == ENOENT); + + /* reset origin's unique bytes */ + ods->ds_phys->ds_unique_bytes = unique; + + /* swap blkptrs */ + { + blkptr_t tmp; + tmp = ohds->ds_phys->ds_bp; + ohds->ds_phys->ds_bp = cds->ds_phys->ds_bp; + cds->ds_phys->ds_bp = tmp; + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + VERIFY(0 == bplist_space(&cds->ds_deadlist, &cdl_used, + &cdl_comp, &cdl_uncomp)); + VERIFY(0 == bplist_space(&ohds->ds_deadlist, &odl_used, + &odl_comp, &odl_uncomp)); + dused = cds->ds_phys->ds_used_bytes + cdl_used - + (ohds->ds_phys->ds_used_bytes + odl_used); + dcomp = cds->ds_phys->ds_compressed_bytes + cdl_comp - + (ohds->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp - + (ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); + + dsl_dir_diduse_space(odd, dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(cdd, -dused, -dcomp, -duncomp, tx); + } + + /* swap ds_*_bytes */ + SWITCH64(ohds->ds_phys->ds_used_bytes, cds->ds_phys->ds_used_bytes); + SWITCH64(ohds->ds_phys->ds_compressed_bytes, + cds->ds_phys->ds_compressed_bytes); + SWITCH64(ohds->ds_phys->ds_uncompressed_bytes, + cds->ds_phys->ds_uncompressed_bytes); + + /* swap deadlists */ + bplist_close(&cds->ds_deadlist); + bplist_close(&ohds->ds_deadlist); + SWITCH64(ohds->ds_phys->ds_deadlist_obj, cds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&cds->ds_deadlist, dp->dp_meta_objset, + cds->ds_phys->ds_deadlist_obj)); + VERIFY(0 == bplist_open(&ohds->ds_deadlist, dp->dp_meta_objset, + ohds->ds_phys->ds_deadlist_obj)); + + dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG); + dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG); +} + +/* + * Swap the clone "cosname" with its origin head file system. + */ +int +dsl_dataset_clone_swap(const char *cosname, boolean_t force) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_open(cosname, + DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_clone_swap_check, + dsl_dataset_clone_swap_sync, ds, &force, 9); + dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); + return (err); +} + /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. diff --git a/usr/src/uts/common/fs/zfs/rrwlock.c b/usr/src/uts/common/fs/zfs/rrwlock.c new file mode 100644 index 0000000000..710685dbc7 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/rrwlock.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/refcount.h> +#include <sys/rrwlock.h> + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + refcount_create(&rrl->rr_anon_rcount); + refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + refcount_destroy(&rrl->rr_anon_rcount); + refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl); + (void) refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +static void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (refcount_count(&rrl->rr_anon_rcount) > 0 || + refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + if (rrn_find_and_remove(rrl)) { + if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + + } else { + if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) + cv_broadcast(&rrl->rr_cv); + } + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && + refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!refcount_is_zero(&rrl->rr_anon_rcount) || + !refcount_is_zero(&rrl->rr_linked_rcount)); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 6c9867df8a..6e6495e2ec 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -533,6 +533,8 @@ extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. @@ -573,7 +575,9 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start, int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp); int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, - boolean_t force, struct vnode *vp, uint64_t voffset); + boolean_t force, boolean_t online, struct vnode *vp, uint64_t voffset, + char *cosname); +int dmu_replay_end_snapshot(char *name, struct drr_begin *drrb); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index 775a777eef..725c771f6a 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -86,6 +86,10 @@ typedef struct objset_impl { list_t os_free_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; } objset_impl_t; #define DMU_META_DNODE_OBJECT 0 diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 2a8d354be4..d02eba1ce7 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -138,6 +138,7 @@ dsl_syncfunc_t dsl_dataset_snapshot_sync; int dsl_dataset_rollback(dsl_dataset_t *ds); int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); int dsl_dataset_promote(const char *name); +int dsl_dataset_clone_swap(const char *name, boolean_t force); void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, void *p, dsl_dataset_evict_func_t func); @@ -148,6 +149,8 @@ void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); +boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); + void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/rrwlock.h b/usr/src/uts/common/fs/zfs/sys/rrwlock.h new file mode 100644 index 0000000000..19a43c97fc --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/rrwlock.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/inttypes.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + refcount_t rr_anon_rcount; + refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h index 38b9fbb9fc..ea55a86b9e 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -33,6 +33,7 @@ #include <sys/list.h> #include <sys/vfs.h> #include <sys/zil.h> +#include <sys/rrwlock.h> #ifdef __cplusplus extern "C" { @@ -53,8 +54,8 @@ struct zfsvfs { uint_t z_acl_inherit; /* acl inheritance behavior */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ - krwlock_t z_unmount_lock; - krwlock_t z_unmount_inactive_lock; + rrwlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ vnode_t *z_ctldir; /* .zfs directory pointer */ @@ -115,6 +116,9 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index b562c9e915..8b4ee46218 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -34,6 +34,7 @@ #include <sys/list.h> #include <sys/dmu.h> #include <sys/zfs_vfsops.h> +#include <sys/rrwlock.h> #endif #include <sys/zfs_acl.h> #include <sys/zil.h> @@ -153,6 +154,7 @@ typedef struct znode { uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_last_itx; /* last ZIL itx on this znode */ + uint64_t z_gen; /* generation (same as zp_gen) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ list_node_t z_link_node; /* all znodes in fs link */ @@ -189,18 +191,27 @@ typedef struct znode { /* * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. * ZFS_EXIT() must be called before exitting the vop. + * ZFS_ENTER_VERIFY_ZP() does ZFS_ENTER plus verifies the znode is valid. */ #define ZFS_ENTER(zfsvfs) \ { \ - if (rw_tryenter(&(zfsvfs)->z_unmount_lock, RW_READER) == 0) \ - return (EIO); \ + rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \ if ((zfsvfs)->z_unmounted) { \ ZFS_EXIT(zfsvfs); \ return (EIO); \ } \ } -#define ZFS_EXIT(zfsvfs) rw_exit(&(zfsvfs)->z_unmount_lock) +#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +#define ZFS_ENTER_VERIFY_ZP(zfsvfs, zp) \ + { \ + ZFS_ENTER((zfsvfs)); \ + if (!(zp)->z_dbuf_held) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } /* * Macros for dealing with dmu_buf_hold @@ -250,6 +261,7 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); extern void zfs_zinactive(znode_t *); extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_znode_free(znode_t *); diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c index 5fe86c98ac..3b2cc409e0 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c +++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c @@ -53,6 +53,16 @@ * reliable way to auto-unmount the filesystem when it's "no longer in use". * When the user unmounts a filesystem, we call zfsctl_unmount(), which * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). */ #include <fs/fs_subr.h> @@ -578,6 +588,9 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) return (err); } +/* + * This creates a snapshot under '.zfs/snapshot'. + */ /* ARGSUSED */ static int zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, @@ -711,6 +724,9 @@ domount: if (err == 0) { /* * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns + * the ZFS vnode mounted on top of the GFS node. This ZFS + * vnode is the root the newly created vfsp. */ VFS_RELE(vfsp); err = traverse(vpp); @@ -718,11 +734,11 @@ domount: if (err == 0) { /* - * Fix up the root vnode. + * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. * * This is where we lie about our v_vfsp in order to - * make .zfs/snapshot/<snapdir> accessible over NFS - * without requiring manual mounts of <snapdir>. + * make .zfs/snapshot/<snapname> accessible over NFS + * without requiring manual mounts of <snapname>. */ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; @@ -771,6 +787,13 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp, return (0); } +/* + * pvp is the '.zfs' directory (zfsctl_node_t). + * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). + * + * This function is the callback to create a GFS vnode for '.zfs/snapshot' + * when a lookup is performed on .zfs for "snapshot". + */ vnode_t * zfsctl_mknode_snapdir(vnode_t *pvp) { @@ -838,6 +861,13 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = { { NULL } }; +/* + * pvp is the GFS vnode '.zfs/snapshot'. + * + * This creates a GFS node under '.zfs/snapshot' representing each + * snapshot. This newly created GFS node is what we mount snapshot + * vfs_t's ontop of. + */ static vnode_t * zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) { @@ -937,6 +967,12 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) if (sep != NULL) { VN_HOLD(vp); + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid> + * and returns the ZFS vnode mounted on top of the GFS node. + * This ZFS vnode is the root of the vfs for objset 'objsetid'. + */ error = traverse(&vp); if (error == 0) { if (vp == sep->se_root) diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index a6bad3f8d8..4a5e68b878 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -63,6 +63,8 @@ #include <sys/zvol.h> #include <sharefs/share.h> #include <sys/zfs_znode.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu_objset.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -1671,7 +1673,8 @@ zfs_ioc_create(zfs_cmd_t *zc) default: cbfunc = NULL; } - if (strchr(zc->zc_name, '@')) + if (strchr(zc->zc_name, '@') || + strchr(zc->zc_name, '%')) return (EINVAL); if (zc->zc_nvlist_src != NULL && @@ -1847,7 +1850,8 @@ zfs_ioc_rename(zfs_cmd_t *zc) boolean_t recursive = zc->zc_cookie & 1; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '%')) return (EINVAL); /* @@ -1869,21 +1873,84 @@ static int zfs_ioc_recvbackup(zfs_cmd_t *zc) { file_t *fp; - int error, fd; offset_t new_off; + objset_t *os; + zfsvfs_t *zfsvfs = NULL; + char *cp; + char cosname[MAXNAMELEN]; + boolean_t force = (boolean_t)zc->zc_guid; + int error, fd; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || - strchr(zc->zc_value, '@') == NULL) + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) return (EINVAL); fd = zc->zc_cookie; fp = getf(fd); if (fp == NULL) return (EBADF); + + /* + * Get the zfsvfs for the receiving objset. There + * won't be one if we're operating on a zvol, if the + * objset doesn't exist yet, or is not mounted. + */ + cp = strchr(zc->zc_value, '@'); + *cp = '\0'; + error = dmu_objset_open(zc->zc_value, DMU_OST_ANY, + DS_MODE_STANDARD | DS_MODE_READONLY, &os); + *cp = '@'; + if (!error) { + if (dmu_objset_type(os) == DMU_OST_ZFS) { + mutex_enter(&os->os->os_user_ptr_lock); + zfsvfs = dmu_objset_get_user(os); + if (zfsvfs != NULL) + VFS_HOLD(zfsvfs->z_vfs); + mutex_exit(&os->os->os_user_ptr_lock); + } + dmu_objset_close(os); + } + error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record, - &zc->zc_cookie, (boolean_t)zc->zc_guid, fp->f_vnode, - fp->f_offset); + &zc->zc_cookie, force, zfsvfs != NULL, fp->f_vnode, + fp->f_offset, cosname); + + /* + * For incremental snapshots where we created a + * temporary clone, we now swap zfsvfs::z_os with + * the newly created and received "cosname". + */ + if (!error && zfsvfs != NULL) { + char osname[MAXNAMELEN]; + int mode; + + error = zfs_suspend_fs(zfsvfs, osname, &mode); + if (!error) { + int swap_err; + int snap_err = 0; + + swap_err = dsl_dataset_clone_swap(cosname, force); + if (!swap_err) { + char *cp = strrchr(zc->zc_value, '@'); + + *cp = '\0'; + snap_err = dmu_replay_end_snapshot(zc->zc_value, + &zc->zc_begin_record); + *cp = '@'; + } + error = zfs_resume_fs(zfsvfs, osname, mode); + if (!error) + error = swap_err; + if (!error) + error = snap_err; + } + /* destroy the clone we created */ + (void) dmu_objset_destroy(cosname); + } + if (zfsvfs != NULL) + VFS_RELE(zfsvfs->z_vfs); new_off = fp->f_offset + zc->zc_cookie; if (VOP_SEEK(fp->f_vnode, fp->f_offset, &new_off) == 0) fp->f_offset = new_off; @@ -2327,6 +2394,7 @@ static struct modlinkage modlinkage = { uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; int _init(void) @@ -2345,6 +2413,7 @@ _init(void) } tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, NULL); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 38c1650857..0736cb3224 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -59,6 +59,7 @@ #include <sys/bootconf.h> #include <sys/sunddi.h> #include <sys/dnlc.h> +#include <sys/dmu_objset.h> int zfsfstype; vfsops_t *zfs_vfsops = NULL; @@ -498,6 +499,76 @@ unregister: } static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + uint_t readonly; + int error; + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + else + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest doesn't + * use readonly mounts, where zfs_unlinked_drain() isn't + * called.) This is because ziltest causes spa_sync() + * to think it's committed, but actually it is not, so + * the intent log contains many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated in + * a yet later txg. This would write a "create object + * N" record to the intent log. Normally, this would be + * fine because the spa_sync() would have written out + * the fact that object N is free, before we could write + * the "create object N" intent log record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector); + + zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + } + + if (!zil_disable) + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + return (0); +} + +static int zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) { dev_t mount_dev; @@ -525,8 +596,8 @@ zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rw_init(&zfsvfs->z_unmount_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_unmount_inactive_lock, NULL, RW_DEFAULT, NULL); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; @@ -583,54 +654,7 @@ zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) xattr_changed_cb(zfsvfs, xattr); zfsvfs->z_issnap = B_TRUE; } else { - uint_t readonly; - - error = zfs_register_callbacks(vfsp); - if (error) - goto out; - - /* - * During replay we remove the read only flag to - * allow replays to succeed. - */ - readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - if (readonly != 0) - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - else - zfs_unlinked_drain(zfsvfs); - - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest doesn't - * use readonly mounts, where zfs_unlinked_drain() isn't - * called.) This is because ziltest causes spa_sync() - * to think it's committed, but actually it is not, so - * the intent log contains many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated in - * a yet later txg. This would write a "create object - * N" record to the intent log. Normally, this would be - * fine because the spa_sync() would have written out - * the fact that object N is free, before we could write - * the "create object N" intent log record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector); - - zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ - - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + error = zfsvfs_setup(zfsvfs, B_TRUE); } if (!zfsvfs->z_issnap) @@ -641,8 +665,8 @@ out: dmu_objset_close(zfsvfs->z_os); mutex_destroy(&zfsvfs->z_znodes_lock); list_destroy(&zfsvfs->z_all_znodes); - rw_destroy(&zfsvfs->z_unmount_lock); - rw_destroy(&zfsvfs->z_unmount_inactive_lock); + rrw_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } else { atomic_add_32(&zfs_active_fs_count, 1); @@ -1019,13 +1043,130 @@ zfs_root(vfs_t *vfsp, vnode_t **vpp) return (error); } +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + objset_t *os = zfsvfs->z_os; + znode_t *zp, *nextzp; + znode_t markerzp; + + rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + return (EIO); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + * Note, the dmu can still callback via znode_pageout_func() + * which can zfs_znode_free() the znode. So we lock + * z_all_znodes; search the list for a held dbuf; drop the lock + * (we know zp can't disappear if we hold a dbuf lock) then + * regrab the lock and restart. + * + * Since we have to restart the search after finding each held dbuf, + * we do two things to speed up searching: we insert a dummy znode + * ('markerzp') to detect the original tail of the list, and move + * non-held znodes to the end of the list. Once we hit 'markerzp', + * we know we've looked at each znode and can break out. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, &markerzp); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != &markerzp; + zp = nextzp) { + nextzp = list_next(&zfsvfs->z_all_znodes, zp); + if (zp->z_dbuf_held) { + /* dbufs should only be held when force unmounting */ + zp->z_dbuf_held = 0; + mutex_exit(&zfsvfs->z_znodes_lock); + dmu_buf_rele(zp->z_dbuf, NULL); + /* Start again */ + mutex_enter(&zfsvfs->z_znodes_lock); + nextzp = list_head(&zfsvfs->z_all_znodes); + } else { + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + } + } + list_remove(&zfsvfs->z_all_znodes, &markerzp); + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + (void) dmu_objset_evict_dbufs(os); + + return (0); +} + /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - objset_t *os = zfsvfs->z_os; - znode_t *zp, *nextzp; + objset_t *os; int ret; ret = secpolicy_fs_unmount(cr, vfsp); @@ -1069,79 +1210,35 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) return (EBUSY); } else { if (vfsp->vfs_count > 2 || - zfsvfs->z_ctldir->v_count > 1) { + zfsvfs->z_ctldir->v_count > 1) return (EBUSY); - } } } vfsp->vfs_flag |= VFS_UNMOUNTED; - rw_enter(&zfsvfs->z_unmount_lock, RW_WRITER); - rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_WRITER); - - /* - * At this point there are no vops active, and any new vops will - * fail with EIO since we have z_unmount_lock for writer (only - * relavent for forced unmount). - * - * Release all holds on dbufs. - * Note, the dmu can still callback via znode_pageout_func() - * which can zfs_znode_free() the znode. So we lock - * z_all_znodes; search the list for a held dbuf; drop the lock - * (we know zp can't disappear if we hold a dbuf lock) then - * regrab the lock and restart. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) { - nextzp = list_next(&zfsvfs->z_all_znodes, zp); - if (zp->z_dbuf_held) { - /* dbufs should only be held when force unmounting */ - zp->z_dbuf_held = 0; - mutex_exit(&zfsvfs->z_znodes_lock); - dmu_buf_rele(zp->z_dbuf, NULL); - /* Start again */ - mutex_enter(&zfsvfs->z_znodes_lock); - nextzp = list_head(&zfsvfs->z_all_znodes); - } - } - mutex_exit(&zfsvfs->z_znodes_lock); - - /* - * Set the unmounted flag and let new vops unblock. - * zfs_inactive will have the unmounted behavior, and all other - * vops will fail with EIO. - */ - zfsvfs->z_unmounted = B_TRUE; - rw_exit(&zfsvfs->z_unmount_lock); - rw_exit(&zfsvfs->z_unmount_inactive_lock); + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; /* - * Unregister properties. + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. */ - if (!dmu_objset_is_snapshot(os)) - zfs_unregister_callbacks(zfsvfs); + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os->os_user_ptr_lock); - /* - * Close the zil. NB: Can't close the zil while zfs_inactive - * threads are blocked as zil_close can call zfs_inactive. - */ - if (zfsvfs->z_log) { - zil_close(zfsvfs->z_log); - zfsvfs->z_log = NULL; + /* + * Finally close the objset + */ + dmu_objset_close(os); } /* - * Evict cached data - */ - (void) dmu_objset_evict_dbufs(os); - - /* - * Finally close the objset - */ - dmu_objset_close(os); - - /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) @@ -1234,6 +1331,77 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) return (0); } +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + *mode = zfsvfs->z_os->os_mode; + dmu_objset_name(zfsvfs->z_os, name); + dmu_objset_close(zfsvfs->z_os); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +{ + int err; + + ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + if (err) { + zfsvfs->z_os = NULL; + } else { + znode_t *zp; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + ASSERT(!zp->z_dbuf_held); + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + } + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't reopen zfsvfs::z_os, force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); + } + return (err); +} + static void zfs_freevfs(vfs_t *vfsp) { @@ -1245,8 +1413,8 @@ zfs_freevfs(vfs_t *vfsp) mutex_destroy(&zfsvfs->z_znodes_lock); list_destroy(&zfsvfs->z_all_znodes); - rw_destroy(&zfsvfs->z_unmount_lock); - rw_destroy(&zfsvfs->z_unmount_inactive_lock); + rrw_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); kmem_free(zfsvfs, sizeof (zfsvfs_t)); atomic_add_32(&zfs_active_fs_count, -1); diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 852555b7f3..2e6405be7a 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -83,8 +83,9 @@ * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. + * This is done avoiding races using ZFS_ENTER(zfsvfs) or + * ZFS_ENTER_VERIFY(zfsvfs, zp). A ZFS_EXIT(zfsvfs) is needed before + * all returns. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: @@ -239,6 +240,7 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, offset_t off; int error; zfsvfs_t *zfsvfs; + znode_t *zp; switch (com) { case _FIOFFS: @@ -257,8 +259,9 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (EFAULT); - zfsvfs = VTOZ(vp)->z_zfsvfs; - ZFS_ENTER(zfsvfs); + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); @@ -398,12 +401,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os = zfsvfs->z_os; + objset_t *os; ssize_t n, nbytes; int error; rl_t *rl; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); + os = zfsvfs->z_os; /* * Validate file offset @@ -568,7 +572,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; @@ -585,7 +589,8 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); + zilog = zfsvfs->z_log; /* * Pre-fault the pages to ensure slow (eg NFS) pages @@ -906,7 +911,7 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); error = zfs_zaccess_rwx(zp, mode, cr); ZFS_EXIT(zfsvfs); return (error); @@ -941,7 +946,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zdp); *vpp = NULL; @@ -1044,14 +1049,16 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; - objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uint64_t zoid; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; top: *vpp = NULL; @@ -1221,7 +1228,7 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr) znode_t *xzp = NULL; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; uint64_t acl_obj, xattr_obj; zfs_dirlock_t *dl; dmu_tx_t *tx; @@ -1229,7 +1236,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr) boolean_t unlinked; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + zilog = zfsvfs->z_log; top: /* @@ -1386,7 +1394,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; uint64_t zoid = 0; dmu_tx_t *tx; @@ -1394,7 +1402,8 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) ASSERT(vap->va_type == VDIR); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + zilog = zfsvfs->z_log; if (dzp->z_phys->zp_flags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); @@ -1483,12 +1492,13 @@ zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr) znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + zilog = zfsvfs->z_log; top: zp = NULL; @@ -1613,7 +1623,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp) int error; uint8_t prefetch; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); /* * If we are not given an eof variable, @@ -1812,7 +1822,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr) (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); ZFS_EXIT(zfsvfs); return (0); @@ -1837,11 +1847,12 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp = zp->z_phys; + znode_phys_t *pzp; int error; uint64_t links; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); + pzp = zp->z_phys; /* * Return all attributes. It's cheaper to provide the answer @@ -1917,10 +1928,10 @@ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { - struct znode *zp = VTOZ(vp); - znode_phys_t *pzp = zp->z_phys; + znode_t *zp = VTOZ(vp); + znode_phys_t *pzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; uint_t mask = vap->va_mask; @@ -1943,7 +1954,9 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) return (EINVAL); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); + pzp = zp->z_phys; + zilog = zfsvfs->z_log; top: attrzp = NULL; @@ -2298,14 +2311,15 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr, error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, sdzp); + zilog = zfsvfs->z_log; /* * Make sure we have the real vp for the target directory. @@ -2319,6 +2333,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr) } tdzp = VTOZ(tdvp); + if (!tdzp->z_dbuf_held) { + ZFS_EXIT(zfsvfs); + return (EIO); + } top: szp = NULL; tzp = NULL; @@ -2529,14 +2547,15 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr) zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; uint64_t zoid; int len = strlen(link); int error; ASSERT(vap->va_type == VLNK); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + zilog = zfsvfs->z_log; top: if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) { ZFS_EXIT(zfsvfs); @@ -2650,7 +2669,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) size_t bufsz; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); bufsz = (size_t)zp->z_phys->zp_size; if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { @@ -2695,7 +2714,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; @@ -2703,7 +2722,8 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) ASSERT(tdvp->v_type == VDIR); - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp); + zilog = zfsvfs->z_log; if (VOP_REALVP(svp, &realvp) == 0) svp = realvp; @@ -2714,6 +2734,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr) } szp = VTOZ(svp); + if (!szp->z_dbuf_held) { + ZFS_EXIT(zfsvfs); + return (EIO); + } top: /* * We do not support links between attributes and non-attributes @@ -2947,7 +2971,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) uint64_t filesz; int error = 0; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); ASSERT(zp->z_dbuf_held && zp->z_phys); @@ -3005,10 +3029,8 @@ zfs_inactive(vnode_t *vp, cred_t *cr) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_READER); - if (zfsvfs->z_unmounted) { - ASSERT(zp->z_dbuf_held == 0); - + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_dbuf_held == 0) { if (vn_has_cached_data(vp)) { (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, B_INVAL, cr); @@ -3022,7 +3044,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr) } else { mutex_exit(&zp->z_lock); } - rw_exit(&zfsvfs->z_unmount_inactive_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); VFS_RELE(zfsvfs->z_vfs); return; } @@ -3053,7 +3075,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr) } zfs_zinactive(zp); - rw_exit(&zfsvfs->z_unmount_inactive_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* @@ -3087,7 +3109,7 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); /* * We are following the UFS semantics with respect to mapcnt @@ -3239,7 +3261,7 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, int need_unlock = 0, err = 0; offset_t orig_off; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); if (protp) *protp = PROT_ALL; @@ -3371,7 +3393,7 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, segvn_crargs_t vn_a; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); if (vp->v_flag & VNOMAP) { ZFS_EXIT(zfsvfs); @@ -3507,7 +3529,7 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, uint64_t off, len; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); top: if (cmd != F_FREESP) { @@ -3542,12 +3564,13 @@ zfs_fid(vnode_t *vp, fid_t *fidp) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint32_t gen = (uint32_t)zp->z_phys->zp_gen; + uint32_t gen; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); + gen = (uint32_t)zp->z_gen; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; if (fidp->fid_len < size) { @@ -3607,7 +3630,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); *valp = 0; error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); @@ -3647,7 +3670,7 @@ zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); error = zfs_getacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); @@ -3662,7 +3685,7 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr) zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; - ZFS_ENTER(zfsvfs); + ZFS_ENTER_VERIFY_ZP(zfsvfs, zp); error = zfs_setacl(zp, vsecp, cr); ZFS_EXIT(zfsvfs); return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 1ac95c5537..7415a15e74 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -41,7 +41,6 @@ #include <sys/vnode.h> #include <sys/file.h> #include <sys/kmem.h> -#include <sys/cmn_err.h> #include <sys/errno.h> #include <sys/unistd.h> #include <sys/mode.h> @@ -417,6 +416,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; + zp->z_gen = zp->z_phys->zp_gen; mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); @@ -706,6 +706,53 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) return (0); } +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + uint64_t obj_num = zp->z_id; + int err; + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_ZNODE || + doi.doi_bonus_size < sizeof (znode_phys_t)) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EINVAL); + } + + ASSERT(db->db_object == obj_num); + ASSERT(db->db_offset == -1); + ASSERT(db->db_data != NULL); + + if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { + dmu_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EIO); + } + + zp->z_dbuf = db; + zp->z_phys = db->db_data; + zfs_znode_dmu_init(zp); + zp->z_unlinked = (zp->z_phys->zp_links == 0); + + /* release the hold from zfs_znode_dmu_init() */ + VFS_RELE(zfsvfs->z_vfs); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { |