summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorek110237 <none@none>2007-10-24 16:54:46 -0700
committerek110237 <none@none>2007-10-24 16:54:46 -0700
commitf18faf3f3e5def85fdfff681617d227703ace2ad (patch)
treef3e763ede9b38b1c489a18a8bf6a649314201e39 /usr/src
parent8696d418011068e5cedf3a229f7a6613e7798e92 (diff)
downloadillumos-joyent-f18faf3f3e5def85fdfff681617d227703ace2ad.tar.gz
6425096 want online 'zfs recv' (read only and read/write)
6597182 .zfs/snapshot code could use a little more comments
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/common/zfs/zfs_namecheck.c6
-rw-r--r--usr/src/lib/libzfs/common/libzfs_dataset.c62
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c21
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c193
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c205
-rw-r--r--usr/src/uts/common/fs/zfs/rrwlock.c249
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/rrwlock.h80
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h18
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ctldir.c42
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c81
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c400
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c123
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c49
18 files changed, 1294 insertions, 257 deletions
diff --git a/usr/src/common/zfs/zfs_namecheck.c b/usr/src/common/zfs/zfs_namecheck.c
index 0bfe9be296..cee25a62b1 100644
--- a/usr/src/common/zfs/zfs_namecheck.c
+++ b/usr/src/common/zfs/zfs_namecheck.c
@@ -54,14 +54,14 @@ valid_char(char c)
return ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
- c == '-' || c == '_' || c == '.' || c == ':');
+ c == '-' || c == '_' || c == '.' || c == ':' || c == '%');
}
/*
* Snapshot names must be made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.:%]
*/
int
snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -126,7 +126,7 @@ permset_namecheck(const char *path, namecheck_err_t *why, char *what)
* Where each component is made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:]
+ * [-_.:%]
*/
int
dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index 8a00f94c7e..04a37032ea 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -130,7 +130,8 @@ path_to_str(const char *path, int types)
* 'buf' detailing exactly why the name was not valid.
*/
static int
-zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type)
+zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
+ boolean_t modifying)
{
namecheck_err_t why;
char what;
@@ -203,13 +204,20 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type)
return (0);
}
+ if (modifying && strchr(path, '%') != NULL) {
+ if (hdl != NULL)
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "invalid character %c in name"), '%');
+ return (0);
+ }
+
return (-1);
}
int
zfs_name_valid(const char *name, zfs_type_t type)
{
- return (zfs_validate_name(NULL, name, type));
+ return (zfs_validate_name(NULL, name, type, B_FALSE));
}
/*
@@ -420,7 +428,7 @@ zfs_open(libzfs_handle_t *hdl, const char *path, int types)
/*
* Validate the name before we even try to open it.
*/
- if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET)) {
+ if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET, B_FALSE)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid dataset name"));
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
@@ -2428,7 +2436,7 @@ zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types)
{
zfs_handle_t *zhp;
- if (!zfs_validate_name(hdl, path, types))
+ if (!zfs_validate_name(hdl, path, types, B_FALSE))
return (B_FALSE);
/*
@@ -2486,7 +2494,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
"cannot create '%s'"), path);
/* validate the path, taking care to note the extended error message */
- if (!zfs_validate_name(hdl, path, type))
+ if (!zfs_validate_name(hdl, path, type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* validate parents exist */
@@ -2777,7 +2785,7 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
"cannot create '%s'"), target);
/* validate the target name */
- if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM))
+ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* validate parents exist */
@@ -3042,7 +3050,7 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive)
"cannot snapshot '%s'"), path);
/* validate the target name */
- if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT))
+ if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
/* make sure the parent exists and is of the appropriate type */
@@ -3246,7 +3254,6 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
dmu_replay_record_t drr;
struct drr_begin *drrb = &zc.zc_begin_record;
char errbuf[1024];
- prop_changelist_t *clp;
char chopprefix[ZFS_MAXNAMELEN];
begin_time = time(NULL);
@@ -3331,7 +3338,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
(void) strcpy(zc.zc_value, tosnap);
(void) strncat(zc.zc_value, drr.drr_u.drr_begin.drr_toname+choplen,
sizeof (zc.zc_value));
- if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT))
+ if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
(void) strcpy(zc.zc_name, zc.zc_value);
@@ -3347,26 +3354,10 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
if (h == NULL)
return (-1);
- if (!dryrun) {
- /*
- * We need to unmount all the dependents of the dataset
- * and the dataset itself. If it's a volume
- * then remove device link.
- */
- if (h->zfs_type == ZFS_TYPE_FILESYSTEM) {
- clp = changelist_gather(h, ZFS_PROP_NAME, 0);
- if (clp == NULL)
- return (-1);
- if (changelist_prefix(clp) != 0) {
- changelist_free(clp);
- return (-1);
- }
- } else {
- if (zvol_remove_link(hdl, h->zfs_name) != 0) {
- zfs_close(h);
- return (-1);
- }
-
+ if (!dryrun && h->zfs_type == ZFS_TYPE_VOLUME) {
+ if (zvol_remove_link(hdl, h->zfs_name) != 0) {
+ zfs_close(h);
+ return (-1);
}
}
zfs_close(h);
@@ -3474,13 +3465,8 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, int isprefix,
if (err == 0 && ioctl_err == 0)
err = zvol_create_link(hdl,
zc.zc_value);
- } else {
- if (drrb->drr_fromguid) {
- err = changelist_postfix(clp);
- changelist_free(clp);
- } else {
- err = zfs_mount(h, NULL, 0);
- }
+ } else if (!drrb->drr_fromguid) {
+ err = zfs_mount(h, NULL, 0);
}
zfs_close(h);
}
@@ -3750,7 +3736,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
errbuf));
}
}
- if (!zfs_validate_name(hdl, target, zhp->zfs_type))
+ if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
} else {
if (recursive) {
@@ -3759,7 +3745,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
}
- if (!zfs_validate_name(hdl, target, zhp->zfs_type))
+ if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
uint64_t unused;
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index ba15278285..c86259c8b1 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1010,6 +1010,7 @@ ZFS_OBJS += \
zfs_log.o \
zfs_replay.o \
zfs_rlock.o \
+ rrwlock.o \
zfs_vfsops.o \
zfs_vnops.o \
zvol.o
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 8753f062d3..2758d84791 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -44,7 +44,6 @@
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
-
spa_t *
dmu_objset_spa(objset_t *os)
{
@@ -244,6 +243,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
osi->os_meta_dnode = dnode_special_open(osi,
&osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
@@ -266,10 +266,10 @@ int
dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
objset_t **osp)
{
- dsl_dataset_t *ds;
- int err;
objset_t *os;
+ dsl_dataset_t *ds;
objset_impl_t *osi;
+ int err;
os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
err = dsl_dataset_open(name, mode, os, &ds);
@@ -387,6 +387,7 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock);
+ mutex_destroy(&osi->os_user_ptr_lock);
kmem_free(osi, sizeof (objset_impl_t));
}
@@ -1049,3 +1050,17 @@ dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
err = func(name, arg);
return (err);
}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+ ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+ return (os->os->os_user_ptr);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index d1b5cc1ecc..812abd0265 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -295,12 +295,9 @@ struct restorearg {
zio_cksum_t zc;
};
-/* ARGSUSED */
static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+replay_incremental_check(dsl_dataset_t *ds, struct drr_begin *drrb)
{
- dsl_dataset_t *ds = arg1;
- struct drr_begin *drrb = arg2;
const char *snapname;
int err;
uint64_t val;
@@ -312,10 +309,6 @@ replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
/* most recent snapshot must match fromguid */
if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
return (ENODEV);
- /* must not have any changes since most recent snapshot */
- if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (ETXTBSY);
/* new snapshot name must not exist */
snapname = strrchr(drrb->drr_toname, '@');
@@ -326,16 +319,31 @@ replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
if (err == 0)
- return (EEXIST);
+ return (EEXIST);
if (err != ENOENT)
- return (err);
+ return (err);
return (0);
}
/* ARGSUSED */
+static int
+replay_offline_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct drr_begin *drrb = arg2;
+
+ /* must not have any changes since most recent snapshot */
+ if (dsl_dataset_modified_since_lastsnap(ds))
+ return (ETXTBSY);
+
+ return (replay_incremental_check(ds, drrb));
+}
+
+/* ARGSUSED */
static void
-replay_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+replay_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr,
+ dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -402,6 +410,57 @@ replay_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
}
+struct onlineincarg {
+ dsl_dir_t *dd;
+ dsl_dataset_t *ohds;
+ boolean_t force;
+ const char *cosname;
+};
+
+/* ARGSUSED */
+static int
+replay_online_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ struct onlineincarg *oia = arg1;
+
+ if (dsl_dataset_modified_since_lastsnap(oia->ohds) && !oia->force)
+ return (ETXTBSY);
+
+ return (replay_incremental_check(oia->ohds, arg2));
+}
+
+/* ARGSUSED */
+static void
+replay_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ struct onlineincarg *oia = arg1;
+ dsl_dataset_t *ohds = oia->ohds;
+ dsl_dir_t *dd = oia->dd;
+ dsl_dataset_t *ods, *ds;
+ uint64_t dsobj;
+
+ VERIFY(0 == dsl_dataset_open_obj(ohds->ds_dir->dd_pool,
+ ohds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_STANDARD, FTAG, &ods));
+
+ dsobj = dsl_dataset_create_sync(dd, strrchr(oia->cosname, '/') + 1,
+ ods, tx);
+
+ /* open the temporary clone */
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_EXCLUSIVE, FTAG, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+ ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
+ ds->ds_phys->ds_dir_obj);
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+}
+
static int
replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
{
@@ -729,13 +788,16 @@ restore_free(struct restorearg *ra, objset_t *os,
int
dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, vnode_t *vp, uint64_t voffset)
+ boolean_t force, boolean_t online, vnode_t *vp, uint64_t voffset,
+ char *cosname)
{
struct restorearg ra;
dmu_replay_record_t *drr;
char *cp;
objset_t *os = NULL;
zio_cksum_t pzc;
+ char *clonebuf = NULL;
+ size_t len;
bzero(&ra, sizeof (ra));
ra.vp = vp;
@@ -790,8 +852,9 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
/*
* Process the begin in syncing context.
*/
- if (drrb->drr_fromguid) {
- /* incremental backup */
+ if (drrb->drr_fromguid && !online) {
+ /* offline incremental receive */
+
dsl_dataset_t *ds = NULL;
cp = strchr(tosnap, '@');
@@ -816,11 +879,52 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
(void) dsl_dataset_rollback(ds);
}
ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- replay_incremental_check, replay_incremental_sync,
- ds, drrb, 1);
+ replay_offline_incremental_check,
+ replay_offline_incremental_sync, ds, drrb, 1);
dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ } else if (drrb->drr_fromguid && online) {
+ /* online incremental receive */
+
+ const char *tail;
+ struct onlineincarg oia = { 0 };
+
+ /*
+ * Get the dsl_dir for the parent of the
+ * temporary clone.
+ */
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+
+ /* tmp clone is: tonsap + '/' + '%' + "snapX" */
+ len = strlen(tosnap) + 2 + strlen(cp + 1) + 1;
+ clonebuf = kmem_alloc(len, KM_SLEEP);
+ (void) snprintf(clonebuf, len, "%s%c%c%s%c",
+ tosnap, '/', '%', cp + 1, '\0');
+ ra.err = dsl_dir_open(tosnap, FTAG, &oia.dd, &tail);
+ *cp = '@';
+ if (ra.err)
+ goto out;
+
+ /* open the dataset we are logically receiving into */
+ *cp = '\0';
+ ra.err = dsl_dataset_open(tosnap, DS_MODE_STANDARD,
+ FTAG, &oia.ohds);
+ *cp = '@';
+ if (ra.err) {
+ dsl_dir_close(oia.dd, FTAG);
+ goto out;
+ }
+
+ oia.force = force;
+ oia.cosname = clonebuf;
+ ra.err = dsl_sync_task_do(oia.dd->dd_pool,
+ replay_online_incremental_check,
+ replay_online_incremental_sync, &oia, drrb, 5);
+ dsl_dataset_close(oia.ohds, DS_MODE_STANDARD, FTAG);
+ dsl_dir_close(oia.dd, FTAG);
} else {
/* full backup */
+
dsl_dir_t *dd = NULL;
const char *tail;
@@ -854,8 +958,8 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
cp = strchr(tosnap, '@');
*cp = '\0';
- ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
- DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
+ ra.err = dmu_objset_open(clonebuf == NULL ? tosnap : clonebuf,
+ DMU_OST_ANY, DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
*cp = '@';
ASSERT3U(ra.err, ==, 0);
@@ -918,9 +1022,11 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
goto out;
}
- ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
- ds_dir->dd_pool, replay_end_check, replay_end_sync,
- os, drrb, 3);
+ if (clonebuf == NULL) {
+ ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
+ ds_dir->dd_pool, replay_end_check,
+ replay_end_sync, os, drrb, 3);
+ }
goto out;
}
default:
@@ -931,8 +1037,11 @@ dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
}
out:
- if (os)
+ if (os) {
+ if (drrb->drr_fromguid && online && !ra.err)
+ dmu_objset_name(os, cosname);
dmu_objset_close(os);
+ }
/*
* Make sure we don't rollback/destroy unless we actually
@@ -949,15 +1058,29 @@ out:
cp = strchr(tosnap, '@');
*cp = '\0';
- err = dsl_dataset_open(tosnap,
+ err = dsl_dataset_open(clonebuf == NULL ? tosnap : clonebuf,
DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
FTAG, &ds);
if (err == 0) {
txg_wait_synced(ds->ds_dir->dd_pool, 0);
if (drrb->drr_fromguid) {
- /* incremental: rollback to most recent snap */
- (void) dsl_dataset_rollback(ds);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (clonebuf != NULL) {
+ /*
+ * online incremental: destroy
+ * the temporarily created clone.
+ */
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
+ FTAG);
+ (void) dmu_objset_destroy(clonebuf);
+ } else {
+ /*
+ * offline incremental: rollback to
+ * most recent snapshot.
+ */
+ (void) dsl_dataset_rollback(ds);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
+ FTAG);
+ }
} else {
/* full: destroy whole fs */
dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
@@ -967,8 +1090,26 @@ out:
*cp = '@';
}
+ if (clonebuf != NULL)
+ kmem_free(clonebuf, len);
kmem_free(ra.buf, ra.bufsize);
if (sizep)
*sizep = ra.voff;
return (ra.err);
}
+
+int
+dmu_replay_end_snapshot(char *name, struct drr_begin *drrb)
+{
+ objset_t *os;
+ int err;
+
+ err = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_STANDARD, &os);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(dmu_objset_ds(os)->ds_dir->dd_pool,
+ replay_end_check, replay_end_sync, os, drrb, 3);
+ dmu_objset_close(os);
+ return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 08bc980ffb..1cba47175a 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -1535,6 +1535,21 @@ dsl_dataset_space(dsl_dataset_t *ds,
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
+boolean_t
+dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+ if (ds->ds_prev == NULL)
+ return (B_FALSE);
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds->ds_prev->ds_phys->ds_creation_txg)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
/* ARGSUSED */
static int
dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1601,7 +1616,7 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
}
-struct renamearg {
+struct renamesnaparg {
dsl_sync_task_group_t *dstg;
char failed[MAXPATHLEN];
char *oldsnap;
@@ -1611,7 +1626,7 @@ struct renamearg {
static int
dsl_snapshot_rename_one(char *name, void *arg)
{
- struct renamearg *ra = arg;
+ struct renamesnaparg *ra = arg;
dsl_dataset_t *ds = NULL;
char *cp;
int err;
@@ -1659,7 +1674,7 @@ static int
dsl_recursive_rename(char *oldname, const char *newname)
{
int err;
- struct renamearg *ra;
+ struct renamesnaparg *ra;
dsl_sync_task_t *dst;
spa_t *spa;
char *cp, *fsname = spa_strdup(oldname);
@@ -1674,7 +1689,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
kmem_free(fsname, len + 1);
return (err);
}
- ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+ ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
ra->oldsnap = strchr(oldname, '@') + 1;
@@ -1704,7 +1719,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
(void) strcpy(oldname, ra->failed);
dsl_sync_task_group_destroy(ra->dstg);
- kmem_free(ra, sizeof (struct renamearg));
+ kmem_free(ra, sizeof (struct renamesnaparg));
spa_close(spa, FTAG);
return (err);
}
@@ -2051,6 +2066,186 @@ dsl_dataset_promote(const char *name)
return (err);
}
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
+/* ARGSUSED */
+static int
+dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *cds = arg1; /* clone to become new head */
+ boolean_t *forcep = arg2;
+ dsl_dir_t *cdd = cds->ds_dir;
+ dsl_pool_t *dp = cds->ds_dir->dd_pool;
+ dsl_dataset_t *ods; /* the snapshot cds is cloned off of */
+ dsl_dataset_t *ohds = NULL;
+ dsl_dir_t *odd;
+ int err;
+
+ /* check that it is a clone */
+ if (cdd->dd_phys->dd_clone_parent_obj == 0)
+ return (EINVAL);
+
+ /* check that cds is not a snapshot */
+ if (dsl_dataset_is_snapshot(cds))
+ return (EINVAL);
+
+ /* open the origin */
+ if (err = dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods))
+ return (err);
+ odd = ods->ds_dir;
+
+ /* make sure the clone is descendant of origin */
+ if (cdd->dd_parent != odd) {
+ err = EINVAL;
+ goto out;
+ }
+
+ /* check that there are no snapshots after the origin */
+ if (cds->ds_phys->ds_prev_snap_obj != ods->ds_object ||
+ ods->ds_phys->ds_next_snap_obj !=
+ odd->dd_phys->dd_head_dataset_obj) {
+ err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Verify origin head dataset hasn't been modified or
+ * 'force' has been passed down.
+ */
+ if (!(*forcep) &&
+ (err = dsl_dataset_open_obj(cdd->dd_pool,
+ odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &ohds)) == 0) {
+ if (dsl_dataset_modified_since_lastsnap(ohds))
+ err = ETXTBSY;
+ dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
+ }
+out:
+ dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+ dsl_dataset_t *cds = arg1; /* clone to become new head */
+ dsl_dir_t *cdd = cds->ds_dir;
+ dsl_pool_t *dp = cds->ds_dir->dd_pool;
+ dsl_dataset_t *ods, *ohds;
+ dsl_dir_t *odd;
+ uint64_t itor = 0;
+ blkptr_t bp;
+ uint64_t unique = 0;
+ int err;
+
+ ASSERT(cdd->dd_phys->dd_clone_parent_obj != 0);
+ ASSERT(dsl_dataset_is_snapshot(cds) == 0);
+
+ /* open the origin */
+ VERIFY(0 == dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods));
+ odd = ods->ds_dir;
+ ASSERT(cds->ds_phys->ds_prev_snap_obj == ods->ds_object);
+ ASSERT(ods->ds_phys->ds_next_snap_obj ==
+ odd->dd_phys->dd_head_dataset_obj);
+
+ /* open the origin head */
+ VERIFY(0 == dsl_dataset_open_obj(cdd->dd_pool,
+ odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &ohds));
+ ASSERT(odd == ohds->ds_dir);
+
+ dmu_buf_will_dirty(cds->ds_dbuf, tx);
+ dmu_buf_will_dirty(ohds->ds_dbuf, tx);
+ dmu_buf_will_dirty(ods->ds_dbuf, tx);
+
+ /* compute unique space */
+ while ((err = bplist_iterate(&cds->ds_deadlist, &itor, &bp)) == 0) {
+ if (bp.blk_birth > ods->ds_phys->ds_prev_snap_txg)
+ unique += bp_get_dasize(cdd->dd_pool->dp_spa, &bp);
+ }
+ VERIFY(err == ENOENT);
+
+ /* reset origin's unique bytes */
+ ods->ds_phys->ds_unique_bytes = unique;
+
+ /* swap blkptrs */
+ {
+ blkptr_t tmp;
+ tmp = ohds->ds_phys->ds_bp;
+ ohds->ds_phys->ds_bp = cds->ds_phys->ds_bp;
+ cds->ds_phys->ds_bp = tmp;
+ }
+
+ /* set dd_*_bytes */
+ {
+ int64_t dused, dcomp, duncomp;
+ uint64_t cdl_used, cdl_comp, cdl_uncomp;
+ uint64_t odl_used, odl_comp, odl_uncomp;
+
+ VERIFY(0 == bplist_space(&cds->ds_deadlist, &cdl_used,
+ &cdl_comp, &cdl_uncomp));
+ VERIFY(0 == bplist_space(&ohds->ds_deadlist, &odl_used,
+ &odl_comp, &odl_uncomp));
+ dused = cds->ds_phys->ds_used_bytes + cdl_used -
+ (ohds->ds_phys->ds_used_bytes + odl_used);
+ dcomp = cds->ds_phys->ds_compressed_bytes + cdl_comp -
+ (ohds->ds_phys->ds_compressed_bytes + odl_comp);
+ duncomp = cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp -
+ (ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+ dsl_dir_diduse_space(odd, dused, dcomp, duncomp, tx);
+ dsl_dir_diduse_space(cdd, -dused, -dcomp, -duncomp, tx);
+ }
+
+ /* swap ds_*_bytes */
+ SWITCH64(ohds->ds_phys->ds_used_bytes, cds->ds_phys->ds_used_bytes);
+ SWITCH64(ohds->ds_phys->ds_compressed_bytes,
+ cds->ds_phys->ds_compressed_bytes);
+ SWITCH64(ohds->ds_phys->ds_uncompressed_bytes,
+ cds->ds_phys->ds_uncompressed_bytes);
+
+ /* swap deadlists */
+ bplist_close(&cds->ds_deadlist);
+ bplist_close(&ohds->ds_deadlist);
+ SWITCH64(ohds->ds_phys->ds_deadlist_obj, cds->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&cds->ds_deadlist, dp->dp_meta_objset,
+ cds->ds_phys->ds_deadlist_obj));
+ VERIFY(0 == bplist_open(&ohds->ds_deadlist, dp->dp_meta_objset,
+ ohds->ds_phys->ds_deadlist_obj));
+
+ dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
+ dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+}
+
+/*
+ * Swap the clone "cosname" with its origin head file system.
+ */
+int
+dsl_dataset_clone_swap(const char *cosname, boolean_t force)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_open(cosname,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_clone_swap_check,
+ dsl_dataset_clone_swap_sync, ds, &force, 9);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+}
+
/*
* Given a pool name and a dataset object number in that pool,
* return the name of that dataset.
diff --git a/usr/src/uts/common/fs/zfs/rrwlock.c b/usr/src/uts/common/fs/zfs/rrwlock.c
new file mode 100644
index 0000000000..710685dbc7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/rrwlock.c
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t. This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting. If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed. Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock). Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out. At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed. If they are not, then the reader blocks for the
+ * waiting writers. Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+ struct rrw_node *rn_next;
+ rrwlock_t *rn_rrl;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ if (refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (NULL);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl)
+ return (rn);
+ }
+ return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+
+ rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+ rn->rn_rrl = rrl;
+ rn->rn_next = tsd_get(rrw_tsd_key);
+ VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl)
+{
+ rrw_node_t *rn;
+ rrw_node_t *prev = NULL;
+
+ if (refcount_count(&rrl->rr_linked_rcount) == 0)
+ return (NULL);
+
+ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+ if (rn->rn_rrl == rrl) {
+ if (prev)
+ prev->rn_next = rn->rn_next;
+ else
+ VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+ kmem_free(rn, sizeof (*rn));
+ return (B_TRUE);
+ }
+ prev = rn;
+ }
+ return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl)
+{
+ mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+ rrl->rr_writer = NULL;
+ refcount_create(&rrl->rr_anon_rcount);
+ refcount_create(&rrl->rr_linked_rcount);
+ rrl->rr_writer_wanted = B_FALSE;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+ mutex_destroy(&rrl->rr_lock);
+ cv_destroy(&rrl->rr_cv);
+ ASSERT(rrl->rr_writer == NULL);
+ refcount_destroy(&rrl->rr_anon_rcount);
+ refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+ ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+ while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+ refcount_is_zero(&rrl->rr_anon_rcount) &&
+ rrn_find(rrl) == NULL))
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+ if (rrl->rr_writer_wanted) {
+ /* may or may not be a re-entrant enter */
+ rrn_add(rrl);
+ (void) refcount_add(&rrl->rr_linked_rcount, tag);
+ } else {
+ (void) refcount_add(&rrl->rr_anon_rcount, tag);
+ }
+ ASSERT(rrl->rr_writer == NULL);
+ mutex_exit(&rrl->rr_lock);
+}
+
+static void
+rrw_enter_write(rrwlock_t *rrl)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(rrl->rr_writer != curthread);
+
+ while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
+ refcount_count(&rrl->rr_linked_rcount) > 0 ||
+ rrl->rr_writer != NULL) {
+ rrl->rr_writer_wanted = B_TRUE;
+ cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+ }
+ rrl->rr_writer_wanted = B_FALSE;
+ rrl->rr_writer = curthread;
+ mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrw_enter_read(rrl, tag);
+ else
+ rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+ mutex_enter(&rrl->rr_lock);
+ ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !refcount_is_zero(&rrl->rr_linked_rcount) ||
+ rrl->rr_writer != NULL);
+
+ if (rrl->rr_writer == NULL) {
+ if (rrn_find_and_remove(rrl)) {
+ if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
+ cv_broadcast(&rrl->rr_cv);
+
+ } else {
+ if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
+ cv_broadcast(&rrl->rr_cv);
+ }
+ } else {
+ ASSERT(rrl->rr_writer == curthread);
+ ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
+ refcount_is_zero(&rrl->rr_linked_rcount));
+ rrl->rr_writer = NULL;
+ cv_broadcast(&rrl->rr_cv);
+ }
+ mutex_exit(&rrl->rr_lock);
+}
+
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+ boolean_t held;
+
+ mutex_enter(&rrl->rr_lock);
+ if (rw == RW_WRITER) {
+ held = (rrl->rr_writer == curthread);
+ } else {
+ held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
+ !refcount_is_zero(&rrl->rr_linked_rcount));
+ }
+ mutex_exit(&rrl->rr_lock);
+
+ return (held);
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 6c9867df8a..6e6495e2ec 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -533,6 +533,8 @@ extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp);
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
/*
* Return the txg number for the given assigned transaction.
@@ -573,7 +575,9 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, struct vnode *vp, uint64_t voffset);
+ boolean_t force, boolean_t online, struct vnode *vp, uint64_t voffset,
+ char *cosname);
+int dmu_replay_end_snapshot(char *name, struct drr_begin *drrb);
/* CRC64 table */
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 775a777eef..725c771f6a 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -86,6 +86,10 @@ typedef struct objset_impl {
list_t os_free_dnodes[TXG_SIZE];
list_t os_dnodes;
list_t os_downgraded_dbufs;
+
+ /* stuff we store for the user */
+ kmutex_t os_user_ptr_lock;
+ void *os_user_ptr;
} objset_impl_t;
#define DMU_META_DNODE_OBJECT 0
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 2a8d354be4..d02eba1ce7 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -138,6 +138,7 @@ dsl_syncfunc_t dsl_dataset_snapshot_sync;
int dsl_dataset_rollback(dsl_dataset_t *ds);
int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
int dsl_dataset_promote(const char *name);
+int dsl_dataset_clone_swap(const char *name, boolean_t force);
void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
void *p, dsl_dataset_evict_func_t func);
@@ -148,6 +149,8 @@ void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/rrwlock.h b/usr/src/uts/common/fs/zfs/sys/rrwlock.h
new file mode 100644
index 0000000000..19a43c97fc
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/rrwlock.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_RR_RW_LOCK_H
+#define _SYS_RR_RW_LOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/inttypes.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+/*
+ * A reader-writer lock implementation that allows re-entrant reads, but
+ * still gives writers priority on "new" reads.
+ *
+ * See rrwlock.c for more details about the implementation.
+ *
+ * Fields of the rrwlock_t structure:
+ * - rr_lock: protects modification and reading of rrwlock_t fields
+ * - rr_cv: cv for waking up readers or waiting writers
+ * - rr_writer: thread id of the current writer
+ * - rr_anon_rount: number of active anonymous readers
+ * - rr_linked_rcount: total number of non-anonymous active readers
+ * - rr_writer_wanted: a writer wants the lock
+ */
+typedef struct rrwlock {
+ kmutex_t rr_lock;
+ kcondvar_t rr_cv;
+ kthread_t *rr_writer;
+ refcount_t rr_anon_rcount;
+ refcount_t rr_linked_rcount;
+ boolean_t rr_writer_wanted;
+} rrwlock_t;
+
+/*
+ * 'tag' is used in reference counting tracking. The
+ * 'tag' must be the same in a rrw_enter() as in its
+ * corresponding rrw_exit().
+ */
+void rrw_init(rrwlock_t *rrl);
+void rrw_destroy(rrwlock_t *rrl);
+void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_exit(rrwlock_t *rrl, void *tag);
+boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+
+#define RRW_READ_HELD(x) rrw_held(x, RW_READER)
+#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_RR_RW_LOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
index 38b9fbb9fc..ea55a86b9e 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -33,6 +33,7 @@
#include <sys/list.h>
#include <sys/vfs.h>
#include <sys/zil.h>
+#include <sys/rrwlock.h>
#ifdef __cplusplus
extern "C" {
@@ -53,8 +54,8 @@ struct zfsvfs {
uint_t z_acl_inherit; /* acl inheritance behavior */
boolean_t z_atime; /* enable atimes mount option */
boolean_t z_unmounted; /* unmounted */
- krwlock_t z_unmount_lock;
- krwlock_t z_unmount_inactive_lock;
+ rrwlock_t z_teardown_lock;
+ krwlock_t z_teardown_inactive_lock;
list_t z_all_znodes; /* all vnodes in the fs */
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
vnode_t *z_ctldir; /* .zfs directory pointer */
@@ -115,6 +116,9 @@ typedef struct zfid_long {
extern uint_t zfs_fsyncer_key;
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index b562c9e915..8b4ee46218 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -34,6 +34,7 @@
#include <sys/list.h>
#include <sys/dmu.h>
#include <sys/zfs_vfsops.h>
+#include <sys/rrwlock.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -153,6 +154,7 @@ typedef struct znode {
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
uint64_t z_last_itx; /* last ZIL itx on this znode */
+ uint64_t z_gen; /* generation (same as zp_gen) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
list_node_t z_link_node; /* all znodes in fs link */
@@ -189,18 +191,27 @@ typedef struct znode {
/*
* ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
* ZFS_EXIT() must be called before exitting the vop.
+ * ZFS_ENTER_VERIFY_ZP() does ZFS_ENTER plus verifies the znode is valid.
*/
#define ZFS_ENTER(zfsvfs) \
{ \
- if (rw_tryenter(&(zfsvfs)->z_unmount_lock, RW_READER) == 0) \
- return (EIO); \
+ rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \
if ((zfsvfs)->z_unmounted) { \
ZFS_EXIT(zfsvfs); \
return (EIO); \
} \
}
-#define ZFS_EXIT(zfsvfs) rw_exit(&(zfsvfs)->z_unmount_lock)
+#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+
+#define ZFS_ENTER_VERIFY_ZP(zfsvfs, zp) \
+ { \
+ ZFS_ENTER((zfsvfs)); \
+ if (!(zp)->z_dbuf_held) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
/*
* Macros for dealing with dmu_buf_hold
@@ -250,6 +261,7 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
extern void zfs_znode_fini(void);
extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern int zfs_rezget(znode_t *);
extern void zfs_zinactive(znode_t *);
extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
extern void zfs_znode_free(znode_t *);
diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
index 5fe86c98ac..3b2cc409e0 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
@@ -53,6 +53,16 @@
* reliable way to auto-unmount the filesystem when it's "no longer in use".
* When the user unmounts a filesystem, we call zfsctl_unmount(), which
* unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()).
*/
#include <fs/fs_subr.h>
@@ -578,6 +588,9 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
return (err);
}
+/*
+ * This creates a snapshot under '.zfs/snapshot'.
+ */
/* ARGSUSED */
static int
zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
@@ -711,6 +724,9 @@ domount:
if (err == 0) {
/*
* Return the mounted root rather than the covered mount point.
+ * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
+ * the ZFS vnode mounted on top of the GFS node. This ZFS
+ * vnode is the root the newly created vfsp.
*/
VFS_RELE(vfsp);
err = traverse(vpp);
@@ -718,11 +734,11 @@ domount:
if (err == 0) {
/*
- * Fix up the root vnode.
+ * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
*
* This is where we lie about our v_vfsp in order to
- * make .zfs/snapshot/<snapdir> accessible over NFS
- * without requiring manual mounts of <snapdir>.
+ * make .zfs/snapshot/<snapname> accessible over NFS
+ * without requiring manual mounts of <snapname>.
*/
ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
@@ -771,6 +787,13 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
return (0);
}
+/*
+ * pvp is the '.zfs' directory (zfsctl_node_t).
+ * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
+ *
+ * This function is the callback to create a GFS vnode for '.zfs/snapshot'
+ * when a lookup is performed on .zfs for "snapshot".
+ */
vnode_t *
zfsctl_mknode_snapdir(vnode_t *pvp)
{
@@ -838,6 +861,13 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = {
{ NULL }
};
+/*
+ * pvp is the GFS vnode '.zfs/snapshot'.
+ *
+ * This creates a GFS node under '.zfs/snapshot' representing each
+ * snapshot. This newly created GFS node is what we mount snapshot
+ * vfs_t's ontop of.
+ */
static vnode_t *
zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
{
@@ -937,6 +967,12 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
if (sep != NULL) {
VN_HOLD(vp);
+ /*
+ * Return the mounted root rather than the covered mount point.
+ * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
+ * and returns the ZFS vnode mounted on top of the GFS node.
+ * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+ */
error = traverse(&vp);
if (error == 0) {
if (vp == sep->se_root)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index a6bad3f8d8..4a5e68b878 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -63,6 +63,8 @@
#include <sys/zvol.h>
#include <sharefs/share.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu_objset.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -1671,7 +1673,8 @@ zfs_ioc_create(zfs_cmd_t *zc)
default:
cbfunc = NULL;
}
- if (strchr(zc->zc_name, '@'))
+ if (strchr(zc->zc_name, '@') ||
+ strchr(zc->zc_name, '%'))
return (EINVAL);
if (zc->zc_nvlist_src != NULL &&
@@ -1847,7 +1850,8 @@ zfs_ioc_rename(zfs_cmd_t *zc)
boolean_t recursive = zc->zc_cookie & 1;
zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_value, '%'))
return (EINVAL);
/*
@@ -1869,21 +1873,84 @@ static int
zfs_ioc_recvbackup(zfs_cmd_t *zc)
{
file_t *fp;
- int error, fd;
offset_t new_off;
+ objset_t *os;
+ zfsvfs_t *zfsvfs = NULL;
+ char *cp;
+ char cosname[MAXNAMELEN];
+ boolean_t force = (boolean_t)zc->zc_guid;
+ int error, fd;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
- strchr(zc->zc_value, '@') == NULL)
+ strchr(zc->zc_value, '@') == NULL ||
+ strchr(zc->zc_value, '%'))
return (EINVAL);
fd = zc->zc_cookie;
fp = getf(fd);
if (fp == NULL)
return (EBADF);
+
+ /*
+ * Get the zfsvfs for the receiving objset. There
+ * won't be one if we're operating on a zvol, if the
+ * objset doesn't exist yet, or is not mounted.
+ */
+ cp = strchr(zc->zc_value, '@');
+ *cp = '\0';
+ error = dmu_objset_open(zc->zc_value, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ *cp = '@';
+ if (!error) {
+ if (dmu_objset_type(os) == DMU_OST_ZFS) {
+ mutex_enter(&os->os->os_user_ptr_lock);
+ zfsvfs = dmu_objset_get_user(os);
+ if (zfsvfs != NULL)
+ VFS_HOLD(zfsvfs->z_vfs);
+ mutex_exit(&os->os->os_user_ptr_lock);
+ }
+ dmu_objset_close(os);
+ }
+
error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
- &zc->zc_cookie, (boolean_t)zc->zc_guid, fp->f_vnode,
- fp->f_offset);
+ &zc->zc_cookie, force, zfsvfs != NULL, fp->f_vnode,
+ fp->f_offset, cosname);
+
+ /*
+ * For incremental snapshots where we created a
+ * temporary clone, we now swap zfsvfs::z_os with
+ * the newly created and received "cosname".
+ */
+ if (!error && zfsvfs != NULL) {
+ char osname[MAXNAMELEN];
+ int mode;
+
+ error = zfs_suspend_fs(zfsvfs, osname, &mode);
+ if (!error) {
+ int swap_err;
+ int snap_err = 0;
+
+ swap_err = dsl_dataset_clone_swap(cosname, force);
+ if (!swap_err) {
+ char *cp = strrchr(zc->zc_value, '@');
+
+ *cp = '\0';
+ snap_err = dmu_replay_end_snapshot(zc->zc_value,
+ &zc->zc_begin_record);
+ *cp = '@';
+ }
+ error = zfs_resume_fs(zfsvfs, osname, mode);
+ if (!error)
+ error = swap_err;
+ if (!error)
+ error = snap_err;
+ }
+ /* destroy the clone we created */
+ (void) dmu_objset_destroy(cosname);
+ }
+ if (zfsvfs != NULL)
+ VFS_RELE(zfsvfs->z_vfs);
new_off = fp->f_offset + zc->zc_cookie;
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &new_off) == 0)
fp->f_offset = new_off;
@@ -2327,6 +2394,7 @@ static struct modlinkage modlinkage = {
uint_t zfs_fsyncer_key;
+extern uint_t rrw_tsd_key;
int
_init(void)
@@ -2345,6 +2413,7 @@ _init(void)
}
tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, NULL);
error = ldi_ident_from_mod(&modlinkage, &zfs_li);
ASSERT(error == 0);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 38c1650857..0736cb3224 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -59,6 +59,7 @@
#include <sys/bootconf.h>
#include <sys/sunddi.h>
#include <sys/dnlc.h>
+#include <sys/dmu_objset.h>
int zfsfstype;
vfsops_t *zfs_vfsops = NULL;
@@ -498,6 +499,76 @@ unregister:
}
static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ uint_t readonly;
+ int error;
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+ if (readonly != 0)
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ else
+ zfs_unlinked_drain(zfsvfs);
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest doesn't
+ * use readonly mounts, where zfs_unlinked_drain() isn't
+ * called.) This is because ziltest causes spa_sync()
+ * to think it's committed, but actually it is not, so
+ * the intent log contains many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated in
+ * a yet later txg. This would write a "create object
+ * N" record to the intent log. Normally, this would be
+ * fine because the spa_sync() would have written out
+ * the fact that object N is free, before we could write
+ * the "create object N" intent log record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+ zfs_replay_vector);
+
+ zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+ }
+
+ if (!zil_disable)
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ return (0);
+}
+
+static int
zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
{
dev_t mount_dev;
@@ -525,8 +596,8 @@ zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
- rw_init(&zfsvfs->z_unmount_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zfsvfs->z_unmount_inactive_lock, NULL, RW_DEFAULT, NULL);
+ rrw_init(&zfsvfs->z_teardown_lock);
+ rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
/* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
@@ -583,54 +654,7 @@ zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
xattr_changed_cb(zfsvfs, xattr);
zfsvfs->z_issnap = B_TRUE;
} else {
- uint_t readonly;
-
- error = zfs_register_callbacks(vfsp);
- if (error)
- goto out;
-
- /*
- * During replay we remove the read only flag to
- * allow replays to succeed.
- */
- readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
- if (readonly != 0)
- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- else
- zfs_unlinked_drain(zfsvfs);
-
- /*
- * Parse and replay the intent log.
- *
- * Because of ziltest, this must be done after
- * zfs_unlinked_drain(). (Further note: ziltest doesn't
- * use readonly mounts, where zfs_unlinked_drain() isn't
- * called.) This is because ziltest causes spa_sync()
- * to think it's committed, but actually it is not, so
- * the intent log contains many txg's worth of changes.
- *
- * In particular, if object N is in the unlinked set in
- * the last txg to actually sync, then it could be
- * actually freed in a later txg and then reallocated in
- * a yet later txg. This would write a "create object
- * N" record to the intent log. Normally, this would be
- * fine because the spa_sync() would have written out
- * the fact that object N is free, before we could write
- * the "create object N" intent log record.
- *
- * But when we are in ziltest mode, we advance the "open
- * txg" without actually spa_sync()-ing the changes to
- * disk. So we would see that object N is still
- * allocated and in the unlinked set, and there is an
- * intent log record saying to allocate it.
- */
- zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
- zfs_replay_vector);
-
- zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
-
- if (!zil_disable)
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+ error = zfsvfs_setup(zfsvfs, B_TRUE);
}
if (!zfsvfs->z_issnap)
@@ -641,8 +665,8 @@ out:
dmu_objset_close(zfsvfs->z_os);
mutex_destroy(&zfsvfs->z_znodes_lock);
list_destroy(&zfsvfs->z_all_znodes);
- rw_destroy(&zfsvfs->z_unmount_lock);
- rw_destroy(&zfsvfs->z_unmount_inactive_lock);
+ rrw_destroy(&zfsvfs->z_teardown_lock);
+ rw_destroy(&zfsvfs->z_teardown_inactive_lock);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
} else {
atomic_add_32(&zfs_active_fs_count, 1);
@@ -1019,13 +1043,130 @@ zfs_root(vfs_t *vfsp, vnode_t **vpp)
return (error);
}
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp, *nextzp;
+ znode_t markerzp;
+
+ rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's vfsp as the parent
+ * filesystem and all of its snapshots have their vnode's
+ * v_vfsp set to the parent's filesystem's vfsp. Note,
+ * 'z_parent' is self referential for non-snapshots.
+ */
+ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ return (EIO);
+ }
+
+ /*
+ * At this point there are no vops active, and any new vops will
+ * fail with EIO since we have z_teardown_lock for writer (only
+ * relavent for forced unmount).
+ *
+ * Release all holds on dbufs.
+ * Note, the dmu can still callback via znode_pageout_func()
+ * which can zfs_znode_free() the znode. So we lock
+ * z_all_znodes; search the list for a held dbuf; drop the lock
+ * (we know zp can't disappear if we hold a dbuf lock) then
+ * regrab the lock and restart.
+ *
+ * Since we have to restart the search after finding each held dbuf,
+ * we do two things to speed up searching: we insert a dummy znode
+ * ('markerzp') to detect the original tail of the list, and move
+ * non-held znodes to the end of the list. Once we hit 'markerzp',
+ * we know we've looked at each znode and can break out.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, &markerzp);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != &markerzp;
+ zp = nextzp) {
+ nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+ if (zp->z_dbuf_held) {
+ /* dbufs should only be held when force unmounting */
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+ /* Start again */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ nextzp = list_head(&zfsvfs->z_all_znodes);
+ } else {
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ }
+ }
+ list_remove(&zfsvfs->z_all_znodes, &markerzp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new vops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other vops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data
+ */
+ (void) dmu_objset_evict_dbufs(os);
+
+ return (0);
+}
+
/*ARGSUSED*/
static int
zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp, *nextzp;
+ objset_t *os;
int ret;
ret = secpolicy_fs_unmount(cr, vfsp);
@@ -1069,79 +1210,35 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
return (EBUSY);
} else {
if (vfsp->vfs_count > 2 ||
- zfsvfs->z_ctldir->v_count > 1) {
+ zfsvfs->z_ctldir->v_count > 1)
return (EBUSY);
- }
}
}
vfsp->vfs_flag |= VFS_UNMOUNTED;
- rw_enter(&zfsvfs->z_unmount_lock, RW_WRITER);
- rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_WRITER);
-
- /*
- * At this point there are no vops active, and any new vops will
- * fail with EIO since we have z_unmount_lock for writer (only
- * relavent for forced unmount).
- *
- * Release all holds on dbufs.
- * Note, the dmu can still callback via znode_pageout_func()
- * which can zfs_znode_free() the znode. So we lock
- * z_all_znodes; search the list for a held dbuf; drop the lock
- * (we know zp can't disappear if we hold a dbuf lock) then
- * regrab the lock and restart.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
- nextzp = list_next(&zfsvfs->z_all_znodes, zp);
- if (zp->z_dbuf_held) {
- /* dbufs should only be held when force unmounting */
- zp->z_dbuf_held = 0;
- mutex_exit(&zfsvfs->z_znodes_lock);
- dmu_buf_rele(zp->z_dbuf, NULL);
- /* Start again */
- mutex_enter(&zfsvfs->z_znodes_lock);
- nextzp = list_head(&zfsvfs->z_all_znodes);
- }
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- /*
- * Set the unmounted flag and let new vops unblock.
- * zfs_inactive will have the unmounted behavior, and all other
- * vops will fail with EIO.
- */
- zfsvfs->z_unmounted = B_TRUE;
- rw_exit(&zfsvfs->z_unmount_lock);
- rw_exit(&zfsvfs->z_unmount_inactive_lock);
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
/*
- * Unregister properties.
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
*/
- if (!dmu_objset_is_snapshot(os))
- zfs_unregister_callbacks(zfsvfs);
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os->os_user_ptr_lock);
- /*
- * Close the zil. NB: Can't close the zil while zfs_inactive
- * threads are blocked as zil_close can call zfs_inactive.
- */
- if (zfsvfs->z_log) {
- zil_close(zfsvfs->z_log);
- zfsvfs->z_log = NULL;
+ /*
+ * Finally close the objset
+ */
+ dmu_objset_close(os);
}
/*
- * Evict cached data
- */
- (void) dmu_objset_evict_dbufs(os);
-
- /*
- * Finally close the objset
- */
- dmu_objset_close(os);
-
- /*
* We can now safely destroy the '.zfs' directory node.
*/
if (zfsvfs->z_ctldir != NULL)
@@ -1234,6 +1331,77 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
return (0);
}
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ *mode = zfsvfs->z_os->os_mode;
+ dmu_objset_name(zfsvfs->z_os, name);
+ dmu_objset_close(zfsvfs->z_os);
+
+ return (0);
+}
+
+/*
+ * Reopen zfsvfs_t::z_os and release VOPs.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+{
+ int err;
+
+ ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ if (err) {
+ zfsvfs->z_os = NULL;
+ } else {
+ znode_t *zp;
+
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ /*
+ * Attempt to re-establish all the active znodes with
+ * their dbufs. If a zfs_rezget() fails, then we'll let
+ * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+ * when they try to use their znode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ ASSERT(!zp->z_dbuf_held);
+ (void) zfs_rezget(zp);
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ }
+
+ /* release the VOPs */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ if (err) {
+ /*
+ * Since we couldn't reopen zfsvfs::z_os, force
+ * unmount this file system.
+ */
+ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
+ (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
+ }
+ return (err);
+}
+
static void
zfs_freevfs(vfs_t *vfsp)
{
@@ -1245,8 +1413,8 @@ zfs_freevfs(vfs_t *vfsp)
mutex_destroy(&zfsvfs->z_znodes_lock);
list_destroy(&zfsvfs->z_all_znodes);
- rw_destroy(&zfsvfs->z_unmount_lock);
- rw_destroy(&zfsvfs->z_unmount_inactive_lock);
+ rrw_destroy(&zfsvfs->z_teardown_lock);
+ rw_destroy(&zfsvfs->z_teardown_inactive_lock);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
atomic_add_32(&zfs_active_fs_count, -1);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 852555b7f3..2e6405be7a 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -83,8 +83,9 @@
* to freed memory. The example below illustrates the following Big Rules:
*
* (1) A check must be made in each zfs thread for a mounted file system.
- * This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs) or
+ * ZFS_ENTER_VERIFY(zfsvfs, zp). A ZFS_EXIT(zfsvfs) is needed before
+ * all returns.
*
* (2) VN_RELE() should always be the last thing except for zil_commit()
* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
@@ -239,6 +240,7 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
offset_t off;
int error;
zfsvfs_t *zfsvfs;
+ znode_t *zp;
switch (com) {
case _FIOFFS:
@@ -257,8 +259,9 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
if (ddi_copyin((void *)data, &off, sizeof (off), flag))
return (EFAULT);
- zfsvfs = VTOZ(vp)->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
/* offset parameter is in/out */
error = zfs_holey(vp, com, &off);
@@ -398,12 +401,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os = zfsvfs->z_os;
+ objset_t *os;
ssize_t n, nbytes;
int error;
rl_t *rl;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+ os = zfsvfs->z_os;
/*
* Validate file offset
@@ -568,7 +572,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
uint64_t end_size;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
offset_t woff;
ssize_t n, nbytes;
rl_t *rl;
@@ -585,7 +589,8 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
limit = MAXOFFSET_T;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+ zilog = zfsvfs->z_log;
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
@@ -906,7 +911,7 @@ zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
error = zfs_zaccess_rwx(zp, mode, cr);
ZFS_EXIT(zfsvfs);
return (error);
@@ -941,7 +946,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zdp);
*vpp = NULL;
@@ -1044,14 +1049,16 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- objset_t *os = zfsvfs->z_os;
+ zilog_t *zilog;
+ objset_t *os;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
uint64_t zoid;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
top:
*vpp = NULL;
@@ -1221,7 +1228,7 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
znode_t *xzp = NULL;
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
@@ -1229,7 +1236,8 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
boolean_t unlinked;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ zilog = zfsvfs->z_log;
top:
/*
@@ -1386,7 +1394,7 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
zfs_dirlock_t *dl;
uint64_t zoid = 0;
dmu_tx_t *tx;
@@ -1394,7 +1402,8 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
ASSERT(vap->va_type == VDIR);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ zilog = zfsvfs->z_log;
if (dzp->z_phys->zp_flags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
@@ -1483,12 +1492,13 @@ zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
znode_t *zp;
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ zilog = zfsvfs->z_log;
top:
zp = NULL;
@@ -1613,7 +1623,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp)
int error;
uint8_t prefetch;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
/*
* If we are not given an eof variable,
@@ -1812,7 +1822,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
@@ -1837,11 +1847,12 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp = zp->z_phys;
+ znode_phys_t *pzp;
int error;
uint64_t links;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+ pzp = zp->z_phys;
/*
* Return all attributes. It's cheaper to provide the answer
@@ -1917,10 +1928,10 @@ static int
zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
- struct znode *zp = VTOZ(vp);
- znode_phys_t *pzp = zp->z_phys;
+ znode_t *zp = VTOZ(vp);
+ znode_phys_t *pzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
dmu_tx_t *tx;
vattr_t oldva;
uint_t mask = vap->va_mask;
@@ -1943,7 +1954,9 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
return (EINVAL);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+ pzp = zp->z_phys;
+ zilog = zfsvfs->z_log;
top:
attrzp = NULL;
@@ -2298,14 +2311,15 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
znode_t *tdzp, *szp, *tzp;
znode_t *sdzp = VTOZ(sdvp);
zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
vnode_t *realvp;
zfs_dirlock_t *sdl, *tdl;
dmu_tx_t *tx;
zfs_zlock_t *zl;
int cmp, serr, terr, error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, sdzp);
+ zilog = zfsvfs->z_log;
/*
* Make sure we have the real vp for the target directory.
@@ -2319,6 +2333,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
}
tdzp = VTOZ(tdvp);
+ if (!tdzp->z_dbuf_held) {
+ ZFS_EXIT(zfsvfs);
+ return (EIO);
+ }
top:
szp = NULL;
tzp = NULL;
@@ -2529,14 +2547,15 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr)
zfs_dirlock_t *dl;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
uint64_t zoid;
int len = strlen(link);
int error;
ASSERT(vap->va_type == VLNK);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ zilog = zfsvfs->z_log;
top:
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
ZFS_EXIT(zfsvfs);
@@ -2650,7 +2669,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
size_t bufsz;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
bufsz = (size_t)zp->z_phys->zp_size;
if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
@@ -2695,7 +2714,7 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
znode_t *dzp = VTOZ(tdvp);
znode_t *tzp, *szp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ zilog_t *zilog;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
vnode_t *realvp;
@@ -2703,7 +2722,8 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
ASSERT(tdvp->v_type == VDIR);
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+ zilog = zfsvfs->z_log;
if (VOP_REALVP(svp, &realvp) == 0)
svp = realvp;
@@ -2714,6 +2734,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
}
szp = VTOZ(svp);
+ if (!szp->z_dbuf_held) {
+ ZFS_EXIT(zfsvfs);
+ return (EIO);
+ }
top:
/*
* We do not support links between attributes and non-attributes
@@ -2947,7 +2971,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
uint64_t filesz;
int error = 0;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
ASSERT(zp->z_dbuf_held && zp->z_phys);
@@ -3005,10 +3029,8 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
- rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_READER);
- if (zfsvfs->z_unmounted) {
- ASSERT(zp->z_dbuf_held == 0);
-
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ if (zp->z_dbuf_held == 0) {
if (vn_has_cached_data(vp)) {
(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
B_INVAL, cr);
@@ -3022,7 +3044,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
} else {
mutex_exit(&zp->z_lock);
}
- rw_exit(&zfsvfs->z_unmount_inactive_lock);
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
VFS_RELE(zfsvfs->z_vfs);
return;
}
@@ -3053,7 +3075,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
}
zfs_zinactive(zp);
- rw_exit(&zfsvfs->z_unmount_inactive_lock);
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
/*
@@ -3087,7 +3109,7 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
/*
* We are following the UFS semantics with respect to mapcnt
@@ -3239,7 +3261,7 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
int need_unlock = 0, err = 0;
offset_t orig_off;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
if (protp)
*protp = PROT_ALL;
@@ -3371,7 +3393,7 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
segvn_crargs_t vn_a;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
if (vp->v_flag & VNOMAP) {
ZFS_EXIT(zfsvfs);
@@ -3507,7 +3529,7 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
uint64_t off, len;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
top:
if (cmd != F_FREESP) {
@@ -3542,12 +3564,13 @@ zfs_fid(vnode_t *vp, fid_t *fidp)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint32_t gen = (uint32_t)zp->z_phys->zp_gen;
+ uint32_t gen;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
int size, i;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+ gen = (uint32_t)zp->z_gen;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
if (fidp->fid_len < size) {
@@ -3607,7 +3630,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
*valp = 0;
error = zfs_dirent_lock(&dl, zp, "", &xzp,
ZXATTR | ZEXISTS | ZSHARED);
@@ -3647,7 +3670,7 @@ zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
error = zfs_getacl(zp, vsecp, cr);
ZFS_EXIT(zfsvfs);
@@ -3662,7 +3685,7 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
- ZFS_ENTER(zfsvfs);
+ ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
error = zfs_setacl(zp, vsecp, cr);
ZFS_EXIT(zfsvfs);
return (error);
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 1ac95c5537..7415a15e74 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -41,7 +41,6 @@
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/kmem.h>
-#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/unistd.h>
#include <sys/mode.h>
@@ -417,6 +416,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
+ zp->z_gen = zp->z_phys->zp_gen;
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -706,6 +706,53 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
return (0);
}
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ uint64_t obj_num = zp->z_id;
+ int err;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ ASSERT(db->db_object == obj_num);
+ ASSERT(db->db_offset == -1);
+ ASSERT(db->db_data != NULL);
+
+ if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
+ }
+
+ zp->z_dbuf = db;
+ zp->z_phys = db->db_data;
+ zfs_znode_dmu_init(zp);
+ zp->z_unlinked = (zp->z_phys->zp_links == 0);
+
+ /* release the hold from zfs_znode_dmu_init() */
+ VFS_RELE(zfsvfs->z_vfs);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+ return (0);
+}
+
void
zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
{