OS-1566 filesystem limits for ZFS datasets

author: Jerry Jelinek <jerry.jelinek@joyent.com> 2012-12-21 20:10:33 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2012-12-21 20:10:33 +0000
commit: 99c071e7ee789a71d9aa9f3bf92cfde20153a526 (patch)
tree: 57beaca4a6b3a72d4870d8d0fb9a58097d881e30
parent: 50b4afdcbc4755b19fb03c07eca48ea8d3f83108 (diff)
download: illumos-joyent-99c071e7ee789a71d9aa9f3bf92cfde20153a526.tar.gz
17 files changed, 920 insertions, 31 deletions
diff --git a/usr/src/common/zfs/zfeature_common.c b/usr/src/common/zfs/zfeature_common.c
index f79d1f4613..ef1760255c 100644
--- a/usr/src/common/zfs/zfeature_common.c
+++ b/usr/src/common/zfs/zfeature_common.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 #ifdef _KERNEL
@@ -156,4 +157,7 @@ zpool_feature_init(void)
 	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
 	    "com.delphix:empty_bpobj", "empty_bpobj",
 	    "Snapshots use less space.", B_TRUE, B_FALSE, NULL);
+	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+	    "com.joyent:filesystem_limits", "filesystem_limits",
+	    "Filesystem and snapshot limits.", B_TRUE, B_FALSE, NULL);
 }
diff --git a/usr/src/common/zfs/zfeature_common.h b/usr/src/common/zfs/zfeature_common.h
index 40bc4374cb..b0b86aa26b 100644
--- a/usr/src/common/zfs/zfeature_common.h
+++ b/usr/src/common/zfs/zfeature_common.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 #ifndef _ZFEATURE_COMMON_H
@@ -52,6 +53,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
 enum spa_feature {
 	SPA_FEATURE_ASYNC_DESTROY,
 	SPA_FEATURE_EMPTY_BPOBJ,
+	SPA_FEATURE_FS_SS_LIMIT,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c
index 65155fec4a..88e13711e0 100644
--- a/usr/src/common/zfs/zfs_prop.c
+++ b/usr/src/common/zfs/zfs_prop.c
@@ -362,6 +362,12 @@ zfs_prop_init(void)
 	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<size> | none", "REFRESERV");
+	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+	    "<count> | none", "FSLIMIT");
+	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<count> | none", "SSLIMIT");
 
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index 2d444dcb4b..00ca716028 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -1797,6 +1797,8 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (*source == NULL) {
@@ -2217,6 +2219,28 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		}
 		break;
 
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+
+		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
+			return (-1);
+
+		/*
+		 * If limit is UINT64_MAX, we translate this into 'none' (unless
+		 * literal is set), and indicate that it's the default value.
+		 * Otherwise, we print the number nicely and indicate that it's
+		 * set locally.
+		 */
+		if (literal) {
+			(void) snprintf(propbuf, proplen, "%llu",
+			    (u_longlong_t)val);
+		} else if (val == UINT64_MAX) {
+			(void) strlcpy(propbuf, "none", proplen);
+		} else {
+			zfs_nicenum(val, propbuf, proplen);
+		}
+		break;
+
 	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
index 8706a6fdbb..ce30fc8556 100644
--- a/usr/src/lib/libzfs/common/libzfs_util.c
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -1265,6 +1265,16 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
 			    "use 'none' to disable quota/refquota"));
 			goto error;
 		}
+
+		/*
+		 * Special handling for "*_limit=none". In this case it's not
+		 * 0 but UINT64_MAX.
+		 */
+		if ((type & ZFS_TYPE_DATASET) && isnone &&
+		    (prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+		    prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
+			*ivalp = UINT64_MAX;
+		}
 		break;
 
 	case PROP_TYPE_INDEX:
diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m
index 3cb4d5a7f7..eb76f30327 100644
--- a/usr/src/man/man1m/zfs.1m
+++ b/usr/src/man/man1m/zfs.1m
@@ -401,8 +401,9 @@ mounted.
 A dataset can also be delegated to a non-global zone by using the \fBzonecfg\fR
 \fBadd dataset\fR subcommand. You cannot delegate a dataset to one zone and the
 children of the same dataset to another zone. The zone administrator can change
-properties of the dataset or any of its children. However, the \fBquota\fR
-property is controlled by the global administrator.
+properties of the dataset or any of its children. However, the \fBquota\fR,
+\fBfilesystem_limit\fR and \fBsnapshot_limit\fR properties of the delegated
+dataset can only be set from the global zone.
 .sp
 .LP
 A \fBZFS\fR volume can be added as a device to a non-global zone by using the
@@ -968,6 +969,22 @@ default value is \fBon\fR.
 .sp
 .ne 2
 .na
+\fB\fBfilesystem_limit\fR=\fIcount\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the number of filesystems and volumes that can exist at this point in
+the dataset tree. The count of nested filesystems and volumes includes the
+filesystem on which the limit is set, thus the minimum value is 1. The limit is
+not enforced if the user is allowed to change the limit. Setting a
+filesystem_limit on a descendent of a filesystem that already has a
+filesystem_limit does not override the ancestor's filesystem_limit, but rather
+imposes an additional limit. This feature must be enabled to be used
+(see \fBzpool-features\fR(5)).
+.RE
+.sp
+.ne 2
+.na
 \fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR
 .ad
 .sp .6
@@ -1031,6 +1048,22 @@ implicit quota.
 .sp
 .ne 2
 .na
+\fB\fBsnapshot_limit\fR=\fIcount\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the number of snapshots that can be created on a dataset and its
+descendents. Setting a snapshot_limit on a descendent of a dataset that already
+has a snapshot_limit does not override the ancestor's snapshot_limit, but
+rather imposes an additional limit. The limit is not enforced if the user is
+allowed to change the limit. This means that recursive snapshots taken from the
+global zone are counted against each delegated dataset. This feature must be
+enabled to be used (see \fBzpool-features\fR(5)).
+.RE
+
+.sp
+.ne 2
+.na
 \fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR
 .ad
 .sp .6
@@ -3127,6 +3160,7 @@ compression      property
 copies           property
 devices          property
 exec             property
+filesystem_limit property
 mountpoint       property
 nbmand           property
 normalization    property
@@ -3143,6 +3177,7 @@ shareiscsi       property
 sharenfs         property
 sharesmb         property
 snapdir          property
+snapshot_limit   property
 utf8only         property
 version          property
 volblocksize     property
diff --git a/usr/src/man/man5/zpool-features.5 b/usr/src/man/man5/zpool-features.5
index 0ab179ef7c..8fabe7ebf4 100644
--- a/usr/src/man/man5/zpool-features.5
+++ b/usr/src/man/man5/zpool-features.5
@@ -1,5 +1,6 @@
 '\" te
 .\" Copyright (c) 2012 by Delphix. All rights reserved.
+.\" Copyright (c) 2012 Joyent, Inc. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -12,7 +13,7 @@
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZPOOL-FEATURES 5 "Mar 16, 2012"
+.TH ZPOOL-FEATURES 5 "Oct 24, 2012"
 .SH NAME
 zpool\-features \- ZFS pool feature descriptions
 .SH DESCRIPTION
@@ -197,5 +198,26 @@ This feature is \fBactive\fR while there are any filesystems, volumes,
 or snapshots which were created after enabling this feature.
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fBfilesystem_limits\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.joyent:filesystem_limits
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	none
+.TE
+
+This feature enables filesystem and snapshot limits. These limits can be used
+to control how many filesystems and/or snapshots can be created at the point in
+the tree on which the limits are set.
+
+This feature is \fBactive\fR once either of the limit properties has been
+set on a dataset and is deactivated when the last limit is removed.
+.RE
+
 .SH "SEE ALSO"
 \fBzpool\fR(1M)
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index b840881486..4345b26757 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -690,7 +691,7 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 			return (EINVAL);
 	}
 
-	return (0);
+	return (dsl_dir_fscount_check(dd, 1, NULL, oa->cr));
 }
 
 static void
@@ -705,6 +706,8 @@ dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
+	dsl_dir_fscount_adjust(dd, tx, 1, B_TRUE);
+
 	obj = dsl_dataset_create_sync(dd, oa->lastname,
 	    oa->clone_origin, oa->flags, oa->cr, tx);
 
@@ -807,6 +810,7 @@ typedef struct snapallarg {
 	dsl_sync_task_group_t *saa_dstg;
 	boolean_t saa_needsuspend;
 	nvlist_t *saa_props;
+	cred_t *saa_cr;
 
 	/* the following are used only if 'temporary' is set: */
 	boolean_t saa_temporary;
@@ -818,6 +822,7 @@ typedef struct snapallarg {
 typedef struct snaponearg {
 	const char *soa_longname; /* long snap name */
 	const char *soa_snapname; /* short snap name */
+	uint64_t soa_tot_cnt;
 	snapallarg_t *soa_saa;
 } snaponearg_t;
 
@@ -832,7 +837,7 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	/* The props have already been checked by zfs_check_userprops(). */
 
 	error = dsl_dataset_snapshot_check(os->os_dsl_dataset,
-	    soa->soa_snapname, tx);
+	    soa->soa_snapname, soa->soa_tot_cnt, tx, saa->saa_cr);
 	if (error)
 		return (error);
 
@@ -897,7 +902,7 @@ snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static int
-snapshot_one_impl(const char *snapname, void *arg)
+snapshot_one_impl(const char *snapname, void *arg, uint64_t cnt)
 {
 	char fsname[MAXPATHLEN];
 	snapallarg_t *saa = arg;
@@ -933,6 +938,7 @@ snapshot_one_impl(const char *snapname, void *arg)
 	soa->soa_saa = saa;
 	soa->soa_longname = snapname;
 	soa->soa_snapname = strchr(snapname, '@') + 1;
+	soa->soa_tot_cnt = cnt;
 
 	dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync,
 	    os, soa, 3);
@@ -952,6 +958,10 @@ dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 	int rv = 0;
 	int err;
 	nvpair_t *pair;
+	nvlist_t *cnt_track = NULL;
+	char *pdelim;
+	uint64_t val;
+	char nm[MAXPATHLEN];
 
 	pair = nvlist_next_nvpair(snaps, NULL);
 	if (pair == NULL)
@@ -963,10 +973,60 @@ dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 	saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	saa.saa_props = props;
 	saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	saa.saa_cr = CRED();
+
+	/*
+	 * Pre-compute how many total new snapshots will be created for each
+	 * level in the tree and below. This is needed for validating the
+	 * snapshot limit when taking a recursive snapshot.
+	 *
+	 * The problem is that the counts are not actually adjusted when
+	 * we are checking, only when we finally sync. For a single snapshot,
+	 * this is easy, the count will increase by 1 at each node up the tree,
+	 * but its more complicated for recursive snapshots. Since we are
+	 * validating each snapshot independently we need to be sure that we
+	 * are validating the complete count for the entire set of snapshots.
+	 * We do this by rolling up the counts for each component of the name
+	 * into an nvlist then we'll use that count in the validation of each
+	 * individual snapshot.
+	 *
+	 * We validated the snapshot names in zfs_ioc_snapshot so we know they
+	 * have a '@'.
+	 */
+	cnt_track = fnvlist_alloc();
 
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
-		err = snapshot_one_impl(nvpair_name(pair), &saa);
+		(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+		pdelim = strchr(nm, '@');
+		*pdelim = '\0';
+
+		do {
+			if (nvlist_lookup_uint64(cnt_track, nm, &val) == 0) {
+				/* update existing entry */
+				fnvlist_add_uint64(cnt_track, nm, val + 1);
+			} else {
+				/* add to list */
+				fnvlist_add_uint64(cnt_track, nm, 1);
+			}
+
+			pdelim = strrchr(nm, '/');
+			if (pdelim != NULL)
+				*pdelim = '\0';
+		} while (pdelim != NULL);
+	}
+
+	/*
+	 * We've calculated the counts, now validate.
+	 */
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+		pdelim = strchr(nm, '@');
+		*pdelim = '\0';
+
+		val = fnvlist_lookup_uint64(cnt_track, nm);
+		err = snapshot_one_impl(nvpair_name(pair), &saa, val);
 		if (err != 0) {
 			if (errors != NULL) {
 				fnvlist_add_int32(errors,
@@ -976,6 +1036,8 @@ dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 		}
 	}
 
+	nvlist_free(cnt_track);
+
 	/*
 	 * If any call to snapshot_one_impl() failed, don't execute the
 	 * sync task.  The error handling code below will clean up the
@@ -1050,7 +1112,7 @@ dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd)
 		return (err);
 	}
 
-	err = snapshot_one_impl(snapname, &saa);
+	err = snapshot_one_impl(snapname, &saa, 1);
 
 	if (err == 0)
 		err = dsl_sync_task_group_wait(saa.saa_dstg);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index e5644b5a0c..b994357827 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -643,6 +643,19 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 			return (ENODEV);
 	}
 
+	/*
+	 * Check filesystem and snapshot limits before receiving. We'll recheck
+	 * again at the end, but might as well abort before receiving if we're
+	 * already over the limit.
+	 */
+	err = dsl_dir_fscount_check(dd, 1, NULL, rbsa->cr);
+	if (err != 0)
+		return (err);
+
+	err = dsl_snapcount_check(dd, 1, NULL, rbsa->cr);
+	if (err != 0)
+		return (err);
+
 	return (0);
 }
 
@@ -668,6 +681,10 @@ recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	spa_history_log_internal_ds(rbsa->ds, "receive new", tx, "");
 }
 
+/*
+ * Note that we do not check the file system limit with dsl_dir_fscount_check
+ * because the temporary %clones don't count against that limit.
+ */
 /* ARGSUSED */
 static int
 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -725,6 +742,11 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		/* if full, most recent snapshot must be $ORIGIN */
 		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
 			return (ENODEV);
+
+		/* Check snapshot limit before receiving */
+		err = dsl_snapcount_check(ds->ds_dir, 1, NULL, rbsa->cr);
+		if (err != 0)
+			return (err);
 	}
 
 	/* temporary clone name must not exist */
@@ -1547,6 +1569,8 @@ struct recvendsyncarg {
 	char *tosnap;
 	uint64_t creation_time;
 	uint64_t toguid;
+	boolean_t is_new;
+	cred_t *cr;
 };
 
 static int
@@ -1555,7 +1579,16 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsl_dataset_t *ds = arg1;
 	struct recvendsyncarg *resa = arg2;
 
-	return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+	if (resa->is_new) {
+		/* re-check the filesystem limit now that recv is complete */
+		int err;
+
+		err = dsl_dir_fscount_check(ds->ds_dir, 1, NULL, resa->cr);
+		if (err != 0)
+			return (err);
+	}
+
+	return (dsl_dataset_snapshot_check(ds, resa->tosnap, 1, tx, resa->cr));
 }
 
 static void
@@ -1564,6 +1597,11 @@ recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsl_dataset_t *ds = arg1;
 	struct recvendsyncarg *resa = arg2;
 
+	if (resa->is_new) {
+		/* update the filesystem counts */
+		dsl_dir_fscount_adjust(ds->ds_dir->dd_parent, tx, 1, B_TRUE);
+	}
+
 	dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
 
 	/* set snapshot's creation time and guid */
@@ -1624,6 +1662,8 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 	resa.creation_time = drc->drc_drrb->drr_creation_time;
 	resa.toguid = drc->drc_drrb->drr_toguid;
 	resa.tosnap = drc->drc_tosnap;
+	resa.is_new = B_FALSE;
+	resa.cr = CRED();
 
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    recv_end_check, recv_end_sync, ds, &resa, 3);
@@ -1659,6 +1699,8 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc)
 	resa.creation_time = drc->drc_drrb->drr_creation_time;
 	resa.toguid = drc->drc_drrb->drr_toguid;
 	resa.tosnap = drc->drc_tosnap;
+	resa.is_new = B_TRUE;
+	resa.cr = CRED();
 
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    recv_end_check, recv_end_sync, ds, &resa, 3);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 6625444e5a..4de580b784 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -45,6 +45,7 @@
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
+#include "zfs_prop.h"
 
 static char *dsl_reaper = "the grim reaper";
 
@@ -331,7 +332,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 }
 
 static int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -348,6 +350,10 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
+
+	if (err == 0 && adj_cnt)
+		dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE);
+
 	return (err);
 }
 
@@ -1947,7 +1953,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 			ASSERT3U(val, ==, obj);
 		}
 #endif
-		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
+		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx,
+		    B_TRUE);
 		ASSERT(err == 0);
 		dsl_dataset_rele(ds_head, FTAG);
 	}
@@ -2012,9 +2019,124 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 	return (0);
 }
 
+/*
+ * Check if adding additional snapshot(s) would exceed any snapshot limits.
+ * Note that all snapshot limits up to the root dataset (i.e. the pool itself)
+ * or the given ancestor must be satisfied. Note that it is valid for the
+ * count to exceed the limit. This can happen if a snapshot is taken by an
+ * administrative user in the global zone (e.g. a recursive snapshot by root).
+ */
+int
+dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor,
+    cred_t *cr)
+{
+	uint64_t limit;
+	int err = 0;
+
+	VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
+	/* If we're allowed to change the limit, don't enforce the limit. */
+	if (dsl_secpolicy_write_prop(dd, ZFS_PROP_SNAPSHOT_LIMIT, cr) == 0)
+		return (0);
+
+	/*
+	 * If renaming a dataset with no snapshots, count adjustment is 0.
+	 */
+	if (cnt == 0)
+		return (0);
+
+	/*
+	 * If an ancestor has been provided, stop checking the limit once we
+	 * hit that dir. We need this during rename so that we don't overcount
+	 * the check once we recurse up to the common ancestor.
+	 */
+	if (ancestor == dd)
+		return (0);
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we won't touch this node's counts. We also know that the counts
+	 * on the nodes above this one are uninitialized and that there cannot
+	 * be a limit set on any of those nodes.
+	 */
+	if (dd->dd_phys->dd_filesystem_count == 0)
+		return (0);
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT),
+	    8, 1, &limit, NULL, B_FALSE);
+	if (err != 0)
+		return (err);
+
+	/* Is there a snapshot limit which we've hit? */
+	if ((dd->dd_phys->dd_snapshot_count + cnt) > limit)
+		return (EDQUOT);
+
+	if (dd->dd_parent != NULL)
+		err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor, cr);
+
+	return (err);
+}
+
+/*
+ * Adjust the snapshot count for the specified dsl_dir_t and all parents.
+ * When a new snapshot is created, increment the count on all parents, and when
+ * a snapshot is destroyed, decrement the count.
+ */
+void
+dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
+    boolean_t first)
+{
+	if (first) {
+		VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+		VERIFY(dmu_tx_is_syncing(tx));
+	}
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we shouldn't touch this node's counts. An uninitialized count
+	 * on the node indicates that either the feature has not yet been
+	 * activated or there are no limits on this part of the tree.
+	 */
+	if (dd->dd_phys->dd_filesystem_count == 0)
+		return;
+
+	/* if renaming a dataset with no snapshots, count adjustment is 0 */
+	if (delta == 0)
+		return;
+
+	/*
+	 * On initial entry we need to check if this feature is active, but
+	 * we don't want to re-check this on each recursive call. Note: the
+	 * feature cannot be active if it's not enabled. If the feature is not
+	 * active, don't touch the on-disk count fields.
+	 */
+	if (first) {
+		zfeature_info_t *quota_feat =
+		    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
+		if (!spa_feature_is_active(dd->dd_pool->dp_spa, quota_feat))
+			return;
+	}
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+
+	dd->dd_phys->dd_snapshot_count += delta;
+	VERIFY(dd->dd_phys->dd_snapshot_count >= 0);
+
+	/* Roll up this additional count into our ancestors */
+	if (dd->dd_parent != NULL)
+		dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE);
+
+	mutex_exit(&dd->dd_lock);
+}
+
 int
 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
+    uint64_t cnt, dmu_tx_t *tx, cred_t *cr)
 {
 	int err;
 	uint64_t value;
@@ -2042,6 +2164,10 @@ dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
+	err = dsl_snapcount_check(ds->ds_dir, cnt, NULL, cr);
+	if (err)
+		return (err);
+
 	err = dsl_dataset_snapshot_reserve_space(ds, tx);
 	if (err)
 		return (err);
@@ -2063,6 +2189,8 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+	dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE);
+
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
 	 */
@@ -2436,7 +2564,7 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
 
 	VERIFY(0 == dsl_dataset_get_snapname(ds));
-	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
+	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx, B_FALSE);
 	ASSERT0(err);
 	mutex_enter(&ds->ds_lock);
 	(void) strcpy(ds->ds_snapname, newsnapname);
@@ -2631,6 +2759,7 @@ struct promotearg {
 	dsl_dataset_t *origin_origin;
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
+	cred_t *cr;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -2718,9 +2847,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
 	}
 
-	/* Check that there is enough space here */
+	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-	    pa->used);
+	    origin_ds->ds_dir, pa->used, pa->cr);
 	if (err)
 		return (err);
 
@@ -2849,10 +2978,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 		/* move snap name entry */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		VERIFY(0 == dsl_dataset_snap_remove(origin_head,
-		    ds->ds_snapname, tx));
+		    ds->ds_snapname, tx, B_TRUE));
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
+		dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE);
 
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -3090,6 +3220,7 @@ dsl_dataset_promote(const char *name, char *conflsnap)
 
 out:
 	rw_exit(&dp->dp_config_rwlock);
+	pa.cr = CRED();
 
 	/*
 	 * Add in 128x the snapnames zapobj size, since we will be moving
diff --git a/usr/src/uts/common/fs/zfs/dsl_deleg.c b/usr/src/uts/common/fs/zfs/dsl_deleg.c
index ba620bd6fb..5bd59d9732 100644
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -521,7 +522,8 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
  * Check if user has requested permission.
  */
 int
-dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr,
+    boolean_t do_lock)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
@@ -555,7 +557,8 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
 	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
 	    offsetof(perm_set_t, p_node));
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	if (do_lock)
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
 	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
 	    checkflag = ZFS_DELEG_DESCENDENT) {
 		uint64_t zapobj;
@@ -616,7 +619,8 @@ again:
 	}
 	error = EPERM;
 success:
-	rw_exit(&dp->dp_config_rwlock);
+	if (do_lock)
+		rw_exit(&dp->dp_config_rwlock);
 
 	cookie = NULL;
 	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -635,7 +639,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 	if (error)
 		return (error);
 
-	error = dsl_deleg_access_impl(ds, perm, cr);
+	error = dsl_deleg_access_impl(ds, perm, cr, B_TRUE);
 	dsl_dataset_rele(ds, FTAG);
 
 	return (error);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index e7e11dc296..a00be0ebe3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -38,12 +39,89 @@
 #include <sys/arc.h>
 #include <sys/sunddi.h>
 #include <sys/zfs_zone.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
 #include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The count of filesystems and snapshots is stored in the dsl_dir_phys_t which
+ * impacts the on-disk format. As such, this capability is controlled by a
+ * feature flag and must be enabled to be used. Once enabled, the feature is
+ * not active until the first limit is set. At that point, future operations to
+ * create/destroy filesystems or snapshots will validate and update the counts.
+ *
+ * Because the on-disk counts will be uninitialized (0) before the feature is
+ * active, the counts are updated when a limit is first set on an uninitialized
+ * node (The filesystem/snapshot counts on a node includes all of the nested
+ * filesystems/snapshots, plus the node itself. Thus, a new leaf node has a
+ * filesystem count of 1 and a snapshot count of 0. A filesystem count of 0 on
+ * a node indicates uninitialized counts on that node.) When setting a limit on
+ * an uninitialized node, the code starts at the filesystem with the new limit
+ * and descends into all sub-filesystems and updates the counts to be accurate.
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized (0), unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem limit is validated by dsl_dir_fscount_check() and updated by
+ * dsl_dir_fscount_adjust(). The snapshot limit is validated by
+ * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
+ * A new limit value is validated in dsl_dir_validate_fs_ss_limit() and the
+ * filesystem counts are adjusted, if necessary, by dsl_dir_set_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ */
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
     uint64_t value, dmu_tx_t *tx);
 
+extern dsl_syncfunc_t dsl_prop_set_sync;
+
 /* ARGSUSED */
 static void
 dsl_dir_evict(dmu_buf_t *db, void *arg)
@@ -407,6 +485,391 @@ dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 	return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 }
 
+/*
+ * Check if the counts are already valid for this filesystem and its
+ * descendants. The counts on this filesystem, and those below, may be
+ * uninitialized due to either the use of a pre-existing pool which did not
+ * support the filesystem/snapshot limit feature, or one in which the feature
+ * had not yet been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a limit set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * have been updated to be correct, so we can skip this filesystem.
+ */
+static int
+dsl_dir_set_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t *fscnt,
+    uint64_t *sscnt)
+{
+	uint64_t my_fs_cnt = 0;
+	uint64_t my_ss_cnt = 0;
+	uint64_t curr_ss_cnt;
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	int err;
+	int ret = 0;
+	boolean_t limit_set = B_FALSE;
+	uint64_t fslimit, sslimit;
+	dsl_dataset_t *ds;
+
+	ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
+	    8, 1, &fslimit, NULL, B_FALSE);
+	if (err == 0 && fslimit != UINT64_MAX)
+		limit_set = B_TRUE;
+
+	if (!limit_set) {
+		err = dsl_prop_get_dd(dd,
+		    zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 8, 1, &sslimit,
+		    NULL, B_FALSE);
+		if (err == 0 && sslimit != UINT64_MAX)
+			limit_set = B_TRUE;
+	}
+
+	/*
+	 * If the dd has a limit, we know its count is already good and we
+	 * don't need to recurse down any further.
+	 */
+	if (limit_set) {
+		*fscnt = dd->dd_phys->dd_filesystem_count;
+		*sscnt = dd->dd_phys->dd_snapshot_count;
+		return (ret);
+	}
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	mutex_enter(&dd->dd_lock);
+
+	/* Iterate datasets */
+	for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+		dsl_dir_t *chld_dd;
+		uint64_t chld_fs_cnt = 0;
+		uint64_t chld_ss_cnt = 0;
+
+		if (dsl_dir_open_obj(dd->dd_pool,
+		    ZFS_DIRENT_OBJ(za->za_first_integer), NULL, FTAG,
+		    &chld_dd)) {
+			ret = 1;
+			break;
+		}
+
+		if (dsl_dir_set_fs_ss_count(chld_dd, tx, &chld_fs_cnt,
+		    &chld_ss_cnt)) {
+			ret = 1;
+			break;
+		}
+
+		dsl_dir_close(chld_dd, FTAG);
+
+		my_fs_cnt += chld_fs_cnt;
+		my_ss_cnt += chld_ss_cnt;
+	}
+	zap_cursor_fini(zc);
+	kmem_free(zc, sizeof (zap_cursor_t));
+	kmem_free(za, sizeof (zap_attribute_t));
+
+	/* Count snapshots */
+	if (dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+	    FTAG, &ds) == 0) {
+		if (zap_count(os, ds->ds_phys->ds_snapnames_zapobj,
+		    &curr_ss_cnt) == 0)
+			my_ss_cnt += curr_ss_cnt;
+		else
+			ret = 1;
+		dsl_dataset_rele(ds, FTAG);
+	} else {
+		ret = 1;
+	}
+
+	/* Add 1 for self */
+	my_fs_cnt++;
+
+	/* save updated counts */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_filesystem_count = my_fs_cnt;
+	dd->dd_phys->dd_snapshot_count = my_ss_cnt;
+
+	mutex_exit(&dd->dd_lock);
+
+	/* Return child dataset count plus self */
+	*fscnt = my_fs_cnt;
+	*sscnt = my_ss_cnt;
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+fs_ss_limit_feat_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+fs_ss_limit_feat_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	zfeature_info_t *limit_feat =
+	    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
+	spa_feature_incr(spa, limit_feat, tx);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * If setting a limit, ensure the on-disk counts are valid.
+ *
+ * We do not validate the new limit, since users who can change the limit are
+ * also allowed to exceed the limit.
+ *
+ * Return -1 to force the zfs_set_prop_nvlist code down the default path to set
+ * the value in the nvlist.
+ */
+int
+dsl_dir_validate_fs_ss_limit(const char *ddname, uint64_t limit,
+    zfs_prop_t ptype)
+{
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	int err;
+	dmu_tx_t *tx;
+	uint64_t my_fs_cnt = 0;
+	uint64_t my_ss_cnt = 0;
+	uint64_t curr_limit;
+	spa_t *spa;
+	zfeature_info_t *limit_feat =
+	    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
+	if ((err = dsl_dataset_hold(ddname, FTAG, &ds)) != 0)
+		return (err);
+
+	spa = dsl_dataset_get_spa(ds);
+	if (!spa_feature_is_enabled(spa,
+	    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT])) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOTSUP);
+	}
+
+	dd = ds->ds_dir;
+
+	if ((err = dsl_prop_get_dd(dd, zfs_prop_to_name(ptype), 8, 1,
+	    &curr_limit, NULL, B_FALSE)) != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (err);
+	}
+
+	if (limit == UINT64_MAX) {
+		/*
+		 * If we had a limit, since we're now removing that limit, this
+		 * is where we could decrement the feature-active counter so
+		 * that the feature becomes inactive (only enabled) if we
+		 * remove the last limit. However, we do not currently support
+		 * deactivating the feature.
+		 */
+		dsl_dataset_rele(ds, FTAG);
+		return (-1);
+	}
+
+	if (!spa_feature_is_active(spa, limit_feat)) {
+		/*
+		 * Since the feature was not active and we're now setting a
+		 * limit, increment the feature-active counter so that the
+		 * feature becomes active for the first time.
+		 *
+		 * We can't update the MOS in open context, so create a sync
+		 * task.
+		 */
+		err = dsl_sync_task_do(dd->dd_pool, fs_ss_limit_feat_check,
+		    fs_ss_limit_feat_sync, spa, (void *)1, 0);
+		if (err != 0)
+			return (err);
+	}
+
+	tx = dmu_tx_create_dd(dd);
+	if (dmu_tx_assign(tx, TXG_WAIT)) {
+		dmu_tx_abort(tx);
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOSPC);
+	}
+
+	/*
+	 * Since we are now setting a non-UINT64_MAX on the filesystem, we need
+	 * to ensure the counts are correct. Descend down the tree from this
+	 * point and update all of the counts to be accurate.
+	 */
+	err = -1;
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	if (dsl_dir_set_fs_ss_count(dd, tx, &my_fs_cnt, &my_ss_cnt))
+		err = ENOSPC;
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	dmu_tx_commit(tx);
+	dsl_dataset_rele(ds, FTAG);
+
+	return (err);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified.
+ */
+int
+dsl_secpolicy_write_prop(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+	int err = 0;
+	uint64_t obj;
+	dsl_dataset_t *ds;
+	uint64_t zoned;
+
+#ifdef _KERNEL
+	if (crgetzoneid(cr) != GLOBAL_ZONEID)
+		return (EPERM);
+
+	if (secpolicy_zfs(cr) == 0)
+		return (0);
+#endif
+
+	if ((obj = dd->dd_phys->dd_head_dataset_obj) == NULL)
+		return (ENOENT);
+
+	ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
+	if ((err = dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)) != 0)
+		return (err);
+
+	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+		/* Only root can access zoned fs's from the GZ */
+		err = EPERM;
+	} else {
+		err = dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr,
+		    B_FALSE);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits. Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_dir_fscount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor,
+    cred_t *cr)
+{
+	uint64_t limit;
+	int err = 0;
+
+	VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+
+	/* If we're allowed to change the limit, don't enforce the limit. */
+	if (dsl_secpolicy_write_prop(dd, ZFS_PROP_FILESYSTEM_LIMIT, cr) == 0)
+		return (0);
+
+	/*
+	 * If an ancestor has been provided, stop checking the limit once we
+	 * hit that dir. We need this during rename so that we don't overcount
+	 * the check once we recurse up to the common ancestor.
+	 */
+	if (ancestor == dd)
+		return (0);
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we won't touch this node's counts.
+	 */
+	if (dd->dd_phys->dd_filesystem_count == 0)
+		return (0);
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
+	    8, 1, &limit, NULL, B_FALSE);
+	if (err != 0)
+		return (err);
+
+	/* Is there a fs limit which we've hit? */
+	if ((dd->dd_phys->dd_filesystem_count + cnt) > limit)
+		return (EDQUOT);
+
+	if (dd->dd_parent != NULL)
+		err = dsl_dir_fscount_check(dd->dd_parent, cnt, ancestor, cr);
+
+	return (err);
+}
+
+/*
+ * Adjust the filesystem count for the specified dsl_dir_t and all parent
+ * filesystems. When a new filesystem is created, increment the count on all
+ * parents, and when a filesystem is destroyed, decrement the count.
+ */
+void
+dsl_dir_fscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
+    boolean_t first)
+{
+	if (first) {
+		VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+		VERIFY(dmu_tx_is_syncing(tx));
+	}
+
+	/*
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We don't count this temporary
+	 * clone, whose name begins with a '%'.
+	 */
+	if (dd->dd_myname[0] == '%')
+		return;
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we shouldn't touch this node's counts. An uninitialized count
+	 * on the node indicates that either the feature has not yet been
+	 * activated or there are no limits on this part of the tree.
+	 */
+	if (dd->dd_phys->dd_filesystem_count == 0)
+		return;
+
+	/*
+	 * On initial entry we need to check if this feature is active, but
+	 * we don't want to re-check this on each recursive call. Note: the
+	 * feature cannot be active if its not enabled. If the feature is not
+	 * active, don't touch the on-disk count fields.
+	 */
+	if (first) {
+		zfeature_info_t *quota_feat =
+		    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
+		if (!spa_feature_is_active(dd->dd_pool->dp_spa, quota_feat))
+			return;
+	}
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+
+	dd->dd_phys->dd_filesystem_count += delta;
+	VERIFY(dd->dd_phys->dd_filesystem_count >= 1);	/* ourself is 1 */
+
+	/* Roll up this additional count into our ancestors */
+	if (dd->dd_parent != NULL)
+		dsl_dir_fscount_adjust(dd->dd_parent, tx, delta, B_FALSE);
+
+	mutex_exit(&dd->dd_lock);
+}
+
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
@@ -415,6 +878,9 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 	uint64_t ddobj;
 	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
+	zfeature_info_t *limit_feat =
+	    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
@@ -431,6 +897,9 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 	ddphys = dbuf->db_data;
 
 	ddphys->dd_creation_time = gethrestime_sec();
+	/* Only initialize the count if the limit feature is active */
+	if (spa_feature_is_active(dp->dp_spa, limit_feat))
+		ddphys->dd_filesystem_count = 1;
 	if (pds)
 		ddphys->dd_parent_obj = pds->dd_object;
 	ddphys->dd_props_zapobj = zap_create(mos,
@@ -488,6 +957,16 @@ dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	/*
+	 * Decrement the filesystem count for all parent filesystems.
+	 *
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We never count this temporary
+	 * clone, whose name begins with a '%'.
+	 */
+	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+		dsl_dir_fscount_adjust(dd->dd_parent, tx, -1, B_TRUE);
+
+	/*
 	 * Remove our reservation. The impl() routine avoids setting the
 	 * actual property, which would require the (already destroyed) ds.
 	 */
@@ -1036,8 +1515,6 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	return (err);
 }
 
-extern dsl_syncfunc_t dsl_prop_set_sync;
-
 static void
 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -1238,6 +1715,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 struct renamearg {
 	dsl_dir_t *newparent;
 	const char *mynewname;
+	cred_t *cr;
 };
 
 static int
@@ -1278,8 +1756,22 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 			return (EINVAL);
 
 		if (err = dsl_dir_transfer_possible(dd->dd_parent,
-		    ra->newparent, myspace))
+		    ra->newparent, dd, myspace, ra->cr))
 			return (err);
+
+		if (dd->dd_phys->dd_filesystem_count == 0 &&
+		    dmu_tx_is_syncing(tx)) {
+			uint64_t fs_cnt = 0;
+			uint64_t ss_cnt = 0;
+
+			/*
+			 * Ensure this portion of the tree's counts have been
+			 * initialized in case the new parent has limits set.
+			 */
+			err = dsl_dir_set_fs_ss_count(dd, tx, &fs_cnt, &ss_cnt);
+			if (err)
+				return (EIO);
+		}
 	}
 
 	return (0);
@@ -1303,6 +1795,20 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 	    "-> %s/%s", namebuf, ra->mynewname);
 
 	if (ra->newparent != dd->dd_parent) {
+		int cnt;
+
+		mutex_enter(&dd->dd_lock);
+
+		cnt = dd->dd_phys->dd_filesystem_count;
+		dsl_dir_fscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
+		dsl_dir_fscount_adjust(ra->newparent, tx, cnt, B_TRUE);
+
+		cnt = dd->dd_phys->dd_snapshot_count;
+		dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
+		dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
+
+		mutex_exit(&dd->dd_lock);
+
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    -dd->dd_phys->dd_used_bytes,
 		    -dd->dd_phys->dd_compressed_bytes,
@@ -1366,6 +1872,8 @@ dsl_dir_rename(dsl_dir_t *dd, const char *newname)
 		goto out;
 	}
 
+	ra.cr = CRED();
+
 	err = dsl_sync_task_do(dd->dd_pool,
 	    dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
 
@@ -1375,11 +1883,13 @@ out:
 }
 
 int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
+    uint64_t space, cred_t *cr)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
+	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
@@ -1387,6 +1897,17 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
 	if (avail < space)
 		return (ENOSPC);
 
+	if (sdd != moving_dd) {
+		err = dsl_dir_fscount_check(tdd,
+		    moving_dd->dd_phys->dd_filesystem_count, ancestor, cr);
+		if (err != 0)
+			return (err);
+	}
+	err = dsl_snapcount_check(tdd, moving_dd->dd_phys->dd_snapshot_count,
+	    ancestor, cr);
+	if (err != 0)
+		return (err);
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 272c3ecde2..b784a385fb 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -218,7 +218,8 @@ int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
 dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_syncfunc_t dsl_dataset_user_hold_sync;
-int dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *, dmu_tx_t *tx);
+int dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *, uint64_t,
+    dmu_tx_t *tx, cred_t *);
 void dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *, dmu_tx_t *tx);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name, char *conflsnap);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h b/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h
index 5842639aaf..10f9bddaca 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -65,7 +65,8 @@ extern "C" {
 int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
 int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
 int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
-int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr,
+    boolean_t);
 void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
 int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
 int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 2191635dd8..014c1be250 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DIR_H
@@ -70,7 +71,11 @@ typedef struct dsl_dir_phys {
 	uint64_t dd_flags;
 	uint64_t dd_used_breakdown[DD_USED_NUM];
 	uint64_t dd_clones; /* dsl_dir objects */
-	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
+
+	uint64_t dd_filesystem_count;
+	uint64_t dd_snapshot_count;
+
+	uint64_t dd_pad[11]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
@@ -131,8 +136,15 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
+int dsl_dir_validate_fs_ss_limit(const char *, uint64_t, zfs_prop_t);
+int dsl_secpolicy_write_prop(dsl_dir_t *, zfs_prop_t, cred_t *);
+int dsl_dir_fscount_check(dsl_dir_t *, uint64_t, dsl_dir_t *, cred_t *);
+void dsl_dir_fscount_adjust(dsl_dir_t *, dmu_tx_t *, int64_t, boolean_t);
+int dsl_snapcount_check(dsl_dir_t *, uint64_t, dsl_dir_t *, cred_t *);
+void dsl_snapcount_adjust(dsl_dir_t *, dmu_tx_t *, int64_t, boolean_t);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    dsl_dir_t *moving_dd, uint64_t space, cred_t *);
 int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 48843bfb28..693e0e7264 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -477,7 +477,7 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
 		if (error)
-			error = dsl_deleg_access_impl(ds, perm, cr);
+			error = dsl_deleg_access_impl(ds, perm, cr, B_TRUE);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
@@ -494,7 +494,7 @@ zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
 		if (error)
-			error = dsl_deleg_access_impl(ds, perm, cr);
+			error = dsl_deleg_access_impl(ds, perm, cr, B_TRUE);
 	}
 	return (error);
 }
@@ -616,12 +616,14 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 		break;
 
 	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (!INGLOBALZONE(curproc)) {
 			uint64_t zoned;
 			char setpoint[MAXNAMELEN];
 			/*
 			 * Unprivileged users are allowed to modify the
-			 * quota on things *under* (ie. contained by)
+			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
 			if (dsl_prop_get_integer(dsname, "zoned", &zoned,
@@ -2383,6 +2385,14 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
 	case ZFS_PROP_REFQUOTA:
 		err = dsl_dataset_set_quota(dsname, source, intval);
 		break;
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+		err = dsl_dir_validate_fs_ss_limit(dsname, intval,
+		    ZFS_PROP_FILESYSTEM_LIMIT);
+		break;
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		err = dsl_dir_validate_fs_ss_limit(dsname, intval,
+		    ZFS_PROP_SNAPSHOT_LIMIT);
+		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 8639a2b8db..e1247350ed 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -138,6 +138,8 @@ typedef enum {
 	ZFS_PROP_REFRATIO,
 	ZFS_PROP_WRITTEN,
 	ZFS_PROP_CLONES,
+	ZFS_PROP_FILESYSTEM_LIMIT,
+	ZFS_PROP_SNAPSHOT_LIMIT,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2012-12-21 20:10:33 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2012-12-21 20:10:33 +0000
commit	99c071e7ee789a71d9aa9f3bf92cfde20153a526 (patch)
tree	57beaca4a6b3a72d4870d8d0fb9a58097d881e30
parent	50b4afdcbc4755b19fb03c07eca48ea8d3f83108 (diff)
download	illumos-joyent-99c071e7ee789a71d9aa9f3bf92cfde20153a526.tar.gz