OS-4495 support cgroups notify_on_release and release_agent

author: Jerry Jelinek <jerry.jelinek@joyent.com> 2015-07-24 13:34:15 +0000
committer: Jerry Jelinek <jerry.jelinek@joyent.com> 2015-07-24 13:34:15 +0000
commit: 431ca10ae7ca970d65f15fe0a1115ee749a97433 (patch)
tree: c91d28297006e34628e27a39dd6084952d077c0f
parent: 7be989b3b3d0affc5705ea8b81d4b84ec65d8246 (diff)
download: illumos-joyent-431ca10ae7ca970d65f15fe0a1115ee749a97433.tar.gz
12 files changed, 1405 insertions, 281 deletions
diff --git a/manifest b/manifest
index 0cdccb830d..68f3c742e0 100644
--- a/manifest
+++ b/manifest
@@ -5045,11 +5045,12 @@ s usr/lib/brand/lx/64=amd64
 d usr/lib/brand/lx/amd64 0755 root bin
 f usr/lib/brand/lx/amd64/lx_librtld_db.so.1 0755 root root
 f usr/lib/brand/lx/amd64/lx_vdso.so.1 0755 root root
+f usr/lib/brand/lx/cgrpmgr 0755 root root
+f usr/lib/brand/lx/etc_default_nfs 0444 root root
 d usr/lib/brand/lx/ld 0755 root root
 f usr/lib/brand/lx/ld/ld.config 0755 root root
 d usr/lib/brand/lx/ld/64 0755 root root
 f usr/lib/brand/lx/ld/64/ld.config 0755 root root
-f usr/lib/brand/lx/etc_default_nfs 0444 root root
 f usr/lib/brand/lx/ltp_skiplist 0444 root root
 f usr/lib/brand/lx/ltp_tests 0444 root root
 f usr/lib/brand/lx/lx_boot 0755 root root
diff --git a/usr/src/lib/brand/lx/Makefile b/usr/src/lib/brand/lx/Makefile
index 2c5a373e25..67f2926305 100644
--- a/usr/src/lib/brand/lx/Makefile
+++ b/usr/src/lib/brand/lx/Makefile
@@ -33,7 +33,7 @@ include Makefile.lx
 .PARALLEL:
 
 SUBDIRS=	cmd librtld_db lx_support lx_init lx_brand netfiles \
-		zone lx_vdso testing .WAIT
+		zone lx_vdso cgrpmgr testing .WAIT
 MSGSUBDIRS=	lx_brand lx_support zone
 
 all :=		TARGET= all
diff --git a/usr/src/lib/brand/lx/cgrpmgr/Makefile b/usr/src/lib/brand/lx/cgrpmgr/Makefile
new file mode 100644
index 0000000000..26aa079d63
--- /dev/null
+++ b/usr/src/lib/brand/lx/cgrpmgr/Makefile
@@ -0,0 +1,56 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+PROG =		cgrpmgr
+
+PROG_OBJS =	cgrpmgr.o
+
+OBJS =		$(PROG_OBJS)
+SRCS =		$(PROG_OBJS:%.o=%.c)
+
+all:		$(PROG)
+
+include ../Makefile.lx
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/cmd/Makefile.ctf
+
+# override the install directory
+ROOTBIN =	$(ROOTBRANDDIR)
+CLOBBERFILES =	$(OBJS) $(ROOTPROG)
+
+UTSBASE =	$(SRC)/uts
+
+CFLAGS +=	$(CCVERBOSE)
+CPPFLAGS +=	-D_REENTRANT -I$(UTSBASE)/common/brand/lx/cgroups
+LDLIBS +=
+
+.KEEP_STATE:
+
+install: all $(ROOTPROG)
+
+clean:
+	$(RM) $(PROG) $(OBJS)
+
+lint: lint_PROG lint_SRCS
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+%.o: %.c
+	$(COMPILE.c) $<
+	$(POST_PROCESS_O)
+
+include $(SRC)/cmd/Makefile.targ
diff --git a/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c
new file mode 100644
index 0000000000..cbbe56e747
--- /dev/null
+++ b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c
@@ -0,0 +1,157 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * The cgrpmgr is a user-level daemon process associated with a specific cgroup
+ * fs mount. It's only job is to run the release_agent when a cgroup becomes
+ * empty and notify_on_release is enabled.
+ */
+
+#include <stdarg.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <cgrps.h>
+
+static void
+run_agent(char *agent, char *arg)
+{
+	char *argv[3];
+	char *cmdp;
+
+	/*
+	 * The parent does nothing.
+	 */
+	if (fork() != 0)
+		return;
+
+	/*
+	 * Child - run the agent.
+	 */
+	(void) setsid();
+
+	cmdp = strrchr(agent, '/');
+	if (cmdp == NULL) {
+		cmdp = agent;
+	} else {
+		cmdp++;
+	}
+
+	argv[0] = cmdp;
+	argv[1] = arg;
+	argv[2] = NULL;
+
+	execv(agent, argv);
+	/* Nothing can be done if the exec fails */
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int fd;
+	int res;
+	sigset_t set, oset;
+	struct statvfs sb;
+	char rel_agent[MAXPATHLEN];
+	char cgrp_path[MAXPATHLEN];
+	cgrpmgr_info_t cgmi;
+
+	/*
+	 * Start by daemonizing ourself.
+	 */
+
+	/* Close all open fd's */
+	closefrom(0);
+
+	clearenv();
+
+	/*
+	 * Block all signals except SIGCHLD since we don't want this code to
+	 * respond to any signal (except, of course, the ones we can't block).
+	 * By setting the SIGCHLD disposition to ignore our children will
+	 * automatically be reaped.
+	 */
+	(void) sigfillset(&set);
+	(void) sigdelset(&set, SIGCHLD);
+	(void) sigdelset(&set, SIGABRT);
+	(void) sigprocmask(SIG_BLOCK, &set, &oset);
+	(void) signal(SIGCHLD, SIG_IGN);
+
+	switch (fork1()) {
+	case -1: /* uh-oh */
+		exit(1);
+
+	case 0:	/* child */
+		break;
+
+	default: /* parent */
+		exit(0);
+	}
+
+	(void) setsid();
+	(void) umask(0077);
+	(void) chdir("/");
+
+	if ((fd = open(argv[1], O_RDONLY)) < 0)
+		exit(1);
+
+	/*
+	 * Sanity check the mount point we got.
+	 */
+	if (fstatvfs(fd, &sb) < 0 || strcmp(sb.f_basetype, "lx_cgroup") != 0)
+		exit(1);
+
+	cgmi.cgmi_pid = getpid();
+	cgmi.cgmi_rel_agent_path = rel_agent;
+	cgmi.cgmi_cgroup_path = cgrp_path;
+
+	/*
+	 * Now wait for and run the release agent each time we return from the
+	 * ioctl. An error return indicates the fs has been unmounted and we
+	 * should exit.
+	 */
+	for (;;) {
+		/*
+		 * Block in the kernel until a cgroup becomes empty.
+		 */
+		res = ioctl(fd, CGRPFS_GETEVNT, &cgmi);
+
+		/*
+		 * EIO indicates we should quit but any other error implies
+		 * we did something wrong (which means a bug), so simply
+		 * terminate on any error.
+		 */
+		if (res != 0) {
+			if (errno == EIO)
+				exit(0);
+			abort();
+		}
+
+		run_agent(rel_agent, cgrp_path);
+	}
+
+	return (0);
+}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/mount.c b/usr/src/lib/brand/lx/lx_brand/common/mount.c
index aca92ed587..406c960dc1 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/mount.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/mount.c
@@ -40,6 +40,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <stdlib.h>
 
 #include <sys/lx_autofs.h>
 #include <sys/lx_debug.h>
@@ -600,6 +601,32 @@ i_make_nfs_args(lx_nfs_mount_data_t *lx_nmd, struct nfs_args *nfs_args,
 	return (0);
 }
 
+static int
+run_cgrp_mgr(char *mntpnt)
+{
+	const char *cmd = "/native/usr/lib/brand/lx/cgrpmgr";
+	char *argv[] = { "cgrpmgr", NULL, NULL };
+
+	argv[1] = mntpnt;
+
+	switch (fork1()) {
+	case 0:
+		/* child */
+		execv(cmd, argv);
+		exit(1);
+		break;
+
+	case -1:
+		return (-1);
+
+	default:
+		/* the cgroup manager process runs until we unmount */
+		break;
+	}
+
+	return (0);
+}
+
 long
 lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
     uintptr_t p5)
@@ -616,6 +643,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 	char			target[MAXPATHLEN];
 	char			fstype[MAXPATHLEN], options[MAX_MNTOPT_STR];
 	int			sflags, rv;
+	long			res;
+	boolean_t		is_cgrp = B_FALSE;
 
 	/* Variables needed for nfs mounts. */
 	lx_nfs_mount_data_t	lx_nmd;
@@ -752,6 +781,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 		}
 		lx_debug("\tlinux mount options: \"%s\"", options);
 
+		is_cgrp = B_TRUE;
+
 		/*
 		 * Currently don't verify Linux mount options since we can
 		 * have asubsystem string provided.
@@ -885,8 +916,24 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
 	lx_debug("\tsolaris mount fstype: %s", fstype);
 	lx_debug("\tsolaris mount options: \"%s\"", options);
 
-	return (mount(source, target, sflags, fstype, sdataptr, sdatalen,
-	    options, sizeof (options)) ? -errno : 0);
+	res = mount(source, target, sflags, fstype, sdataptr, sdatalen,
+	    options, sizeof (options));
+
+	if (res == 0) {
+		if (is_cgrp && run_cgrp_mgr(target) != 0) {
+			/*
+			 * Forking the cgrp manager failed, unmount and return
+			 * an ENOMEM error as the best approximation that we're
+			 * out of resources.
+			 */
+			(void) umount(target);
+			return (-ENOMEM);
+		} else {
+			return (0);
+		}
+	} else {
+		return (-errno);
+	}
 }
 
 /*
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
index f0fab9f904..cfbeb2796c 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps.h
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
@@ -46,11 +46,36 @@ extern "C" {
 #include <sys/atomic.h>
 #include <vm/anon.h>
 
+/*
+ * cgrpmgr ioctl interface.
+ */
+#define	CGRPFS_IOC	('C' << 16 | 'G' << 8)
+#define	CGRPFS_GETEVNT	(CGRPFS_IOC | 1)
+
+typedef struct cgrpmgr_info {
+	pid_t	cgmi_pid;
+	char	*cgmi_rel_agent_path;
+	char	*cgmi_cgroup_path;
+} cgrpmgr_info_t;
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+typedef struct cgrpmgr_info32 {
+	pid_t		cgmi_pid;
+	caddr32_t	cgmi_rel_agent_path;
+	caddr32_t	cgmi_cgroup_path;
+} cgrpmgr_info32_t;
+
+typedef struct cgrp_evnt {
+	list_node_t	cg_evnt_lst;
+	char		*cg_evnt_path;
+} cgrp_evnt_t;
+
 #define	CG_PSNSIZE	256	/* max size of pseudo file name entries */
 #define	CG_PSDSIZE	16	/* pretend that a dir entry takes 16 bytes */
 
-#define	CG_START_ID	0	/* initial node ID for allocation */
-
 /*
  * The order of these entries must be in sync with the cg_ssde_dir array.
  */
@@ -61,8 +86,10 @@ typedef enum cgrp_ssid {
 
 typedef enum cgrp_nodetype {
 	CG_CGROUP_DIR = 1,	/* cgroup directory entry */
-	CG_PROCS,
-	CG_TASKS,
+	CG_NOTIFY,		/* notify_on_release file */
+	CG_PROCS,		/* cgroup.procs file */
+	CG_REL_AGENT,		/* release_agent file */
+	CG_TASKS,		/* tasks file */
 } cgrp_nodetype_t;
 
 typedef struct cgrp_subsys_dirent {
@@ -70,10 +97,19 @@ typedef struct cgrp_subsys_dirent {
 	char		*cgrp_ssd_name;
 } cgrp_subsys_dirent_t;
 
+#define	N_DIRENTS(m)	(cgrp_num_pseudo_ents((m)->cg_ssid) + 2)
+
+/*
+ * A modern systemd-based Linux system typically has 50-60 cgroups so
+ * we size the hash for 2x that number.
+ */
+#define	CGRP_HASH_SZ	128
+
 /*
  * cgroups per-mount data structure.
  *
- * All fields are protected by cg_contents.
+ * All but the event related fields are protected by cg_contents.
+ * The evnt_list and counter is protected by cg_events.
  */
 typedef struct cgrp_mnt {
 	struct vfs	*cg_vfsp;	/* filesystem's vfs struct */
@@ -82,45 +118,45 @@ typedef struct cgrp_mnt {
 	cgrp_ssid_t	cg_ssid;	/* subsystem type */
 	dev_t		cg_dev;		/* unique dev # of mounted `device' */
 	uint_t		cg_gen;		/* node ID source for files */
-	kmutex_t	cg_contents;	/* lock for cgrp_mnt structure */
-	kmutex_t	cg_renamelck;	/* rename lock for this mount */
+	uint_t		cg_grp_gen;	/* ID source for cgroups */
+	kmutex_t	cg_contents;	/* global lock for most fs activity */
+	char		cg_agent[MAXPATHLEN + 1]; /* release_agent path */
+	pid_t		cg_mgrpid;	/* pid of user-level manager */
+	kmutex_t	cg_events;	/* lock for event list */
+	kcondvar_t	cg_evnt_cv;	/* condvar for event list wakeup */
+	int		cg_evnt_cnt;	/* counter for num events in list */
+	list_t		cg_evnt_list;	/* list of agent events */
+	/* ptr to zone data for containing zone */
+	lx_zone_data_t	*cg_lxzdata;
+	struct cgrp_node **cg_grp_hash;	/* hash list of cgroups in the fs */
 } cgrp_mnt_t;
 
 /*
  * cgrp_node is the file system dependent node for cgroups.
  *
- *	cgn_rwlock protects access of the directory list at cgn_dir
- *	as well as syncronizing read and writes to the cgrp_node
- *
- *	cgn_contents protects growing, shrinking, reading and writing
- *	the file along with cgn_rwlock (see below).
+ * The node is used to represent both directories (a cgroup) and pseudo files
+ * within the directory.
  *
- *	cgn_tlock protects updates to cgn_mode and cgn_nlink
- *
- *	cg_contents in the cgrp_mount data structure protects
- *	cgn_forw and cgn_back which are used to maintain a linked
- *	list of all cgroup files associated with that file system
- *
- *	The ordering of the locking is:
- *	cg_rwlock -> cgn_contents
- *
- *	cgn_tlock doesn't require any cgrp_node locks
+ * Members are tagged in the comment to note which type of node they apply to:
+ * A - all
+ * D - dir (i.e. a cgroup)
+ * F - pseudo file
  */
 
 typedef struct cgrp_node {
-	struct cgrp_node	*cgn_back;	/* lnked lst of cgrp_nodes */
-	struct cgrp_node	*cgn_forw;	/* lnked lst of cgrp_nodes */
-	struct cgrp_dirent	*cgn_dir;	/* dirent list */
-	struct cgrp_node	*cgn_parent;	/* dir containing this node */
-	uint_t			cgn_dirents;	/* number of dirents */
-	cgrp_nodetype_t		cgn_type;	/* type for this node */
-	struct vnode 		*cgn_vnode;	/* vnode for this cgrp_node */
-	int 			cgn_id;		/* ID number for the cgroup */
-	struct vattr		cgn_attr;	/* attributes */
-	krwlock_t		cgn_contents;	/* serialize mods */
-	krwlock_t		cgn_rwlock;	/* rw - serialize */
-						/* mods and dir updates */
-	kmutex_t		cgn_tlock;	/* time, flag, and nlink lock */
+	struct cgrp_node	*cgn_back;	/* A lnked lst of cgrp_nodes */
+	struct cgrp_node	*cgn_forw;	/* A lnked lst of cgrp_nodes */
+	struct cgrp_dirent	*cgn_dir;	/* D dirent list */
+	struct cgrp_node	*cgn_parent;	/* A dir containing this node */
+	struct cgrp_node	*cgn_next;	/* D link in per-mount cgroup */
+						/*   hash table */
+	uint_t			cgn_dirents;	/* D number of dirents */
+	cgrp_nodetype_t		cgn_type;	/* A type for this node */
+	uint_t			cgn_notify;	/* D notify_on_release value */
+	uint_t			cgn_task_cnt;	/* D number of threads in grp */
+	struct vnode 		*cgn_vnode;	/* A vnode for this cgrp_node */
+	uint_t 			cgn_id;		/* D ID number for the cgroup */
+	struct vattr		cgn_attr;	/* A attributes */
 } cgrp_node_t;
 
 /*
@@ -184,6 +220,10 @@ void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *);
 int cgrp_taccess(void *, int, cred_t *);
 ino_t cgrp_inode(cgrp_nodetype_t, unsigned int);
 int cgrp_num_pseudo_ents(cgrp_ssid_t);
+cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t);
+void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *);
+
+#endif /* KERNEL */
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
index 0d153f73c1..8950be1966 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
@@ -36,6 +36,7 @@ static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *, enum de_op);
 
 static cgrp_subsys_dirent_t cgrp_generic_dir[] = {
 	{ CG_PROCS,		"cgroup.procs" },
+	{ CG_NOTIFY,		"notify_on_release" },
 	{ CG_TASKS,		"tasks" }
 };
 
@@ -165,6 +166,132 @@ cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold,
 }
 
 /*
+ * The following functions maintain the per-mount cgroup hash table.
+ */
+static void
+cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	uint_t cgid;
+	int hsh;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgid = cn->cgn_id;
+	hsh = cgid % CGRP_HASH_SZ;
+
+	cn->cgn_next = cgm->cg_grp_hash[hsh];
+	cgm->cg_grp_hash[hsh] = cn;
+}
+
+static void
+cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	uint_t cgid;
+	int hsh;
+	cgrp_node_t *np = NULL, *curp, *prevp = NULL;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgid = cn->cgn_id;
+	hsh = cgid % CGRP_HASH_SZ;
+
+	for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+	    curp = curp->cgn_next) {
+		if (curp->cgn_id == cgid) {
+			if (prevp == NULL) {
+				cgm->cg_grp_hash[hsh] = curp->cgn_next;
+			} else {
+				prevp->cgn_next = curp->cgn_next;
+			}
+			np = curp;
+			np->cgn_next = NULL;
+			break;
+		}
+
+		prevp = curp;
+	}
+
+	ASSERT(np != NULL);
+	ASSERT(np->cgn_task_cnt == 0);
+}
+
+/*
+ * Count up the number of threads already running in the zone and initialize the
+ * first cgroup's task counter.
+ *
+ * We have to look at all of the processes to find applicable ones.
+ */
+static void
+cgrp_cg_hash_init(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	int i;
+	int cnt = 0;
+	zoneid_t zoneid = curproc->p_zone->zone_id;
+	pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	/* Scan all of the process entries */
+	mutex_enter(&pidlock);
+	for (i = 1; i < v.v_proc; i++) {
+		proc_t *p;
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, system processes,
+		 * a PID of 0, the pid for our zsched process,  anything the
+		 * security policy doesn't allow us to look at, its not an
+		 * lx-branded process and processes that are not in the zone.
+		 */
+		if ((p = pid_entry(i)) == NULL ||
+		    p->p_stat == SIDL ||
+		    (p->p_flag & SSYS) != 0 ||
+		    p->p_pid == 0 ||
+		    p->p_pid == schedpid ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+		    p->p_zone->zone_id != zoneid) {
+			continue;
+		}
+
+		mutex_enter(&p->p_lock);
+		if (p->p_brand != &lx_brand) {
+			mutex_exit(&p->p_lock);
+			continue;
+		}
+		cnt += p->p_lwpcnt;
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * There should be at least the init process with 1 thread in the zone
+	 */
+	ASSERT(cnt > 0);
+	cn->cgn_task_cnt = cnt;
+
+	DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt);
+
+	mutex_exit(&pidlock);
+}
+
+cgrp_node_t *
+cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid)
+{
+	int hsh = cgid % CGRP_HASH_SZ;
+	cgrp_node_t *curp;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+	    curp = curp->cgn_next) {
+		if (curp->cgn_id == cgid) {
+			return (curp);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
  * Calculate an inode number
  *
  * This takes various bits of info and munges them to give the inode number for
@@ -217,9 +344,6 @@ cgrp_taccess(void *vcp, int mode, cred_t *cred)
 /*
  * Search directory 'parent' for entry 'name'.
  *
- * The calling thread can't hold the write version
- * of the rwlock for the directory being searched
- *
  * 0 is returned on success and *foundcp points
  * to the found cgrp_node with its vnode held.
  */
@@ -227,8 +351,10 @@ int
 cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp,
     cred_t *cred)
 {
+	cgrp_mnt_t *cgm = VTOCGM(parent->cgn_vnode);
 	int error;
 
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	*foundcp = NULL;
 	if (parent->cgn_type != CG_CGROUP_DIR)
 		return (ENOTDIR);
@@ -280,10 +406,7 @@ cgrp_direnter(
 	int error = 0;
 	char *s;
 
-	/*
-	 * cgn_rwlock is held to serialize direnter and dirdeletes
-	 */
-	ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
 
 	/*
@@ -302,23 +425,15 @@ cgrp_direnter(
 	 * Remember that we can only rename within the same directory.
 	 */
 	if (op == DE_RENAME) {
-		rw_enter(&cn->cgn_rwlock, RW_WRITER);
-		mutex_enter(&cn->cgn_tlock);
 		if (cn->cgn_nlink == 0) {
-			mutex_exit(&cn->cgn_tlock);
-			rw_exit(&cn->cgn_rwlock);
 			return (ENOENT);
 		}
 
 		if (cn->cgn_nlink == MAXLINK) {
-			mutex_exit(&cn->cgn_tlock);
-			rw_exit(&cn->cgn_rwlock);
 			return (EMLINK);
 		}
 		cn->cgn_nlink++;
 		gethrestime(&cn->cgn_ctime);
-		mutex_exit(&cn->cgn_tlock);
-		rw_exit(&cn->cgn_rwlock);
 	}
 
 	/*
@@ -342,7 +457,9 @@ cgrp_direnter(
 	if (cdp) {
 		ASSERT(found != NULL);
 		error = EEXIST;
+		mutex_exit(&cgm->cg_contents);
 		cgnode_rele(found);
+		mutex_enter(&cgm->cg_contents);
 	} else {
 
 		/*
@@ -358,6 +475,13 @@ cgrp_direnter(
 			error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred);
 			if (error)
 				goto out;
+
+			if (op == DE_MKDIR) {
+				/*
+				 * inherit notify_on_release value from parent
+				 */
+				cn->cgn_notify = dir->cgn_notify;
+			}
 		}
 
 		error = cgrp_diraddentry(dir, cn, name, op);
@@ -366,7 +490,6 @@ cgrp_direnter(
 				/*
 				 * Unmake the inode we just made.
 				 */
-				rw_enter(&cn->cgn_rwlock, RW_WRITER);
 				if ((cn->cgn_type) == CG_CGROUP_DIR) {
 					ASSERT(cdp == NULL);
 					/*
@@ -374,25 +497,26 @@ cgrp_direnter(
 					 */
 					cgrp_dirtrunc(cn);
 				}
-				mutex_enter(&cn->cgn_tlock);
 				cn->cgn_nlink = 0;
-				mutex_exit(&cn->cgn_tlock);
 				gethrestime(&cn->cgn_ctime);
-				rw_exit(&cn->cgn_rwlock);
+				mutex_exit(&cgm->cg_contents);
 				cgnode_rele(cn);
+				mutex_enter(&cgm->cg_contents);
 				cn = NULL;
 			}
 		} else if (cnp) {
 			*cnp = cn;
 		} else if (op == DE_CREATE || op == DE_MKDIR) {
+			mutex_exit(&cgm->cg_contents);
 			cgnode_rele(cn);
+			mutex_enter(&cgm->cg_contents);
 		}
 	}
 
 out:
 	if (error && op == DE_RENAME) {
 		/* Undo bumped link count. */
-		DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+		cn->cgn_nlink--;
 		gethrestime(&cn->cgn_ctime);
 	}
 	return (error);
@@ -410,17 +534,17 @@ int
 cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
     cred_t *cred)
 {
+	cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode);
 	cgrp_dirent_t *cndp;
 	int error;
 	size_t namelen;
 	cgrp_node_t *cnnp;
 	timestruc_t now;
 
-	ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
-	ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 
 	if (nm[0] == '\0')
-		panic("cgrp_dirdelete: NULL name for 0x%p", (void *)cn);
+		panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn);
 
 	/*
 	 * return error when removing . and ..
@@ -465,32 +589,21 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
 			nextp = cdp->cgd_next;
 
 			cgnode_hold(pseudo_node);
-			rw_enter(&pseudo_node->cgn_rwlock, RW_WRITER);
 			error = cgrp_dirdelete(cn, pseudo_node,
 			    cdp->cgd_name, DR_REMOVE, cred);
-			rw_exit(&pseudo_node->cgn_rwlock);
+			mutex_exit(&cgm->cg_contents);
 			cgnode_rele(pseudo_node);
+			mutex_enter(&cgm->cg_contents);
 
 			cdp = nextp;
 		}
-	}
 
-	cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
-	if (cndp == NULL) {
-		/*
-		 * If it is gone, some other thread got here first!
-		 * Return error ENOENT.
-		 */
-		return (ENOENT);
+		cgrp_cg_hash_remove(cgm, cn);
 	}
 
-	/*
-	 * If the cgrp_node in the cgrp_dirent changed, we were probably
-	 * the victim of a concurrent rename operation. The original
-	 * is gone, so return that status.
-	 */
-	if (cn != cnnp)
-		return (ENOENT);
+	cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
+	VERIFY(cndp != NULL);
+	VERIFY(cn == cnnp);
 
 	cgrp_hash_out(cndp);
 
@@ -527,7 +640,7 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
 	cn->cgn_ctime = now;
 
 	ASSERT(cn->cgn_nlink > 0);
-	DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+	cn->cgn_nlink--;
 	if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) {
 		cgrp_dirtrunc(cn);
 		ASSERT(cn->cgn_nlink == 0);
@@ -544,10 +657,9 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
 	struct vnode *vp;
 	timestruc_t now;
 
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	ASSERT(vap != NULL);
 
-	rw_init(&cn->cgn_rwlock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&cn->cgn_tlock, NULL, MUTEX_DEFAULT, NULL);
 	cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	cn->cgn_mask = 0;
 	cn->cgn_attr.va_type = vap->va_type;
@@ -581,15 +693,7 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
 	vp->v_rdev = vap->va_rdev;
 	vp->v_data = (caddr_t)cn;
 
-	mutex_enter(&cgm->cg_contents);
-
-	/*
-	 * Set the cgroup ID for this cgrp_node by using a counter on each
-	 * mount. We also use this value as the directory nodeid (which is used
-	 * to derive the inode) so each cgroup in the tree will have a unique
-	 * id (and inode).
-	 */
-	cn->cgn_nodeid = cn->cgn_id = cgm->cg_gen++;
+	cn->cgn_nodeid = cgm->cg_gen++;
 
 	/*
 	 * Add new cgrp_node to end of linked list of cgrp_nodes for this
@@ -600,10 +704,38 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
 		cn->cgn_back = cgm->cg_rootnode->cgn_back;
 		cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn;
 	}
-	mutex_exit(&cgm->cg_contents);
 	vn_exists(vp);
 }
 
+void
+cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name,
+    cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr)
+{
+	cgrp_node_t *ncn;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, nattr,
+	    &ncn, cr, NULL);
+
+	/*
+	 * Fix the inode and assign the pseudo file type to be correct.
+	 */
+	ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid);
+	ncn->cgn_type = type;
+
+	/*
+	 * Since we're creating these entries here and not via the
+	 * normal VOP_CREATE code path, we need to do the rele to drop
+	 * our hold. This will leave the vnode v_count at 0 when we
+	 * come out of cgrp_inactive but we won't reclaim the vnode
+	 * there since the cgn_nlink value will still be 1.
+	 */
+	mutex_exit(&cgm->cg_contents);
+	cgnode_rele(ncn);
+	mutex_enter(&cgm->cg_contents);
+}
+
 /*
  * cgrp_dirinit is used internally to initialize a directory (dir)
  * with '.' and '..' entries without checking permissions and locking
@@ -615,19 +747,34 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
 {
 	cgrp_dirent_t *dot, *dotdot;
 	timestruc_t now;
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dir->cgn_vnode);
+	cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
 	cgrp_ssde_t *ssdp;
 	cgrp_subsys_dirent_t *pseudo_files;
 	struct vattr nattr;
 	int i;
 
-	ASSERT(RW_WRITE_HELD(&parent->cgn_rwlock));
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
 
 	ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM);
 	ssdp = &cg_ssde_dir[cgm->cg_ssid];
 
 	/*
+	 * If this is the top-level cgroup created by the mount then we need to
+	 * count up the number of procs and tasks already running in the zone.
+	 */
+
+	/*
+	 * Set the cgroup ID for this cgrp_node by using a counter on each
+	 * mount.
+	 */
+	dir->cgn_id = cgm->cg_grp_gen++;
+	cgrp_cg_hash_insert(cgm, dir);
+	/* Initialise the first cgroup if this is top-level group */
+	if (parent == dir)
+		cgrp_cg_hash_init(cgm, dir);
+
+	/*
 	 * Initialize the entries
 	 */
 	dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP);
@@ -659,7 +806,7 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
 	dir->cgn_mtime = now;
 	dir->cgn_ctime = now;
 
-	INCR_COUNT(&parent->cgn_nlink, &parent->cgn_tlock);
+	parent->cgn_nlink++;
 	parent->cgn_ctime = now;
 
 	dir->cgn_dir = dot;
@@ -672,28 +819,20 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
 	nattr.va_type = VREG;
 	nattr.va_rdev = 0;
 
+	/*
+	 * If this is the top-level dir in the file system then it always
+	 * has a release_agent pseudo file. Only the top-level dir has this
+	 * file.
+	 */
+	if (parent == dir) {
+		cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr,
+		    cr);
+	}
+
 	pseudo_files = ssdp->cg_ssde_files;
 	for (i = 0; i < ssdp->cg_ssde_nfiles; i++) {
-		cgrp_node_t *ncn;
-
-		cgrp_direnter(cgm, dir, pseudo_files[i].cgrp_ssd_name,
-		    DE_CREATE, (cgrp_node_t *)NULL, &nattr, &ncn, cr, NULL);
-
-		/*
-		 * Fix the inode and assign the pseudo file type to be correct.
-		 */
-		ncn->cgn_nodeid = cgrp_inode(pseudo_files[i].cgrp_ssd_type,
-		    dir->cgn_nodeid);
-		ncn->cgn_type = pseudo_files[i].cgrp_ssd_type;
-
-		/*
-		 * Since we're creating these entries here and not via the
-		 * normal VOP_CREATE code path, we need to do the rele to drop
-		 * our hold. This will leave the vnode v_count at 0 when we
-		 * come out of cgrp_inactive but we won't reclaim the vnode
-		 * there since the cgn_nlink value will still be 1.
-		 */
-		cgnode_rele(ncn);
+		cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name,
+		    pseudo_files[i].cgrp_ssd_type, &nattr, cr);
 	}
 }
 
@@ -705,8 +844,9 @@ cgrp_dirtrunc(cgrp_node_t *dir)
 {
 	cgrp_dirent_t *cgdp;
 	timestruc_t now;
+	cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
 
-	ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	ASSERT(dir->cgn_type == CG_CGROUP_DIR);
 
 	for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) {
@@ -726,7 +866,7 @@ cgrp_dirtrunc(cgrp_node_t *dir)
 		 */
 		cn = cgdp->cgd_cgrp_node;
 		ASSERT(cn->cgn_nlink > 0);
-		DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+		cn->cgn_nlink--;
 
 		cgrp_hash_out(cgdp);
 		kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen);
@@ -849,6 +989,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
 {
 	cgrp_node_t *cn;
 
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
 	ASSERT(va != NULL);
 
 	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
@@ -870,9 +1011,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
 
 	if (op == DE_MKDIR) {
 		cn->cgn_type = CG_CGROUP_DIR;
-		rw_enter(&cn->cgn_rwlock, RW_WRITER);
 		cgrp_dirinit(dir, cn, cred);
-		rw_exit(&cn->cgn_rwlock);
 	}
 
 	*newnode = cn;
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
index 8066f184ce..b2ffa02418 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
@@ -31,10 +31,12 @@
  * For example, it is common to see cgroup trees (each is its own mount with a
  * different subsystem controller) for blkio, cpuset, memory, systemd (has no
  * controller), etc. Within each tree there is a top-level directory with at
- * least a cgroup.procs and tasks file listing the processes within that group,
- * although there could be subdirectories, which define new cgroups, that then
- * contain a subset of the processes. Each subdirectory also has, at a minimum,
- * a cgroup.procs and tasks file.
+ * least a cgroup.procs, notify_on_release, release_agent, and tasks file.
+ * The cgroup.procs file lists the processes within that group and the tasks
+ * file lists the threads in the group. There could be subdirectories, which
+ * define new cgroups, that then contain a subset of the processes. Each
+ * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and
+ * tasks file.
  *
  * Since we're using lx to run user-level code within zones, the majority (all?)
  * of the cgroup resource management functionality simply doesn't apply to us.
@@ -45,14 +47,54 @@
  * hierarchy and does not report that any resource management controllers are
  * available for separate mounts.
  *
+ * In addition to the hierarchy, the other important component of cgroups that
+ * is used by systemd is the 'release_agent'. This provides a mechanism to
+ * run a command when a cgroup becomes empty (the last task in the group
+ * leaves, either by exit or move, and there are no more sub-cgroups). The
+ * 'release_agent' file only exists in the top-level cgroup of the mounted
+ * file system and holds the path to a command to run. The 'notify_on_release'
+ * file exists in each cgroup dir. If that file contains a '1' then the agent
+ * is run when that group becomes empty. The agent is passed a path string of
+ * the cgroup, relative to the file system mount point (e.g. a mount on
+ * /sys/fs/cgroups/systemd with a sub-cgroup of foo/bar gets the arg foo/bar).
+ *
+ * Cgroup membership is implemented via hooks into the lx brand code. When
+ * the cgroup file system loads it installs callbacks for:
+ *    lx_cgrp_forklwp
+ *    lx_cgrp_procexit
+ *    lx_cgrp_initlwp
+ *    lx_cgrp_freelwp
+ * and when it unloads it clears those hooks. The lx brand code calls those
+ * hooks when a process/lwp starts and when it exits. Internally we use a
+ * simple reference counter (cgn_task_cnt) on the cgroup node to track how many
+ * threads are in the group, so we can tell when a group becomes empty.
+ * To make this quick, a hash table (cg_grp_hash) is maintained on the
+ * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is
+ * sized so that there should typically only be 0 or 1 cgroups per bucket.
+ * We also keep a reference to the file system in the zone-specific brand data
+ * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t
+ * when it runs the hook.
+ *
+ * Once a cgroup becomes empty, running the release agent is actually done
+ * by a user-level cgrpmgr process. That process makes a CGRPFS_GETEVNT
+ * ioctl which blocks until there is an event (i.e. the agent needs to run).
+ * Internally we maintain a list (cg_evnt_list) of release events on
+ * cgrp_mnt_t. The ioctl pulls an event off of the list, or blocks until an
+ * event is available, and then returns the event. The cgrpmgr process is
+ * started by the lx mount emulation when it mounts the file system. The
+ * cgrpmgr will exit when the ioctl returns EIO, indicating that the file
+ * system is being unmounted.
+ *
  * This file system is similar to tmpfs in that directories only exist in
  * memory. Each subdirectory represents a different cgroup. Within the cgroup
  * there are pseudo files (see cg_ssde_dir) with well-defined names which
  * control the configuration and behavior of the cgroup (see cgrp_nodetype_t).
- * The primary files within every cgroup are named 'cgroup.procs' and 'tasks'.
- * These are used to control and list which processes/threads belong to the
- * cgroup. In the general case there can be additional files in the cgroup
- * which define additional behavior, although none exists at this time.
+ * The primary files within every cgroup are named 'cgroup.procs',
+ * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the
+ * top-level cgroup). The cgroup.procs and tasks files are used to control and
+ * list which processes/threads belong to the cgroup. In the general case there
+ * could be additional files in the cgroup, which defined additional behavior
+ * (i.e. subsystem specific pseudo files), although none exist at this time.
  *
  * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is
  * used to correlate with the threads to determine cgroup membership. When
@@ -69,11 +111,27 @@
  * - no file rename, but a directory (i.e. a cgroup) can be renamed within the
  *   containing directory, but not into a different directory
  * - can mkdir and rmdir to create/destroy cgroups
- * - cannot rmdir while it contains a subdir (i.e. a sub-cgroup)
+ * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup)
  * - open, read/write, close on the subsytem-specific pseudo files is
  *   allowed, as this is the interface to configure and report on the cgroup.
  *   The pseudo file's mode controls write access and cannot be changed.
  *
+ * The locking in this file system is simple since the file system is not
+ * subjected to heavy I/O activity and all data is in-memory. There is a single
+ * global mutex for each mount (cg_contents). This mutex is held for the life
+ * of most vnode operations. The most active path is probably the LWP start and
+ * exit hooks which increment/decrement the reference counter on the cgroup
+ * node. The lock is important for this case since we don't want concurrent
+ * activity (such as moving the process into another cgroup) while we're trying
+ * to lookup the cgroup from the mount's hash table. We must be careful to
+ * avoid a deadlock while reading or writing since that code can take pidlock
+ * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of
+ * those is held. To prevent deadlock we always take cg_contents after pidlock
+ * and p_lock.
+ *
+ * In addition to the cg_contents lock there is also a second mutex (cg_events)
+ * used with the event queue condvar (cg_evnt_cv).
+ *
  * EXTENDING THE FILE SYSTEM
  *
  * When adding support for a new subsystem, be sure to also update the
@@ -100,7 +158,8 @@
  * list of cgroup IDs associated with every thread, instead of just one ID
  * (br_cgroupid). The thread data would need to become a struct which held
  * both an ID and an indication as to which mounted cgroup file system instance
- * the ID was associated with.
+ * the ID was associated with. We would also need a list of cgroup mounts per
+ * zone, instead the current single zone reference.
  */
 
 #include <sys/types.h>
@@ -123,6 +182,8 @@
 #include <sys/systm.h>
 #include <sys/mntent.h>
 #include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
 #include <sys/lx_brand.h>
 
 #include "cgrps.h"
@@ -131,6 +192,11 @@
 static int	cgrp_fstype;
 static dev_t	cgrp_dev;
 
+#define	MAX_AGENT_EVENTS	32		/* max num queued events */
+
+#define	UMNT_DELAY_TIME	drv_usectohz(50000)	/* 500th of a second */
+#define	UMNT_RETRY_MAX	100			/* 100 times - 2 secs */
+
 /*
  * cgrp_mountcount is used to prevent module unloads while there is still
  * state from a former mount hanging around. The filesystem module must not be
@@ -171,6 +237,12 @@ static int cgrp_root(struct vfs *, struct vnode **);
 static int cgrp_statvfs(struct vfs *, struct statvfs64 *);
 static void cgrp_freevfs(vfs_t *vfsp);
 
+/* Forward declarations for hooks */
+static void cgrp_proc_fork_helper(vfs_t *, uint_t, pid_t);
+static void cgrp_proc_exit_helper(vfs_t *, uint_t, pid_t);
+static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t);
+static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t);
+
 /*
  * Loadable module wrapper
  */
@@ -209,6 +281,12 @@ _fini()
 	if (cgrp_mountcount)
 		return (EBUSY);
 
+	/* Disable hooks used by the lx brand module. */
+	lx_cgrp_forklwp = NULL;
+	lx_cgrp_proc_exit = NULL;
+	lx_cgrp_initlwp = NULL;
+	lx_cgrp_freelwp = NULL;
+
 	if ((error = mod_remove(&modlinkage)) != 0)
 		return (error);
 
@@ -282,6 +360,12 @@ cgrp_init(int fstype, char *name)
 	 */
 	cgrp_dev = makedevice(dev, 0);
 
+	/* Install the hooks used by the lx brand module. */
+	lx_cgrp_forklwp = cgrp_proc_fork_helper;
+	lx_cgrp_proc_exit = cgrp_proc_exit_helper;
+	lx_cgrp_initlwp = cgrp_lwp_fork_helper;
+	lx_cgrp_freelwp = cgrp_lwp_exit_helper;
+
 	return (0);
 }
 
@@ -294,6 +378,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	int error;
 	struct vattr rattr;
 	cgrp_ssid_t ssid = CG_SSID_GENERIC;
+	lx_zone_data_t *lxzdata;
 
 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 		return (error);
@@ -309,6 +394,13 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		return (EINVAL);
 
 	/*
+	 * We currently only support one mount per zone.
+	 */
+	lxzdata = ztolxzd(curproc->p_zone);
+	if (lxzdata->lxzd_cgroup != NULL)
+		return (EINVAL);
+
+	/*
 	 * Ensure we don't allow overlaying mounts
 	 */
 	mutex_enter(&mvp->v_lock);
@@ -354,10 +446,15 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 
 	/* Set but don't bother entering the mutex (not on mount list yet) */
 	mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&cgm->cg_events, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&cgm->cg_evnt_cv, NULL, CV_DRIVER, NULL);
 
-	cgm->cg_vfsp = vfsp;
+	cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp;
+	cgm->cg_lxzdata = lxzdata;
 	cgm->cg_ssid = ssid;
-	cgm->cg_gen = CG_START_ID;
+
+	list_create(&cgm->cg_evnt_list, sizeof (cgrp_evnt_t),
+	    offsetof(cgrp_evnt_t, cg_evnt_lst));
 
 	vfsp->vfs_data = (caddr_t)cgm;
 	vfsp->vfs_fstype = cgrp_fstype;
@@ -368,15 +465,19 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
 	(void) strcpy(cgm->cg_mntpath, dpn.pn_path);
 
+	cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ,
+	    KM_SLEEP);
+
 	/* allocate and initialize root cgrp_node structure */
 	bzero(&rattr, sizeof (struct vattr));
 	rattr.va_mode = (mode_t)(S_IFDIR | 0755);
 	rattr.va_type = VDIR;
 	rattr.va_rdev = 0;
 	cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP);
+
+	mutex_enter(&cgm->cg_contents);
 	cgrp_node_init(cgm, cp, &rattr, cr);
 
-	rw_enter(&cp->cgn_rwlock, RW_WRITER);
 	CGNTOV(cp)->v_flag |= VROOT;
 
 	/*
@@ -393,7 +494,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 	cp->cgn_nodeid = cgrp_inode(ssid, cgm->cg_gen);
 	cgrp_dirinit(cp, cp, cr);
 
-	rw_exit(&cp->cgn_rwlock);
+	mutex_exit(&cgm->cg_contents);
 
 	pn_free(&dpn);
 	error = 0;
@@ -414,15 +515,20 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	struct vnode	*vp;
 	int error;
 	uint_t cnt;
+	int retry_cnt = 0;
 
 	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
 		return (error);
 
+retry:
 	mutex_enter(&cgm->cg_contents);
 
 	/*
-	 * In the normal unmount case, if there are no
-	 * open files, only the root node should have a reference count.
+	 * In the normal unmount case, if there were no open files, only the
+	 * root node would have a reference count. However, the user-level
+	 * agent manager should have the root vnode open and be waiting in
+	 * ioctl. We need to wake the manager and it may take some retries
+	 * before it closes its file descriptor.
 	 *
 	 * With cg_contents held, nothing can be added or removed.
 	 * There may be some dirty pages.  To prevent fsflush from
@@ -432,6 +538,29 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 	 */
 	cgnp = cgm->cg_rootnode;
 
+	ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL);
+
+	mutex_enter(&cgm->cg_events);
+	cv_signal(&cgm->cg_evnt_cv);
+
+	/*
+	 * Delete any queued events (normally there shouldn't be any).
+	 */
+	for (;;) {
+		cgrp_evnt_t *evntp;
+
+		evntp = list_remove_head(&cgm->cg_evnt_list);
+		if (evntp == NULL)
+			break;
+		kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+		kmem_free(evntp, sizeof (cgrp_evnt_t));
+		cgm->cg_evnt_cnt--;
+	}
+
+	/* Set the counter to -1 so an incoming ioctl knows we're unmounting */
+	cgm->cg_evnt_cnt = -1;
+	mutex_exit(&cgm->cg_events);
+
 	vp = CGNTOV(cgnp);
 	mutex_enter(&vp->v_lock);
 
@@ -441,10 +570,16 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 		return (EINVAL);
 	}
 
+
 	cnt = vp->v_count;
 	if (cnt > 1) {
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&cgm->cg_contents);
+		/* Likely because the user-level manager hasn't exited yet */
+		if (retry_cnt++ < UMNT_RETRY_MAX) {
+			delay(UMNT_DELAY_TIME);
+			goto retry;
+		}
 		return (EBUSY);
 	}
 
@@ -476,6 +611,11 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 		}
 	}
 
+	cgm->cg_lxzdata->lxzd_cgroup = NULL;
+	kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ);
+	list_destroy(&cgm->cg_evnt_list);
+	cv_destroy(&cgm->cg_evnt_cv);
+
 	/*
 	 * We can drop the mutex now because
 	 * no one can find this mount anymore
@@ -519,10 +659,10 @@ cgrp_freevfs(vfs_t *vfsp)
 	 * Remove all directory entries
 	 */
 	for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) {
-		rw_enter(&cn->cgn_rwlock, RW_WRITER);
+		mutex_enter(&cgm->cg_contents);
 		if (cn->cgn_type == CG_CGROUP_DIR)
 			cgrp_dirtrunc(cn);
-		rw_exit(&cn->cgn_rwlock);
+		mutex_exit(&cgm->cg_contents);
 	}
 
 	ASSERT(cgm->cg_rootnode);
@@ -571,7 +711,7 @@ cgrp_freevfs(vfs_t *vfsp)
 	kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1);
 
 	mutex_destroy(&cgm->cg_contents);
-	mutex_destroy(&cgm->cg_renamelck);
+	mutex_destroy(&cgm->cg_events);
 	kmem_free(cgm, sizeof (cgrp_mnt_t));
 
 	/* Allow _fini() to succeed now */
@@ -676,3 +816,186 @@ cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 	sbp->f_namemax = MAXNAMELEN - 1;
 	return (0);
 }
+
+static int
+cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen)
+{
+	cgrp_node_t *parent;
+	cgrp_dirent_t *dp;
+
+	buf[0] = '\0';
+
+	parent = cn->cgn_parent;
+	if (parent == NULL || parent == cn) {
+		(void) strlcpy(buf, ".", blen);
+		return (0);
+	}
+
+	/*
+	 * Search the parent dir list to find this cn's name.
+	 */
+	for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) {
+		if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) {
+			(void) strlcpy(buf, dp->cgd_name, blen);
+			return (0);
+		}
+	}
+
+	return (-1);
+}
+
+/*
+ * Engueue an event for user-level release_agent manager. The event data is the
+ * pathname (relative to the mount point of the file system) of the newly empty
+ * cgroup.
+ */
+void
+cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+	cgrp_node_t *parent;
+	char nm[MAXNAMELEN];
+	char *argstr, *oldstr, *tmp;
+	cgrp_evnt_t *evntp;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+	/* Nothing to do if the agent is not set */
+	if (cgm->cg_agent[0] == '\0')
+		return;
+
+	parent = cn->cgn_parent;
+	/* Cannot remove the top-level cgroup (only via unmount) */
+	if (parent == cn)
+		return;
+
+	argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	*argstr = '\0';
+
+	/*
+	 * Iterate up the directory tree to construct the agent argument string.
+	 */
+	do {
+		cgrp_get_dirname(cn, nm, sizeof (nm));
+		DTRACE_PROBE1(cgrp__dir__name, char *, nm);
+		if (*argstr == '\0') {
+			(void) strlcpy(argstr, nm, MAXPATHLEN);
+		} else {
+			tmp = oldstr;
+			oldstr = argstr;
+			argstr = tmp;
+			(void) snprintf(argstr, MAXPATHLEN, "%s/%s", nm,
+			    oldstr);
+		}
+
+		if (cn->cgn_parent == NULL)
+			break;
+		cn = cn->cgn_parent;
+		parent = cn->cgn_parent;
+
+		/*
+		 * The arg path is relative to the mountpoint so we stop when
+		 * we get to the top level.
+		 */
+		if (parent == NULL || parent == cn)
+			break;
+	} while (parent != cn);
+
+	kmem_free(oldstr, MAXPATHLEN);
+
+	DTRACE_PROBE1(cgrp__agent__event, char *, argstr);
+
+	/*
+	 * Add the event to the list for the user-level agent. We add it to
+	 * the end of the list (which should normally be an empty list since
+	 * the user-level agent is designed to service events as quickly as
+	 * it can).
+	 */
+	evntp = kmem_zalloc(sizeof (cgrp_evnt_t), KM_SLEEP);
+	evntp->cg_evnt_path = argstr;
+
+	mutex_enter(&cgm->cg_events);
+	if (cgm->cg_evnt_cnt >= MAX_AGENT_EVENTS) {
+		/*
+		 * We don't queue up an arbitrary number of events. Because
+		 * the user-level manager should be servicing events quickly,
+		 * if the list gets long then something is wrong.
+		 */
+		cmn_err(CE_WARN, "cgrp: event queue full for zone %s",
+		    ttoproc(curthread)->p_zone->zone_name);
+		kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+		kmem_free(evntp, sizeof (cgrp_evnt_t));
+
+	} else {
+		list_insert_tail(&cgm->cg_evnt_list, evntp);
+		cgm->cg_evnt_cnt++;
+		cv_signal(&cgm->cg_evnt_cv);
+	}
+	mutex_exit(&cgm->cg_events);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_proc_fork_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid)
+{
+}
+
+/*ARGSUSED*/
+static void
+cgrp_proc_exit_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid)
+{
+	if (curproc->p_zone->zone_proc_initpid == pid ||
+	    curproc->p_zone->zone_proc_initpid == -1) {
+		/*
+		 * The zone's init just exited. If this is because of a zone
+		 * reboot initiated from outside the zone, then we've never
+		 * tried to unmount this fs, so we need to wakeup the
+		 * user-level manager so that it can exit. Its also possible
+		 * init died abnormally, but that leads to a zone reboot so the
+		 * action is the same here.
+		 */
+		cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+
+		mutex_enter(&cgm->cg_events);
+		cv_signal(&cgm->cg_evnt_cv);
+		mutex_exit(&cgm->cg_events);
+	}
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cn;
+
+	mutex_enter(&cgm->cg_contents);
+	cn = cgrp_cg_hash_lookup(cgm, cg_id);
+	ASSERT(cn != NULL);
+	cn->cgn_task_cnt++;
+	mutex_exit(&cgm->cg_contents);
+
+	DTRACE_PROBE1(cgrp__lwp__fork, void *, cn);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+	cgrp_node_t *cn;
+
+	mutex_enter(&cgm->cg_contents);
+	cn = cgrp_cg_hash_lookup(cgm, cg_id);
+	ASSERT(cn != NULL);
+	VERIFY(cn->cgn_task_cnt > 0);
+	cn->cgn_task_cnt--;
+	DTRACE_PROBE1(cgrp__lwp__exit, void *, cn);
+
+	if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) &&
+	    cn->cgn_notify == 1) {
+		cgrp_rel_agent_event(cgm, cn);
+	}
+
+	mutex_exit(&cgm->cg_contents);
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
index f7eceb4e94..24640631f5 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
@@ -153,19 +153,69 @@ cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ)
 }
 
 /*
+ * Move a thread from one cgroup to another. If the old cgroup is empty
+ * we queue up an agent event. We return true in that case since we've
+ * dropped the locks and the caller needs to reacquire them.
+ */
+static boolean_t
+cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn,
+    uint_t cg_id, proc_t *p)
+{
+	cgrp_node_t *ocn;
+
+	ASSERT(MUTEX_HELD(&cgm->cg_contents));
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid);
+	VERIFY(ocn != NULL);
+
+	ASSERT(ocn->cgn_task_cnt > 0);
+	atomic_dec_32(&ocn->cgn_task_cnt);
+	atomic_inc_32(&ncn->cgn_task_cnt);
+	plwpd->br_cgroupid = cg_id;
+
+	if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) &&
+	    ocn->cgn_notify == 1) {
+		/*
+		 * We want to drop p_lock before queuing the event since
+		 * that might sleep. Dropping p_lock might cause the caller to
+		 * have to restart the move process from the beginning.
+		 */
+		mutex_exit(&p->p_lock);
+		cgrp_rel_agent_event(cgm, ocn);
+		mutex_exit(&cgm->cg_contents);
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * Assign either all of the threads, or a single thread, for the specified pid
  * to the new cgroup. Controlled by the typ argument.
  */
 static int
-cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
+cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
 {
 	proc_t *p;
 	kthread_t *t;
 	int error;
+	cgrp_node_t *ncn;
 
 	if (pid == 1)
 		pid = curproc->p_zone->zone_proc_initpid;
 
+	/*
+	 * Move one or all threads to this cgroup.
+	 */
+	if (typ == CG_WR_TASKS) {
+		error = ESRCH;
+	} else {
+		error = 0;
+	}
+
+restart:
 	mutex_enter(&pidlock);
 
 	p = cgrp_p_for_wr(pid, typ);
@@ -194,39 +244,48 @@ cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
 	 * Ignore writes for PID which is not an lx-branded process or with
 	 * no threads.
 	 */
+
 	mutex_enter(&p->p_lock);
-	if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) {
+	mutex_exit(&pidlock);
+	if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL ||
+	    p->p_flag & SEXITING) {
 		mutex_exit(&p->p_lock);
-		mutex_exit(&pidlock);
 		return (0);
 	}
 
-	/*
-	 * Move one or all threads to this cgroup.
-	 */
-	if (typ == CG_WR_TASKS) {
-		error = ESRCH;
-	} else {
-		error = 0;
-	}
+	mutex_enter(&cgm->cg_contents);
+
+	ncn = cgrp_cg_hash_lookup(cgm, cg_id);
+	VERIFY(ncn != NULL);
 
 	do {
 		lx_lwp_data_t *plwpd = ttolxlwp(t);
-		if (plwpd != NULL) {
+		if (plwpd != NULL && plwpd->br_cgroupid != cg_id) {
 			if (typ == CG_WR_PROCS) {
-				plwpd->br_cgroupid = cg_id;
+				if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+					/*
+					 * We dropped all of the locks so we
+					 * need to start over.
+					 */
+					goto restart;
+				}
+
 			} else if (plwpd->br_pid == pid) {
 				/* type is CG_WR_TASKS and we found the task */
-				plwpd->br_cgroupid = cg_id;
 				error = 0;
-				break;
+				if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+					goto done;
+				} else {
+					break;
+				}
 			}
 		}
 		t = t->t_forw;
 	} while (t != p->p_tlist);
 
+	mutex_exit(&cgm->cg_contents);
 	mutex_exit(&p->p_lock);
-	mutex_exit(&pidlock);
+done:
 
 	return (error);
 }
@@ -273,7 +332,56 @@ cgrp_get_pid_str(struct uio *uio, pid_t *pid)
 }
 
 static int
-cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ)
+cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio)
+{
+	int error;
+	uint_t value;
+
+	/*
+	 * This is cheesy but since we only take a 0 or 1 value we can
+	 * let the pid_str function do the uio string conversion.
+	 */
+	error = cgrp_get_pid_str(uio, (pid_t *)&value);
+	if (error != 0)
+		return (error);
+
+	if (value != 0 && value != 1)
+		return (EINVAL);
+
+	/*
+	 * The flag is on the containing dir. We don't bother taking the
+	 * cg_contents lock since this is a simple assignment.
+	 */
+	cn->cgn_parent->cgn_notify = value;
+	return (0);
+}
+
+static int
+cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+	int error;
+	int len;
+	char *wrp;
+
+	len = uio->uio_offset + uio->uio_resid;
+	if (len > MAXPATHLEN)
+		return (EFBIG);
+
+	mutex_enter(&cgm->cg_contents);
+
+	wrp = &cgm->cg_agent[uio->uio_offset];
+	error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio);
+	cgm->cg_agent[len] = '\0';
+	if (len > 1 && cgm->cg_agent[len - 1] == '\n')
+		cgm->cg_agent[len - 1] = '\0';
+
+	mutex_exit(&cgm->cg_contents);
+	return (error);
+}
+
+static int
+cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio,
+    cgrp_wr_type_t typ)
 {
 	/* the cgroup ID is on the containing dir */
 	uint_t cg_id = cn->cgn_parent->cgn_id;
@@ -285,7 +393,7 @@ cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ)
 		if (error != 0)
 			return (error);
 
-		error = cgrp_proc_set_id(cg_id, pidnum, typ);
+		error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ);
 		if (error != 0)
 			return (error);
 	}
@@ -304,9 +412,6 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr,
 	vp = CGNTOV(cn);
 	ASSERT(vp->v_type == VREG);
 
-	ASSERT(RW_WRITE_HELD(&cn->cgn_contents));
-	ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
-
 	if (uio->uio_loffset < 0)
 		return (EINVAL);
 
@@ -323,11 +428,17 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr,
 		limit = MAXOFF_T;
 
 	switch (cn->cgn_type) {
+	case CG_NOTIFY:
+		error = cgrp_wr_notify(cn, uio);
+		break;
 	case CG_PROCS:
-		error = cgrp_wr_proc_or_task(cn, uio, CG_WR_PROCS);
+		error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS);
+		break;
+	case CG_REL_AGENT:
+		error = cgrp_wr_rel_agent(cgm, uio);
 		break;
 	case CG_TASKS:
-		error = cgrp_wr_proc_or_task(cn, uio, CG_WR_TASKS);
+		error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS);
 		break;
 	default:
 		VERIFY(0);
@@ -351,6 +462,12 @@ cgrp_p_lock(proc_t *p)
 
 	/* first try the fast path */
 	mutex_enter(&p->p_lock);
+	if (p->p_flag & SEXITING) {
+		mutex_exit(&p->p_lock);
+		mutex_exit(&pidlock);
+		return (NULL);
+	}
+
 	if (!(p->p_proc_flag & P_PR_LOCK)) {
 		p->p_proc_flag |= P_PR_LOCK;
 		mutex_exit(&p->p_lock);
@@ -404,13 +521,76 @@ cgrp_p_unlock(proc_t *p)
 	ASSERT(MUTEX_HELD(&p->p_lock));
 	ASSERT(!MUTEX_HELD(&pidlock));
 
-	cv_signal(&pr_pid_cv[p->p_slot]);
 	p->p_proc_flag &= ~P_PR_LOCK;
+	cv_signal(&pr_pid_cv[p->p_slot]);
 	mutex_exit(&p->p_lock);
 	THREAD_KPRI_RELEASE();
 }
 
 /*
+ * Read value from the notify_on_release pseudo file on the parent node
+ * (which is the actual cgroup node). We don't bother taking the cg_contents
+ * lock since it's a single instruction so an empty group action/read will
+ * only see one value or the other.
+ */
+/* ARGSUSED */
+static int
+cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+	int len;
+	int error = 0;
+	char buf[16];
+	char *rdp;
+	/* the flag is on the containing dir */
+	uint_t value = cn->cgn_parent->cgn_notify;
+
+	len = snprintf(buf, sizeof (buf), "%u\n", value);
+	if (uio->uio_offset > len)
+		return (0);
+
+	len -= uio->uio_offset;
+	rdp = &buf[uio->uio_offset];
+	len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+	error = uiomove(rdp, len, UIO_READ, uio);
+	return (error);
+}
+
+/*
+ * Read value from the release_agent pseudo file.
+ */
+static int
+cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+	int len;
+	int error = 0;
+	char *rdp;
+
+	mutex_enter(&cgm->cg_contents);
+
+	if (cgm->cg_agent[0] == '\0') {
+		mutex_exit(&cgm->cg_contents);
+		return (0);
+	}
+
+	len = strlen(cgm->cg_agent);
+	if (uio->uio_offset > len) {
+		mutex_exit(&cgm->cg_contents);
+		return (0);
+	}
+
+	len -= uio->uio_offset;
+	rdp = &cgm->cg_agent[uio->uio_offset];
+	len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+	error = uiomove(rdp, len, UIO_READ, uio);
+
+	mutex_exit(&cgm->cg_contents);
+
+	return (error);
+}
+
+/*
  * Read pids from the cgroup.procs pseudo file. We have to look at all of the
  * processes to find applicable ones, then report pids for any process which
  * has all of its threads in the same cgroup.
@@ -470,6 +650,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
 		 * Check if all threads are in this cgroup.
 		 */
 		in_cg = B_TRUE;
+		mutex_enter(&cgm->cg_contents);
 		do {
 			lx_lwp_data_t *plwpd = ttolxlwp(t);
 			if (plwpd == NULL || plwpd->br_cgroupid != cg_id) {
@@ -479,6 +660,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
 
 			t = t->t_forw;
 		} while (t != p->p_tlist);
+		mutex_exit(&cgm->cg_contents);
 
 		mutex_exit(&p->p_lock);
 		if (!in_cg) {
@@ -647,7 +829,9 @@ cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
 		if (p == NULL)
 			continue;
 
+		mutex_enter(&cgm->cg_contents);
 		error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio);
+		mutex_exit(&cgm->cg_contents);
 
 		mutex_enter(&p->p_lock);
 		cgrp_p_unlock(p);
@@ -664,8 +848,6 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct)
 {
 	int error = 0;
 
-	ASSERT(RW_LOCK_HELD(&cn->cgn_contents));
-
 	if (uio->uio_loffset >= MAXOFF_T)
 		return (0);
 	if (uio->uio_loffset < 0)
@@ -674,9 +856,15 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct)
 		return (0);
 
 	switch (cn->cgn_type) {
+	case CG_NOTIFY:
+		error = cgrp_rd_notify(cgm, cn, uio);
+		break;
 	case CG_PROCS:
 		error = cgrp_rd_procs(cgm, cn, uio);
 		break;
+	case CG_REL_AGENT:
+		error = cgrp_rd_rel_agent(cgm, uio);
+		break;
 	case CG_TASKS:
 		error = cgrp_rd_tasks(cgm, cn, uio);
 		break;
@@ -692,8 +880,8 @@ static int
 cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
     struct caller_context *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VTOCGM(vp);
 	int error;
 
 	/*
@@ -703,17 +891,8 @@ cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
 		return (EISDIR);
 	if (vp->v_type != VREG)
 		return (EINVAL);
-	/*
-	 * cgrp_rwlock should have already been called from layers above
-	 */
-	ASSERT(RW_READ_HELD(&cn->cgn_rwlock));
-
-	rw_enter(&cn->cgn_contents, RW_READER);
-
 	error = cgrp_rd(cgm, cn, uiop, ct);
 
-	rw_exit(&cn->cgn_contents);
-
 	return (error);
 }
 
@@ -721,8 +900,8 @@ static int
 cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
     struct caller_context *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VTOCGM(vp);
 	int error;
 
 	/*
@@ -731,11 +910,6 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
-	/* cgrp_rwlock should have already been called from layers above */
-	ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
-
-	rw_enter(&cn->cgn_contents, RW_WRITER);
-
 	if (ioflag & FAPPEND) {
 		/* In append mode start at end of file. */
 		uiop->uio_loffset = cn->cgn_size;
@@ -743,21 +917,146 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
 
 	error = cgrp_wr(cgm, cn, uiop, cred, ct);
 
-	rw_exit(&cn->cgn_contents);
-
 	return (error);
 }
 
+static int
+cgrp_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, cred_t *cr,
+    int *rvalp, caller_context_t *ct)
+{
+	cgrp_mnt_t *cgm = VTOCGM(vp);
+	model_t model;
+	cgrpmgr_info_t cgmi;
+	cgrp_evnt_t *evntp;
+	int res = 0;
+
+	/* We only support the cgrpmgr ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/* The caller must be root */
+	if (secpolicy_vnode_any_access(cr, vp, crgetuid(cr)) != 0 ||
+	    crgetuid(cr) != 0)
+		return (ENOTTY);
+
+	if (cmd != CGRPFS_GETEVNT)
+		return (ENOTTY);
+
+	model = get_udatamodel();
+	if (model == DATAMODEL_NATIVE) {
+		if (copyin((void *)data, &cgmi, sizeof (cgmi)))
+			return (EFAULT);
+
+	} else {
+		cgrpmgr_info32_t cgmi32;
+
+		if (copyin((void *)data, &cgmi32, sizeof (cgmi32)))
+			return (EFAULT);
+
+		cgmi.cgmi_pid = cgmi32.cgmi_pid;
+		cgmi.cgmi_rel_agent_path =
+		    (char *)(intptr_t)cgmi32.cgmi_rel_agent_path;
+		cgmi.cgmi_cgroup_path =
+		    (char *)(intptr_t)cgmi32.cgmi_cgroup_path;
+	}
+
+	if (cgm->cg_mgrpid == 0) {
+		/*
+		 * This is the initial call from the user-level manager,
+		 * keep track of its pid.
+		 */
+		cgm->cg_mgrpid  = cgmi.cgmi_pid;
+	} else if (cgm->cg_mgrpid != cgmi.cgmi_pid) {
+		/*
+		 * We only allow the manager which first contacted us to
+		 * make this ioctl.
+		 */
+		return (EINVAL);
+	}
+
+	/*
+	 * If there is a pending event, service it immediately, otherwise
+	 * block until an event occurs.
+	 */
+retry:
+	mutex_enter(&cgm->cg_events);
+
+	if (cgm->cg_evnt_cnt < 0) {
+		/*
+		 * Trying to unmount, tell the manager to quit.
+		 */
+		mutex_exit(&cgm->cg_events);
+		return (EIO);
+	}
+
+	if (cgm->cg_evnt_cnt == 0) {
+		cv_wait_sig(&cgm->cg_evnt_cv, &cgm->cg_events);
+
+		if (cgm->cg_evnt_cnt <= 0) {
+			/*
+			 * We were woken up but there are no events, it must
+			 * be due to an unmount and it's time for the user
+			 * manager to go away.
+			 */
+			mutex_exit(&cgm->cg_events);
+			return (EIO);
+		}
+	}
+
+	evntp = list_remove_head(&cgm->cg_evnt_list);
+	VERIFY(evntp != NULL);
+	ASSERT(cgm->cg_evnt_cnt > 0);
+	cgm->cg_evnt_cnt--;
+
+	mutex_exit(&cgm->cg_events);
+
+	/*
+	 * An event for the user-level manager should only occur if a
+	 * release_agent has been set, but on the unlikely chance that the
+	 * agent path was cleared after the event was enqueued, we check under
+	 * the lock and go back to waiting if the path is empty.
+	 */
+	mutex_enter(&cgm->cg_contents);
+	if (cgm->cg_agent[0] == '\0') {
+		mutex_exit(&cgm->cg_contents);
+		kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+		kmem_free(evntp, sizeof (cgrp_evnt_t));
+		goto retry;
+	}
+
+	if (copyout(cgm->cg_agent, (void *)cgmi.cgmi_rel_agent_path,
+	    strlen(cgm->cg_agent) + 1)) {
+		mutex_exit(&cgm->cg_contents);
+		res = EFAULT;
+		goto done;
+	}
+
+	mutex_exit(&cgm->cg_contents);
+
+	if (copyout(evntp->cg_evnt_path, (void *)cgmi.cgmi_cgroup_path,
+	    strlen(evntp->cg_evnt_path) + 1)) {
+		res = EFAULT;
+	}
+
+done:
+	kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+	kmem_free(evntp, sizeof (cgrp_evnt_t));
+
+	return (res);
+}
+
 /* ARGSUSED2 */
 static int
 cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
     caller_context_t *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
 	struct vattr va;
 	int attrs = 1;
 
-	mutex_enter(&cn->cgn_tlock);
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 	if (attrs == 0) {
 		cn->cgn_uid = va.va_uid;
 		cn->cgn_gid = va.va_gid;
@@ -778,7 +1077,7 @@ cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
 	vap->va_seq = cn->cgn_seq;
 
 	vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
-	mutex_exit(&cn->cgn_tlock);
+	mutex_exit(&cgm->cg_contents);
 	return (0);
 }
 
@@ -787,7 +1086,8 @@ static int
 cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
     caller_context_t *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
 	int error = 0;
 	struct vattr *get;
 	long mask;
@@ -799,7 +1099,8 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
 	    (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE))
 		return (EINVAL);
 
-	mutex_enter(&cn->cgn_tlock);
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 
 	get = &cn->cgn_attr;
 	/*
@@ -832,7 +1133,7 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
 		gethrestime(&cn->cgn_ctime);
 
 out:
-	mutex_exit(&cn->cgn_tlock);
+	mutex_exit(&cgm->cg_contents);
 	return (error);
 }
 
@@ -841,12 +1142,14 @@ static int
 cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred,
     caller_context_t *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
 	int error;
 
-	mutex_enter(&cn->cgn_tlock);
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 	error = cgrp_taccess(cn, mode, cred);
-	mutex_exit(&cn->cgn_tlock);
+	mutex_exit(&cgm->cg_contents);
 	return (error);
 }
 
@@ -856,7 +1159,8 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(dvp);
+	cgrp_node_t *cn = VTOCGN(dvp);
+	cgrp_mnt_t *cgm;
 	cgrp_node_t *ncn = NULL;
 	int error;
 
@@ -874,7 +1178,10 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 	}
 	ASSERT(cn);
 
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 	error = cgrp_dirlookup(cn, nm, &ncn, cred);
+	mutex_exit(&cgm->cg_contents);
 
 	if (error == 0) {
 		ASSERT(ncn);
@@ -890,17 +1197,21 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
     enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred,
     int flag, caller_context_t *ct, vsecattr_t *vsecp)
 {
-	cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+	cgrp_node_t *parent = VTOCGN(dvp);
 	cgrp_node_t *cn = NULL;
+	cgrp_mnt_t *cgm;
 	int error;
 
 	if (*nm == '\0')
 		return (EPERM);
 
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 	error = cgrp_dirlookup(parent, nm, &cn, cred);
 	if (error == 0) {		/* name found */
 		ASSERT(cn);
 
+		mutex_exit(&cgm->cg_contents);
 		/*
 		 * Creating an existing file, allow it except for the following
 		 * errors.
@@ -919,6 +1230,7 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
 		*vpp = CGNTOV(cn);
 		return (0);
 	}
+	mutex_exit(&cgm->cg_contents);
 
 	/*
 	 * cgroups doesn't allow creation of additional, non-subsystem specific
@@ -932,9 +1244,10 @@ static int
 cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
     caller_context_t *ct, int flags)
 {
-	cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+	cgrp_node_t *parent = VTOCGN(dvp);
 	int error;
 	cgrp_node_t *cn = NULL;
+	cgrp_mnt_t *cgm;
 
 	/*
 	 * Removal of subsystem-specific files is not allowed but we need
@@ -942,7 +1255,10 @@ cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
 	 * file.
 	 */
 
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
 	error = cgrp_dirlookup(parent, nm, &cn, cred);
+	mutex_exit(&cgm->cg_contents);
 	if (error)
 		return (error);
 
@@ -979,11 +1295,11 @@ cgrp_rename(
 	cgrp_node_t *fromparent;
 	cgrp_node_t *toparent;
 	cgrp_node_t *fromcn = NULL;	/* source cgrp_node */
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(odvp);
+	cgrp_mnt_t *cgm = VTOCGM(odvp);
 	int error, err;
 
-	fromparent = (cgrp_node_t *)VTOCGN(odvp);
-	toparent = (cgrp_node_t *)VTOCGN(ndvp);
+	fromparent = VTOCGN(odvp);
+	toparent = VTOCGN(ndvp);
 
 	if (fromparent != toparent)
 		return (EIO);
@@ -991,14 +1307,14 @@ cgrp_rename(
 	/* discourage additional use of toparent */
 	toparent = NULL;
 
-	mutex_enter(&cgm->cg_renamelck);
+	mutex_enter(&cgm->cg_contents);
 
 	/*
 	 * Look up cgrp_node of file we're supposed to rename.
 	 */
 	error = cgrp_dirlookup(fromparent, onm, &fromcn, cred);
 	if (error) {
-		mutex_exit(&cgm->cg_renamelck);
+		mutex_exit(&cgm->cg_contents);
 		return (error);
 	}
 
@@ -1030,11 +1346,9 @@ cgrp_rename(
 	/*
 	 * Link source to new target
 	 */
-	rw_enter(&fromparent->cgn_rwlock, RW_WRITER);
 	error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME,
 	    fromcn, (struct vattr *)NULL,
 	    (cgrp_node_t **)NULL, cred, ct);
-	rw_exit(&fromparent->cgn_rwlock);
 
 	if (error)
 		goto done;
@@ -1042,9 +1356,6 @@ cgrp_rename(
 	/*
 	 * Unlink from source.
 	 */
-	rw_enter(&fromparent->cgn_rwlock, RW_WRITER);
-	rw_enter(&fromcn->cgn_rwlock, RW_WRITER);
-
 	error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred);
 
 	/*
@@ -1054,17 +1365,14 @@ cgrp_rename(
 	if (error == ENOENT)
 		error = 0;
 
-	rw_exit(&fromcn->cgn_rwlock);
-	rw_exit(&fromparent->cgn_rwlock);
-
 	if (err == 0) {
 		vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct);
 		vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct);
 	}
 
 done:
+	mutex_exit(&cgm->cg_contents);
 	cgnode_rele(fromcn);
-	mutex_exit(&cgm->cg_renamelck);
 
 	return (error);
 }
@@ -1074,9 +1382,9 @@ static int
 cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
     struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
 {
-	cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+	cgrp_node_t *parent = VTOCGN(dvp);
 	cgrp_node_t *self = NULL;
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dvp);
+	cgrp_mnt_t *cgm = VTOCGM(dvp);
 	int error;
 
 	/*
@@ -1086,25 +1394,28 @@ cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
 	if (parent->cgn_nlink == 0)
 		return (ENOENT);
 
+	mutex_enter(&cgm->cg_contents);
 	error = cgrp_dirlookup(parent, nm, &self, cred);
 	if (error == 0) {
 		ASSERT(self != NULL);
+		mutex_exit(&cgm->cg_contents);
 		cgnode_rele(self);
 		return (EEXIST);
 	}
-	if (error != ENOENT)
+	if (error != ENOENT) {
+		mutex_exit(&cgm->cg_contents);
 		return (error);
+	}
 
-	rw_enter(&parent->cgn_rwlock, RW_WRITER);
 	error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL,
 	    va, &self, cred, ct);
 	if (error) {
-		rw_exit(&parent->cgn_rwlock);
+		mutex_exit(&cgm->cg_contents);
 		if (self != NULL)
 			cgnode_rele(self);
 		return (error);
 	}
-	rw_exit(&parent->cgn_rwlock);
+	mutex_exit(&cgm->cg_contents);
 	*vpp = CGNTOV(self);
 	return (0);
 }
@@ -1114,7 +1425,7 @@ static int
 cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
     caller_context_t *ct, int flags)
 {
-	cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+	cgrp_node_t *parent = VTOCGN(dvp);
 	cgrp_mnt_t *cgm;
 	cgrp_node_t *self = NULL;
 	struct vnode *vp;
@@ -1127,63 +1438,61 @@ cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
 		return (EINVAL);
 	if (strcmp(nm, "..") == 0)
 		return (EEXIST); /* Should be ENOTEMPTY */
+
+	cgm = VTOCGM(parent->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+
 	error = cgrp_dirlookup(parent, nm, &self, cred);
-	if (error)
+	if (error) {
+		mutex_exit(&cgm->cg_contents);
 		return (error);
-
-	rw_enter(&parent->cgn_rwlock, RW_WRITER);
-	rw_enter(&self->cgn_rwlock, RW_WRITER);
+	}
 
 	vp = CGNTOV(self);
 	if (vp == dvp || vp == cdir) {
 		error = EINVAL;
-		goto done1;
+		goto done;
 	}
 	if (self->cgn_type != CG_CGROUP_DIR) {
 		error = ENOTDIR;
-		goto done1;
+		goto done;
 	}
 
 	cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp);
 
-	mutex_enter(&self->cgn_tlock);
-	/* Check for the existence of any sub-cgroup directories */
-	if (self->cgn_nlink > 2) {
-		mutex_exit(&self->cgn_tlock);
+	/*
+	 * Check for the existence of any sub-cgroup directories or tasks in
+	 * the cgroup.
+	 */
+	if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) {
 		error = EEXIST;
-		goto done1;
+		/*
+		 * Update atime because checking cn_dirents is logically
+		 * equivalent to reading the directory
+		 */
+		gethrestime(&self->cgn_atime);
+		goto done;
 	}
-	mutex_exit(&self->cgn_tlock);
 
 	if (vn_vfswlock(vp)) {
 		error = EBUSY;
-		goto done1;
+		goto done;
 	}
 	if (vn_mountedvfs(vp) != NULL) {
 		error = EBUSY;
-		goto done;
+	} else {
+		error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
 	}
 
-	/*
-	 * Confirm directory only includes entries for ".", ".." and the
-	 * fixed pseudo file entries.
-	 */
-	if (self->cgn_dirents > (cgrp_num_pseudo_ents(cgm->cg_ssid) + 2)) {
-		error = EEXIST;		/* should be ENOTEMPTY */
-		/*
-		 * Update atime because checking cn_dirents is logically
-		 * equivalent to reading the directory
-		 */
-		gethrestime(&self->cgn_atime);
-		goto done;
+	vn_vfsunlock(vp);
+
+	if (parent->cgn_task_cnt == 0 &&
+	    parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) {
+		cgrp_rel_agent_event(cgm, parent);
 	}
 
-	error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
 done:
-	vn_vfsunlock(vp);
-done1:
-	rw_exit(&self->cgn_rwlock);
-	rw_exit(&parent->cgn_rwlock);
+	mutex_exit(&cgm->cg_contents);
 	vnevent_rmdir(CGNTOV(self), dvp, nm, ct);
 	cgnode_rele(self);
 
@@ -1195,7 +1504,8 @@ static int
 cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
     caller_context_t *ct, int flags)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm;
 	cgrp_dirent_t *cdp;
 	int error = 0;
 	size_t namelen;
@@ -1212,10 +1522,6 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
 			*eofp = 1;
 		return (0);
 	}
-	/*
-	 * assuming system call has already called cgrp_rwlock
-	 */
-	ASSERT(RW_READ_HELD(&cn->cgn_rwlock));
 
 	if (uiop->uio_iovcnt != 1)
 		return (EINVAL);
@@ -1223,8 +1529,12 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
+	cgm = VTOCGM(cn->cgn_vnode);
+	mutex_enter(&cgm->cg_contents);
+
 	if (cn->cgn_dir == NULL) {
 		VERIFY(cn->cgn_nlink == 0);
+		mutex_exit(&cgm->cg_contents);
 		return (0);
 	}
 
@@ -1284,6 +1594,9 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
 		uiop->uio_offset = offset;
 	}
 	gethrestime(&cn->cgn_atime);
+
+	mutex_exit(&cgm->cg_contents);
+
 	kmem_free(outbuf, bufsize);
 	return (error);
 }
@@ -1301,11 +1614,10 @@ cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm,
 static void
 cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
 {
-	cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
-	cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vp->v_vfsp);
+	cgrp_node_t *cn = VTOCGN(vp);
+	cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp);
 
-	rw_enter(&cn->cgn_rwlock, RW_WRITER);
-	mutex_enter(&cn->cgn_tlock);
+	mutex_enter(&cgm->cg_contents);
 	mutex_enter(&vp->v_lock);
 	ASSERT(vp->v_count >= 1);
 
@@ -1316,27 +1628,22 @@ cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
 	if (vp->v_count > 1 || cn->cgn_nlink != 0) {
 		vp->v_count--;
 		mutex_exit(&vp->v_lock);
-		mutex_exit(&cn->cgn_tlock);
-		rw_exit(&cn->cgn_rwlock);
+		mutex_exit(&cgm->cg_contents);
 		return;
 	}
 
-	mutex_exit(&vp->v_lock);
-	mutex_exit(&cn->cgn_tlock);
-	/* Here's our chance to send invalid event while we're between locks */
-	vn_invalid(CGNTOV(cn));
-
-	mutex_enter(&cgm->cg_contents);
 	if (cn->cgn_forw == NULL)
 		cgm->cg_rootnode->cgn_back = cn->cgn_back;
 	else
 		cn->cgn_forw->cgn_back = cn->cgn_back;
 	cn->cgn_back->cgn_forw = cn->cgn_forw;
+
+	mutex_exit(&vp->v_lock);
 	mutex_exit(&cgm->cg_contents);
 
-	rw_exit(&cn->cgn_rwlock);
-	rw_destroy(&cn->cgn_rwlock);
-	mutex_destroy(&cn->cgn_tlock);
+	/* Here's our chance to send invalid event */
+	vn_invalid(CGNTOV(cn));
+
 	vn_free(CGNTOV(cn));
 	kmem_free(cn, sizeof (cgrp_node_t));
 }
@@ -1349,27 +1656,17 @@ cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
 }
 
-/* ARGSUSED2 */
+/* ARGSUSED */
 static int
 cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
 {
-	cgrp_node_t *cn = VTOCGN(vp);
-
-	if (write_lock) {
-		rw_enter(&cn->cgn_rwlock, RW_WRITER);
-	} else {
-		rw_enter(&cn->cgn_rwlock, RW_READER);
-	}
 	return (write_lock);
 }
 
-/* ARGSUSED1 */
+/* ARGSUSED */
 static void
 cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
 {
-	cgrp_node_t *cn = VTOCGN(vp);
-
-	rw_exit(&cn->cgn_rwlock);
 }
 
 static int
@@ -1412,6 +1709,7 @@ const fs_operation_def_t cgrp_vnodeops_template[] = {
 	VOPNAME_CLOSE,		{ .vop_close = cgrp_close },
 	VOPNAME_READ,		{ .vop_read = cgrp_read },
 	VOPNAME_WRITE,		{ .vop_write = cgrp_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = cgrp_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = cgrp_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = cgrp_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = cgrp_access },
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
index feccf31800..44acdff3b1 100644
--- a/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -205,6 +205,14 @@ lx_systrace_f *lx_systrace_return_ptr;
 static int lx_systrace_enabled;
 
 /*
+ * cgroup file system maintenance functions which are set when cgroups loads.
+ */
+void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t);
+void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t);
+void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+/*
  * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
  * want an MMU dependency here (and should there be a microprocessor without
  * a hole, we don't want to start allocating from the top of the VA range).
@@ -312,6 +320,16 @@ lx_proc_exit(proc_t *p)
 {
 	lx_proc_data_t *lxpd;
 	proc_t *cp;
+	lx_zone_data_t *lxzdata;
+
+	/* cgroup integration */
+	lxzdata = ztolxzd(p->p_zone);
+	if (lxzdata->lxzd_cgroup != NULL) {
+		lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread));
+		ASSERT(lx_cgrp_proc_exit != NULL);
+		(*lx_cgrp_proc_exit)(lxzdata->lxzd_cgroup,
+		    lwpd->br_cgroupid, p->p_pid);
+	}
 
 	mutex_enter(&p->p_lock);
 	VERIFY(lxpd = ptolxproc(p));
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
index 67565379fe..3577749b66 100644
--- a/usr/src/uts/common/brand/lx/os/lx_misc.c
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -261,6 +261,7 @@ lx_freelwp(klwp_t *lwp)
 {
 	struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
 	proc_t *p = lwptoproc(lwp);
+	lx_zone_data_t *lxzdata;
 
 	VERIFY(MUTEX_NOT_HELD(&p->p_lock));
 
@@ -279,6 +280,14 @@ lx_freelwp(klwp_t *lwp)
 		return;
 	}
 
+	/* cgroup integration */
+	lxzdata = ztolxzd(p->p_zone);
+	if (lxzdata->lxzd_cgroup != NULL) {
+		ASSERT(lx_cgrp_freelwp != NULL);
+		(*lx_cgrp_freelwp)(lxzdata->lxzd_cgroup,
+		    lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid);
+	}
+
 	/*
 	 * It is possible for the lx_freelwp hook to be called without a prior
 	 * call to lx_exitlwp being made.  This happens as part of lwp
@@ -370,6 +379,7 @@ lx_initlwp(klwp_t *lwp, void *lwpbd)
 	lx_lwp_data_t *plwpd = ttolxlwp(curthread);
 	kthread_t *tp = lwptot(lwp);
 	proc_t *p = lwptoproc(lwp);
+	lx_zone_data_t *lxzdata;
 
 	VERIFY(MUTEX_HELD(&p->p_lock));
 	VERIFY(lwp->lwp_brand == NULL);
@@ -452,6 +462,15 @@ lx_initlwp(klwp_t *lwp, void *lwpbd)
 		lx_ptrace_inherit_tracer(plwpd, lwpd);
 		lwpd->br_cgroupid = plwpd->br_cgroupid;
 	}
+
+	/* cgroup integration */
+	lxzdata = ztolxzd(p->p_zone);
+	if (lxzdata->lxzd_cgroup != NULL) {
+		ASSERT(lx_cgrp_initlwp != NULL);
+		(*lx_cgrp_initlwp)(lxzdata->lxzd_cgroup,
+		    lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid);
+	}
+
 }
 
 /*
@@ -465,6 +484,7 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
 {
 	struct lx_lwp_data *src = srclwp->lwp_brand;
 	struct lx_lwp_data *dst = dstlwp->lwp_brand;
+	lx_zone_data_t *lxzdata;
 
 	dst->br_ppid = src->br_pid;
 	dst->br_ptid = lwptot(srclwp)->t_tid;
@@ -496,6 +516,15 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
 	 */
 	dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
 	dst->br_scall_args = NULL;
+
+	/* cgroup integration */
+	lxzdata = ztolxzd(srclwp->lwp_procp->p_zone);
+	if (lxzdata->lxzd_cgroup != NULL) {
+		ASSERT(lx_cgrp_forklwp != NULL);
+		(*lx_cgrp_forklwp)(lxzdata->lxzd_cgroup,
+		    dst->br_cgroupid, lwptoproc(dstlwp)->p_pid);
+	}
+
 }
 
 /*
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
index e6288fac57..895ea44db5 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_brand.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -35,6 +35,7 @@
 #include <sys/cpuvar.h>
 #include <sys/zone.h>
 #include <sys/ksocket.h>
+#include <sys/vfs.h>
 #endif
 
 #ifdef	__cplusplus
@@ -383,6 +384,14 @@ typedef enum lx_proc_flags {
 
 #ifdef	_KERNEL
 
+/*
+ * Entry points for cgroup integration.
+ */
+extern void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t);
+extern void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t);
+extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
 #define	LX_RLFAKE_LOCKS		0
 #define	LX_RLFAKE_NICE		1
 #define	LX_RLFAKE_RTPRIO	2
@@ -632,11 +641,18 @@ struct lx_lwp_data {
  */
 #define	LX_BR_ARGS_SIZE_MAX	(1024)
 
-/* brand specific data */
+/*
+ * brand specific data
+ *
+ * We currently only support a single cgroup mount in an lx zone so we only have
+ * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever
+ * enhanced to support different mounts with different subsystem controllers.
+ */
 typedef struct lx_zone_data {
 	char lxzd_kernel_version[LX_VERS_MAX];
 	ksocket_t lxzd_ioctl_sock;
 	char lxzd_bootid[LX_BOOTID_LEN];	/* procfs boot_id */
+	vfs_t *lxzd_cgroup;			/* cgroup for this zone */
 } lx_zone_data_t;
 
 #define	BR_CPU_BOUND	0x0001
author	Jerry Jelinek <jerry.jelinek@joyent.com>	2015-07-24 13:34:15 +0000
committer	Jerry Jelinek <jerry.jelinek@joyent.com>	2015-07-24 13:34:15 +0000
commit	431ca10ae7ca970d65f15fe0a1115ee749a97433 (patch)
tree	c91d28297006e34628e27a39dd6084952d077c0f
parent	7be989b3b3d0affc5705ea8b81d4b84ec65d8246 (diff)
download	illumos-joyent-431ca10ae7ca970d65f15fe0a1115ee749a97433.tar.gz