summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2015-07-24 13:34:15 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2015-07-24 13:34:15 +0000
commit431ca10ae7ca970d65f15fe0a1115ee749a97433 (patch)
treec91d28297006e34628e27a39dd6084952d077c0f
parent7be989b3b3d0affc5705ea8b81d4b84ec65d8246 (diff)
downloadillumos-joyent-431ca10ae7ca970d65f15fe0a1115ee749a97433.tar.gz
OS-4495 support cgroups notify_on_release and release_agent
-rw-r--r--manifest3
-rw-r--r--usr/src/lib/brand/lx/Makefile2
-rw-r--r--usr/src/lib/brand/lx/cgrpmgr/Makefile56
-rw-r--r--usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c157
-rw-r--r--usr/src/lib/brand/lx/lx_brand/common/mount.c51
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps.h110
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_node.c299
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c361
-rw-r--r--usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c582
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_brand.c18
-rw-r--r--usr/src/uts/common/brand/lx/os/lx_misc.c29
-rw-r--r--usr/src/uts/common/brand/lx/sys/lx_brand.h18
12 files changed, 1405 insertions, 281 deletions
diff --git a/manifest b/manifest
index 0cdccb830d..68f3c742e0 100644
--- a/manifest
+++ b/manifest
@@ -5045,11 +5045,12 @@ s usr/lib/brand/lx/64=amd64
d usr/lib/brand/lx/amd64 0755 root bin
f usr/lib/brand/lx/amd64/lx_librtld_db.so.1 0755 root root
f usr/lib/brand/lx/amd64/lx_vdso.so.1 0755 root root
+f usr/lib/brand/lx/cgrpmgr 0755 root root
+f usr/lib/brand/lx/etc_default_nfs 0444 root root
d usr/lib/brand/lx/ld 0755 root root
f usr/lib/brand/lx/ld/ld.config 0755 root root
d usr/lib/brand/lx/ld/64 0755 root root
f usr/lib/brand/lx/ld/64/ld.config 0755 root root
-f usr/lib/brand/lx/etc_default_nfs 0444 root root
f usr/lib/brand/lx/ltp_skiplist 0444 root root
f usr/lib/brand/lx/ltp_tests 0444 root root
f usr/lib/brand/lx/lx_boot 0755 root root
diff --git a/usr/src/lib/brand/lx/Makefile b/usr/src/lib/brand/lx/Makefile
index 2c5a373e25..67f2926305 100644
--- a/usr/src/lib/brand/lx/Makefile
+++ b/usr/src/lib/brand/lx/Makefile
@@ -33,7 +33,7 @@ include Makefile.lx
.PARALLEL:
SUBDIRS= cmd librtld_db lx_support lx_init lx_brand netfiles \
- zone lx_vdso testing .WAIT
+ zone lx_vdso cgrpmgr testing .WAIT
MSGSUBDIRS= lx_brand lx_support zone
all := TARGET= all
diff --git a/usr/src/lib/brand/lx/cgrpmgr/Makefile b/usr/src/lib/brand/lx/cgrpmgr/Makefile
new file mode 100644
index 0000000000..26aa079d63
--- /dev/null
+++ b/usr/src/lib/brand/lx/cgrpmgr/Makefile
@@ -0,0 +1,56 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+PROG = cgrpmgr
+
+PROG_OBJS = cgrpmgr.o
+
+OBJS = $(PROG_OBJS)
+SRCS = $(PROG_OBJS:%.o=%.c)
+
+all: $(PROG)
+
+include ../Makefile.lx
+include $(SRC)/cmd/Makefile.cmd
+include $(SRC)/cmd/Makefile.ctf
+
+# override the install directory
+ROOTBIN = $(ROOTBRANDDIR)
+CLOBBERFILES = $(OBJS) $(ROOTPROG)
+
+UTSBASE = $(SRC)/uts
+
+CFLAGS += $(CCVERBOSE)
+CPPFLAGS += -D_REENTRANT -I$(UTSBASE)/common/brand/lx/cgroups
+LDLIBS +=
+
+.KEEP_STATE:
+
+install: all $(ROOTPROG)
+
+clean:
+ $(RM) $(PROG) $(OBJS)
+
+lint: lint_PROG lint_SRCS
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+%.o: %.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+include $(SRC)/cmd/Makefile.targ
diff --git a/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c
new file mode 100644
index 0000000000..cbbe56e747
--- /dev/null
+++ b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c
@@ -0,0 +1,157 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * The cgrpmgr is a user-level daemon process associated with a specific cgroup
+ * fs mount. It's only job is to run the release_agent when a cgroup becomes
+ * empty and notify_on_release is enabled.
+ */
+
+#include <stdarg.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <cgrps.h>
+
+static void
+run_agent(char *agent, char *arg)
+{
+ char *argv[3];
+ char *cmdp;
+
+ /*
+ * The parent does nothing.
+ */
+ if (fork() != 0)
+ return;
+
+ /*
+ * Child - run the agent.
+ */
+ (void) setsid();
+
+ cmdp = strrchr(agent, '/');
+ if (cmdp == NULL) {
+ cmdp = agent;
+ } else {
+ cmdp++;
+ }
+
+ argv[0] = cmdp;
+ argv[1] = arg;
+ argv[2] = NULL;
+
+ execv(agent, argv);
+ /* Nothing can be done if the exec fails */
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ int res;
+ sigset_t set, oset;
+ struct statvfs sb;
+ char rel_agent[MAXPATHLEN];
+ char cgrp_path[MAXPATHLEN];
+ cgrpmgr_info_t cgmi;
+
+ /*
+ * Start by daemonizing ourself.
+ */
+
+ /* Close all open fd's */
+ closefrom(0);
+
+ clearenv();
+
+ /*
+ * Block all signals except SIGCHLD since we don't want this code to
+ * respond to any signal (except, of course, the ones we can't block).
+ * By setting the SIGCHLD disposition to ignore our children will
+ * automatically be reaped.
+ */
+ (void) sigfillset(&set);
+ (void) sigdelset(&set, SIGCHLD);
+ (void) sigdelset(&set, SIGABRT);
+ (void) sigprocmask(SIG_BLOCK, &set, &oset);
+ (void) signal(SIGCHLD, SIG_IGN);
+
+ switch (fork1()) {
+ case -1: /* uh-oh */
+ exit(1);
+
+ case 0: /* child */
+ break;
+
+ default: /* parent */
+ exit(0);
+ }
+
+ (void) setsid();
+ (void) umask(0077);
+ (void) chdir("/");
+
+ if ((fd = open(argv[1], O_RDONLY)) < 0)
+ exit(1);
+
+ /*
+ * Sanity check the mount point we got.
+ */
+ if (fstatvfs(fd, &sb) < 0 || strcmp(sb.f_basetype, "lx_cgroup") != 0)
+ exit(1);
+
+ cgmi.cgmi_pid = getpid();
+ cgmi.cgmi_rel_agent_path = rel_agent;
+ cgmi.cgmi_cgroup_path = cgrp_path;
+
+ /*
+ * Now wait for and run the release agent each time we return from the
+ * ioctl. An error return indicates the fs has been unmounted and we
+ * should exit.
+ */
+ for (;;) {
+ /*
+ * Block in the kernel until a cgroup becomes empty.
+ */
+ res = ioctl(fd, CGRPFS_GETEVNT, &cgmi);
+
+ /*
+ * EIO indicates we should quit but any other error implies
+ * we did something wrong (which means a bug), so simply
+ * terminate on any error.
+ */
+ if (res != 0) {
+ if (errno == EIO)
+ exit(0);
+ abort();
+ }
+
+ run_agent(rel_agent, cgrp_path);
+ }
+
+ return (0);
+}
diff --git a/usr/src/lib/brand/lx/lx_brand/common/mount.c b/usr/src/lib/brand/lx/lx_brand/common/mount.c
index aca92ed587..406c960dc1 100644
--- a/usr/src/lib/brand/lx/lx_brand/common/mount.c
+++ b/usr/src/lib/brand/lx/lx_brand/common/mount.c
@@ -40,6 +40,7 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
+#include <stdlib.h>
#include <sys/lx_autofs.h>
#include <sys/lx_debug.h>
@@ -600,6 +601,32 @@ i_make_nfs_args(lx_nfs_mount_data_t *lx_nmd, struct nfs_args *nfs_args,
return (0);
}
+static int
+run_cgrp_mgr(char *mntpnt)
+{
+ const char *cmd = "/native/usr/lib/brand/lx/cgrpmgr";
+ char *argv[] = { "cgrpmgr", NULL, NULL };
+
+ argv[1] = mntpnt;
+
+ switch (fork1()) {
+ case 0:
+ /* child */
+ execv(cmd, argv);
+ exit(1);
+ break;
+
+ case -1:
+ return (-1);
+
+ default:
+ /* the cgroup manager process runs until we unmount */
+ break;
+ }
+
+ return (0);
+}
+
long
lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
uintptr_t p5)
@@ -616,6 +643,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
char target[MAXPATHLEN];
char fstype[MAXPATHLEN], options[MAX_MNTOPT_STR];
int sflags, rv;
+ long res;
+ boolean_t is_cgrp = B_FALSE;
/* Variables needed for nfs mounts. */
lx_nfs_mount_data_t lx_nmd;
@@ -752,6 +781,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
}
lx_debug("\tlinux mount options: \"%s\"", options);
+ is_cgrp = B_TRUE;
+
/*
* Currently don't verify Linux mount options since we can
* have asubsystem string provided.
@@ -885,8 +916,24 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4,
lx_debug("\tsolaris mount fstype: %s", fstype);
lx_debug("\tsolaris mount options: \"%s\"", options);
- return (mount(source, target, sflags, fstype, sdataptr, sdatalen,
- options, sizeof (options)) ? -errno : 0);
+ res = mount(source, target, sflags, fstype, sdataptr, sdatalen,
+ options, sizeof (options));
+
+ if (res == 0) {
+ if (is_cgrp && run_cgrp_mgr(target) != 0) {
+ /*
+ * Forking the cgrp manager failed, unmount and return
+ * an ENOMEM error as the best approximation that we're
+ * out of resources.
+ */
+ (void) umount(target);
+ return (-ENOMEM);
+ } else {
+ return (0);
+ }
+ } else {
+ return (-errno);
+ }
}
/*
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
index f0fab9f904..cfbeb2796c 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps.h
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h
@@ -46,11 +46,36 @@ extern "C" {
#include <sys/atomic.h>
#include <vm/anon.h>
+/*
+ * cgrpmgr ioctl interface.
+ */
+#define CGRPFS_IOC ('C' << 16 | 'G' << 8)
+#define CGRPFS_GETEVNT (CGRPFS_IOC | 1)
+
+typedef struct cgrpmgr_info {
+ pid_t cgmi_pid;
+ char *cgmi_rel_agent_path;
+ char *cgmi_cgroup_path;
+} cgrpmgr_info_t;
+
+#if defined(_KERNEL)
+
+#include <sys/lx_brand.h>
+
+typedef struct cgrpmgr_info32 {
+ pid_t cgmi_pid;
+ caddr32_t cgmi_rel_agent_path;
+ caddr32_t cgmi_cgroup_path;
+} cgrpmgr_info32_t;
+
+typedef struct cgrp_evnt {
+ list_node_t cg_evnt_lst;
+ char *cg_evnt_path;
+} cgrp_evnt_t;
+
#define CG_PSNSIZE 256 /* max size of pseudo file name entries */
#define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */
-#define CG_START_ID 0 /* initial node ID for allocation */
-
/*
* The order of these entries must be in sync with the cg_ssde_dir array.
*/
@@ -61,8 +86,10 @@ typedef enum cgrp_ssid {
typedef enum cgrp_nodetype {
CG_CGROUP_DIR = 1, /* cgroup directory entry */
- CG_PROCS,
- CG_TASKS,
+ CG_NOTIFY, /* notify_on_release file */
+ CG_PROCS, /* cgroup.procs file */
+ CG_REL_AGENT, /* release_agent file */
+ CG_TASKS, /* tasks file */
} cgrp_nodetype_t;
typedef struct cgrp_subsys_dirent {
@@ -70,10 +97,19 @@ typedef struct cgrp_subsys_dirent {
char *cgrp_ssd_name;
} cgrp_subsys_dirent_t;
+#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2)
+
+/*
+ * A modern systemd-based Linux system typically has 50-60 cgroups so
+ * we size the hash for 2x that number.
+ */
+#define CGRP_HASH_SZ 128
+
/*
* cgroups per-mount data structure.
*
- * All fields are protected by cg_contents.
+ * All but the event related fields are protected by cg_contents.
+ * The evnt_list and counter is protected by cg_events.
*/
typedef struct cgrp_mnt {
struct vfs *cg_vfsp; /* filesystem's vfs struct */
@@ -82,45 +118,45 @@ typedef struct cgrp_mnt {
cgrp_ssid_t cg_ssid; /* subsystem type */
dev_t cg_dev; /* unique dev # of mounted `device' */
uint_t cg_gen; /* node ID source for files */
- kmutex_t cg_contents; /* lock for cgrp_mnt structure */
- kmutex_t cg_renamelck; /* rename lock for this mount */
+ uint_t cg_grp_gen; /* ID source for cgroups */
+ kmutex_t cg_contents; /* global lock for most fs activity */
+ char cg_agent[MAXPATHLEN + 1]; /* release_agent path */
+ pid_t cg_mgrpid; /* pid of user-level manager */
+ kmutex_t cg_events; /* lock for event list */
+ kcondvar_t cg_evnt_cv; /* condvar for event list wakeup */
+ int cg_evnt_cnt; /* counter for num events in list */
+ list_t cg_evnt_list; /* list of agent events */
+ /* ptr to zone data for containing zone */
+ lx_zone_data_t *cg_lxzdata;
+ struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */
} cgrp_mnt_t;
/*
* cgrp_node is the file system dependent node for cgroups.
*
- * cgn_rwlock protects access of the directory list at cgn_dir
- * as well as syncronizing read and writes to the cgrp_node
- *
- * cgn_contents protects growing, shrinking, reading and writing
- * the file along with cgn_rwlock (see below).
+ * The node is used to represent both directories (a cgroup) and pseudo files
+ * within the directory.
*
- * cgn_tlock protects updates to cgn_mode and cgn_nlink
- *
- * cg_contents in the cgrp_mount data structure protects
- * cgn_forw and cgn_back which are used to maintain a linked
- * list of all cgroup files associated with that file system
- *
- * The ordering of the locking is:
- * cg_rwlock -> cgn_contents
- *
- * cgn_tlock doesn't require any cgrp_node locks
+ * Members are tagged in the comment to note which type of node they apply to:
+ * A - all
+ * D - dir (i.e. a cgroup)
+ * F - pseudo file
*/
typedef struct cgrp_node {
- struct cgrp_node *cgn_back; /* lnked lst of cgrp_nodes */
- struct cgrp_node *cgn_forw; /* lnked lst of cgrp_nodes */
- struct cgrp_dirent *cgn_dir; /* dirent list */
- struct cgrp_node *cgn_parent; /* dir containing this node */
- uint_t cgn_dirents; /* number of dirents */
- cgrp_nodetype_t cgn_type; /* type for this node */
- struct vnode *cgn_vnode; /* vnode for this cgrp_node */
- int cgn_id; /* ID number for the cgroup */
- struct vattr cgn_attr; /* attributes */
- krwlock_t cgn_contents; /* serialize mods */
- krwlock_t cgn_rwlock; /* rw - serialize */
- /* mods and dir updates */
- kmutex_t cgn_tlock; /* time, flag, and nlink lock */
+ struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */
+ struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */
+ struct cgrp_dirent *cgn_dir; /* D dirent list */
+ struct cgrp_node *cgn_parent; /* A dir containing this node */
+ struct cgrp_node *cgn_next; /* D link in per-mount cgroup */
+ /* hash table */
+ uint_t cgn_dirents; /* D number of dirents */
+ cgrp_nodetype_t cgn_type; /* A type for this node */
+ uint_t cgn_notify; /* D notify_on_release value */
+ uint_t cgn_task_cnt; /* D number of threads in grp */
+ struct vnode *cgn_vnode; /* A vnode for this cgrp_node */
+ uint_t cgn_id; /* D ID number for the cgroup */
+ struct vattr cgn_attr; /* A attributes */
} cgrp_node_t;
/*
@@ -184,6 +220,10 @@ void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *);
int cgrp_taccess(void *, int, cred_t *);
ino_t cgrp_inode(cgrp_nodetype_t, unsigned int);
int cgrp_num_pseudo_ents(cgrp_ssid_t);
+cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t);
+void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *);
+
+#endif /* KERNEL */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
index 0d153f73c1..8950be1966 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c
@@ -36,6 +36,7 @@ static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *, enum de_op);
static cgrp_subsys_dirent_t cgrp_generic_dir[] = {
{ CG_PROCS, "cgroup.procs" },
+ { CG_NOTIFY, "notify_on_release" },
{ CG_TASKS, "tasks" }
};
@@ -165,6 +166,132 @@ cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold,
}
/*
+ * The following functions maintain the per-mount cgroup hash table.
+ */
+static void
+cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ uint_t cgid;
+ int hsh;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ cgid = cn->cgn_id;
+ hsh = cgid % CGRP_HASH_SZ;
+
+ cn->cgn_next = cgm->cg_grp_hash[hsh];
+ cgm->cg_grp_hash[hsh] = cn;
+}
+
+static void
+cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ uint_t cgid;
+ int hsh;
+ cgrp_node_t *np = NULL, *curp, *prevp = NULL;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ cgid = cn->cgn_id;
+ hsh = cgid % CGRP_HASH_SZ;
+
+ for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+ curp = curp->cgn_next) {
+ if (curp->cgn_id == cgid) {
+ if (prevp == NULL) {
+ cgm->cg_grp_hash[hsh] = curp->cgn_next;
+ } else {
+ prevp->cgn_next = curp->cgn_next;
+ }
+ np = curp;
+ np->cgn_next = NULL;
+ break;
+ }
+
+ prevp = curp;
+ }
+
+ ASSERT(np != NULL);
+ ASSERT(np->cgn_task_cnt == 0);
+}
+
+/*
+ * Count up the number of threads already running in the zone and initialize the
+ * first cgroup's task counter.
+ *
+ * We have to look at all of the processes to find applicable ones.
+ */
+static void
+cgrp_cg_hash_init(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ int i;
+ int cnt = 0;
+ zoneid_t zoneid = curproc->p_zone->zone_id;
+ pid_t schedpid = curproc->p_zone->zone_zsched->p_pid;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ /* Scan all of the process entries */
+ mutex_enter(&pidlock);
+ for (i = 1; i < v.v_proc; i++) {
+ proc_t *p;
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, system processes,
+ * a PID of 0, the pid for our zsched process, anything the
+ * security policy doesn't allow us to look at, its not an
+ * lx-branded process and processes that are not in the zone.
+ */
+ if ((p = pid_entry(i)) == NULL ||
+ p->p_stat == SIDL ||
+ (p->p_flag & SSYS) != 0 ||
+ p->p_pid == 0 ||
+ p->p_pid == schedpid ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0 ||
+ p->p_zone->zone_id != zoneid) {
+ continue;
+ }
+
+ mutex_enter(&p->p_lock);
+ if (p->p_brand != &lx_brand) {
+ mutex_exit(&p->p_lock);
+ continue;
+ }
+ cnt += p->p_lwpcnt;
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * There should be at least the init process with 1 thread in the zone
+ */
+ ASSERT(cnt > 0);
+ cn->cgn_task_cnt = cnt;
+
+ DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt);
+
+ mutex_exit(&pidlock);
+}
+
+cgrp_node_t *
+cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid)
+{
+ int hsh = cgid % CGRP_HASH_SZ;
+ cgrp_node_t *curp;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ for (curp = cgm->cg_grp_hash[hsh]; curp != NULL;
+ curp = curp->cgn_next) {
+ if (curp->cgn_id == cgid) {
+ return (curp);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
* Calculate an inode number
*
* This takes various bits of info and munges them to give the inode number for
@@ -217,9 +344,6 @@ cgrp_taccess(void *vcp, int mode, cred_t *cred)
/*
* Search directory 'parent' for entry 'name'.
*
- * The calling thread can't hold the write version
- * of the rwlock for the directory being searched
- *
* 0 is returned on success and *foundcp points
* to the found cgrp_node with its vnode held.
*/
@@ -227,8 +351,10 @@ int
cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp,
cred_t *cred)
{
+ cgrp_mnt_t *cgm = VTOCGM(parent->cgn_vnode);
int error;
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
*foundcp = NULL;
if (parent->cgn_type != CG_CGROUP_DIR)
return (ENOTDIR);
@@ -280,10 +406,7 @@ cgrp_direnter(
int error = 0;
char *s;
- /*
- * cgn_rwlock is held to serialize direnter and dirdeletes
- */
- ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
ASSERT(dir->cgn_type == CG_CGROUP_DIR);
/*
@@ -302,23 +425,15 @@ cgrp_direnter(
* Remember that we can only rename within the same directory.
*/
if (op == DE_RENAME) {
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
- mutex_enter(&cn->cgn_tlock);
if (cn->cgn_nlink == 0) {
- mutex_exit(&cn->cgn_tlock);
- rw_exit(&cn->cgn_rwlock);
return (ENOENT);
}
if (cn->cgn_nlink == MAXLINK) {
- mutex_exit(&cn->cgn_tlock);
- rw_exit(&cn->cgn_rwlock);
return (EMLINK);
}
cn->cgn_nlink++;
gethrestime(&cn->cgn_ctime);
- mutex_exit(&cn->cgn_tlock);
- rw_exit(&cn->cgn_rwlock);
}
/*
@@ -342,7 +457,9 @@ cgrp_direnter(
if (cdp) {
ASSERT(found != NULL);
error = EEXIST;
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(found);
+ mutex_enter(&cgm->cg_contents);
} else {
/*
@@ -358,6 +475,13 @@ cgrp_direnter(
error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred);
if (error)
goto out;
+
+ if (op == DE_MKDIR) {
+ /*
+ * inherit notify_on_release value from parent
+ */
+ cn->cgn_notify = dir->cgn_notify;
+ }
}
error = cgrp_diraddentry(dir, cn, name, op);
@@ -366,7 +490,6 @@ cgrp_direnter(
/*
* Unmake the inode we just made.
*/
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
if ((cn->cgn_type) == CG_CGROUP_DIR) {
ASSERT(cdp == NULL);
/*
@@ -374,25 +497,26 @@ cgrp_direnter(
*/
cgrp_dirtrunc(cn);
}
- mutex_enter(&cn->cgn_tlock);
cn->cgn_nlink = 0;
- mutex_exit(&cn->cgn_tlock);
gethrestime(&cn->cgn_ctime);
- rw_exit(&cn->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(cn);
+ mutex_enter(&cgm->cg_contents);
cn = NULL;
}
} else if (cnp) {
*cnp = cn;
} else if (op == DE_CREATE || op == DE_MKDIR) {
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(cn);
+ mutex_enter(&cgm->cg_contents);
}
}
out:
if (error && op == DE_RENAME) {
/* Undo bumped link count. */
- DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+ cn->cgn_nlink--;
gethrestime(&cn->cgn_ctime);
}
return (error);
@@ -410,17 +534,17 @@ int
cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
cred_t *cred)
{
+ cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode);
cgrp_dirent_t *cndp;
int error;
size_t namelen;
cgrp_node_t *cnnp;
timestruc_t now;
- ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
- ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
if (nm[0] == '\0')
- panic("cgrp_dirdelete: NULL name for 0x%p", (void *)cn);
+ panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn);
/*
* return error when removing . and ..
@@ -465,32 +589,21 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
nextp = cdp->cgd_next;
cgnode_hold(pseudo_node);
- rw_enter(&pseudo_node->cgn_rwlock, RW_WRITER);
error = cgrp_dirdelete(cn, pseudo_node,
cdp->cgd_name, DR_REMOVE, cred);
- rw_exit(&pseudo_node->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(pseudo_node);
+ mutex_enter(&cgm->cg_contents);
cdp = nextp;
}
- }
- cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
- if (cndp == NULL) {
- /*
- * If it is gone, some other thread got here first!
- * Return error ENOENT.
- */
- return (ENOENT);
+ cgrp_cg_hash_remove(cgm, cn);
}
- /*
- * If the cgrp_node in the cgrp_dirent changed, we were probably
- * the victim of a concurrent rename operation. The original
- * is gone, so return that status.
- */
- if (cn != cnnp)
- return (ENOENT);
+ cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp);
+ VERIFY(cndp != NULL);
+ VERIFY(cn == cnnp);
cgrp_hash_out(cndp);
@@ -527,7 +640,7 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op,
cn->cgn_ctime = now;
ASSERT(cn->cgn_nlink > 0);
- DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+ cn->cgn_nlink--;
if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) {
cgrp_dirtrunc(cn);
ASSERT(cn->cgn_nlink == 0);
@@ -544,10 +657,9 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
struct vnode *vp;
timestruc_t now;
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
ASSERT(vap != NULL);
- rw_init(&cn->cgn_rwlock, NULL, RW_DEFAULT, NULL);
- mutex_init(&cn->cgn_tlock, NULL, MUTEX_DEFAULT, NULL);
cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode);
cn->cgn_mask = 0;
cn->cgn_attr.va_type = vap->va_type;
@@ -581,15 +693,7 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
vp->v_rdev = vap->va_rdev;
vp->v_data = (caddr_t)cn;
- mutex_enter(&cgm->cg_contents);
-
- /*
- * Set the cgroup ID for this cgrp_node by using a counter on each
- * mount. We also use this value as the directory nodeid (which is used
- * to derive the inode) so each cgroup in the tree will have a unique
- * id (and inode).
- */
- cn->cgn_nodeid = cn->cgn_id = cgm->cg_gen++;
+ cn->cgn_nodeid = cgm->cg_gen++;
/*
* Add new cgrp_node to end of linked list of cgrp_nodes for this
@@ -600,10 +704,38 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred)
cn->cgn_back = cgm->cg_rootnode->cgn_back;
cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn;
}
- mutex_exit(&cgm->cg_contents);
vn_exists(vp);
}
+void
+cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name,
+ cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr)
+{
+ cgrp_node_t *ncn;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, nattr,
+ &ncn, cr, NULL);
+
+ /*
+ * Fix the inode and assign the pseudo file type to be correct.
+ */
+ ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid);
+ ncn->cgn_type = type;
+
+ /*
+ * Since we're creating these entries here and not via the
+ * normal VOP_CREATE code path, we need to do the rele to drop
+ * our hold. This will leave the vnode v_count at 0 when we
+ * come out of cgrp_inactive but we won't reclaim the vnode
+ * there since the cgn_nlink value will still be 1.
+ */
+ mutex_exit(&cgm->cg_contents);
+ cgnode_rele(ncn);
+ mutex_enter(&cgm->cg_contents);
+}
+
/*
* cgrp_dirinit is used internally to initialize a directory (dir)
* with '.' and '..' entries without checking permissions and locking
@@ -615,19 +747,34 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
{
cgrp_dirent_t *dot, *dotdot;
timestruc_t now;
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dir->cgn_vnode);
+ cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
cgrp_ssde_t *ssdp;
cgrp_subsys_dirent_t *pseudo_files;
struct vattr nattr;
int i;
- ASSERT(RW_WRITE_HELD(&parent->cgn_rwlock));
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
ASSERT(dir->cgn_type == CG_CGROUP_DIR);
ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM);
ssdp = &cg_ssde_dir[cgm->cg_ssid];
/*
+ * If this is the top-level cgroup created by the mount then we need to
+ * count up the number of procs and tasks already running in the zone.
+ */
+
+ /*
+ * Set the cgroup ID for this cgrp_node by using a counter on each
+ * mount.
+ */
+ dir->cgn_id = cgm->cg_grp_gen++;
+ cgrp_cg_hash_insert(cgm, dir);
+ /* Initialise the first cgroup if this is top-level group */
+ if (parent == dir)
+ cgrp_cg_hash_init(cgm, dir);
+
+ /*
* Initialize the entries
*/
dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP);
@@ -659,7 +806,7 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
dir->cgn_mtime = now;
dir->cgn_ctime = now;
- INCR_COUNT(&parent->cgn_nlink, &parent->cgn_tlock);
+ parent->cgn_nlink++;
parent->cgn_ctime = now;
dir->cgn_dir = dot;
@@ -672,28 +819,20 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr)
nattr.va_type = VREG;
nattr.va_rdev = 0;
+ /*
+ * If this is the top-level dir in the file system then it always
+ * has a release_agent pseudo file. Only the top-level dir has this
+ * file.
+ */
+ if (parent == dir) {
+ cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr,
+ cr);
+ }
+
pseudo_files = ssdp->cg_ssde_files;
for (i = 0; i < ssdp->cg_ssde_nfiles; i++) {
- cgrp_node_t *ncn;
-
- cgrp_direnter(cgm, dir, pseudo_files[i].cgrp_ssd_name,
- DE_CREATE, (cgrp_node_t *)NULL, &nattr, &ncn, cr, NULL);
-
- /*
- * Fix the inode and assign the pseudo file type to be correct.
- */
- ncn->cgn_nodeid = cgrp_inode(pseudo_files[i].cgrp_ssd_type,
- dir->cgn_nodeid);
- ncn->cgn_type = pseudo_files[i].cgrp_ssd_type;
-
- /*
- * Since we're creating these entries here and not via the
- * normal VOP_CREATE code path, we need to do the rele to drop
- * our hold. This will leave the vnode v_count at 0 when we
- * come out of cgrp_inactive but we won't reclaim the vnode
- * there since the cgn_nlink value will still be 1.
- */
- cgnode_rele(ncn);
+ cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name,
+ pseudo_files[i].cgrp_ssd_type, &nattr, cr);
}
}
@@ -705,8 +844,9 @@ cgrp_dirtrunc(cgrp_node_t *dir)
{
cgrp_dirent_t *cgdp;
timestruc_t now;
+ cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode);
- ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock));
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
ASSERT(dir->cgn_type == CG_CGROUP_DIR);
for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) {
@@ -726,7 +866,7 @@ cgrp_dirtrunc(cgrp_node_t *dir)
*/
cn = cgdp->cgd_cgrp_node;
ASSERT(cn->cgn_nlink > 0);
- DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock);
+ cn->cgn_nlink--;
cgrp_hash_out(cgdp);
kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen);
@@ -849,6 +989,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
{
cgrp_node_t *cn;
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
ASSERT(va != NULL);
if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
@@ -870,9 +1011,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va,
if (op == DE_MKDIR) {
cn->cgn_type = CG_CGROUP_DIR;
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
cgrp_dirinit(dir, cn, cred);
- rw_exit(&cn->cgn_rwlock);
}
*newnode = cn;
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
index 8066f184ce..b2ffa02418 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c
@@ -31,10 +31,12 @@
* For example, it is common to see cgroup trees (each is its own mount with a
* different subsystem controller) for blkio, cpuset, memory, systemd (has no
* controller), etc. Within each tree there is a top-level directory with at
- * least a cgroup.procs and tasks file listing the processes within that group,
- * although there could be subdirectories, which define new cgroups, that then
- * contain a subset of the processes. Each subdirectory also has, at a minimum,
- * a cgroup.procs and tasks file.
+ * least a cgroup.procs, notify_on_release, release_agent, and tasks file.
+ * The cgroup.procs file lists the processes within that group and the tasks
+ * file lists the threads in the group. There could be subdirectories, which
+ * define new cgroups, that then contain a subset of the processes. Each
+ * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and
+ * tasks file.
*
* Since we're using lx to run user-level code within zones, the majority (all?)
* of the cgroup resource management functionality simply doesn't apply to us.
@@ -45,14 +47,54 @@
* hierarchy and does not report that any resource management controllers are
* available for separate mounts.
*
+ * In addition to the hierarchy, the other important component of cgroups that
+ * is used by systemd is the 'release_agent'. This provides a mechanism to
+ * run a command when a cgroup becomes empty (the last task in the group
+ * leaves, either by exit or move, and there are no more sub-cgroups). The
+ * 'release_agent' file only exists in the top-level cgroup of the mounted
+ * file system and holds the path to a command to run. The 'notify_on_release'
+ * file exists in each cgroup dir. If that file contains a '1' then the agent
+ * is run when that group becomes empty. The agent is passed a path string of
+ * the cgroup, relative to the file system mount point (e.g. a mount on
+ * /sys/fs/cgroups/systemd with a sub-cgroup of foo/bar gets the arg foo/bar).
+ *
+ * Cgroup membership is implemented via hooks into the lx brand code. When
+ * the cgroup file system loads it installs callbacks for:
+ * lx_cgrp_forklwp
+ * lx_cgrp_procexit
+ * lx_cgrp_initlwp
+ * lx_cgrp_freelwp
+ * and when it unloads it clears those hooks. The lx brand code calls those
+ * hooks when a process/lwp starts and when it exits. Internally we use a
+ * simple reference counter (cgn_task_cnt) on the cgroup node to track how many
+ * threads are in the group, so we can tell when a group becomes empty.
+ * To make this quick, a hash table (cg_grp_hash) is maintained on the
+ * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is
+ * sized so that there should typically only be 0 or 1 cgroups per bucket.
+ * We also keep a reference to the file system in the zone-specific brand data
+ * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t
+ * when it runs the hook.
+ *
+ * Once a cgroup becomes empty, running the release agent is actually done
+ * by a user-level cgrpmgr process. That process makes a CGRPFS_GETEVNT
+ * ioctl which blocks until there is an event (i.e. the agent needs to run).
+ * Internally we maintain a list (cg_evnt_list) of release events on
+ * cgrp_mnt_t. The ioctl pulls an event off of the list, or blocks until an
+ * event is available, and then returns the event. The cgrpmgr process is
+ * started by the lx mount emulation when it mounts the file system. The
+ * cgrpmgr will exit when the ioctl returns EIO, indicating that the file
+ * system is being unmounted.
+ *
* This file system is similar to tmpfs in that directories only exist in
* memory. Each subdirectory represents a different cgroup. Within the cgroup
* there are pseudo files (see cg_ssde_dir) with well-defined names which
* control the configuration and behavior of the cgroup (see cgrp_nodetype_t).
- * The primary files within every cgroup are named 'cgroup.procs' and 'tasks'.
- * These are used to control and list which processes/threads belong to the
- * cgroup. In the general case there can be additional files in the cgroup
- * which define additional behavior, although none exists at this time.
+ * The primary files within every cgroup are named 'cgroup.procs',
+ * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the
+ * top-level cgroup). The cgroup.procs and tasks files are used to control and
+ * list which processes/threads belong to the cgroup. In the general case there
+ * could be additional files in the cgroup, which defined additional behavior
+ * (i.e. subsystem specific pseudo files), although none exist at this time.
*
* Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is
* used to correlate with the threads to determine cgroup membership. When
@@ -69,11 +111,27 @@
* - no file rename, but a directory (i.e. a cgroup) can be renamed within the
* containing directory, but not into a different directory
* - can mkdir and rmdir to create/destroy cgroups
- * - cannot rmdir while it contains a subdir (i.e. a sub-cgroup)
+ * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup)
* - open, read/write, close on the subsytem-specific pseudo files is
* allowed, as this is the interface to configure and report on the cgroup.
* The pseudo file's mode controls write access and cannot be changed.
*
+ * The locking in this file system is simple since the file system is not
+ * subjected to heavy I/O activity and all data is in-memory. There is a single
+ * global mutex for each mount (cg_contents). This mutex is held for the life
+ * of most vnode operations. The most active path is probably the LWP start and
+ * exit hooks which increment/decrement the reference counter on the cgroup
+ * node. The lock is important for this case since we don't want concurrent
+ * activity (such as moving the process into another cgroup) while we're trying
+ * to lookup the cgroup from the mount's hash table. We must be careful to
+ * avoid a deadlock while reading or writing since that code can take pidlock
+ * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of
+ * those is held. To prevent deadlock we always take cg_contents after pidlock
+ * and p_lock.
+ *
+ * In addition to the cg_contents lock there is also a second mutex (cg_events)
+ * used with the event queue condvar (cg_evnt_cv).
+ *
* EXTENDING THE FILE SYSTEM
*
* When adding support for a new subsystem, be sure to also update the
@@ -100,7 +158,8 @@
* list of cgroup IDs associated with every thread, instead of just one ID
* (br_cgroupid). The thread data would need to become a struct which held
* both an ID and an indication as to which mounted cgroup file system instance
- * the ID was associated with.
+ * the ID was associated with. We would also need a list of cgroup mounts per
+ * zone, instead the current single zone reference.
*/
#include <sys/types.h>
@@ -123,6 +182,8 @@
#include <sys/systm.h>
#include <sys/mntent.h>
#include <sys/policy.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
#include <sys/lx_brand.h>
#include "cgrps.h"
@@ -131,6 +192,11 @@
static int cgrp_fstype;
static dev_t cgrp_dev;
+#define MAX_AGENT_EVENTS 32 /* max num queued events */
+
+#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */
+#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */
+
/*
* cgrp_mountcount is used to prevent module unloads while there is still
* state from a former mount hanging around. The filesystem module must not be
@@ -171,6 +237,12 @@ static int cgrp_root(struct vfs *, struct vnode **);
static int cgrp_statvfs(struct vfs *, struct statvfs64 *);
static void cgrp_freevfs(vfs_t *vfsp);
+/* Forward declarations for hooks */
+static void cgrp_proc_fork_helper(vfs_t *, uint_t, pid_t);
+static void cgrp_proc_exit_helper(vfs_t *, uint_t, pid_t);
+static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t);
+static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t);
+
/*
* Loadable module wrapper
*/
@@ -209,6 +281,12 @@ _fini()
if (cgrp_mountcount)
return (EBUSY);
+ /* Disable hooks used by the lx brand module. */
+ lx_cgrp_forklwp = NULL;
+ lx_cgrp_proc_exit = NULL;
+ lx_cgrp_initlwp = NULL;
+ lx_cgrp_freelwp = NULL;
+
if ((error = mod_remove(&modlinkage)) != 0)
return (error);
@@ -282,6 +360,12 @@ cgrp_init(int fstype, char *name)
*/
cgrp_dev = makedevice(dev, 0);
+ /* Install the hooks used by the lx brand module. */
+ lx_cgrp_forklwp = cgrp_proc_fork_helper;
+ lx_cgrp_proc_exit = cgrp_proc_exit_helper;
+ lx_cgrp_initlwp = cgrp_lwp_fork_helper;
+ lx_cgrp_freelwp = cgrp_lwp_exit_helper;
+
return (0);
}
@@ -294,6 +378,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
int error;
struct vattr rattr;
cgrp_ssid_t ssid = CG_SSID_GENERIC;
+ lx_zone_data_t *lxzdata;
if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
return (error);
@@ -309,6 +394,13 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
return (EINVAL);
/*
+ * We currently only support one mount per zone.
+ */
+ lxzdata = ztolxzd(curproc->p_zone);
+ if (lxzdata->lxzd_cgroup != NULL)
+ return (EINVAL);
+
+ /*
* Ensure we don't allow overlaying mounts
*/
mutex_enter(&mvp->v_lock);
@@ -354,10 +446,15 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
/* Set but don't bother entering the mutex (not on mount list yet) */
mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&cgm->cg_events, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&cgm->cg_evnt_cv, NULL, CV_DRIVER, NULL);
- cgm->cg_vfsp = vfsp;
+ cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp;
+ cgm->cg_lxzdata = lxzdata;
cgm->cg_ssid = ssid;
- cgm->cg_gen = CG_START_ID;
+
+ list_create(&cgm->cg_evnt_list, sizeof (cgrp_evnt_t),
+ offsetof(cgrp_evnt_t, cg_evnt_lst));
vfsp->vfs_data = (caddr_t)cgm;
vfsp->vfs_fstype = cgrp_fstype;
@@ -368,15 +465,19 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
(void) strcpy(cgm->cg_mntpath, dpn.pn_path);
+ cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ,
+ KM_SLEEP);
+
/* allocate and initialize root cgrp_node structure */
bzero(&rattr, sizeof (struct vattr));
rattr.va_mode = (mode_t)(S_IFDIR | 0755);
rattr.va_type = VDIR;
rattr.va_rdev = 0;
cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP);
+
+ mutex_enter(&cgm->cg_contents);
cgrp_node_init(cgm, cp, &rattr, cr);
- rw_enter(&cp->cgn_rwlock, RW_WRITER);
CGNTOV(cp)->v_flag |= VROOT;
/*
@@ -393,7 +494,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
cp->cgn_nodeid = cgrp_inode(ssid, cgm->cg_gen);
cgrp_dirinit(cp, cp, cr);
- rw_exit(&cp->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
pn_free(&dpn);
error = 0;
@@ -414,15 +515,20 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
struct vnode *vp;
int error;
uint_t cnt;
+ int retry_cnt = 0;
if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
return (error);
+retry:
mutex_enter(&cgm->cg_contents);
/*
- * In the normal unmount case, if there are no
- * open files, only the root node should have a reference count.
+ * In the normal unmount case, if there were no open files, only the
+ * root node would have a reference count. However, the user-level
+ * agent manager should have the root vnode open and be waiting in
+ * ioctl. We need to wake the manager and it may take some retries
+ * before it closes its file descriptor.
*
* With cg_contents held, nothing can be added or removed.
* There may be some dirty pages. To prevent fsflush from
@@ -432,6 +538,29 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
*/
cgnp = cgm->cg_rootnode;
+ ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL);
+
+ mutex_enter(&cgm->cg_events);
+ cv_signal(&cgm->cg_evnt_cv);
+
+ /*
+ * Delete any queued events (normally there shouldn't be any).
+ */
+ for (;;) {
+ cgrp_evnt_t *evntp;
+
+ evntp = list_remove_head(&cgm->cg_evnt_list);
+ if (evntp == NULL)
+ break;
+ kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+ kmem_free(evntp, sizeof (cgrp_evnt_t));
+ cgm->cg_evnt_cnt--;
+ }
+
+ /* Set the counter to -1 so an incoming ioctl knows we're unmounting */
+ cgm->cg_evnt_cnt = -1;
+ mutex_exit(&cgm->cg_events);
+
vp = CGNTOV(cgnp);
mutex_enter(&vp->v_lock);
@@ -441,10 +570,16 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
return (EINVAL);
}
+
cnt = vp->v_count;
if (cnt > 1) {
mutex_exit(&vp->v_lock);
mutex_exit(&cgm->cg_contents);
+ /* Likely because the user-level manager hasn't exited yet */
+ if (retry_cnt++ < UMNT_RETRY_MAX) {
+ delay(UMNT_DELAY_TIME);
+ goto retry;
+ }
return (EBUSY);
}
@@ -476,6 +611,11 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
}
}
+ cgm->cg_lxzdata->lxzd_cgroup = NULL;
+ kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ);
+ list_destroy(&cgm->cg_evnt_list);
+ cv_destroy(&cgm->cg_evnt_cv);
+
/*
* We can drop the mutex now because
* no one can find this mount anymore
@@ -519,10 +659,10 @@ cgrp_freevfs(vfs_t *vfsp)
* Remove all directory entries
*/
for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) {
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
+ mutex_enter(&cgm->cg_contents);
if (cn->cgn_type == CG_CGROUP_DIR)
cgrp_dirtrunc(cn);
- rw_exit(&cn->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
}
ASSERT(cgm->cg_rootnode);
@@ -571,7 +711,7 @@ cgrp_freevfs(vfs_t *vfsp)
kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1);
mutex_destroy(&cgm->cg_contents);
- mutex_destroy(&cgm->cg_renamelck);
+ mutex_destroy(&cgm->cg_events);
kmem_free(cgm, sizeof (cgrp_mnt_t));
/* Allow _fini() to succeed now */
@@ -676,3 +816,186 @@ cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
sbp->f_namemax = MAXNAMELEN - 1;
return (0);
}
+
+static int
+cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen)
+{
+ cgrp_node_t *parent;
+ cgrp_dirent_t *dp;
+
+ buf[0] = '\0';
+
+ parent = cn->cgn_parent;
+ if (parent == NULL || parent == cn) {
+ (void) strlcpy(buf, ".", blen);
+ return (0);
+ }
+
+ /*
+ * Search the parent dir list to find this cn's name.
+ */
+ for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) {
+ if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) {
+ (void) strlcpy(buf, dp->cgd_name, blen);
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+/*
+ * Engueue an event for user-level release_agent manager. The event data is the
+ * pathname (relative to the mount point of the file system) of the newly empty
+ * cgroup.
+ */
+void
+cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn)
+{
+ cgrp_node_t *parent;
+ char nm[MAXNAMELEN];
+ char *argstr, *oldstr, *tmp;
+ cgrp_evnt_t *evntp;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+
+ /* Nothing to do if the agent is not set */
+ if (cgm->cg_agent[0] == '\0')
+ return;
+
+ parent = cn->cgn_parent;
+ /* Cannot remove the top-level cgroup (only via unmount) */
+ if (parent == cn)
+ return;
+
+ argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ *argstr = '\0';
+
+ /*
+ * Iterate up the directory tree to construct the agent argument string.
+ */
+ do {
+ cgrp_get_dirname(cn, nm, sizeof (nm));
+ DTRACE_PROBE1(cgrp__dir__name, char *, nm);
+ if (*argstr == '\0') {
+ (void) strlcpy(argstr, nm, MAXPATHLEN);
+ } else {
+ tmp = oldstr;
+ oldstr = argstr;
+ argstr = tmp;
+ (void) snprintf(argstr, MAXPATHLEN, "%s/%s", nm,
+ oldstr);
+ }
+
+ if (cn->cgn_parent == NULL)
+ break;
+ cn = cn->cgn_parent;
+ parent = cn->cgn_parent;
+
+ /*
+ * The arg path is relative to the mountpoint so we stop when
+ * we get to the top level.
+ */
+ if (parent == NULL || parent == cn)
+ break;
+ } while (parent != cn);
+
+ kmem_free(oldstr, MAXPATHLEN);
+
+ DTRACE_PROBE1(cgrp__agent__event, char *, argstr);
+
+ /*
+ * Add the event to the list for the user-level agent. We add it to
+ * the end of the list (which should normally be an empty list since
+ * the user-level agent is designed to service events as quickly as
+ * it can).
+ */
+ evntp = kmem_zalloc(sizeof (cgrp_evnt_t), KM_SLEEP);
+ evntp->cg_evnt_path = argstr;
+
+ mutex_enter(&cgm->cg_events);
+ if (cgm->cg_evnt_cnt >= MAX_AGENT_EVENTS) {
+ /*
+ * We don't queue up an arbitrary number of events. Because
+ * the user-level manager should be servicing events quickly,
+ * if the list gets long then something is wrong.
+ */
+ cmn_err(CE_WARN, "cgrp: event queue full for zone %s",
+ ttoproc(curthread)->p_zone->zone_name);
+ kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+ kmem_free(evntp, sizeof (cgrp_evnt_t));
+
+ } else {
+ list_insert_tail(&cgm->cg_evnt_list, evntp);
+ cgm->cg_evnt_cnt++;
+ cv_signal(&cgm->cg_evnt_cv);
+ }
+ mutex_exit(&cgm->cg_events);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_proc_fork_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid)
+{
+}
+
+/*ARGSUSED*/
+static void
+cgrp_proc_exit_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid)
+{
+ if (curproc->p_zone->zone_proc_initpid == pid ||
+ curproc->p_zone->zone_proc_initpid == -1) {
+ /*
+ * The zone's init just exited. If this is because of a zone
+ * reboot initiated from outside the zone, then we've never
+ * tried to unmount this fs, so we need to wakeup the
+ * user-level manager so that it can exit. Its also possible
+ * init died abnormally, but that leads to a zone reboot so the
+ * action is the same here.
+ */
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+
+ mutex_enter(&cgm->cg_events);
+ cv_signal(&cgm->cg_evnt_cv);
+ mutex_exit(&cgm->cg_events);
+ }
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cn;
+
+ mutex_enter(&cgm->cg_contents);
+ cn = cgrp_cg_hash_lookup(cgm, cg_id);
+ ASSERT(cn != NULL);
+ cn->cgn_task_cnt++;
+ mutex_exit(&cgm->cg_contents);
+
+ DTRACE_PROBE1(cgrp__lwp__fork, void *, cn);
+}
+
+/*ARGSUSED*/
+static void
+cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid)
+{
+ cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp);
+ cgrp_node_t *cn;
+
+ mutex_enter(&cgm->cg_contents);
+ cn = cgrp_cg_hash_lookup(cgm, cg_id);
+ ASSERT(cn != NULL);
+ VERIFY(cn->cgn_task_cnt > 0);
+ cn->cgn_task_cnt--;
+ DTRACE_PROBE1(cgrp__lwp__exit, void *, cn);
+
+ if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) &&
+ cn->cgn_notify == 1) {
+ cgrp_rel_agent_event(cgm, cn);
+ }
+
+ mutex_exit(&cgm->cg_contents);
+}
diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
index f7eceb4e94..24640631f5 100644
--- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
+++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c
@@ -153,19 +153,69 @@ cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ)
}
/*
+ * Move a thread from one cgroup to another. If the old cgroup is empty
+ * we queue up an agent event. We return true in that case since we've
+ * dropped the locks and the caller needs to reacquire them.
+ */
+static boolean_t
+cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn,
+ uint_t cg_id, proc_t *p)
+{
+ cgrp_node_t *ocn;
+
+ ASSERT(MUTEX_HELD(&cgm->cg_contents));
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid);
+ VERIFY(ocn != NULL);
+
+ ASSERT(ocn->cgn_task_cnt > 0);
+ atomic_dec_32(&ocn->cgn_task_cnt);
+ atomic_inc_32(&ncn->cgn_task_cnt);
+ plwpd->br_cgroupid = cg_id;
+
+ if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) &&
+ ocn->cgn_notify == 1) {
+ /*
+ * We want to drop p_lock before queuing the event since
+ * that might sleep. Dropping p_lock might cause the caller to
+ * have to restart the move process from the beginning.
+ */
+ mutex_exit(&p->p_lock);
+ cgrp_rel_agent_event(cgm, ocn);
+ mutex_exit(&cgm->cg_contents);
+
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* Assign either all of the threads, or a single thread, for the specified pid
* to the new cgroup. Controlled by the typ argument.
*/
static int
-cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
+cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
{
proc_t *p;
kthread_t *t;
int error;
+ cgrp_node_t *ncn;
if (pid == 1)
pid = curproc->p_zone->zone_proc_initpid;
+ /*
+ * Move one or all threads to this cgroup.
+ */
+ if (typ == CG_WR_TASKS) {
+ error = ESRCH;
+ } else {
+ error = 0;
+ }
+
+restart:
mutex_enter(&pidlock);
p = cgrp_p_for_wr(pid, typ);
@@ -194,39 +244,48 @@ cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ)
* Ignore writes for PID which is not an lx-branded process or with
* no threads.
*/
+
mutex_enter(&p->p_lock);
- if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) {
+ mutex_exit(&pidlock);
+ if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL ||
+ p->p_flag & SEXITING) {
mutex_exit(&p->p_lock);
- mutex_exit(&pidlock);
return (0);
}
- /*
- * Move one or all threads to this cgroup.
- */
- if (typ == CG_WR_TASKS) {
- error = ESRCH;
- } else {
- error = 0;
- }
+ mutex_enter(&cgm->cg_contents);
+
+ ncn = cgrp_cg_hash_lookup(cgm, cg_id);
+ VERIFY(ncn != NULL);
do {
lx_lwp_data_t *plwpd = ttolxlwp(t);
- if (plwpd != NULL) {
+ if (plwpd != NULL && plwpd->br_cgroupid != cg_id) {
if (typ == CG_WR_PROCS) {
- plwpd->br_cgroupid = cg_id;
+ if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+ /*
+ * We dropped all of the locks so we
+ * need to start over.
+ */
+ goto restart;
+ }
+
} else if (plwpd->br_pid == pid) {
/* type is CG_WR_TASKS and we found the task */
- plwpd->br_cgroupid = cg_id;
error = 0;
- break;
+ if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) {
+ goto done;
+ } else {
+ break;
+ }
}
}
t = t->t_forw;
} while (t != p->p_tlist);
+ mutex_exit(&cgm->cg_contents);
mutex_exit(&p->p_lock);
- mutex_exit(&pidlock);
+done:
return (error);
}
@@ -273,7 +332,56 @@ cgrp_get_pid_str(struct uio *uio, pid_t *pid)
}
static int
-cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ)
+cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio)
+{
+ int error;
+ uint_t value;
+
+ /*
+ * This is cheesy but since we only take a 0 or 1 value we can
+ * let the pid_str function do the uio string conversion.
+ */
+ error = cgrp_get_pid_str(uio, (pid_t *)&value);
+ if (error != 0)
+ return (error);
+
+ if (value != 0 && value != 1)
+ return (EINVAL);
+
+ /*
+ * The flag is on the containing dir. We don't bother taking the
+ * cg_contents lock since this is a simple assignment.
+ */
+ cn->cgn_parent->cgn_notify = value;
+ return (0);
+}
+
+static int
+cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+ int error;
+ int len;
+ char *wrp;
+
+ len = uio->uio_offset + uio->uio_resid;
+ if (len > MAXPATHLEN)
+ return (EFBIG);
+
+ mutex_enter(&cgm->cg_contents);
+
+ wrp = &cgm->cg_agent[uio->uio_offset];
+ error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio);
+ cgm->cg_agent[len] = '\0';
+ if (len > 1 && cgm->cg_agent[len - 1] == '\n')
+ cgm->cg_agent[len - 1] = '\0';
+
+ mutex_exit(&cgm->cg_contents);
+ return (error);
+}
+
+static int
+cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio,
+ cgrp_wr_type_t typ)
{
/* the cgroup ID is on the containing dir */
uint_t cg_id = cn->cgn_parent->cgn_id;
@@ -285,7 +393,7 @@ cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ)
if (error != 0)
return (error);
- error = cgrp_proc_set_id(cg_id, pidnum, typ);
+ error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ);
if (error != 0)
return (error);
}
@@ -304,9 +412,6 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr,
vp = CGNTOV(cn);
ASSERT(vp->v_type == VREG);
- ASSERT(RW_WRITE_HELD(&cn->cgn_contents));
- ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
-
if (uio->uio_loffset < 0)
return (EINVAL);
@@ -323,11 +428,17 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr,
limit = MAXOFF_T;
switch (cn->cgn_type) {
+ case CG_NOTIFY:
+ error = cgrp_wr_notify(cn, uio);
+ break;
case CG_PROCS:
- error = cgrp_wr_proc_or_task(cn, uio, CG_WR_PROCS);
+ error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS);
+ break;
+ case CG_REL_AGENT:
+ error = cgrp_wr_rel_agent(cgm, uio);
break;
case CG_TASKS:
- error = cgrp_wr_proc_or_task(cn, uio, CG_WR_TASKS);
+ error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS);
break;
default:
VERIFY(0);
@@ -351,6 +462,12 @@ cgrp_p_lock(proc_t *p)
/* first try the fast path */
mutex_enter(&p->p_lock);
+ if (p->p_flag & SEXITING) {
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ return (NULL);
+ }
+
if (!(p->p_proc_flag & P_PR_LOCK)) {
p->p_proc_flag |= P_PR_LOCK;
mutex_exit(&p->p_lock);
@@ -404,13 +521,76 @@ cgrp_p_unlock(proc_t *p)
ASSERT(MUTEX_HELD(&p->p_lock));
ASSERT(!MUTEX_HELD(&pidlock));
- cv_signal(&pr_pid_cv[p->p_slot]);
p->p_proc_flag &= ~P_PR_LOCK;
+ cv_signal(&pr_pid_cv[p->p_slot]);
mutex_exit(&p->p_lock);
THREAD_KPRI_RELEASE();
}
/*
+ * Read value from the notify_on_release pseudo file on the parent node
+ * (which is the actual cgroup node). We don't bother taking the cg_contents
+ * lock since it's a single instruction so an empty group action/read will
+ * only see one value or the other.
+ */
+/* ARGSUSED */
+static int
+cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
+{
+ int len;
+ int error = 0;
+ char buf[16];
+ char *rdp;
+ /* the flag is on the containing dir */
+ uint_t value = cn->cgn_parent->cgn_notify;
+
+ len = snprintf(buf, sizeof (buf), "%u\n", value);
+ if (uio->uio_offset > len)
+ return (0);
+
+ len -= uio->uio_offset;
+ rdp = &buf[uio->uio_offset];
+ len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+ return (error);
+}
+
+/*
+ * Read value from the release_agent pseudo file.
+ */
+static int
+cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio)
+{
+ int len;
+ int error = 0;
+ char *rdp;
+
+ mutex_enter(&cgm->cg_contents);
+
+ if (cgm->cg_agent[0] == '\0') {
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+ }
+
+ len = strlen(cgm->cg_agent);
+ if (uio->uio_offset > len) {
+ mutex_exit(&cgm->cg_contents);
+ return (0);
+ }
+
+ len -= uio->uio_offset;
+ rdp = &cgm->cg_agent[uio->uio_offset];
+ len = (uio->uio_resid < len) ? uio->uio_resid : len;
+
+ error = uiomove(rdp, len, UIO_READ, uio);
+
+ mutex_exit(&cgm->cg_contents);
+
+ return (error);
+}
+
+/*
* Read pids from the cgroup.procs pseudo file. We have to look at all of the
* processes to find applicable ones, then report pids for any process which
* has all of its threads in the same cgroup.
@@ -470,6 +650,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
* Check if all threads are in this cgroup.
*/
in_cg = B_TRUE;
+ mutex_enter(&cgm->cg_contents);
do {
lx_lwp_data_t *plwpd = ttolxlwp(t);
if (plwpd == NULL || plwpd->br_cgroupid != cg_id) {
@@ -479,6 +660,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
t = t->t_forw;
} while (t != p->p_tlist);
+ mutex_exit(&cgm->cg_contents);
mutex_exit(&p->p_lock);
if (!in_cg) {
@@ -647,7 +829,9 @@ cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio)
if (p == NULL)
continue;
+ mutex_enter(&cgm->cg_contents);
error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio);
+ mutex_exit(&cgm->cg_contents);
mutex_enter(&p->p_lock);
cgrp_p_unlock(p);
@@ -664,8 +848,6 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct)
{
int error = 0;
- ASSERT(RW_LOCK_HELD(&cn->cgn_contents));
-
if (uio->uio_loffset >= MAXOFF_T)
return (0);
if (uio->uio_loffset < 0)
@@ -674,9 +856,15 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct)
return (0);
switch (cn->cgn_type) {
+ case CG_NOTIFY:
+ error = cgrp_rd_notify(cgm, cn, uio);
+ break;
case CG_PROCS:
error = cgrp_rd_procs(cgm, cn, uio);
break;
+ case CG_REL_AGENT:
+ error = cgrp_rd_rel_agent(cgm, uio);
+ break;
case CG_TASKS:
error = cgrp_rd_tasks(cgm, cn, uio);
break;
@@ -692,8 +880,8 @@ static int
cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
struct caller_context *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VTOCGM(vp);
int error;
/*
@@ -703,17 +891,8 @@ cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
return (EISDIR);
if (vp->v_type != VREG)
return (EINVAL);
- /*
- * cgrp_rwlock should have already been called from layers above
- */
- ASSERT(RW_READ_HELD(&cn->cgn_rwlock));
-
- rw_enter(&cn->cgn_contents, RW_READER);
-
error = cgrp_rd(cgm, cn, uiop, ct);
- rw_exit(&cn->cgn_contents);
-
return (error);
}
@@ -721,8 +900,8 @@ static int
cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
struct caller_context *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VTOCGM(vp);
int error;
/*
@@ -731,11 +910,6 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
if (vp->v_type != VREG)
return (EINVAL);
- /* cgrp_rwlock should have already been called from layers above */
- ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock));
-
- rw_enter(&cn->cgn_contents, RW_WRITER);
-
if (ioflag & FAPPEND) {
/* In append mode start at end of file. */
uiop->uio_loffset = cn->cgn_size;
@@ -743,21 +917,146 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
error = cgrp_wr(cgm, cn, uiop, cred, ct);
- rw_exit(&cn->cgn_contents);
-
return (error);
}
+static int
+cgrp_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, cred_t *cr,
+ int *rvalp, caller_context_t *ct)
+{
+ cgrp_mnt_t *cgm = VTOCGM(vp);
+ model_t model;
+ cgrpmgr_info_t cgmi;
+ cgrp_evnt_t *evntp;
+ int res = 0;
+
+ /* We only support the cgrpmgr ioctls on the root vnode */
+ if (!(vp->v_flag & VROOT))
+ return (ENOTTY);
+
+ /* The caller must be root */
+ if (secpolicy_vnode_any_access(cr, vp, crgetuid(cr)) != 0 ||
+ crgetuid(cr) != 0)
+ return (ENOTTY);
+
+ if (cmd != CGRPFS_GETEVNT)
+ return (ENOTTY);
+
+ model = get_udatamodel();
+ if (model == DATAMODEL_NATIVE) {
+ if (copyin((void *)data, &cgmi, sizeof (cgmi)))
+ return (EFAULT);
+
+ } else {
+ cgrpmgr_info32_t cgmi32;
+
+ if (copyin((void *)data, &cgmi32, sizeof (cgmi32)))
+ return (EFAULT);
+
+ cgmi.cgmi_pid = cgmi32.cgmi_pid;
+ cgmi.cgmi_rel_agent_path =
+ (char *)(intptr_t)cgmi32.cgmi_rel_agent_path;
+ cgmi.cgmi_cgroup_path =
+ (char *)(intptr_t)cgmi32.cgmi_cgroup_path;
+ }
+
+ if (cgm->cg_mgrpid == 0) {
+ /*
+ * This is the initial call from the user-level manager,
+ * keep track of its pid.
+ */
+ cgm->cg_mgrpid = cgmi.cgmi_pid;
+ } else if (cgm->cg_mgrpid != cgmi.cgmi_pid) {
+ /*
+ * We only allow the manager which first contacted us to
+ * make this ioctl.
+ */
+ return (EINVAL);
+ }
+
+ /*
+ * If there is a pending event, service it immediately, otherwise
+ * block until an event occurs.
+ */
+retry:
+ mutex_enter(&cgm->cg_events);
+
+ if (cgm->cg_evnt_cnt < 0) {
+ /*
+ * Trying to unmount, tell the manager to quit.
+ */
+ mutex_exit(&cgm->cg_events);
+ return (EIO);
+ }
+
+ if (cgm->cg_evnt_cnt == 0) {
+ cv_wait_sig(&cgm->cg_evnt_cv, &cgm->cg_events);
+
+ if (cgm->cg_evnt_cnt <= 0) {
+ /*
+ * We were woken up but there are no events, it must
+ * be due to an unmount and it's time for the user
+ * manager to go away.
+ */
+ mutex_exit(&cgm->cg_events);
+ return (EIO);
+ }
+ }
+
+ evntp = list_remove_head(&cgm->cg_evnt_list);
+ VERIFY(evntp != NULL);
+ ASSERT(cgm->cg_evnt_cnt > 0);
+ cgm->cg_evnt_cnt--;
+
+ mutex_exit(&cgm->cg_events);
+
+ /*
+ * An event for the user-level manager should only occur if a
+ * release_agent has been set, but on the unlikely chance that the
+ * agent path was cleared after the event was enqueued, we check under
+ * the lock and go back to waiting if the path is empty.
+ */
+ mutex_enter(&cgm->cg_contents);
+ if (cgm->cg_agent[0] == '\0') {
+ mutex_exit(&cgm->cg_contents);
+ kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+ kmem_free(evntp, sizeof (cgrp_evnt_t));
+ goto retry;
+ }
+
+ if (copyout(cgm->cg_agent, (void *)cgmi.cgmi_rel_agent_path,
+ strlen(cgm->cg_agent) + 1)) {
+ mutex_exit(&cgm->cg_contents);
+ res = EFAULT;
+ goto done;
+ }
+
+ mutex_exit(&cgm->cg_contents);
+
+ if (copyout(evntp->cg_evnt_path, (void *)cgmi.cgmi_cgroup_path,
+ strlen(evntp->cg_evnt_path) + 1)) {
+ res = EFAULT;
+ }
+
+done:
+ kmem_free(evntp->cg_evnt_path, MAXPATHLEN);
+ kmem_free(evntp, sizeof (cgrp_evnt_t));
+
+ return (res);
+}
+
/* ARGSUSED2 */
static int
cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
caller_context_t *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
struct vattr va;
int attrs = 1;
- mutex_enter(&cn->cgn_tlock);
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
if (attrs == 0) {
cn->cgn_uid = va.va_uid;
cn->cgn_gid = va.va_gid;
@@ -778,7 +1077,7 @@ cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
vap->va_seq = cn->cgn_seq;
vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
- mutex_exit(&cn->cgn_tlock);
+ mutex_exit(&cgm->cg_contents);
return (0);
}
@@ -787,7 +1086,8 @@ static int
cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
caller_context_t *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
int error = 0;
struct vattr *get;
long mask;
@@ -799,7 +1099,8 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
(vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE))
return (EINVAL);
- mutex_enter(&cn->cgn_tlock);
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
get = &cn->cgn_attr;
/*
@@ -832,7 +1133,7 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred,
gethrestime(&cn->cgn_ctime);
out:
- mutex_exit(&cn->cgn_tlock);
+ mutex_exit(&cgm->cg_contents);
return (error);
}
@@ -841,12 +1142,14 @@ static int
cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred,
caller_context_t *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
int error;
- mutex_enter(&cn->cgn_tlock);
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
error = cgrp_taccess(cn, mode, cred);
- mutex_exit(&cn->cgn_tlock);
+ mutex_exit(&cgm->cg_contents);
return (error);
}
@@ -856,7 +1159,8 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
caller_context_t *ct, int *direntflags, pathname_t *realpnp)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(dvp);
+ cgrp_node_t *cn = VTOCGN(dvp);
+ cgrp_mnt_t *cgm;
cgrp_node_t *ncn = NULL;
int error;
@@ -874,7 +1178,10 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
}
ASSERT(cn);
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
error = cgrp_dirlookup(cn, nm, &ncn, cred);
+ mutex_exit(&cgm->cg_contents);
if (error == 0) {
ASSERT(ncn);
@@ -890,17 +1197,21 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred,
int flag, caller_context_t *ct, vsecattr_t *vsecp)
{
- cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+ cgrp_node_t *parent = VTOCGN(dvp);
cgrp_node_t *cn = NULL;
+ cgrp_mnt_t *cgm;
int error;
if (*nm == '\0')
return (EPERM);
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
error = cgrp_dirlookup(parent, nm, &cn, cred);
if (error == 0) { /* name found */
ASSERT(cn);
+ mutex_exit(&cgm->cg_contents);
/*
* Creating an existing file, allow it except for the following
* errors.
@@ -919,6 +1230,7 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap,
*vpp = CGNTOV(cn);
return (0);
}
+ mutex_exit(&cgm->cg_contents);
/*
* cgroups doesn't allow creation of additional, non-subsystem specific
@@ -932,9 +1244,10 @@ static int
cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
caller_context_t *ct, int flags)
{
- cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+ cgrp_node_t *parent = VTOCGN(dvp);
int error;
cgrp_node_t *cn = NULL;
+ cgrp_mnt_t *cgm;
/*
* Removal of subsystem-specific files is not allowed but we need
@@ -942,7 +1255,10 @@ cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred,
* file.
*/
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
error = cgrp_dirlookup(parent, nm, &cn, cred);
+ mutex_exit(&cgm->cg_contents);
if (error)
return (error);
@@ -979,11 +1295,11 @@ cgrp_rename(
cgrp_node_t *fromparent;
cgrp_node_t *toparent;
cgrp_node_t *fromcn = NULL; /* source cgrp_node */
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(odvp);
+ cgrp_mnt_t *cgm = VTOCGM(odvp);
int error, err;
- fromparent = (cgrp_node_t *)VTOCGN(odvp);
- toparent = (cgrp_node_t *)VTOCGN(ndvp);
+ fromparent = VTOCGN(odvp);
+ toparent = VTOCGN(ndvp);
if (fromparent != toparent)
return (EIO);
@@ -991,14 +1307,14 @@ cgrp_rename(
/* discourage additional use of toparent */
toparent = NULL;
- mutex_enter(&cgm->cg_renamelck);
+ mutex_enter(&cgm->cg_contents);
/*
* Look up cgrp_node of file we're supposed to rename.
*/
error = cgrp_dirlookup(fromparent, onm, &fromcn, cred);
if (error) {
- mutex_exit(&cgm->cg_renamelck);
+ mutex_exit(&cgm->cg_contents);
return (error);
}
@@ -1030,11 +1346,9 @@ cgrp_rename(
/*
* Link source to new target
*/
- rw_enter(&fromparent->cgn_rwlock, RW_WRITER);
error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME,
fromcn, (struct vattr *)NULL,
(cgrp_node_t **)NULL, cred, ct);
- rw_exit(&fromparent->cgn_rwlock);
if (error)
goto done;
@@ -1042,9 +1356,6 @@ cgrp_rename(
/*
* Unlink from source.
*/
- rw_enter(&fromparent->cgn_rwlock, RW_WRITER);
- rw_enter(&fromcn->cgn_rwlock, RW_WRITER);
-
error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred);
/*
@@ -1054,17 +1365,14 @@ cgrp_rename(
if (error == ENOENT)
error = 0;
- rw_exit(&fromcn->cgn_rwlock);
- rw_exit(&fromparent->cgn_rwlock);
-
if (err == 0) {
vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct);
vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct);
}
done:
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(fromcn);
- mutex_exit(&cgm->cg_renamelck);
return (error);
}
@@ -1074,9 +1382,9 @@ static int
cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
{
- cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+ cgrp_node_t *parent = VTOCGN(dvp);
cgrp_node_t *self = NULL;
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dvp);
+ cgrp_mnt_t *cgm = VTOCGM(dvp);
int error;
/*
@@ -1086,25 +1394,28 @@ cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
if (parent->cgn_nlink == 0)
return (ENOENT);
+ mutex_enter(&cgm->cg_contents);
error = cgrp_dirlookup(parent, nm, &self, cred);
if (error == 0) {
ASSERT(self != NULL);
+ mutex_exit(&cgm->cg_contents);
cgnode_rele(self);
return (EEXIST);
}
- if (error != ENOENT)
+ if (error != ENOENT) {
+ mutex_exit(&cgm->cg_contents);
return (error);
+ }
- rw_enter(&parent->cgn_rwlock, RW_WRITER);
error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL,
va, &self, cred, ct);
if (error) {
- rw_exit(&parent->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
if (self != NULL)
cgnode_rele(self);
return (error);
}
- rw_exit(&parent->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
*vpp = CGNTOV(self);
return (0);
}
@@ -1114,7 +1425,7 @@ static int
cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
caller_context_t *ct, int flags)
{
- cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp);
+ cgrp_node_t *parent = VTOCGN(dvp);
cgrp_mnt_t *cgm;
cgrp_node_t *self = NULL;
struct vnode *vp;
@@ -1127,63 +1438,61 @@ cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
return (EINVAL);
if (strcmp(nm, "..") == 0)
return (EEXIST); /* Should be ENOTEMPTY */
+
+ cgm = VTOCGM(parent->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+
error = cgrp_dirlookup(parent, nm, &self, cred);
- if (error)
+ if (error) {
+ mutex_exit(&cgm->cg_contents);
return (error);
-
- rw_enter(&parent->cgn_rwlock, RW_WRITER);
- rw_enter(&self->cgn_rwlock, RW_WRITER);
+ }
vp = CGNTOV(self);
if (vp == dvp || vp == cdir) {
error = EINVAL;
- goto done1;
+ goto done;
}
if (self->cgn_type != CG_CGROUP_DIR) {
error = ENOTDIR;
- goto done1;
+ goto done;
}
cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp);
- mutex_enter(&self->cgn_tlock);
- /* Check for the existence of any sub-cgroup directories */
- if (self->cgn_nlink > 2) {
- mutex_exit(&self->cgn_tlock);
+ /*
+ * Check for the existence of any sub-cgroup directories or tasks in
+ * the cgroup.
+ */
+ if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) {
error = EEXIST;
- goto done1;
+ /*
+ * Update atime because checking cn_dirents is logically
+ * equivalent to reading the directory
+ */
+ gethrestime(&self->cgn_atime);
+ goto done;
}
- mutex_exit(&self->cgn_tlock);
if (vn_vfswlock(vp)) {
error = EBUSY;
- goto done1;
+ goto done;
}
if (vn_mountedvfs(vp) != NULL) {
error = EBUSY;
- goto done;
+ } else {
+ error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
}
- /*
- * Confirm directory only includes entries for ".", ".." and the
- * fixed pseudo file entries.
- */
- if (self->cgn_dirents > (cgrp_num_pseudo_ents(cgm->cg_ssid) + 2)) {
- error = EEXIST; /* should be ENOTEMPTY */
- /*
- * Update atime because checking cn_dirents is logically
- * equivalent to reading the directory
- */
- gethrestime(&self->cgn_atime);
- goto done;
+ vn_vfsunlock(vp);
+
+ if (parent->cgn_task_cnt == 0 &&
+ parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) {
+ cgrp_rel_agent_event(cgm, parent);
}
- error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred);
done:
- vn_vfsunlock(vp);
-done1:
- rw_exit(&self->cgn_rwlock);
- rw_exit(&parent->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
vnevent_rmdir(CGNTOV(self), dvp, nm, ct);
cgnode_rele(self);
@@ -1195,7 +1504,8 @@ static int
cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
caller_context_t *ct, int flags)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm;
cgrp_dirent_t *cdp;
int error = 0;
size_t namelen;
@@ -1212,10 +1522,6 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
*eofp = 1;
return (0);
}
- /*
- * assuming system call has already called cgrp_rwlock
- */
- ASSERT(RW_READ_HELD(&cn->cgn_rwlock));
if (uiop->uio_iovcnt != 1)
return (EINVAL);
@@ -1223,8 +1529,12 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
if (vp->v_type != VDIR)
return (ENOTDIR);
+ cgm = VTOCGM(cn->cgn_vnode);
+ mutex_enter(&cgm->cg_contents);
+
if (cn->cgn_dir == NULL) {
VERIFY(cn->cgn_nlink == 0);
+ mutex_exit(&cgm->cg_contents);
return (0);
}
@@ -1284,6 +1594,9 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp,
uiop->uio_offset = offset;
}
gethrestime(&cn->cgn_atime);
+
+ mutex_exit(&cgm->cg_contents);
+
kmem_free(outbuf, bufsize);
return (error);
}
@@ -1301,11 +1614,10 @@ cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm,
static void
cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
{
- cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp);
- cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vp->v_vfsp);
+ cgrp_node_t *cn = VTOCGN(vp);
+ cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp);
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
- mutex_enter(&cn->cgn_tlock);
+ mutex_enter(&cgm->cg_contents);
mutex_enter(&vp->v_lock);
ASSERT(vp->v_count >= 1);
@@ -1316,27 +1628,22 @@ cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
if (vp->v_count > 1 || cn->cgn_nlink != 0) {
vp->v_count--;
mutex_exit(&vp->v_lock);
- mutex_exit(&cn->cgn_tlock);
- rw_exit(&cn->cgn_rwlock);
+ mutex_exit(&cgm->cg_contents);
return;
}
- mutex_exit(&vp->v_lock);
- mutex_exit(&cn->cgn_tlock);
- /* Here's our chance to send invalid event while we're between locks */
- vn_invalid(CGNTOV(cn));
-
- mutex_enter(&cgm->cg_contents);
if (cn->cgn_forw == NULL)
cgm->cg_rootnode->cgn_back = cn->cgn_back;
else
cn->cgn_forw->cgn_back = cn->cgn_back;
cn->cgn_back->cgn_forw = cn->cgn_forw;
+
+ mutex_exit(&vp->v_lock);
mutex_exit(&cgm->cg_contents);
- rw_exit(&cn->cgn_rwlock);
- rw_destroy(&cn->cgn_rwlock);
- mutex_destroy(&cn->cgn_tlock);
+ /* Here's our chance to send invalid event */
+ vn_invalid(CGNTOV(cn));
+
vn_free(CGNTOV(cn));
kmem_free(cn, sizeof (cgrp_node_t));
}
@@ -1349,27 +1656,17 @@ cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
}
-/* ARGSUSED2 */
+/* ARGSUSED */
static int
cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
{
- cgrp_node_t *cn = VTOCGN(vp);
-
- if (write_lock) {
- rw_enter(&cn->cgn_rwlock, RW_WRITER);
- } else {
- rw_enter(&cn->cgn_rwlock, RW_READER);
- }
return (write_lock);
}
-/* ARGSUSED1 */
+/* ARGSUSED */
static void
cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
{
- cgrp_node_t *cn = VTOCGN(vp);
-
- rw_exit(&cn->cgn_rwlock);
}
static int
@@ -1412,6 +1709,7 @@ const fs_operation_def_t cgrp_vnodeops_template[] = {
VOPNAME_CLOSE, { .vop_close = cgrp_close },
VOPNAME_READ, { .vop_read = cgrp_read },
VOPNAME_WRITE, { .vop_write = cgrp_write },
+ VOPNAME_IOCTL, { .vop_ioctl = cgrp_ioctl },
VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr },
VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr },
VOPNAME_ACCESS, { .vop_access = cgrp_access },
diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c
index feccf31800..44acdff3b1 100644
--- a/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ b/usr/src/uts/common/brand/lx/os/lx_brand.c
@@ -205,6 +205,14 @@ lx_systrace_f *lx_systrace_return_ptr;
static int lx_systrace_enabled;
/*
+ * cgroup file system maintenance functions which are set when cgroups loads.
+ */
+void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t);
+void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t);
+void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
+/*
* While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
* want an MMU dependency here (and should there be a microprocessor without
* a hole, we don't want to start allocating from the top of the VA range).
@@ -312,6 +320,16 @@ lx_proc_exit(proc_t *p)
{
lx_proc_data_t *lxpd;
proc_t *cp;
+ lx_zone_data_t *lxzdata;
+
+ /* cgroup integration */
+ lxzdata = ztolxzd(p->p_zone);
+ if (lxzdata->lxzd_cgroup != NULL) {
+ lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread));
+ ASSERT(lx_cgrp_proc_exit != NULL);
+ (*lx_cgrp_proc_exit)(lxzdata->lxzd_cgroup,
+ lwpd->br_cgroupid, p->p_pid);
+ }
mutex_enter(&p->p_lock);
VERIFY(lxpd = ptolxproc(p));
diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c
index 67565379fe..3577749b66 100644
--- a/usr/src/uts/common/brand/lx/os/lx_misc.c
+++ b/usr/src/uts/common/brand/lx/os/lx_misc.c
@@ -261,6 +261,7 @@ lx_freelwp(klwp_t *lwp)
{
struct lx_lwp_data *lwpd = lwptolxlwp(lwp);
proc_t *p = lwptoproc(lwp);
+ lx_zone_data_t *lxzdata;
VERIFY(MUTEX_NOT_HELD(&p->p_lock));
@@ -279,6 +280,14 @@ lx_freelwp(klwp_t *lwp)
return;
}
+ /* cgroup integration */
+ lxzdata = ztolxzd(p->p_zone);
+ if (lxzdata->lxzd_cgroup != NULL) {
+ ASSERT(lx_cgrp_freelwp != NULL);
+ (*lx_cgrp_freelwp)(lxzdata->lxzd_cgroup,
+ lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid);
+ }
+
/*
* It is possible for the lx_freelwp hook to be called without a prior
* call to lx_exitlwp being made. This happens as part of lwp
@@ -370,6 +379,7 @@ lx_initlwp(klwp_t *lwp, void *lwpbd)
lx_lwp_data_t *plwpd = ttolxlwp(curthread);
kthread_t *tp = lwptot(lwp);
proc_t *p = lwptoproc(lwp);
+ lx_zone_data_t *lxzdata;
VERIFY(MUTEX_HELD(&p->p_lock));
VERIFY(lwp->lwp_brand == NULL);
@@ -452,6 +462,15 @@ lx_initlwp(klwp_t *lwp, void *lwpbd)
lx_ptrace_inherit_tracer(plwpd, lwpd);
lwpd->br_cgroupid = plwpd->br_cgroupid;
}
+
+ /* cgroup integration */
+ lxzdata = ztolxzd(p->p_zone);
+ if (lxzdata->lxzd_cgroup != NULL) {
+ ASSERT(lx_cgrp_initlwp != NULL);
+ (*lx_cgrp_initlwp)(lxzdata->lxzd_cgroup,
+ lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid);
+ }
+
}
/*
@@ -465,6 +484,7 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
{
struct lx_lwp_data *src = srclwp->lwp_brand;
struct lx_lwp_data *dst = dstlwp->lwp_brand;
+ lx_zone_data_t *lxzdata;
dst->br_ppid = src->br_pid;
dst->br_ptid = lwptot(srclwp)->t_tid;
@@ -496,6 +516,15 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp)
*/
dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND;
dst->br_scall_args = NULL;
+
+ /* cgroup integration */
+ lxzdata = ztolxzd(srclwp->lwp_procp->p_zone);
+ if (lxzdata->lxzd_cgroup != NULL) {
+ ASSERT(lx_cgrp_forklwp != NULL);
+ (*lx_cgrp_forklwp)(lxzdata->lxzd_cgroup,
+ dst->br_cgroupid, lwptoproc(dstlwp)->p_pid);
+ }
+
}
/*
diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h
index e6288fac57..895ea44db5 100644
--- a/usr/src/uts/common/brand/lx/sys/lx_brand.h
+++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h
@@ -35,6 +35,7 @@
#include <sys/cpuvar.h>
#include <sys/zone.h>
#include <sys/ksocket.h>
+#include <sys/vfs.h>
#endif
#ifdef __cplusplus
@@ -383,6 +384,14 @@ typedef enum lx_proc_flags {
#ifdef _KERNEL
+/*
+ * Entry points for cgroup integration.
+ */
+extern void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t);
+extern void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t);
+extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t);
+extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t);
+
#define LX_RLFAKE_LOCKS 0
#define LX_RLFAKE_NICE 1
#define LX_RLFAKE_RTPRIO 2
@@ -632,11 +641,18 @@ struct lx_lwp_data {
*/
#define LX_BR_ARGS_SIZE_MAX (1024)
-/* brand specific data */
+/*
+ * brand specific data
+ *
+ * We currently only support a single cgroup mount in an lx zone so we only have
+ * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever
+ * enhanced to support different mounts with different subsystem controllers.
+ */
typedef struct lx_zone_data {
char lxzd_kernel_version[LX_VERS_MAX];
ksocket_t lxzd_ioctl_sock;
char lxzd_bootid[LX_BOOTID_LEN]; /* procfs boot_id */
+ vfs_t *lxzd_cgroup; /* cgroup for this zone */
} lx_zone_data_t;
#define BR_CPU_BOUND 0x0001