diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2015-07-24 13:34:15 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2015-07-24 13:34:15 +0000 |
commit | 431ca10ae7ca970d65f15fe0a1115ee749a97433 (patch) | |
tree | c91d28297006e34628e27a39dd6084952d077c0f | |
parent | 7be989b3b3d0affc5705ea8b81d4b84ec65d8246 (diff) | |
download | illumos-joyent-431ca10ae7ca970d65f15fe0a1115ee749a97433.tar.gz |
OS-4495 support cgroups notify_on_release and release_agent
-rw-r--r-- | manifest | 3 | ||||
-rw-r--r-- | usr/src/lib/brand/lx/Makefile | 2 | ||||
-rw-r--r-- | usr/src/lib/brand/lx/cgrpmgr/Makefile | 56 | ||||
-rw-r--r-- | usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c | 157 | ||||
-rw-r--r-- | usr/src/lib/brand/lx/lx_brand/common/mount.c | 51 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/cgroups/cgrps.h | 110 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/cgroups/cgrps_node.c | 299 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c | 361 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c | 582 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/os/lx_brand.c | 18 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/os/lx_misc.c | 29 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/sys/lx_brand.h | 18 |
12 files changed, 1405 insertions, 281 deletions
@@ -5045,11 +5045,12 @@ s usr/lib/brand/lx/64=amd64 d usr/lib/brand/lx/amd64 0755 root bin f usr/lib/brand/lx/amd64/lx_librtld_db.so.1 0755 root root f usr/lib/brand/lx/amd64/lx_vdso.so.1 0755 root root +f usr/lib/brand/lx/cgrpmgr 0755 root root +f usr/lib/brand/lx/etc_default_nfs 0444 root root d usr/lib/brand/lx/ld 0755 root root f usr/lib/brand/lx/ld/ld.config 0755 root root d usr/lib/brand/lx/ld/64 0755 root root f usr/lib/brand/lx/ld/64/ld.config 0755 root root -f usr/lib/brand/lx/etc_default_nfs 0444 root root f usr/lib/brand/lx/ltp_skiplist 0444 root root f usr/lib/brand/lx/ltp_tests 0444 root root f usr/lib/brand/lx/lx_boot 0755 root root diff --git a/usr/src/lib/brand/lx/Makefile b/usr/src/lib/brand/lx/Makefile index 2c5a373e25..67f2926305 100644 --- a/usr/src/lib/brand/lx/Makefile +++ b/usr/src/lib/brand/lx/Makefile @@ -33,7 +33,7 @@ include Makefile.lx .PARALLEL: SUBDIRS= cmd librtld_db lx_support lx_init lx_brand netfiles \ - zone lx_vdso testing .WAIT + zone lx_vdso cgrpmgr testing .WAIT MSGSUBDIRS= lx_brand lx_support zone all := TARGET= all diff --git a/usr/src/lib/brand/lx/cgrpmgr/Makefile b/usr/src/lib/brand/lx/cgrpmgr/Makefile new file mode 100644 index 0000000000..26aa079d63 --- /dev/null +++ b/usr/src/lib/brand/lx/cgrpmgr/Makefile @@ -0,0 +1,56 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +PROG = cgrpmgr + +PROG_OBJS = cgrpmgr.o + +OBJS = $(PROG_OBJS) +SRCS = $(PROG_OBJS:%.o=%.c) + +all: $(PROG) + +include ../Makefile.lx +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/cmd/Makefile.ctf + +# override the install directory +ROOTBIN = $(ROOTBRANDDIR) +CLOBBERFILES = $(OBJS) $(ROOTPROG) + +UTSBASE = $(SRC)/uts + +CFLAGS += $(CCVERBOSE) +CPPFLAGS += -D_REENTRANT -I$(UTSBASE)/common/brand/lx/cgroups +LDLIBS += + +.KEEP_STATE: + +install: all $(ROOTPROG) + +clean: + $(RM) $(PROG) $(OBJS) + +lint: lint_PROG lint_SRCS + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(POST_PROCESS) + +%.o: %.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +include $(SRC)/cmd/Makefile.targ diff --git a/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c new file mode 100644 index 0000000000..cbbe56e747 --- /dev/null +++ b/usr/src/lib/brand/lx/cgrpmgr/cgrpmgr.c @@ -0,0 +1,157 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * The cgrpmgr is a user-level daemon process associated with a specific cgroup + * fs mount. It's only job is to run the release_agent when a cgroup becomes + * empty and notify_on_release is enabled. + */ + +#include <stdarg.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/statvfs.h> +#include <sys/wait.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> + +#include <cgrps.h> + +static void +run_agent(char *agent, char *arg) +{ + char *argv[3]; + char *cmdp; + + /* + * The parent does nothing. + */ + if (fork() != 0) + return; + + /* + * Child - run the agent. + */ + (void) setsid(); + + cmdp = strrchr(agent, '/'); + if (cmdp == NULL) { + cmdp = agent; + } else { + cmdp++; + } + + argv[0] = cmdp; + argv[1] = arg; + argv[2] = NULL; + + execv(agent, argv); + /* Nothing can be done if the exec fails */ + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int fd; + int res; + sigset_t set, oset; + struct statvfs sb; + char rel_agent[MAXPATHLEN]; + char cgrp_path[MAXPATHLEN]; + cgrpmgr_info_t cgmi; + + /* + * Start by daemonizing ourself. + */ + + /* Close all open fd's */ + closefrom(0); + + clearenv(); + + /* + * Block all signals except SIGCHLD since we don't want this code to + * respond to any signal (except, of course, the ones we can't block). + * By setting the SIGCHLD disposition to ignore our children will + * automatically be reaped. + */ + (void) sigfillset(&set); + (void) sigdelset(&set, SIGCHLD); + (void) sigdelset(&set, SIGABRT); + (void) sigprocmask(SIG_BLOCK, &set, &oset); + (void) signal(SIGCHLD, SIG_IGN); + + switch (fork1()) { + case -1: /* uh-oh */ + exit(1); + + case 0: /* child */ + break; + + default: /* parent */ + exit(0); + } + + (void) setsid(); + (void) umask(0077); + (void) chdir("/"); + + if ((fd = open(argv[1], O_RDONLY)) < 0) + exit(1); + + /* + * Sanity check the mount point we got. + */ + if (fstatvfs(fd, &sb) < 0 || strcmp(sb.f_basetype, "lx_cgroup") != 0) + exit(1); + + cgmi.cgmi_pid = getpid(); + cgmi.cgmi_rel_agent_path = rel_agent; + cgmi.cgmi_cgroup_path = cgrp_path; + + /* + * Now wait for and run the release agent each time we return from the + * ioctl. An error return indicates the fs has been unmounted and we + * should exit. + */ + for (;;) { + /* + * Block in the kernel until a cgroup becomes empty. + */ + res = ioctl(fd, CGRPFS_GETEVNT, &cgmi); + + /* + * EIO indicates we should quit but any other error implies + * we did something wrong (which means a bug), so simply + * terminate on any error. + */ + if (res != 0) { + if (errno == EIO) + exit(0); + abort(); + } + + run_agent(rel_agent, cgrp_path); + } + + return (0); +} diff --git a/usr/src/lib/brand/lx/lx_brand/common/mount.c b/usr/src/lib/brand/lx/lx_brand/common/mount.c index aca92ed587..406c960dc1 100644 --- a/usr/src/lib/brand/lx/lx_brand/common/mount.c +++ b/usr/src/lib/brand/lx/lx_brand/common/mount.c @@ -40,6 +40,7 @@ #include <sys/stat.h> #include <sys/types.h> #include <unistd.h> +#include <stdlib.h> #include <sys/lx_autofs.h> #include <sys/lx_debug.h> @@ -600,6 +601,32 @@ i_make_nfs_args(lx_nfs_mount_data_t *lx_nmd, struct nfs_args *nfs_args, return (0); } +static int +run_cgrp_mgr(char *mntpnt) +{ + const char *cmd = "/native/usr/lib/brand/lx/cgrpmgr"; + char *argv[] = { "cgrpmgr", NULL, NULL }; + + argv[1] = mntpnt; + + switch (fork1()) { + case 0: + /* child */ + execv(cmd, argv); + exit(1); + break; + + case -1: + return (-1); + + default: + /* the cgroup manager process runs until we unmount */ + break; + } + + return (0); +} + long lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, uintptr_t p5) @@ -616,6 +643,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, char target[MAXPATHLEN]; char fstype[MAXPATHLEN], options[MAX_MNTOPT_STR]; int sflags, rv; + long res; + boolean_t is_cgrp = B_FALSE; /* Variables needed for nfs mounts. */ lx_nfs_mount_data_t lx_nmd; @@ -752,6 +781,8 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, } lx_debug("\tlinux mount options: \"%s\"", options); + is_cgrp = B_TRUE; + /* * Currently don't verify Linux mount options since we can * have asubsystem string provided. @@ -885,8 +916,24 @@ lx_mount(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4, lx_debug("\tsolaris mount fstype: %s", fstype); lx_debug("\tsolaris mount options: \"%s\"", options); - return (mount(source, target, sflags, fstype, sdataptr, sdatalen, - options, sizeof (options)) ? -errno : 0); + res = mount(source, target, sflags, fstype, sdataptr, sdatalen, + options, sizeof (options)); + + if (res == 0) { + if (is_cgrp && run_cgrp_mgr(target) != 0) { + /* + * Forking the cgrp manager failed, unmount and return + * an ENOMEM error as the best approximation that we're + * out of resources. + */ + (void) umount(target); + return (-ENOMEM); + } else { + return (0); + } + } else { + return (-errno); + } } /* diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h index f0fab9f904..cfbeb2796c 100644 --- a/usr/src/uts/common/brand/lx/cgroups/cgrps.h +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h @@ -46,11 +46,36 @@ extern "C" { #include <sys/atomic.h> #include <vm/anon.h> +/* + * cgrpmgr ioctl interface. + */ +#define CGRPFS_IOC ('C' << 16 | 'G' << 8) +#define CGRPFS_GETEVNT (CGRPFS_IOC | 1) + +typedef struct cgrpmgr_info { + pid_t cgmi_pid; + char *cgmi_rel_agent_path; + char *cgmi_cgroup_path; +} cgrpmgr_info_t; + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +typedef struct cgrpmgr_info32 { + pid_t cgmi_pid; + caddr32_t cgmi_rel_agent_path; + caddr32_t cgmi_cgroup_path; +} cgrpmgr_info32_t; + +typedef struct cgrp_evnt { + list_node_t cg_evnt_lst; + char *cg_evnt_path; +} cgrp_evnt_t; + #define CG_PSNSIZE 256 /* max size of pseudo file name entries */ #define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */ -#define CG_START_ID 0 /* initial node ID for allocation */ - /* * The order of these entries must be in sync with the cg_ssde_dir array. */ @@ -61,8 +86,10 @@ typedef enum cgrp_ssid { typedef enum cgrp_nodetype { CG_CGROUP_DIR = 1, /* cgroup directory entry */ - CG_PROCS, - CG_TASKS, + CG_NOTIFY, /* notify_on_release file */ + CG_PROCS, /* cgroup.procs file */ + CG_REL_AGENT, /* release_agent file */ + CG_TASKS, /* tasks file */ } cgrp_nodetype_t; typedef struct cgrp_subsys_dirent { @@ -70,10 +97,19 @@ typedef struct cgrp_subsys_dirent { char *cgrp_ssd_name; } cgrp_subsys_dirent_t; +#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2) + +/* + * A modern systemd-based Linux system typically has 50-60 cgroups so + * we size the hash for 2x that number. + */ +#define CGRP_HASH_SZ 128 + /* * cgroups per-mount data structure. * - * All fields are protected by cg_contents. + * All but the event related fields are protected by cg_contents. + * The evnt_list and counter is protected by cg_events. */ typedef struct cgrp_mnt { struct vfs *cg_vfsp; /* filesystem's vfs struct */ @@ -82,45 +118,45 @@ typedef struct cgrp_mnt { cgrp_ssid_t cg_ssid; /* subsystem type */ dev_t cg_dev; /* unique dev # of mounted `device' */ uint_t cg_gen; /* node ID source for files */ - kmutex_t cg_contents; /* lock for cgrp_mnt structure */ - kmutex_t cg_renamelck; /* rename lock for this mount */ + uint_t cg_grp_gen; /* ID source for cgroups */ + kmutex_t cg_contents; /* global lock for most fs activity */ + char cg_agent[MAXPATHLEN + 1]; /* release_agent path */ + pid_t cg_mgrpid; /* pid of user-level manager */ + kmutex_t cg_events; /* lock for event list */ + kcondvar_t cg_evnt_cv; /* condvar for event list wakeup */ + int cg_evnt_cnt; /* counter for num events in list */ + list_t cg_evnt_list; /* list of agent events */ + /* ptr to zone data for containing zone */ + lx_zone_data_t *cg_lxzdata; + struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */ } cgrp_mnt_t; /* * cgrp_node is the file system dependent node for cgroups. * - * cgn_rwlock protects access of the directory list at cgn_dir - * as well as syncronizing read and writes to the cgrp_node - * - * cgn_contents protects growing, shrinking, reading and writing - * the file along with cgn_rwlock (see below). + * The node is used to represent both directories (a cgroup) and pseudo files + * within the directory. * - * cgn_tlock protects updates to cgn_mode and cgn_nlink - * - * cg_contents in the cgrp_mount data structure protects - * cgn_forw and cgn_back which are used to maintain a linked - * list of all cgroup files associated with that file system - * - * The ordering of the locking is: - * cg_rwlock -> cgn_contents - * - * cgn_tlock doesn't require any cgrp_node locks + * Members are tagged in the comment to note which type of node they apply to: + * A - all + * D - dir (i.e. a cgroup) + * F - pseudo file */ typedef struct cgrp_node { - struct cgrp_node *cgn_back; /* lnked lst of cgrp_nodes */ - struct cgrp_node *cgn_forw; /* lnked lst of cgrp_nodes */ - struct cgrp_dirent *cgn_dir; /* dirent list */ - struct cgrp_node *cgn_parent; /* dir containing this node */ - uint_t cgn_dirents; /* number of dirents */ - cgrp_nodetype_t cgn_type; /* type for this node */ - struct vnode *cgn_vnode; /* vnode for this cgrp_node */ - int cgn_id; /* ID number for the cgroup */ - struct vattr cgn_attr; /* attributes */ - krwlock_t cgn_contents; /* serialize mods */ - krwlock_t cgn_rwlock; /* rw - serialize */ - /* mods and dir updates */ - kmutex_t cgn_tlock; /* time, flag, and nlink lock */ + struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */ + struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */ + struct cgrp_dirent *cgn_dir; /* D dirent list */ + struct cgrp_node *cgn_parent; /* A dir containing this node */ + struct cgrp_node *cgn_next; /* D link in per-mount cgroup */ + /* hash table */ + uint_t cgn_dirents; /* D number of dirents */ + cgrp_nodetype_t cgn_type; /* A type for this node */ + uint_t cgn_notify; /* D notify_on_release value */ + uint_t cgn_task_cnt; /* D number of threads in grp */ + struct vnode *cgn_vnode; /* A vnode for this cgrp_node */ + uint_t cgn_id; /* D ID number for the cgroup */ + struct vattr cgn_attr; /* A attributes */ } cgrp_node_t; /* @@ -184,6 +220,10 @@ void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *); int cgrp_taccess(void *, int, cred_t *); ino_t cgrp_inode(cgrp_nodetype_t, unsigned int); int cgrp_num_pseudo_ents(cgrp_ssid_t); +cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t); +void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *); + +#endif /* KERNEL */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c index 0d153f73c1..8950be1966 100644 --- a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c @@ -36,6 +36,7 @@ static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *, enum de_op); static cgrp_subsys_dirent_t cgrp_generic_dir[] = { { CG_PROCS, "cgroup.procs" }, + { CG_NOTIFY, "notify_on_release" }, { CG_TASKS, "tasks" } }; @@ -165,6 +166,132 @@ cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold, } /* + * The following functions maintain the per-mount cgroup hash table. + */ +static void +cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + cn->cgn_next = cgm->cg_grp_hash[hsh]; + cgm->cg_grp_hash[hsh] = cn; +} + +static void +cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + cgrp_node_t *np = NULL, *curp, *prevp = NULL; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + if (prevp == NULL) { + cgm->cg_grp_hash[hsh] = curp->cgn_next; + } else { + prevp->cgn_next = curp->cgn_next; + } + np = curp; + np->cgn_next = NULL; + break; + } + + prevp = curp; + } + + ASSERT(np != NULL); + ASSERT(np->cgn_task_cnt == 0); +} + +/* + * Count up the number of threads already running in the zone and initialize the + * first cgroup's task counter. + * + * We have to look at all of the processes to find applicable ones. + */ +static void +cgrp_cg_hash_init(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + int i; + int cnt = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Scan all of the process entries */ + mutex_enter(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != zoneid) { + continue; + } + + mutex_enter(&p->p_lock); + if (p->p_brand != &lx_brand) { + mutex_exit(&p->p_lock); + continue; + } + cnt += p->p_lwpcnt; + mutex_exit(&p->p_lock); + } + + /* + * There should be at least the init process with 1 thread in the zone + */ + ASSERT(cnt > 0); + cn->cgn_task_cnt = cnt; + + DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt); + + mutex_exit(&pidlock); +} + +cgrp_node_t * +cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid) +{ + int hsh = cgid % CGRP_HASH_SZ; + cgrp_node_t *curp; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + return (curp); + } + } + + return (NULL); +} + +/* * Calculate an inode number * * This takes various bits of info and munges them to give the inode number for @@ -217,9 +344,6 @@ cgrp_taccess(void *vcp, int mode, cred_t *cred) /* * Search directory 'parent' for entry 'name'. * - * The calling thread can't hold the write version - * of the rwlock for the directory being searched - * * 0 is returned on success and *foundcp points * to the found cgrp_node with its vnode held. */ @@ -227,8 +351,10 @@ int cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp, cred_t *cred) { + cgrp_mnt_t *cgm = VTOCGM(parent->cgn_vnode); int error; + ASSERT(MUTEX_HELD(&cgm->cg_contents)); *foundcp = NULL; if (parent->cgn_type != CG_CGROUP_DIR) return (ENOTDIR); @@ -280,10 +406,7 @@ cgrp_direnter( int error = 0; char *s; - /* - * cgn_rwlock is held to serialize direnter and dirdeletes - */ - ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock)); + ASSERT(MUTEX_HELD(&cgm->cg_contents)); ASSERT(dir->cgn_type == CG_CGROUP_DIR); /* @@ -302,23 +425,15 @@ cgrp_direnter( * Remember that we can only rename within the same directory. */ if (op == DE_RENAME) { - rw_enter(&cn->cgn_rwlock, RW_WRITER); - mutex_enter(&cn->cgn_tlock); if (cn->cgn_nlink == 0) { - mutex_exit(&cn->cgn_tlock); - rw_exit(&cn->cgn_rwlock); return (ENOENT); } if (cn->cgn_nlink == MAXLINK) { - mutex_exit(&cn->cgn_tlock); - rw_exit(&cn->cgn_rwlock); return (EMLINK); } cn->cgn_nlink++; gethrestime(&cn->cgn_ctime); - mutex_exit(&cn->cgn_tlock); - rw_exit(&cn->cgn_rwlock); } /* @@ -342,7 +457,9 @@ cgrp_direnter( if (cdp) { ASSERT(found != NULL); error = EEXIST; + mutex_exit(&cgm->cg_contents); cgnode_rele(found); + mutex_enter(&cgm->cg_contents); } else { /* @@ -358,6 +475,13 @@ cgrp_direnter( error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred); if (error) goto out; + + if (op == DE_MKDIR) { + /* + * inherit notify_on_release value from parent + */ + cn->cgn_notify = dir->cgn_notify; + } } error = cgrp_diraddentry(dir, cn, name, op); @@ -366,7 +490,6 @@ cgrp_direnter( /* * Unmake the inode we just made. */ - rw_enter(&cn->cgn_rwlock, RW_WRITER); if ((cn->cgn_type) == CG_CGROUP_DIR) { ASSERT(cdp == NULL); /* @@ -374,25 +497,26 @@ cgrp_direnter( */ cgrp_dirtrunc(cn); } - mutex_enter(&cn->cgn_tlock); cn->cgn_nlink = 0; - mutex_exit(&cn->cgn_tlock); gethrestime(&cn->cgn_ctime); - rw_exit(&cn->cgn_rwlock); + mutex_exit(&cgm->cg_contents); cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); cn = NULL; } } else if (cnp) { *cnp = cn; } else if (op == DE_CREATE || op == DE_MKDIR) { + mutex_exit(&cgm->cg_contents); cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); } } out: if (error && op == DE_RENAME) { /* Undo bumped link count. */ - DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock); + cn->cgn_nlink--; gethrestime(&cn->cgn_ctime); } return (error); @@ -410,17 +534,17 @@ int cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, cred_t *cred) { + cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode); cgrp_dirent_t *cndp; int error; size_t namelen; cgrp_node_t *cnnp; timestruc_t now; - ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock)); - ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock)); + ASSERT(MUTEX_HELD(&cgm->cg_contents)); if (nm[0] == '\0') - panic("cgrp_dirdelete: NULL name for 0x%p", (void *)cn); + panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn); /* * return error when removing . and .. @@ -465,32 +589,21 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, nextp = cdp->cgd_next; cgnode_hold(pseudo_node); - rw_enter(&pseudo_node->cgn_rwlock, RW_WRITER); error = cgrp_dirdelete(cn, pseudo_node, cdp->cgd_name, DR_REMOVE, cred); - rw_exit(&pseudo_node->cgn_rwlock); + mutex_exit(&cgm->cg_contents); cgnode_rele(pseudo_node); + mutex_enter(&cgm->cg_contents); cdp = nextp; } - } - cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp); - if (cndp == NULL) { - /* - * If it is gone, some other thread got here first! - * Return error ENOENT. - */ - return (ENOENT); + cgrp_cg_hash_remove(cgm, cn); } - /* - * If the cgrp_node in the cgrp_dirent changed, we were probably - * the victim of a concurrent rename operation. The original - * is gone, so return that status. - */ - if (cn != cnnp) - return (ENOENT); + cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp); + VERIFY(cndp != NULL); + VERIFY(cn == cnnp); cgrp_hash_out(cndp); @@ -527,7 +640,7 @@ cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, cn->cgn_ctime = now; ASSERT(cn->cgn_nlink > 0); - DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock); + cn->cgn_nlink--; if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) { cgrp_dirtrunc(cn); ASSERT(cn->cgn_nlink == 0); @@ -544,10 +657,9 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) struct vnode *vp; timestruc_t now; + ASSERT(MUTEX_HELD(&cgm->cg_contents)); ASSERT(vap != NULL); - rw_init(&cn->cgn_rwlock, NULL, RW_DEFAULT, NULL); - mutex_init(&cn->cgn_tlock, NULL, MUTEX_DEFAULT, NULL); cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode); cn->cgn_mask = 0; cn->cgn_attr.va_type = vap->va_type; @@ -581,15 +693,7 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) vp->v_rdev = vap->va_rdev; vp->v_data = (caddr_t)cn; - mutex_enter(&cgm->cg_contents); - - /* - * Set the cgroup ID for this cgrp_node by using a counter on each - * mount. We also use this value as the directory nodeid (which is used - * to derive the inode) so each cgroup in the tree will have a unique - * id (and inode). - */ - cn->cgn_nodeid = cn->cgn_id = cgm->cg_gen++; + cn->cgn_nodeid = cgm->cg_gen++; /* * Add new cgrp_node to end of linked list of cgrp_nodes for this @@ -600,10 +704,38 @@ cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) cn->cgn_back = cgm->cg_rootnode->cgn_back; cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn; } - mutex_exit(&cgm->cg_contents); vn_exists(vp); } +void +cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name, + cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr) +{ + cgrp_node_t *ncn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, nattr, + &ncn, cr, NULL); + + /* + * Fix the inode and assign the pseudo file type to be correct. + */ + ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid); + ncn->cgn_type = type; + + /* + * Since we're creating these entries here and not via the + * normal VOP_CREATE code path, we need to do the rele to drop + * our hold. This will leave the vnode v_count at 0 when we + * come out of cgrp_inactive but we won't reclaim the vnode + * there since the cgn_nlink value will still be 1. + */ + mutex_exit(&cgm->cg_contents); + cgnode_rele(ncn); + mutex_enter(&cgm->cg_contents); +} + /* * cgrp_dirinit is used internally to initialize a directory (dir) * with '.' and '..' entries without checking permissions and locking @@ -615,19 +747,34 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) { cgrp_dirent_t *dot, *dotdot; timestruc_t now; - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dir->cgn_vnode); + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); cgrp_ssde_t *ssdp; cgrp_subsys_dirent_t *pseudo_files; struct vattr nattr; int i; - ASSERT(RW_WRITE_HELD(&parent->cgn_rwlock)); + ASSERT(MUTEX_HELD(&cgm->cg_contents)); ASSERT(dir->cgn_type == CG_CGROUP_DIR); ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM); ssdp = &cg_ssde_dir[cgm->cg_ssid]; /* + * If this is the top-level cgroup created by the mount then we need to + * count up the number of procs and tasks already running in the zone. + */ + + /* + * Set the cgroup ID for this cgrp_node by using a counter on each + * mount. + */ + dir->cgn_id = cgm->cg_grp_gen++; + cgrp_cg_hash_insert(cgm, dir); + /* Initialise the first cgroup if this is top-level group */ + if (parent == dir) + cgrp_cg_hash_init(cgm, dir); + + /* * Initialize the entries */ dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP); @@ -659,7 +806,7 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) dir->cgn_mtime = now; dir->cgn_ctime = now; - INCR_COUNT(&parent->cgn_nlink, &parent->cgn_tlock); + parent->cgn_nlink++; parent->cgn_ctime = now; dir->cgn_dir = dot; @@ -672,28 +819,20 @@ cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) nattr.va_type = VREG; nattr.va_rdev = 0; + /* + * If this is the top-level dir in the file system then it always + * has a release_agent pseudo file. Only the top-level dir has this + * file. + */ + if (parent == dir) { + cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr, + cr); + } + pseudo_files = ssdp->cg_ssde_files; for (i = 0; i < ssdp->cg_ssde_nfiles; i++) { - cgrp_node_t *ncn; - - cgrp_direnter(cgm, dir, pseudo_files[i].cgrp_ssd_name, - DE_CREATE, (cgrp_node_t *)NULL, &nattr, &ncn, cr, NULL); - - /* - * Fix the inode and assign the pseudo file type to be correct. - */ - ncn->cgn_nodeid = cgrp_inode(pseudo_files[i].cgrp_ssd_type, - dir->cgn_nodeid); - ncn->cgn_type = pseudo_files[i].cgrp_ssd_type; - - /* - * Since we're creating these entries here and not via the - * normal VOP_CREATE code path, we need to do the rele to drop - * our hold. This will leave the vnode v_count at 0 when we - * come out of cgrp_inactive but we won't reclaim the vnode - * there since the cgn_nlink value will still be 1. - */ - cgnode_rele(ncn); + cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name, + pseudo_files[i].cgrp_ssd_type, &nattr, cr); } } @@ -705,8 +844,9 @@ cgrp_dirtrunc(cgrp_node_t *dir) { cgrp_dirent_t *cgdp; timestruc_t now; + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); - ASSERT(RW_WRITE_HELD(&dir->cgn_rwlock)); + ASSERT(MUTEX_HELD(&cgm->cg_contents)); ASSERT(dir->cgn_type == CG_CGROUP_DIR); for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) { @@ -726,7 +866,7 @@ cgrp_dirtrunc(cgrp_node_t *dir) */ cn = cgdp->cgd_cgrp_node; ASSERT(cn->cgn_nlink > 0); - DECR_COUNT(&cn->cgn_nlink, &cn->cgn_tlock); + cn->cgn_nlink--; cgrp_hash_out(cgdp); kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen); @@ -849,6 +989,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va, { cgrp_node_t *cn; + ASSERT(MUTEX_HELD(&cgm->cg_contents)); ASSERT(va != NULL); if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || @@ -870,9 +1011,7 @@ cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va, if (op == DE_MKDIR) { cn->cgn_type = CG_CGROUP_DIR; - rw_enter(&cn->cgn_rwlock, RW_WRITER); cgrp_dirinit(dir, cn, cred); - rw_exit(&cn->cgn_rwlock); } *newnode = cn; diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c index 8066f184ce..b2ffa02418 100644 --- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c @@ -31,10 +31,12 @@ * For example, it is common to see cgroup trees (each is its own mount with a * different subsystem controller) for blkio, cpuset, memory, systemd (has no * controller), etc. Within each tree there is a top-level directory with at - * least a cgroup.procs and tasks file listing the processes within that group, - * although there could be subdirectories, which define new cgroups, that then - * contain a subset of the processes. Each subdirectory also has, at a minimum, - * a cgroup.procs and tasks file. + * least a cgroup.procs, notify_on_release, release_agent, and tasks file. + * The cgroup.procs file lists the processes within that group and the tasks + * file lists the threads in the group. There could be subdirectories, which + * define new cgroups, that then contain a subset of the processes. Each + * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and + * tasks file. * * Since we're using lx to run user-level code within zones, the majority (all?) * of the cgroup resource management functionality simply doesn't apply to us. @@ -45,14 +47,54 @@ * hierarchy and does not report that any resource management controllers are * available for separate mounts. * + * In addition to the hierarchy, the other important component of cgroups that + * is used by systemd is the 'release_agent'. This provides a mechanism to + * run a command when a cgroup becomes empty (the last task in the group + * leaves, either by exit or move, and there are no more sub-cgroups). The + * 'release_agent' file only exists in the top-level cgroup of the mounted + * file system and holds the path to a command to run. The 'notify_on_release' + * file exists in each cgroup dir. If that file contains a '1' then the agent + * is run when that group becomes empty. The agent is passed a path string of + * the cgroup, relative to the file system mount point (e.g. a mount on + * /sys/fs/cgroups/systemd with a sub-cgroup of foo/bar gets the arg foo/bar). + * + * Cgroup membership is implemented via hooks into the lx brand code. When + * the cgroup file system loads it installs callbacks for: + * lx_cgrp_forklwp + * lx_cgrp_procexit + * lx_cgrp_initlwp + * lx_cgrp_freelwp + * and when it unloads it clears those hooks. The lx brand code calls those + * hooks when a process/lwp starts and when it exits. Internally we use a + * simple reference counter (cgn_task_cnt) on the cgroup node to track how many + * threads are in the group, so we can tell when a group becomes empty. + * To make this quick, a hash table (cg_grp_hash) is maintained on the + * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is + * sized so that there should typically only be 0 or 1 cgroups per bucket. + * We also keep a reference to the file system in the zone-specific brand data + * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t + * when it runs the hook. + * + * Once a cgroup becomes empty, running the release agent is actually done + * by a user-level cgrpmgr process. That process makes a CGRPFS_GETEVNT + * ioctl which blocks until there is an event (i.e. the agent needs to run). + * Internally we maintain a list (cg_evnt_list) of release events on + * cgrp_mnt_t. The ioctl pulls an event off of the list, or blocks until an + * event is available, and then returns the event. The cgrpmgr process is + * started by the lx mount emulation when it mounts the file system. The + * cgrpmgr will exit when the ioctl returns EIO, indicating that the file + * system is being unmounted. + * * This file system is similar to tmpfs in that directories only exist in * memory. Each subdirectory represents a different cgroup. Within the cgroup * there are pseudo files (see cg_ssde_dir) with well-defined names which * control the configuration and behavior of the cgroup (see cgrp_nodetype_t). - * The primary files within every cgroup are named 'cgroup.procs' and 'tasks'. - * These are used to control and list which processes/threads belong to the - * cgroup. In the general case there can be additional files in the cgroup - * which define additional behavior, although none exists at this time. + * The primary files within every cgroup are named 'cgroup.procs', + * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the + * top-level cgroup). The cgroup.procs and tasks files are used to control and + * list which processes/threads belong to the cgroup. In the general case there + * could be additional files in the cgroup, which defined additional behavior + * (i.e. subsystem specific pseudo files), although none exist at this time. * * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is * used to correlate with the threads to determine cgroup membership. When @@ -69,11 +111,27 @@ * - no file rename, but a directory (i.e. a cgroup) can be renamed within the * containing directory, but not into a different directory * - can mkdir and rmdir to create/destroy cgroups - * - cannot rmdir while it contains a subdir (i.e. a sub-cgroup) + * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup) * - open, read/write, close on the subsytem-specific pseudo files is * allowed, as this is the interface to configure and report on the cgroup. * The pseudo file's mode controls write access and cannot be changed. * + * The locking in this file system is simple since the file system is not + * subjected to heavy I/O activity and all data is in-memory. There is a single + * global mutex for each mount (cg_contents). This mutex is held for the life + * of most vnode operations. The most active path is probably the LWP start and + * exit hooks which increment/decrement the reference counter on the cgroup + * node. The lock is important for this case since we don't want concurrent + * activity (such as moving the process into another cgroup) while we're trying + * to lookup the cgroup from the mount's hash table. We must be careful to + * avoid a deadlock while reading or writing since that code can take pidlock + * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of + * those is held. To prevent deadlock we always take cg_contents after pidlock + * and p_lock. + * + * In addition to the cg_contents lock there is also a second mutex (cg_events) + * used with the event queue condvar (cg_evnt_cv). + * * EXTENDING THE FILE SYSTEM * * When adding support for a new subsystem, be sure to also update the @@ -100,7 +158,8 @@ * list of cgroup IDs associated with every thread, instead of just one ID * (br_cgroupid). The thread data would need to become a struct which held * both an ID and an indication as to which mounted cgroup file system instance - * the ID was associated with. + * the ID was associated with. We would also need a list of cgroup mounts per + * zone, instead the current single zone reference. */ #include <sys/types.h> @@ -123,6 +182,8 @@ #include <sys/systm.h> #include <sys/mntent.h> #include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> #include <sys/lx_brand.h> #include "cgrps.h" @@ -131,6 +192,11 @@ static int cgrp_fstype; static dev_t cgrp_dev; +#define MAX_AGENT_EVENTS 32 /* max num queued events */ + +#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */ +#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */ + /* * cgrp_mountcount is used to prevent module unloads while there is still * state from a former mount hanging around. The filesystem module must not be @@ -171,6 +237,12 @@ static int cgrp_root(struct vfs *, struct vnode **); static int cgrp_statvfs(struct vfs *, struct statvfs64 *); static void cgrp_freevfs(vfs_t *vfsp); +/* Forward declarations for hooks */ +static void cgrp_proc_fork_helper(vfs_t *, uint_t, pid_t); +static void cgrp_proc_exit_helper(vfs_t *, uint_t, pid_t); +static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t); +static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t); + /* * Loadable module wrapper */ @@ -209,6 +281,12 @@ _fini() if (cgrp_mountcount) return (EBUSY); + /* Disable hooks used by the lx brand module. */ + lx_cgrp_forklwp = NULL; + lx_cgrp_proc_exit = NULL; + lx_cgrp_initlwp = NULL; + lx_cgrp_freelwp = NULL; + if ((error = mod_remove(&modlinkage)) != 0) return (error); @@ -282,6 +360,12 @@ cgrp_init(int fstype, char *name) */ cgrp_dev = makedevice(dev, 0); + /* Install the hooks used by the lx brand module. */ + lx_cgrp_forklwp = cgrp_proc_fork_helper; + lx_cgrp_proc_exit = cgrp_proc_exit_helper; + lx_cgrp_initlwp = cgrp_lwp_fork_helper; + lx_cgrp_freelwp = cgrp_lwp_exit_helper; + return (0); } @@ -294,6 +378,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) int error; struct vattr rattr; cgrp_ssid_t ssid = CG_SSID_GENERIC; + lx_zone_data_t *lxzdata; if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) return (error); @@ -309,6 +394,13 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) return (EINVAL); /* + * We currently only support one mount per zone. + */ + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata->lxzd_cgroup != NULL) + return (EINVAL); + + /* * Ensure we don't allow overlaying mounts */ mutex_enter(&mvp->v_lock); @@ -354,10 +446,15 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) /* Set but don't bother entering the mutex (not on mount list yet) */ mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&cgm->cg_events, NULL, MUTEX_DEFAULT, NULL); + cv_init(&cgm->cg_evnt_cv, NULL, CV_DRIVER, NULL); - cgm->cg_vfsp = vfsp; + cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp; + cgm->cg_lxzdata = lxzdata; cgm->cg_ssid = ssid; - cgm->cg_gen = CG_START_ID; + + list_create(&cgm->cg_evnt_list, sizeof (cgrp_evnt_t), + offsetof(cgrp_evnt_t, cg_evnt_lst)); vfsp->vfs_data = (caddr_t)cgm; vfsp->vfs_fstype = cgrp_fstype; @@ -368,15 +465,19 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(cgm->cg_mntpath, dpn.pn_path); + cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ, + KM_SLEEP); + /* allocate and initialize root cgrp_node structure */ bzero(&rattr, sizeof (struct vattr)); rattr.va_mode = (mode_t)(S_IFDIR | 0755); rattr.va_type = VDIR; rattr.va_rdev = 0; cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP); + + mutex_enter(&cgm->cg_contents); cgrp_node_init(cgm, cp, &rattr, cr); - rw_enter(&cp->cgn_rwlock, RW_WRITER); CGNTOV(cp)->v_flag |= VROOT; /* @@ -393,7 +494,7 @@ cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) cp->cgn_nodeid = cgrp_inode(ssid, cgm->cg_gen); cgrp_dirinit(cp, cp, cr); - rw_exit(&cp->cgn_rwlock); + mutex_exit(&cgm->cg_contents); pn_free(&dpn); error = 0; @@ -414,15 +515,20 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct vnode *vp; int error; uint_t cnt; + int retry_cnt = 0; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); +retry: mutex_enter(&cgm->cg_contents); /* - * In the normal unmount case, if there are no - * open files, only the root node should have a reference count. + * In the normal unmount case, if there were no open files, only the + * root node would have a reference count. However, the user-level + * agent manager should have the root vnode open and be waiting in + * ioctl. We need to wake the manager and it may take some retries + * before it closes its file descriptor. * * With cg_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from @@ -432,6 +538,29 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) */ cgnp = cgm->cg_rootnode; + ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL); + + mutex_enter(&cgm->cg_events); + cv_signal(&cgm->cg_evnt_cv); + + /* + * Delete any queued events (normally there shouldn't be any). + */ + for (;;) { + cgrp_evnt_t *evntp; + + evntp = list_remove_head(&cgm->cg_evnt_list); + if (evntp == NULL) + break; + kmem_free(evntp->cg_evnt_path, MAXPATHLEN); + kmem_free(evntp, sizeof (cgrp_evnt_t)); + cgm->cg_evnt_cnt--; + } + + /* Set the counter to -1 so an incoming ioctl knows we're unmounting */ + cgm->cg_evnt_cnt = -1; + mutex_exit(&cgm->cg_events); + vp = CGNTOV(cgnp); mutex_enter(&vp->v_lock); @@ -441,10 +570,16 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) return (EINVAL); } + cnt = vp->v_count; if (cnt > 1) { mutex_exit(&vp->v_lock); mutex_exit(&cgm->cg_contents); + /* Likely because the user-level manager hasn't exited yet */ + if (retry_cnt++ < UMNT_RETRY_MAX) { + delay(UMNT_DELAY_TIME); + goto retry; + } return (EBUSY); } @@ -476,6 +611,11 @@ cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } } + cgm->cg_lxzdata->lxzd_cgroup = NULL; + kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ); + list_destroy(&cgm->cg_evnt_list); + cv_destroy(&cgm->cg_evnt_cv); + /* * We can drop the mutex now because * no one can find this mount anymore @@ -519,10 +659,10 @@ cgrp_freevfs(vfs_t *vfsp) * Remove all directory entries */ for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) { - rw_enter(&cn->cgn_rwlock, RW_WRITER); + mutex_enter(&cgm->cg_contents); if (cn->cgn_type == CG_CGROUP_DIR) cgrp_dirtrunc(cn); - rw_exit(&cn->cgn_rwlock); + mutex_exit(&cgm->cg_contents); } ASSERT(cgm->cg_rootnode); @@ -571,7 +711,7 @@ cgrp_freevfs(vfs_t *vfsp) kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1); mutex_destroy(&cgm->cg_contents); - mutex_destroy(&cgm->cg_renamelck); + mutex_destroy(&cgm->cg_events); kmem_free(cgm, sizeof (cgrp_mnt_t)); /* Allow _fini() to succeed now */ @@ -676,3 +816,186 @@ cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) sbp->f_namemax = MAXNAMELEN - 1; return (0); } + +static int +cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen) +{ + cgrp_node_t *parent; + cgrp_dirent_t *dp; + + buf[0] = '\0'; + + parent = cn->cgn_parent; + if (parent == NULL || parent == cn) { + (void) strlcpy(buf, ".", blen); + return (0); + } + + /* + * Search the parent dir list to find this cn's name. + */ + for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) { + if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) { + (void) strlcpy(buf, dp->cgd_name, blen); + return (0); + } + } + + return (-1); +} + +/* + * Engueue an event for user-level release_agent manager. The event data is the + * pathname (relative to the mount point of the file system) of the newly empty + * cgroup. + */ +void +cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + cgrp_node_t *parent; + char nm[MAXNAMELEN]; + char *argstr, *oldstr, *tmp; + cgrp_evnt_t *evntp; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Nothing to do if the agent is not set */ + if (cgm->cg_agent[0] == '\0') + return; + + parent = cn->cgn_parent; + /* Cannot remove the top-level cgroup (only via unmount) */ + if (parent == cn) + return; + + argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + *argstr = '\0'; + + /* + * Iterate up the directory tree to construct the agent argument string. + */ + do { + cgrp_get_dirname(cn, nm, sizeof (nm)); + DTRACE_PROBE1(cgrp__dir__name, char *, nm); + if (*argstr == '\0') { + (void) strlcpy(argstr, nm, MAXPATHLEN); + } else { + tmp = oldstr; + oldstr = argstr; + argstr = tmp; + (void) snprintf(argstr, MAXPATHLEN, "%s/%s", nm, + oldstr); + } + + if (cn->cgn_parent == NULL) + break; + cn = cn->cgn_parent; + parent = cn->cgn_parent; + + /* + * The arg path is relative to the mountpoint so we stop when + * we get to the top level. + */ + if (parent == NULL || parent == cn) + break; + } while (parent != cn); + + kmem_free(oldstr, MAXPATHLEN); + + DTRACE_PROBE1(cgrp__agent__event, char *, argstr); + + /* + * Add the event to the list for the user-level agent. We add it to + * the end of the list (which should normally be an empty list since + * the user-level agent is designed to service events as quickly as + * it can). + */ + evntp = kmem_zalloc(sizeof (cgrp_evnt_t), KM_SLEEP); + evntp->cg_evnt_path = argstr; + + mutex_enter(&cgm->cg_events); + if (cgm->cg_evnt_cnt >= MAX_AGENT_EVENTS) { + /* + * We don't queue up an arbitrary number of events. Because + * the user-level manager should be servicing events quickly, + * if the list gets long then something is wrong. + */ + cmn_err(CE_WARN, "cgrp: event queue full for zone %s", + ttoproc(curthread)->p_zone->zone_name); + kmem_free(evntp->cg_evnt_path, MAXPATHLEN); + kmem_free(evntp, sizeof (cgrp_evnt_t)); + + } else { + list_insert_tail(&cgm->cg_evnt_list, evntp); + cgm->cg_evnt_cnt++; + cv_signal(&cgm->cg_evnt_cv); + } + mutex_exit(&cgm->cg_events); +} + +/*ARGSUSED*/ +static void +cgrp_proc_fork_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid) +{ +} + +/*ARGSUSED*/ +static void +cgrp_proc_exit_helper(vfs_t *vfsp, uint_t cg_id, pid_t pid) +{ + if (curproc->p_zone->zone_proc_initpid == pid || + curproc->p_zone->zone_proc_initpid == -1) { + /* + * The zone's init just exited. If this is because of a zone + * reboot initiated from outside the zone, then we've never + * tried to unmount this fs, so we need to wakeup the + * user-level manager so that it can exit. Its also possible + * init died abnormally, but that leads to a zone reboot so the + * action is the same here. + */ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + + mutex_enter(&cgm->cg_events); + cv_signal(&cgm->cg_evnt_cv); + mutex_exit(&cgm->cg_events); + } +} + +/*ARGSUSED*/ +static void +cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + cn->cgn_task_cnt++; + mutex_exit(&cgm->cg_contents); + + DTRACE_PROBE1(cgrp__lwp__fork, void *, cn); +} + +/*ARGSUSED*/ +static void +cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + VERIFY(cn->cgn_task_cnt > 0); + cn->cgn_task_cnt--; + DTRACE_PROBE1(cgrp__lwp__exit, void *, cn); + + if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) && + cn->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, cn); + } + + mutex_exit(&cgm->cg_contents); +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c index f7eceb4e94..24640631f5 100644 --- a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c @@ -153,19 +153,69 @@ cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ) } /* + * Move a thread from one cgroup to another. If the old cgroup is empty + * we queue up an agent event. We return true in that case since we've + * dropped the locks and the caller needs to reacquire them. + */ +static boolean_t +cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn, + uint_t cg_id, proc_t *p) +{ + cgrp_node_t *ocn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid); + VERIFY(ocn != NULL); + + ASSERT(ocn->cgn_task_cnt > 0); + atomic_dec_32(&ocn->cgn_task_cnt); + atomic_inc_32(&ncn->cgn_task_cnt); + plwpd->br_cgroupid = cg_id; + + if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) && + ocn->cgn_notify == 1) { + /* + * We want to drop p_lock before queuing the event since + * that might sleep. Dropping p_lock might cause the caller to + * have to restart the move process from the beginning. + */ + mutex_exit(&p->p_lock); + cgrp_rel_agent_event(cgm, ocn); + mutex_exit(&cgm->cg_contents); + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Assign either all of the threads, or a single thread, for the specified pid * to the new cgroup. Controlled by the typ argument. */ static int -cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) +cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) { proc_t *p; kthread_t *t; int error; + cgrp_node_t *ncn; if (pid == 1) pid = curproc->p_zone->zone_proc_initpid; + /* + * Move one or all threads to this cgroup. + */ + if (typ == CG_WR_TASKS) { + error = ESRCH; + } else { + error = 0; + } + +restart: mutex_enter(&pidlock); p = cgrp_p_for_wr(pid, typ); @@ -194,39 +244,48 @@ cgrp_proc_set_id(uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) * Ignore writes for PID which is not an lx-branded process or with * no threads. */ + mutex_enter(&p->p_lock); - if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) { + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL || + p->p_flag & SEXITING) { mutex_exit(&p->p_lock); - mutex_exit(&pidlock); return (0); } - /* - * Move one or all threads to this cgroup. - */ - if (typ == CG_WR_TASKS) { - error = ESRCH; - } else { - error = 0; - } + mutex_enter(&cgm->cg_contents); + + ncn = cgrp_cg_hash_lookup(cgm, cg_id); + VERIFY(ncn != NULL); do { lx_lwp_data_t *plwpd = ttolxlwp(t); - if (plwpd != NULL) { + if (plwpd != NULL && plwpd->br_cgroupid != cg_id) { if (typ == CG_WR_PROCS) { - plwpd->br_cgroupid = cg_id; + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + /* + * We dropped all of the locks so we + * need to start over. + */ + goto restart; + } + } else if (plwpd->br_pid == pid) { /* type is CG_WR_TASKS and we found the task */ - plwpd->br_cgroupid = cg_id; error = 0; - break; + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + goto done; + } else { + break; + } } } t = t->t_forw; } while (t != p->p_tlist); + mutex_exit(&cgm->cg_contents); mutex_exit(&p->p_lock); - mutex_exit(&pidlock); +done: return (error); } @@ -273,7 +332,56 @@ cgrp_get_pid_str(struct uio *uio, pid_t *pid) } static int -cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ) +cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio) +{ + int error; + uint_t value; + + /* + * This is cheesy but since we only take a 0 or 1 value we can + * let the pid_str function do the uio string conversion. + */ + error = cgrp_get_pid_str(uio, (pid_t *)&value); + if (error != 0) + return (error); + + if (value != 0 && value != 1) + return (EINVAL); + + /* + * The flag is on the containing dir. We don't bother taking the + * cg_contents lock since this is a simple assignment. + */ + cn->cgn_parent->cgn_notify = value; + return (0); +} + +static int +cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int error; + int len; + char *wrp; + + len = uio->uio_offset + uio->uio_resid; + if (len > MAXPATHLEN) + return (EFBIG); + + mutex_enter(&cgm->cg_contents); + + wrp = &cgm->cg_agent[uio->uio_offset]; + error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio); + cgm->cg_agent[len] = '\0'; + if (len > 1 && cgm->cg_agent[len - 1] == '\n') + cgm->cg_agent[len - 1] = '\0'; + + mutex_exit(&cgm->cg_contents); + return (error); +} + +static int +cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, + cgrp_wr_type_t typ) { /* the cgroup ID is on the containing dir */ uint_t cg_id = cn->cgn_parent->cgn_id; @@ -285,7 +393,7 @@ cgrp_wr_proc_or_task(cgrp_node_t *cn, struct uio *uio, cgrp_wr_type_t typ) if (error != 0) return (error); - error = cgrp_proc_set_id(cg_id, pidnum, typ); + error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ); if (error != 0) return (error); } @@ -304,9 +412,6 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr, vp = CGNTOV(cn); ASSERT(vp->v_type == VREG); - ASSERT(RW_WRITE_HELD(&cn->cgn_contents)); - ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock)); - if (uio->uio_loffset < 0) return (EINVAL); @@ -323,11 +428,17 @@ cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, struct cred *cr, limit = MAXOFF_T; switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_wr_notify(cn, uio); + break; case CG_PROCS: - error = cgrp_wr_proc_or_task(cn, uio, CG_WR_PROCS); + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS); + break; + case CG_REL_AGENT: + error = cgrp_wr_rel_agent(cgm, uio); break; case CG_TASKS: - error = cgrp_wr_proc_or_task(cn, uio, CG_WR_TASKS); + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS); break; default: VERIFY(0); @@ -351,6 +462,12 @@ cgrp_p_lock(proc_t *p) /* first try the fast path */ mutex_enter(&p->p_lock); + if (p->p_flag & SEXITING) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + return (NULL); + } + if (!(p->p_proc_flag & P_PR_LOCK)) { p->p_proc_flag |= P_PR_LOCK; mutex_exit(&p->p_lock); @@ -404,13 +521,76 @@ cgrp_p_unlock(proc_t *p) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(!MUTEX_HELD(&pidlock)); - cv_signal(&pr_pid_cv[p->p_slot]); p->p_proc_flag &= ~P_PR_LOCK; + cv_signal(&pr_pid_cv[p->p_slot]); mutex_exit(&p->p_lock); THREAD_KPRI_RELEASE(); } /* + * Read value from the notify_on_release pseudo file on the parent node + * (which is the actual cgroup node). We don't bother taking the cg_contents + * lock since it's a single instruction so an empty group action/read will + * only see one value or the other. + */ +/* ARGSUSED */ +static int +cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int len; + int error = 0; + char buf[16]; + char *rdp; + /* the flag is on the containing dir */ + uint_t value = cn->cgn_parent->cgn_notify; + + len = snprintf(buf, sizeof (buf), "%u\n", value); + if (uio->uio_offset > len) + return (0); + + len -= uio->uio_offset; + rdp = &buf[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + return (error); +} + +/* + * Read value from the release_agent pseudo file. + */ +static int +cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int len; + int error = 0; + char *rdp; + + mutex_enter(&cgm->cg_contents); + + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len = strlen(cgm->cg_agent); + if (uio->uio_offset > len) { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len -= uio->uio_offset; + rdp = &cgm->cg_agent[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + + mutex_exit(&cgm->cg_contents); + + return (error); +} + +/* * Read pids from the cgroup.procs pseudo file. We have to look at all of the * processes to find applicable ones, then report pids for any process which * has all of its threads in the same cgroup. @@ -470,6 +650,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) * Check if all threads are in this cgroup. */ in_cg = B_TRUE; + mutex_enter(&cgm->cg_contents); do { lx_lwp_data_t *plwpd = ttolxlwp(t); if (plwpd == NULL || plwpd->br_cgroupid != cg_id) { @@ -479,6 +660,7 @@ cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) t = t->t_forw; } while (t != p->p_tlist); + mutex_exit(&cgm->cg_contents); mutex_exit(&p->p_lock); if (!in_cg) { @@ -647,7 +829,9 @@ cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) if (p == NULL) continue; + mutex_enter(&cgm->cg_contents); error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio); + mutex_exit(&cgm->cg_contents); mutex_enter(&p->p_lock); cgrp_p_unlock(p); @@ -664,8 +848,6 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct) { int error = 0; - ASSERT(RW_LOCK_HELD(&cn->cgn_contents)); - if (uio->uio_loffset >= MAXOFF_T) return (0); if (uio->uio_loffset < 0) @@ -674,9 +856,15 @@ cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, caller_context_t *ct) return (0); switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_rd_notify(cgm, cn, uio); + break; case CG_PROCS: error = cgrp_rd_procs(cgm, cn, uio); break; + case CG_REL_AGENT: + error = cgrp_rd_rel_agent(cgm, uio); + break; case CG_TASKS: error = cgrp_rd_tasks(cgm, cn, uio); break; @@ -692,8 +880,8 @@ static int cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct caller_context *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); int error; /* @@ -703,17 +891,8 @@ cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, return (EISDIR); if (vp->v_type != VREG) return (EINVAL); - /* - * cgrp_rwlock should have already been called from layers above - */ - ASSERT(RW_READ_HELD(&cn->cgn_rwlock)); - - rw_enter(&cn->cgn_contents, RW_READER); - error = cgrp_rd(cgm, cn, uiop, ct); - rw_exit(&cn->cgn_contents); - return (error); } @@ -721,8 +900,8 @@ static int cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct caller_context *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); int error; /* @@ -731,11 +910,6 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, if (vp->v_type != VREG) return (EINVAL); - /* cgrp_rwlock should have already been called from layers above */ - ASSERT(RW_WRITE_HELD(&cn->cgn_rwlock)); - - rw_enter(&cn->cgn_contents, RW_WRITER); - if (ioflag & FAPPEND) { /* In append mode start at end of file. */ uiop->uio_loffset = cn->cgn_size; @@ -743,21 +917,146 @@ cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, error = cgrp_wr(cgm, cn, uiop, cred, ct); - rw_exit(&cn->cgn_contents); - return (error); } +static int +cgrp_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, cred_t *cr, + int *rvalp, caller_context_t *ct) +{ + cgrp_mnt_t *cgm = VTOCGM(vp); + model_t model; + cgrpmgr_info_t cgmi; + cgrp_evnt_t *evntp; + int res = 0; + + /* We only support the cgrpmgr ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* The caller must be root */ + if (secpolicy_vnode_any_access(cr, vp, crgetuid(cr)) != 0 || + crgetuid(cr) != 0) + return (ENOTTY); + + if (cmd != CGRPFS_GETEVNT) + return (ENOTTY); + + model = get_udatamodel(); + if (model == DATAMODEL_NATIVE) { + if (copyin((void *)data, &cgmi, sizeof (cgmi))) + return (EFAULT); + + } else { + cgrpmgr_info32_t cgmi32; + + if (copyin((void *)data, &cgmi32, sizeof (cgmi32))) + return (EFAULT); + + cgmi.cgmi_pid = cgmi32.cgmi_pid; + cgmi.cgmi_rel_agent_path = + (char *)(intptr_t)cgmi32.cgmi_rel_agent_path; + cgmi.cgmi_cgroup_path = + (char *)(intptr_t)cgmi32.cgmi_cgroup_path; + } + + if (cgm->cg_mgrpid == 0) { + /* + * This is the initial call from the user-level manager, + * keep track of its pid. + */ + cgm->cg_mgrpid = cgmi.cgmi_pid; + } else if (cgm->cg_mgrpid != cgmi.cgmi_pid) { + /* + * We only allow the manager which first contacted us to + * make this ioctl. + */ + return (EINVAL); + } + + /* + * If there is a pending event, service it immediately, otherwise + * block until an event occurs. + */ +retry: + mutex_enter(&cgm->cg_events); + + if (cgm->cg_evnt_cnt < 0) { + /* + * Trying to unmount, tell the manager to quit. + */ + mutex_exit(&cgm->cg_events); + return (EIO); + } + + if (cgm->cg_evnt_cnt == 0) { + cv_wait_sig(&cgm->cg_evnt_cv, &cgm->cg_events); + + if (cgm->cg_evnt_cnt <= 0) { + /* + * We were woken up but there are no events, it must + * be due to an unmount and it's time for the user + * manager to go away. + */ + mutex_exit(&cgm->cg_events); + return (EIO); + } + } + + evntp = list_remove_head(&cgm->cg_evnt_list); + VERIFY(evntp != NULL); + ASSERT(cgm->cg_evnt_cnt > 0); + cgm->cg_evnt_cnt--; + + mutex_exit(&cgm->cg_events); + + /* + * An event for the user-level manager should only occur if a + * release_agent has been set, but on the unlikely chance that the + * agent path was cleared after the event was enqueued, we check under + * the lock and go back to waiting if the path is empty. + */ + mutex_enter(&cgm->cg_contents); + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + kmem_free(evntp->cg_evnt_path, MAXPATHLEN); + kmem_free(evntp, sizeof (cgrp_evnt_t)); + goto retry; + } + + if (copyout(cgm->cg_agent, (void *)cgmi.cgmi_rel_agent_path, + strlen(cgm->cg_agent) + 1)) { + mutex_exit(&cgm->cg_contents); + res = EFAULT; + goto done; + } + + mutex_exit(&cgm->cg_contents); + + if (copyout(evntp->cg_evnt_path, (void *)cgmi.cgmi_cgroup_path, + strlen(evntp->cg_evnt_path) + 1)) { + res = EFAULT; + } + +done: + kmem_free(evntp->cg_evnt_path, MAXPATHLEN); + kmem_free(evntp, sizeof (cgrp_evnt_t)); + + return (res); +} + /* ARGSUSED2 */ static int cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, caller_context_t *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; struct vattr va; int attrs = 1; - mutex_enter(&cn->cgn_tlock); + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); if (attrs == 0) { cn->cgn_uid = va.va_uid; cn->cgn_gid = va.va_gid; @@ -778,7 +1077,7 @@ cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, vap->va_seq = cn->cgn_seq; vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); - mutex_exit(&cn->cgn_tlock); + mutex_exit(&cgm->cg_contents); return (0); } @@ -787,7 +1086,8 @@ static int cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, caller_context_t *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; int error = 0; struct vattr *get; long mask; @@ -799,7 +1099,8 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE)) return (EINVAL); - mutex_enter(&cn->cgn_tlock); + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); get = &cn->cgn_attr; /* @@ -832,7 +1133,7 @@ cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, gethrestime(&cn->cgn_ctime); out: - mutex_exit(&cn->cgn_tlock); + mutex_exit(&cgm->cg_contents); return (error); } @@ -841,12 +1142,14 @@ static int cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred, caller_context_t *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; int error; - mutex_enter(&cn->cgn_tlock); + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); error = cgrp_taccess(cn, mode, cred); - mutex_exit(&cn->cgn_tlock); + mutex_exit(&cgm->cg_contents); return (error); } @@ -856,7 +1159,8 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(dvp); + cgrp_node_t *cn = VTOCGN(dvp); + cgrp_mnt_t *cgm; cgrp_node_t *ncn = NULL; int error; @@ -874,7 +1178,10 @@ cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, } ASSERT(cn); + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); error = cgrp_dirlookup(cn, nm, &ncn, cred); + mutex_exit(&cgm->cg_contents); if (error == 0) { ASSERT(ncn); @@ -890,17 +1197,21 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap, enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred, int flag, caller_context_t *ct, vsecattr_t *vsecp) { - cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp); + cgrp_node_t *parent = VTOCGN(dvp); cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; int error; if (*nm == '\0') return (EPERM); + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); error = cgrp_dirlookup(parent, nm, &cn, cred); if (error == 0) { /* name found */ ASSERT(cn); + mutex_exit(&cgm->cg_contents); /* * Creating an existing file, allow it except for the following * errors. @@ -919,6 +1230,7 @@ cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap, *vpp = CGNTOV(cn); return (0); } + mutex_exit(&cgm->cg_contents); /* * cgroups doesn't allow creation of additional, non-subsystem specific @@ -932,9 +1244,10 @@ static int cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred, caller_context_t *ct, int flags) { - cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp); + cgrp_node_t *parent = VTOCGN(dvp); int error; cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; /* * Removal of subsystem-specific files is not allowed but we need @@ -942,7 +1255,10 @@ cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred, * file. */ + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); error = cgrp_dirlookup(parent, nm, &cn, cred); + mutex_exit(&cgm->cg_contents); if (error) return (error); @@ -979,11 +1295,11 @@ cgrp_rename( cgrp_node_t *fromparent; cgrp_node_t *toparent; cgrp_node_t *fromcn = NULL; /* source cgrp_node */ - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(odvp); + cgrp_mnt_t *cgm = VTOCGM(odvp); int error, err; - fromparent = (cgrp_node_t *)VTOCGN(odvp); - toparent = (cgrp_node_t *)VTOCGN(ndvp); + fromparent = VTOCGN(odvp); + toparent = VTOCGN(ndvp); if (fromparent != toparent) return (EIO); @@ -991,14 +1307,14 @@ cgrp_rename( /* discourage additional use of toparent */ toparent = NULL; - mutex_enter(&cgm->cg_renamelck); + mutex_enter(&cgm->cg_contents); /* * Look up cgrp_node of file we're supposed to rename. */ error = cgrp_dirlookup(fromparent, onm, &fromcn, cred); if (error) { - mutex_exit(&cgm->cg_renamelck); + mutex_exit(&cgm->cg_contents); return (error); } @@ -1030,11 +1346,9 @@ cgrp_rename( /* * Link source to new target */ - rw_enter(&fromparent->cgn_rwlock, RW_WRITER); error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME, fromcn, (struct vattr *)NULL, (cgrp_node_t **)NULL, cred, ct); - rw_exit(&fromparent->cgn_rwlock); if (error) goto done; @@ -1042,9 +1356,6 @@ cgrp_rename( /* * Unlink from source. */ - rw_enter(&fromparent->cgn_rwlock, RW_WRITER); - rw_enter(&fromcn->cgn_rwlock, RW_WRITER); - error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred); /* @@ -1054,17 +1365,14 @@ cgrp_rename( if (error == ENOENT) error = 0; - rw_exit(&fromcn->cgn_rwlock); - rw_exit(&fromparent->cgn_rwlock); - if (err == 0) { vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct); vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct); } done: + mutex_exit(&cgm->cg_contents); cgnode_rele(fromcn); - mutex_exit(&cgm->cg_renamelck); return (error); } @@ -1074,9 +1382,9 @@ static int cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp) { - cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp); + cgrp_node_t *parent = VTOCGN(dvp); cgrp_node_t *self = NULL; - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VTOCGM(dvp); + cgrp_mnt_t *cgm = VTOCGM(dvp); int error; /* @@ -1086,25 +1394,28 @@ cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, if (parent->cgn_nlink == 0) return (ENOENT); + mutex_enter(&cgm->cg_contents); error = cgrp_dirlookup(parent, nm, &self, cred); if (error == 0) { ASSERT(self != NULL); + mutex_exit(&cgm->cg_contents); cgnode_rele(self); return (EEXIST); } - if (error != ENOENT) + if (error != ENOENT) { + mutex_exit(&cgm->cg_contents); return (error); + } - rw_enter(&parent->cgn_rwlock, RW_WRITER); error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL, va, &self, cred, ct); if (error) { - rw_exit(&parent->cgn_rwlock); + mutex_exit(&cgm->cg_contents); if (self != NULL) cgnode_rele(self); return (error); } - rw_exit(&parent->cgn_rwlock); + mutex_exit(&cgm->cg_contents); *vpp = CGNTOV(self); return (0); } @@ -1114,7 +1425,7 @@ static int cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, caller_context_t *ct, int flags) { - cgrp_node_t *parent = (cgrp_node_t *)VTOCGN(dvp); + cgrp_node_t *parent = VTOCGN(dvp); cgrp_mnt_t *cgm; cgrp_node_t *self = NULL; struct vnode *vp; @@ -1127,63 +1438,61 @@ cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, return (EINVAL); if (strcmp(nm, "..") == 0) return (EEXIST); /* Should be ENOTEMPTY */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &self, cred); - if (error) + if (error) { + mutex_exit(&cgm->cg_contents); return (error); - - rw_enter(&parent->cgn_rwlock, RW_WRITER); - rw_enter(&self->cgn_rwlock, RW_WRITER); + } vp = CGNTOV(self); if (vp == dvp || vp == cdir) { error = EINVAL; - goto done1; + goto done; } if (self->cgn_type != CG_CGROUP_DIR) { error = ENOTDIR; - goto done1; + goto done; } cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp); - mutex_enter(&self->cgn_tlock); - /* Check for the existence of any sub-cgroup directories */ - if (self->cgn_nlink > 2) { - mutex_exit(&self->cgn_tlock); + /* + * Check for the existence of any sub-cgroup directories or tasks in + * the cgroup. + */ + if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) { error = EEXIST; - goto done1; + /* + * Update atime because checking cn_dirents is logically + * equivalent to reading the directory + */ + gethrestime(&self->cgn_atime); + goto done; } - mutex_exit(&self->cgn_tlock); if (vn_vfswlock(vp)) { error = EBUSY; - goto done1; + goto done; } if (vn_mountedvfs(vp) != NULL) { error = EBUSY; - goto done; + } else { + error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred); } - /* - * Confirm directory only includes entries for ".", ".." and the - * fixed pseudo file entries. - */ - if (self->cgn_dirents > (cgrp_num_pseudo_ents(cgm->cg_ssid) + 2)) { - error = EEXIST; /* should be ENOTEMPTY */ - /* - * Update atime because checking cn_dirents is logically - * equivalent to reading the directory - */ - gethrestime(&self->cgn_atime); - goto done; + vn_vfsunlock(vp); + + if (parent->cgn_task_cnt == 0 && + parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, parent); } - error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred); done: - vn_vfsunlock(vp); -done1: - rw_exit(&self->cgn_rwlock); - rw_exit(&parent->cgn_rwlock); + mutex_exit(&cgm->cg_contents); vnevent_rmdir(CGNTOV(self), dvp, nm, ct); cgnode_rele(self); @@ -1195,7 +1504,8 @@ static int cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, caller_context_t *ct, int flags) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; cgrp_dirent_t *cdp; int error = 0; size_t namelen; @@ -1212,10 +1522,6 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, *eofp = 1; return (0); } - /* - * assuming system call has already called cgrp_rwlock - */ - ASSERT(RW_READ_HELD(&cn->cgn_rwlock)); if (uiop->uio_iovcnt != 1) return (EINVAL); @@ -1223,8 +1529,12 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, if (vp->v_type != VDIR) return (ENOTDIR); + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + if (cn->cgn_dir == NULL) { VERIFY(cn->cgn_nlink == 0); + mutex_exit(&cgm->cg_contents); return (0); } @@ -1284,6 +1594,9 @@ cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, uiop->uio_offset = offset; } gethrestime(&cn->cgn_atime); + + mutex_exit(&cgm->cg_contents); + kmem_free(outbuf, bufsize); return (error); } @@ -1301,11 +1614,10 @@ cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm, static void cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) { - cgrp_node_t *cn = (cgrp_node_t *)VTOCGN(vp); - cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vp->v_vfsp); + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp); - rw_enter(&cn->cgn_rwlock, RW_WRITER); - mutex_enter(&cn->cgn_tlock); + mutex_enter(&cgm->cg_contents); mutex_enter(&vp->v_lock); ASSERT(vp->v_count >= 1); @@ -1316,27 +1628,22 @@ cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) if (vp->v_count > 1 || cn->cgn_nlink != 0) { vp->v_count--; mutex_exit(&vp->v_lock); - mutex_exit(&cn->cgn_tlock); - rw_exit(&cn->cgn_rwlock); + mutex_exit(&cgm->cg_contents); return; } - mutex_exit(&vp->v_lock); - mutex_exit(&cn->cgn_tlock); - /* Here's our chance to send invalid event while we're between locks */ - vn_invalid(CGNTOV(cn)); - - mutex_enter(&cgm->cg_contents); if (cn->cgn_forw == NULL) cgm->cg_rootnode->cgn_back = cn->cgn_back; else cn->cgn_forw->cgn_back = cn->cgn_back; cn->cgn_back->cgn_forw = cn->cgn_forw; + + mutex_exit(&vp->v_lock); mutex_exit(&cgm->cg_contents); - rw_exit(&cn->cgn_rwlock); - rw_destroy(&cn->cgn_rwlock); - mutex_destroy(&cn->cgn_tlock); + /* Here's our chance to send invalid event */ + vn_invalid(CGNTOV(cn)); + vn_free(CGNTOV(cn)); kmem_free(cn, sizeof (cgrp_node_t)); } @@ -1349,27 +1656,17 @@ cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } -/* ARGSUSED2 */ +/* ARGSUSED */ static int cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) { - cgrp_node_t *cn = VTOCGN(vp); - - if (write_lock) { - rw_enter(&cn->cgn_rwlock, RW_WRITER); - } else { - rw_enter(&cn->cgn_rwlock, RW_READER); - } return (write_lock); } -/* ARGSUSED1 */ +/* ARGSUSED */ static void cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) { - cgrp_node_t *cn = VTOCGN(vp); - - rw_exit(&cn->cgn_rwlock); } static int @@ -1412,6 +1709,7 @@ const fs_operation_def_t cgrp_vnodeops_template[] = { VOPNAME_CLOSE, { .vop_close = cgrp_close }, VOPNAME_READ, { .vop_read = cgrp_read }, VOPNAME_WRITE, { .vop_write = cgrp_write }, + VOPNAME_IOCTL, { .vop_ioctl = cgrp_ioctl }, VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr }, VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr }, VOPNAME_ACCESS, { .vop_access = cgrp_access }, diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c index feccf31800..44acdff3b1 100644 --- a/usr/src/uts/common/brand/lx/os/lx_brand.c +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -205,6 +205,14 @@ lx_systrace_f *lx_systrace_return_ptr; static int lx_systrace_enabled; /* + * cgroup file system maintenance functions which are set when cgroups loads. + */ +void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t); +void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t); +void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +/* * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly * want an MMU dependency here (and should there be a microprocessor without * a hole, we don't want to start allocating from the top of the VA range). @@ -312,6 +320,16 @@ lx_proc_exit(proc_t *p) { lx_proc_data_t *lxpd; proc_t *cp; + lx_zone_data_t *lxzdata; + + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + if (lxzdata->lxzd_cgroup != NULL) { + lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread)); + ASSERT(lx_cgrp_proc_exit != NULL); + (*lx_cgrp_proc_exit)(lxzdata->lxzd_cgroup, + lwpd->br_cgroupid, p->p_pid); + } mutex_enter(&p->p_lock); VERIFY(lxpd = ptolxproc(p)); diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c index 67565379fe..3577749b66 100644 --- a/usr/src/uts/common/brand/lx/os/lx_misc.c +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -261,6 +261,7 @@ lx_freelwp(klwp_t *lwp) { struct lx_lwp_data *lwpd = lwptolxlwp(lwp); proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; VERIFY(MUTEX_NOT_HELD(&p->p_lock)); @@ -279,6 +280,14 @@ lx_freelwp(klwp_t *lwp) return; } + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + if (lxzdata->lxzd_cgroup != NULL) { + ASSERT(lx_cgrp_freelwp != NULL); + (*lx_cgrp_freelwp)(lxzdata->lxzd_cgroup, + lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid); + } + /* * It is possible for the lx_freelwp hook to be called without a prior * call to lx_exitlwp being made. This happens as part of lwp @@ -370,6 +379,7 @@ lx_initlwp(klwp_t *lwp, void *lwpbd) lx_lwp_data_t *plwpd = ttolxlwp(curthread); kthread_t *tp = lwptot(lwp); proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; VERIFY(MUTEX_HELD(&p->p_lock)); VERIFY(lwp->lwp_brand == NULL); @@ -452,6 +462,15 @@ lx_initlwp(klwp_t *lwp, void *lwpbd) lx_ptrace_inherit_tracer(plwpd, lwpd); lwpd->br_cgroupid = plwpd->br_cgroupid; } + + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + if (lxzdata->lxzd_cgroup != NULL) { + ASSERT(lx_cgrp_initlwp != NULL); + (*lx_cgrp_initlwp)(lxzdata->lxzd_cgroup, + lwpd->br_cgroupid, lwptot(lwp)->t_tid, lwpd->br_pid); + } + } /* @@ -465,6 +484,7 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) { struct lx_lwp_data *src = srclwp->lwp_brand; struct lx_lwp_data *dst = dstlwp->lwp_brand; + lx_zone_data_t *lxzdata; dst->br_ppid = src->br_pid; dst->br_ptid = lwptot(srclwp)->t_tid; @@ -496,6 +516,15 @@ lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) */ dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; dst->br_scall_args = NULL; + + /* cgroup integration */ + lxzdata = ztolxzd(srclwp->lwp_procp->p_zone); + if (lxzdata->lxzd_cgroup != NULL) { + ASSERT(lx_cgrp_forklwp != NULL); + (*lx_cgrp_forklwp)(lxzdata->lxzd_cgroup, + dst->br_cgroupid, lwptoproc(dstlwp)->p_pid); + } + } /* diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h index e6288fac57..895ea44db5 100644 --- a/usr/src/uts/common/brand/lx/sys/lx_brand.h +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -35,6 +35,7 @@ #include <sys/cpuvar.h> #include <sys/zone.h> #include <sys/ksocket.h> +#include <sys/vfs.h> #endif #ifdef __cplusplus @@ -383,6 +384,14 @@ typedef enum lx_proc_flags { #ifdef _KERNEL +/* + * Entry points for cgroup integration. + */ +extern void (*lx_cgrp_forklwp)(vfs_t *, uint_t, pid_t); +extern void (*lx_cgrp_proc_exit)(vfs_t *, uint_t, pid_t); +extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + #define LX_RLFAKE_LOCKS 0 #define LX_RLFAKE_NICE 1 #define LX_RLFAKE_RTPRIO 2 @@ -632,11 +641,18 @@ struct lx_lwp_data { */ #define LX_BR_ARGS_SIZE_MAX (1024) -/* brand specific data */ +/* + * brand specific data + * + * We currently only support a single cgroup mount in an lx zone so we only have + * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever + * enhanced to support different mounts with different subsystem controllers. + */ typedef struct lx_zone_data { char lxzd_kernel_version[LX_VERS_MAX]; ksocket_t lxzd_ioctl_sock; char lxzd_bootid[LX_BOOTID_LEN]; /* procfs boot_id */ + vfs_t *lxzd_cgroup; /* cgroup for this zone */ } lx_zone_data_t; #define BR_CPU_BOUND 0x0001 |