diff options
Diffstat (limited to 'usr/src/cmd/zoneadmd')
| -rw-r--r-- | usr/src/cmd/zoneadmd/Makefile | 49 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/Makefile.com | 70 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/amd64/Makefile | 31 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/i386/Makefile | 30 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/mcap.c | 1193 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/vplat.c | 437 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zcons.c | 131 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zoneadmd.c | 589 | ||||
| -rw-r--r-- | usr/src/cmd/zoneadmd/zoneadmd.h | 20 |
9 files changed, 2073 insertions, 477 deletions
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile index 8324f7fefa..e81e4631aa 100644 --- a/usr/src/cmd/zoneadmd/Makefile +++ b/usr/src/cmd/zoneadmd/Makefile @@ -18,57 +18,54 @@ # # CDDL HEADER END - -# - # # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2014 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2011, Joyent, Inc. All rights reserved. # PROG= zoneadmd include ../Makefile.cmd +include ../Makefile.ctf -ROOTCMDDIR= $(ROOTLIB)/zones - -OBJS= zoneadmd.o zcons.o vplat.o -SRCS = $(OBJS:.o=.c) -POFILE=zoneadmd_all.po -POFILES= $(OBJS:%.o=%.po) +$(64ONLY)SUBDIRS= $(MACH) +$(BUILD64)SUBDIRS += $(MACH64) -CFLAGS += $(CCVERBOSE) -CERRWARN += -_gcc=-Wno-switch -CERRWARN += -_gcc=-Wno-parentheses -CERRWARN += -_gcc=-Wno-uninitialized +all := TARGET = all +install := TARGET = install +clean := TARGET = clean +clobber := TARGET = clobber +lint := TARGET = lint -LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \ - -lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \ - -linetutil -lscf XGETFLAGS += -a -x zoneadmd.xcl +ROOTUSRLIBZONES = $(ROOT)/usr/lib/zones + .KEEP_STATE: .PARALLEL: -all: $(PROG) +all: $(SUBDIRS) $(PROG): $(OBJS) $(LINK.c) -o $@ $(OBJS) $(LDLIBS) $(POST_PROCESS) -install: all $(ROOTCMD) - -$(POFILE): $(POFILES) - $(RM) $@ - $(CAT) $(POFILES) > $@ +install: $(SUBDIRS) + -$(RM) $(ROOTUSRLIBZONES)/$(PROG) + -$(LN) $(ISAEXEC) $(ROOTUSRLIBZONES)/$(PROG) -clean: - $(RM) $(OBJS) +$(POFILE): -lint: lint_SRCS +clean clobebr lint: $(SUBDIRS) check: - $(CSTYLE) -p -P $(SRCS:%=%) + $(CSTYLE) -p -P *.c + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: include ../Makefile.targ diff --git a/usr/src/cmd/zoneadmd/Makefile.com b/usr/src/cmd/zoneadmd/Makefile.com new file mode 100644 index 0000000000..162d1f0219 --- /dev/null +++ b/usr/src/cmd/zoneadmd/Makefile.com @@ -0,0 +1,70 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +PROG= zoneadmd + +include ../../Makefile.cmd +include ../../Makefile.ctf + +ROOTCMDDIR= $(ROOTLIB)/zones + +OBJS= zoneadmd.o zcons.o vplat.o mcap.o + +CFLAGS += $(CCVERBOSE) +LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \ + -lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \ + -linetutil -lproc -lscf + +.KEEP_STATE: + +%.o: ../%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) + +ROOTUSRLIBZONES = $(ROOT)/usr/lib/zones +ROOTUSRLIBZONES32 = $(ROOTUSRLIBZONES)/$(MACH32) +ROOTUSRLIBZONES64 = $(ROOTUSRLIBZONES)/$(MACH64) +ROOTUSRLIBZONESPROG32 = $(ROOTUSRLIBZONES32)/$(PROG) +ROOTUSRLIBZONESPROG64 = $(ROOTUSRLIBZONES64)/$(PROG) +$(ROOTUSRLIBZONES32)/%: $(ROOTUSRLIBZONES32) % + $(INS.file) +$(ROOTUSRLIBZONES64)/%: $(ROOTUSRLIBZONES64) % + $(INS.file) +$(ROOTUSRLIBZONES32): + $(INS.dir) + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(POST_PROCESS) + +clean: + $(RM) $(OBJS) + +lint: + $(LINT.c) ../*.c $(LDLIBS) + +include ../../Makefile.targ diff --git a/usr/src/cmd/zoneadmd/amd64/Makefile b/usr/src/cmd/zoneadmd/amd64/Makefile new file mode 100644 index 0000000000..75ac51db32 --- /dev/null +++ b/usr/src/cmd/zoneadmd/amd64/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +.KEEP_STATE: + +include ../Makefile.com +include ../../Makefile.cmd.64 + +install: all $(ROOTUSRLIBZONES64) $(ROOTUSRLIBZONESPROG64) diff --git a/usr/src/cmd/zoneadmd/i386/Makefile b/usr/src/cmd/zoneadmd/i386/Makefile new file mode 100644 index 0000000000..a8764e0638 --- /dev/null +++ b/usr/src/cmd/zoneadmd/i386/Makefile @@ -0,0 +1,30 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +.KEEP_STATE: + +include ../Makefile.com + +install: all $(ROOTUSRLIBZONES32) $(ROOTUSRLIBZONESPROG32) diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c new file mode 100644 index 0000000000..44917b0024 --- /dev/null +++ b/usr/src/cmd/zoneadmd/mcap.c @@ -0,0 +1,1193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2014, Joyent, Inc. All rights reserved. + */ + +/* + * This file implements the code which runs a thread inside zoneadmd to cap + * the associated zone's physical memory. A thread to do this is started + * when the zone boots and is halted when the zone shuts down. + * + * Because of the way that the VM system is currently implemented, there is no + * way to go from the bottom up (page to process to zone). Thus, there is no + * obvious way to hook an rctl into the kernel's paging code to enforce a hard + * memory cap. Instead, we implement a soft physical memory cap which looks + * at the zone's overall rss and once it is over the cap, works from the top + * down (zone to process to page), looking at zone processes, to determine + * what to try to pageout to get the zone under its memory cap. + * + * The code uses the fast, cheap, but potentially very inaccurate sum of the + * rss values from psinfo_t to first approximate the zone's rss and will + * fallback to the vm_getusage syscall to determine the zone's rss if needed. + * It then checks the rss against the zone's zone.max-physical-memory rctl. + * Once the zone goes over its cap, then this thread will work through the + * zone's /proc process list, Pgrab-bing each process and stepping through the + * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...) + * to pageout pages, until the zone is again under its cap. + * + * Although zone memory capping is implemented as a soft cap by this user-level + * thread, the interfaces around memory caps that are exposed to the user are + * the standard ones; an rctl and kstats. This thread uses the rctl value + * to obtain the cap and works with the zone kernel code to update the kstats. + * If the implementation ever moves into the kernel, these exposed interfaces + * do not need to change. + * + * The thread adaptively sleeps, periodically checking the state of the + * zone. As the zone's rss gets closer to the cap, the thread will wake up + * more often to check the zone's status. Once the zone is over the cap, + * the thread will work to pageout until the zone is under the cap, as shown + * by updated vm_usage data. + * + * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags + * are set by hrm_setbits() and on x86 that code path is only executed by + * segvn_pagelock -> hat_setstat -> hrm_setbits + * segvn_softunlock -^ + * On SPARC there is an additional code path which may make this data + * useful (sfmmu_ttesync), but since it is not generic, we ignore the page + * maps. If we ever fix this issue, then we could generalize this mcap code to + * do more with the data on active pages. + * + * For debugging, touch the file {zonepath}/mcap_debug.log. This will + * cause the thread to start logging its actions into that file (it may take + * a minute or two if the thread is currently sleeping). Removing that + * file will cause logging to stop. + */ + +#include <sys/mman.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libproc.h> +#include <limits.h> +#include <procfs.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <time.h> +#include <unistd.h> +#include <sys/priocntl.h> +#include <dirent.h> +#include <zone.h> +#include <libzonecfg.h> +#include <thread.h> +#include <values.h> +#include <sys/vm_usage.h> +#include <sys/resource.h> +#include <sys/debug.h> +#include <synch.h> +#include <wait.h> +#include <libcontract.h> +#include <libcontract_priv.h> +#include <sys/contract/process.h> +#include "zoneadmd.h" + + /* round up to next y = 2^n */ +#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1)) + +#define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */ + +/* + * zonecfg attribute tunables for memory capping. + * phys-mcap-cmd + * type: string + * specifies a command that can be run when over the cap + * phys-mcap-no-vmusage + * type: boolean + * true disables vm_getusage and just uses zone's proc. rss sum + * phys-mcap-no-pageout + * type: boolean + * true disables pageout when over + * phys-mcap-no-pf-throttle + * type: boolean + * true disables page fault throttling when over + */ +#define TUNE_CMD "phys-mcap-cmd" +#define TUNE_NVMU "phys-mcap-no-vmusage" +#define TUNE_NPAGE "phys-mcap-no-pageout" +#define TUNE_NPFTHROT "phys-mcap-no-pf-throttle" + +/* + * These are only used in get_mem_info but global. We always need scale_rss and + * prev_fast_rss to be persistent but we also have the other two global so we + * can easily see these with mdb. + */ +uint64_t scale_rss = 0; +uint64_t prev_fast_rss = 0; +uint64_t fast_rss = 0; +uint64_t accurate_rss = 0; + +static char zonename[ZONENAME_MAX]; +static char zonepath[MAXPATHLEN]; +static char zoneproc[MAXPATHLEN]; +static char debug_log[MAXPATHLEN]; +static zoneid_t zid; +static mutex_t shutdown_mx; +static cond_t shutdown_cv; +static int shutting_down = 0; +static thread_t mcap_tid; +static FILE *debug_log_fp = NULL; +static uint64_t zone_rss_cap; /* RSS cap(KB) */ +static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */ +static boolean_t skip_vmusage = B_FALSE; +static boolean_t skip_pageout = B_FALSE; +static boolean_t skip_pf_throttle = B_FALSE; + +static zlog_t *logp; + +static int64_t check_suspend(); +static void get_mcap_tunables(); + +/* + * Structure to hold current state about a process address space that we're + * working on. + */ +typedef struct { + int pr_curr; /* the # of the mapping we're working on */ + int pr_nmap; /* number of mappings in address space */ + prmap_t *pr_mapp; /* process's map array */ +} proc_map_t; + +typedef struct zsd_vmusage64 { + id_t vmu_zoneid; + uint_t vmu_type; + id_t vmu_id; + /* + * An amd64 kernel will align the following uint64_t members, but a + * 32bit i386 process will not without help. + */ + int vmu_align_next_members_on_8_bytes; + uint64_t vmu_rss_all; + uint64_t vmu_rss_private; + uint64_t vmu_rss_shared; + uint64_t vmu_swap_all; + uint64_t vmu_swap_private; + uint64_t vmu_swap_shared; +} zsd_vmusage64_t; + +/* + * Output a debug log message. + */ +/*PRINTFLIKE1*/ +static void +debug(char *fmt, ...) +{ + va_list ap; + + if (debug_log_fp == NULL) + return; + + va_start(ap, fmt); + (void) vfprintf(debug_log_fp, fmt, ap); + va_end(ap); + (void) fflush(debug_log_fp); +} + +/* + * Like sleep(3C) but can be interupted by cond_signal which is posted when + * we're shutting down the mcap thread. + */ +static void +sleep_shutdown(int secs) +{ + timestruc_t to; + + to.tv_sec = secs; + to.tv_nsec = 0; + + (void) mutex_lock(&shutdown_mx); + if (!shutting_down) + (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to); + (void) mutex_unlock(&shutdown_mx); +} + +static boolean_t +proc_issystem(pid_t pid) +{ + char pc_clname[PC_CLNMSZ]; + + if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname, + PC_KY_NULL) != -1) + return (strcmp(pc_clname, "SYS") == 0); + + return (B_TRUE); +} + +/* + * Fork a child that enters the zone and runs the "phys-mcap-cmd" command. + */ +static void +run_over_cmd() +{ + int ctfd; + int err; + pid_t childpid; + siginfo_t info; + ctid_t ct; + + /* + * Before we enter the zone, we need to create a new process contract + * for the child, as required by zone_enter(). + */ + if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1) + return; + if (ct_tmpl_set_critical(ctfd, 0) != 0 || + ct_tmpl_set_informative(ctfd, 0) != 0 || + ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 || + ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 || + ct_tmpl_activate(ctfd) != 0) { + (void) close(ctfd); + return; + } + + childpid = fork(); + switch (childpid) { + case -1: + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + break; + case 0: /* Child */ + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + if (zone_enter(zid) == -1) + _exit(errno); + err = system(over_cmd); + _exit(err); + break; + default: /* Parent */ + if (contract_latest(&ct) == -1) + ct = -1; + (void) ct_tmpl_clear(ctfd); + (void) close(ctfd); + err = waitid(P_PID, childpid, &info, WEXITED); + (void) contract_abandon_id(ct); + if (err == -1 || info.si_status != 0) + debug("over_cmd failed"); + break; + } +} + +/* + * Get the next mapping. + */ +static prmap_t * +nextmapping(proc_map_t *pmp) +{ + if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap) + return (NULL); + + return (&pmp->pr_mapp[pmp->pr_curr++]); +} + +/* + * Initialize the proc_map_t to access the first mapping of an address space. + */ +static prmap_t * +init_map(proc_map_t *pmp, pid_t pid) +{ + int fd; + int res; + struct stat st; + char pathbuf[MAXPATHLEN]; + + bzero(pmp, sizeof (proc_map_t)); + pmp->pr_nmap = -1; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + return (NULL); + +redo: + errno = 0; + if (fstat(fd, &st) != 0) + goto done; + + if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) { + debug("cannot malloc() %ld bytes for xmap", st.st_size); + goto done; + } + (void) bzero(pmp->pr_mapp, st.st_size); + + errno = 0; + if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) { + free(pmp->pr_mapp); + pmp->pr_mapp = NULL; + if (res > 0 || errno == E2BIG) { + goto redo; + } else { + debug("pid %ld cannot read xmap\n", pid); + goto done; + } + } + + pmp->pr_nmap = st.st_size / sizeof (prmap_t); + +done: + (void) close(fd); + return (nextmapping(pmp)); +} + +/* + * Attempt to invalidate the entire mapping from within the given process's + * address space. May return nonzero with errno as: + * ESRCH - process not found + * ENOMEM - segment not found + * EINVAL - mapping exceeds a single segment + */ +static int +pageout_mapping(pid_t pid, prmap_t *pmp) +{ + int res; + + if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM) + return (0); + + errno = 0; + res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr, + pmp->pr_size); + + return (res); +} + +/* + * Work through a process paging out mappings until the whole address space was + * examined or the excess is < 0. Return our estimate of the updated excess. + */ +static int64_t +pageout_process(pid_t pid, int64_t excess) +{ + int psfd; + prmap_t *pmap; + proc_map_t cur; + int res; + int64_t sum_d_rss, d_rss; + int64_t old_rss; + int map_cnt; + psinfo_t psinfo; + char pathbuf[MAXPATHLEN]; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc, + pid); + if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0) + return (excess); + + cur.pr_mapp = NULL; + + if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo)) + goto done; + + old_rss = (int64_t)psinfo.pr_rssize; + map_cnt = 0; + + /* If unscannable, skip it. */ + if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) { + debug("pid %ld: system process, skipping %s\n", + pid, psinfo.pr_psargs); + goto done; + } + + /* If tiny RSS (16KB), skip it. */ + if (old_rss <= 16) { + debug("pid %ld: skipping, RSS %lldKB %s\n", + pid, old_rss, psinfo.pr_psargs); + goto done; + } + + /* Get segment residency information. */ + pmap = init_map(&cur, pid); + + /* Skip process if it has no mappings. */ + if (pmap == NULL) { + debug("pid %ld: map unreadable; ignoring\n", pid); + goto done; + } + + debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n", + pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs); + + /* + * Within the process's address space, attempt to page out mappings. + */ + sum_d_rss = 0; + while (excess > 0 && pmap != NULL && !shutting_down) { + /* invalidate the entire mapping */ + if ((res = pageout_mapping(pid, pmap)) < 0) + debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n", + pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno); + + map_cnt++; + + /* + * Re-check the process rss and get the delta. + */ + if (pread(psfd, &psinfo, sizeof (psinfo), 0) + != sizeof (psinfo)) { + excess -= old_rss; + goto done; + } + + d_rss = (int64_t)psinfo.pr_rssize - old_rss; + old_rss = (int64_t)psinfo.pr_rssize; + sum_d_rss += d_rss; + + /* + * d_rss hopefully should be negative (or 0 if nothing + * invalidated) but can be positive if more got paged in. + */ + excess += d_rss; + + if (excess <= 0) { + debug("pid %ld: (part.) nmap %d delta_rss %lldKB " + "excess %lldKB\n", pid, map_cnt, + (unsigned long long)sum_d_rss, (long long)excess); + map_cnt = 0; + + /* + * If we're actually under, this will suspend checking + * in the middle of this process's address space. + */ + excess = check_suspend(); + if (shutting_down) + goto done; + + /* + * since we might have suspended, re-read process's rss + */ + if (pread(psfd, &psinfo, sizeof (psinfo), 0) + != sizeof (psinfo)) { + excess -= old_rss; + goto done; + } + + old_rss = (int64_t)psinfo.pr_rssize; + + debug("pid %ld: resume pageout; excess %lld\n", pid, + (long long)excess); + sum_d_rss = 0; + } + + pmap = nextmapping(&cur); + } + + debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n", + pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess); + +done: + if (cur.pr_mapp != NULL) + free(cur.pr_mapp); + + (void) close(psfd); + + if (shutting_down) + return (0); + + return (excess); +} + +/* + * Get the zone's RSS data. + */ +static uint64_t +get_mem_info() +{ + uint64_t n = 1; + zsd_vmusage64_t buf; + uint64_t tmp_rss; + DIR *pdir = NULL; + struct dirent *dent; + + /* + * Start by doing the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to double + * counting some memory and overestimating how much is being used, but + * as long as that's not over the cap, then we don't need do the + * expensive calculation. + * + * If we have to do the expensive calculation, we remember the scaling + * factor so that we can try to use that on subsequent iterations for + * the fast rss. + */ + if (shutting_down) + return (0); + + if ((pdir = opendir(zoneproc)) == NULL) + return (0); + + accurate_rss = 0; + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", + zoneproc, pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) + rss = (int64_t)psinfo.pr_rssize; + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + + if (shutting_down) + return (0); + + debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss, + scale_rss, prev_fast_rss); + + /* see if we can get by with a scaled fast rss */ + tmp_rss = fast_rss; + if (scale_rss > 1 && prev_fast_rss > 0) { + /* + * Only scale the fast value if it hasn't ballooned too much + * to trust. + */ + if (fast_rss / prev_fast_rss < 2) { + fast_rss /= scale_rss; + debug("scaled fast rss: %lluKB\n", fast_rss); + } + } + + if (fast_rss <= zone_rss_cap || skip_vmusage) { + uint64_t zone_rss_bytes; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + return (fast_rss); + } + + buf.vmu_id = zid; + + /* get accurate usage (cached data may be up to 5 seconds old) */ + if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5, + (uintptr_t)&buf, (uintptr_t)&n) != 0) { + debug("vmusage failed\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (n > 1) { + /* This should never happen */ + debug("vmusage returned more than one result\n"); + (void) sleep_shutdown(1); + return (0); + } + + if (buf.vmu_id != zid) { + /* This should never happen */ + debug("vmusage returned the incorrect zone\n"); + (void) sleep_shutdown(1); + return (0); + } + + accurate_rss = buf.vmu_rss_all / 1024; + + /* calculate scaling factor to use for fast_rss from now on */ + if (accurate_rss > 0) { + scale_rss = fast_rss / accurate_rss; + debug("new scaling factor: %llu\n", scale_rss); + /* remember the fast rss when we had to get the accurate rss */ + prev_fast_rss = tmp_rss; + } + + debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss, + scale_rss, prev_fast_rss); + return (accurate_rss); +} + +/* + * Needed to read the zones physical-memory-cap rctl. + */ +static struct ps_prochandle * +grab_zone_proc() +{ + DIR *dirp; + struct dirent *dentp; + struct ps_prochandle *ph = NULL; + int tmp; + + if ((dirp = opendir(zoneproc)) == NULL) + return (NULL); + + while (!shutting_down && (dentp = readdir(dirp))) { + int pid; + + if (strcmp(".", dentp->d_name) == 0 || + strcmp("..", dentp->d_name) == 0) + continue; + + pid = atoi(dentp->d_name); + /* attempt to grab process */ + if ((ph = Pgrab(pid, 0, &tmp)) != NULL) { + if (Psetflags(ph, PR_RLC) == 0) { + if (Pcreate_agent(ph) == 0) { + (void) closedir(dirp); + return (ph); + } + } + Prelease(ph, 0); + } + } + + (void) closedir(dirp); + return (NULL); +} + +static uint64_t +get_zone_cap() +{ + rctlblk_t *rblk; + uint64_t mcap; + struct ps_prochandle *ph; + + if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL) + return (UINT64_MAX); + + if ((ph = grab_zone_proc()) == NULL) { + free(rblk); + return (UINT64_MAX); + } + + if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk, + RCTL_FIRST)) { + Pdestroy_agent(ph); + Prelease(ph, 0); + free(rblk); + return (UINT64_MAX); + } + + Pdestroy_agent(ph); + Prelease(ph, 0); + + mcap = rctlblk_get_value(rblk); + free(rblk); + return (mcap); +} + +/* + * check_suspend is invoked at the beginning of every pass through the process + * list or after we've paged out enough so that we think the excess is under + * the cap. The purpose is to periodically check the zone's rss and return + * the excess when the zone is over the cap. The rest of the time this + * function will sleep, periodically waking up to check the current rss. + * + * Depending on the percentage of penetration of the zone's rss into the + * cap we sleep for longer or shorter amounts. This reduces the impact of this + * work on the system, which is important considering that each zone will be + * monitoring its rss. + */ +static int64_t +check_suspend() +{ + static hrtime_t last_cap_read = 0; + static uint64_t addon; + static uint64_t lo_thresh; /* Thresholds for how long to sleep */ + static uint64_t hi_thresh; /* when under the cap (80% & 90%). */ + static uint64_t prev_zone_rss = 0; + static uint32_t pfdelay = 0; /* usec page fault delay when over */ + + /* Wait a second to give the async pageout a chance to catch up. */ + (void) sleep_shutdown(1); + + while (!shutting_down) { + int64_t new_excess; + int sleep_time; + hrtime_t now; + struct stat st; + uint64_t zone_rss; /* total RSS(KB) */ + + /* + * Check if the debug log files exists and enable or disable + * debug. + */ + if (debug_log_fp == NULL) { + if (stat(debug_log, &st) == 0) + debug_log_fp = fopen(debug_log, "w"); + } else { + if (stat(debug_log, &st) == -1) { + (void) fclose(debug_log_fp); + debug_log_fp = NULL; + } + } + + /* + * If the CAP_REFRESH interval has passed, re-get the current + * cap in case it has been dynamically updated. + */ + now = gethrtime(); + if (now - last_cap_read > CAP_REFRESH) { + uint64_t mcap; + + last_cap_read = now; + + mcap = get_zone_cap(); + if (mcap != 0 && mcap != UINT64_MAX) + zone_rss_cap = ROUNDUP(mcap, 1024) / 1024; + else + zone_rss_cap = UINT64_MAX; + + lo_thresh = (uint64_t)(zone_rss_cap * .8); + hi_thresh = (uint64_t)(zone_rss_cap * .9); + addon = (uint64_t)(zone_rss_cap * 0.05); + + /* + * We allow the memory cap tunables to be changed on + * the fly. + */ + get_mcap_tunables(); + + debug("%s: %s\n", TUNE_CMD, over_cmd); + debug("%s: %d\n", TUNE_NVMU, skip_vmusage); + debug("%s: %d\n", TUNE_NPAGE, skip_pageout); + debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle); + debug("current cap %lluKB lo %lluKB hi %lluKB\n", + zone_rss_cap, lo_thresh, hi_thresh); + } + + /* No cap, nothing to do. */ + if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) { + debug("no cap, sleep 120 seconds\n"); + (void) sleep_shutdown(120); + continue; + } + + zone_rss = get_mem_info(); + + /* calculate excess */ + new_excess = zone_rss - zone_rss_cap; + + debug("rss %lluKB, cap %lluKB, excess %lldKB\n", + zone_rss, zone_rss_cap, new_excess); + + /* + * If necessary, updates stats. + */ + + /* + * If it looks like we did some paging out since last over the + * cap then update the kstat so we can approximate how much was + * paged out. + */ + if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) { + uint64_t diff; + + /* assume diff is num bytes we paged out */ + diff = (prev_zone_rss - zone_rss) * 1024; + + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT, + &diff, 0); + } + prev_zone_rss = zone_rss; + + if (new_excess > 0) { + uint64_t n = 1; + + /* Increment "nover" kstat. */ + (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0); + + if (!skip_pf_throttle) { + /* + * Tell the kernel to start throttling page + * faults by some number of usecs to help us + * catch up. If we are persistently over the + * cap the delay ramps up to a max of 2000usecs. + * Note that for delays less than 1 tick + * (i.e. all of these) we busy-wait in as_fault. + * delay faults/sec + * 125 8000 + * 250 4000 + * 500 2000 + * 1000 1000 + * 2000 500 + */ + if (pfdelay == 0) + pfdelay = 125; + else if (pfdelay < 2000) + pfdelay *= 2; + + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, + &pfdelay, 0); + } + + /* + * Once we go over the cap, then we want to + * page out a little extra instead of stopping + * right at the cap. To do this we add 5% to + * the excess so that pageout_proces will work + * a little longer before stopping. + */ + return ((int64_t)(new_excess + addon)); + } + + /* + * At this point we are under the cap. + * + * Tell the kernel to stop throttling page faults. + * + * Scale the amount of time we sleep before rechecking the + * zone's memory usage. Also, scale the accpetable age of + * cached results from vm_getusage. We do this based on the + * penetration into the capped limit. + */ + if (pfdelay > 0) { + pfdelay = 0; + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, + &pfdelay, 0); + } + + if (zone_rss <= lo_thresh) { + sleep_time = 120; + } else if (zone_rss <= hi_thresh) { + sleep_time = 60; + } else { + sleep_time = 30; + } + + debug("sleep %d seconds\n", sleep_time); + (void) sleep_shutdown(sleep_time); + } + + /* Shutting down, tell the kernel so it doesn't throttle */ + if (pfdelay > 0) { + pfdelay = 0; + (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0); + } + + return (0); +} + +static void +get_mcap_tunables() +{ + zone_dochandle_t handle; + struct zone_attrtab attr; + + over_cmd[0] = '\0'; + if ((handle = zonecfg_init_handle()) == NULL) + return; + + if (zonecfg_get_handle(zonename, handle) != Z_OK) + goto done; + + /* Reset to defaults in case rebooting and settings have changed */ + over_cmd[0] = '\0'; + skip_vmusage = B_FALSE; + skip_pageout = B_FALSE; + skip_pf_throttle = B_FALSE; + + if (zonecfg_setattrent(handle) != Z_OK) + goto done; + while (zonecfg_getattrent(handle, &attr) == Z_OK) { + if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) { + (void) strlcpy(over_cmd, attr.zone_attr_value, + sizeof (over_cmd)); + } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_vmusage = B_TRUE; + } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_pageout = B_TRUE; + } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) { + if (strcmp("true", attr.zone_attr_value) == 0) + skip_pf_throttle = B_TRUE; + } + } + (void) zonecfg_endattrent(handle); + +done: + zonecfg_fini_handle(handle); +} + +/* ARGSUSED */ +static int +chk_proc_fs(void *data, const char *spec, const char *dir, + const char *fstype, const char *opt) +{ + if (fstype != NULL && strcmp(fstype, "proc") == 0) + *((boolean_t *)data) = B_TRUE; + + return (0); +} + +static boolean_t +has_proc() +{ + brand_handle_t bh; + boolean_t fnd = B_FALSE; + + if ((bh = brand_open(brand_name)) != NULL) { + (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd); + } + + brand_close(bh); + return (fnd); +} + +/* + * We run this loop for brands with no /proc to simply update the RSS, using + * the cheap GZ /proc data, every 5 minutes. + */ +static void +no_procfs() +{ + DIR *pdir = NULL; + struct dirent *dent; + uint64_t zone_rss_bytes; + + (void) sleep_shutdown(30); + while (!shutting_down) { + /* + * Just do the fast, cheap RSS calculation using the rss value + * in psinfo_t. Because that's per-process, it can lead to + * double counting some memory and overestimating how much is + * being used. Since there is no /proc in the zone, we use the + * GZ /proc and check for the correct zone. + */ + if ((pdir = opendir("/proc")) == NULL) + return; + + fast_rss = 0; + while (!shutting_down && (dent = readdir(pdir)) != NULL) { + pid_t pid; + int psfd; + int64_t rss; + char pathbuf[MAXPATHLEN]; + psinfo_t psinfo; + + if (strcmp(".", dent->d_name) == 0 || + strcmp("..", dent->d_name) == 0) + continue; + + pid = atoi(dent->d_name); + if (pid == 0 || pid == 1) + continue; + + (void) snprintf(pathbuf, sizeof (pathbuf), + "/proc/%d/psinfo", pid); + + rss = 0; + if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) { + if (pread(psfd, &psinfo, sizeof (psinfo), 0) == + sizeof (psinfo)) { + if (psinfo.pr_zoneid == zid) + rss = (int64_t)psinfo.pr_rssize; + } + + (void) close(psfd); + } + + fast_rss += rss; + } + + (void) closedir(pdir); + + if (shutting_down) + return; + + zone_rss_bytes = fast_rss * 1024; + /* Use the zone's approx. RSS in the kernel */ + (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0); + + (void) sleep_shutdown(300); + } +} + +/* + * Thread that checks zone's memory usage and when over the cap, goes through + * the zone's process list trying to pageout processes to get under the cap. + */ +static void +mcap_zone() +{ + DIR *pdir = NULL; + int64_t excess; + + debug("thread startup\n"); + + get_mcap_tunables(); + + /* + * If the zone has no /proc filesystem, we can't use the fast algorithm + * to check RSS or pageout any processes. All we can do is periodically + * update it's RSS kstat using the expensive sycall. + */ + if (!has_proc()) { + no_procfs(); + debug("thread shutdown\n"); + return; + } + + /* + * When first starting it is likely lots of other zones are starting + * too because the system is booting. Since we just started the zone + * we're not worried about being over the cap right away, so we let + * things settle a bit and tolerate some older data here to minimize + * the load on the system. + */ + (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */ + + /* Wait until zone's /proc is mounted */ + while (!shutting_down) { + struct stat st; + + if (stat(zoneproc, &st) == 0 && + strcmp(st.st_fstype, "proc") == 0) + break; + sleep_shutdown(5); + } + + /* Open zone's /proc and walk entries. */ + while (!shutting_down) { + if ((pdir = opendir(zoneproc)) != NULL) + break; + sleep_shutdown(5); + } + + while (!shutting_down) { + struct dirent *dirent; + + /* Wait until we've gone over the cap. */ + excess = check_suspend(); + + debug("starting to scan, excess %lldk\n", (long long)excess); + + if (over_cmd[0] != '\0') { + uint64_t zone_rss; /* total RSS(KB) */ + + debug("run phys_mcap_cmd: %s\n", over_cmd); + run_over_cmd(); + + zone_rss = get_mem_info(); + excess = zone_rss - zone_rss_cap; + debug("rss %lluKB, cap %lluKB, excess %lldKB\n", + zone_rss, zone_rss_cap, excess); + if (excess <= 0) + continue; + } + + while (!shutting_down && (dirent = readdir(pdir)) != NULL) { + pid_t pid; + + if (strcmp(".", dirent->d_name) == 0 || + strcmp("..", dirent->d_name) == 0) + continue; + + pid = atoi(dirent->d_name); + if (pid == 0 || pid == 1) + continue; + + if (skip_pageout) + (void) sleep_shutdown(2); + else + excess = pageout_process(pid, excess); + + if (excess <= 0) { + debug("apparently under; excess %lld\n", + (long long)excess); + /* Double check the current excess */ + excess = check_suspend(); + } + } + + debug("process pass done; excess %lld\n", (long long)excess); + rewinddir(pdir); + + if (skip_pageout) + (void) sleep_shutdown(120); + } + + if (pdir != NULL) + (void) closedir(pdir); + debug("thread shutdown\n"); +} + +void +create_mcap_thread(zlog_t *zlogp, zoneid_t id) +{ + int res; + char brandname[MAXNAMELEN]; + + shutting_down = 0; + zid = id; + logp = zlogp; + (void) getzonenamebyid(zid, zonename, sizeof (zonename)); + + if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0) + zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename); + + brandname[0] = '\0'; + if (zone_get_brand(zonename, brandname, sizeof (brandname)) != 0) + zerror(zlogp, B_FALSE, "zone %s missing brand", zonename); + + /* all but the lx brand currently use /proc */ + if (strcmp(brandname, "lx") == 0) { + (void) snprintf(zoneproc, sizeof (zoneproc), + "%s/root/native/proc", zonepath); + } else { + (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc", + zonepath); + } + + (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log", + zonepath); + + res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL, + &mcap_tid); + if (res != 0) { + zerror(zlogp, B_FALSE, "error %d creating memory cap thread", + res); + mcap_tid = 0; + } +} + +void +destroy_mcap_thread() +{ + if (mcap_tid != 0) { + shutting_down = 1; + (void) cond_signal(&shutdown_cv); + (void) thr_join(mcap_tid, NULL, NULL); + mcap_tid = 0; + } +} diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c index b9954b81b3..e63f87d2a0 100644 --- a/usr/src/cmd/zoneadmd/vplat.c +++ b/usr/src/cmd/zoneadmd/vplat.c @@ -137,6 +137,9 @@ #define ALT_MOUNT(mount_cmd) ((mount_cmd) != Z_MNT_BOOT) +/* Number of times to retry unmounting if it fails */ +#define UMOUNT_RETRIES 30 + /* a reasonable estimate for the number of lwps per process */ #define LWPS_PER_PROCESS 10 @@ -165,6 +168,7 @@ extern int getnetmaskbyaddr(struct in_addr, struct in_addr *); /* from zoneadmd */ extern char query_hook[]; +extern char post_statechg_hook[]; /* * For each "net" resource configured in zonecfg, we track a zone_addr_list_t @@ -201,7 +205,7 @@ autofs_cleanup(zoneid_t zoneid) /* * Ask autofs to unmount all trigger nodes in the given zone. */ - return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid)); + return (_autofssys(AUTOFS_UNMOUNTALL, (void *)((uintptr_t)zoneid))); } static void @@ -592,6 +596,24 @@ root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved) } /* + * Perform brand-specific cleanup if we are unable to unmount a FS. + */ +static void +brand_umount_cleanup(zlog_t *zlogp, char *path) +{ + char cmdbuf[2 * MAXPATHLEN]; + + if (post_statechg_hook[0] == '\0') + return; + + if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook, + ZONE_STATE_DOWN, Z_UNMOUNT, path) > sizeof (cmdbuf)) + return; + + (void) do_subproc(zlogp, cmdbuf, NULL, B_FALSE); +} + +/* * The general strategy for unmounting filesystems is as follows: * * - Remote filesystems may be dead, and attempting to contact them as @@ -624,6 +646,7 @@ static int unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd) { int error = 0; + int fail = 0; FILE *mnttab; struct mnttab *mnts; uint_t nmnt; @@ -711,18 +734,39 @@ unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd) if (umount2(path, MS_FORCE) == 0) { unmounted = B_TRUE; stuck = B_FALSE; + fail = 0; } else { /* - * The first failure indicates a - * mount we won't be able to get - * rid of automatically, so we - * bail. + * We may hit a failure here if there + * is an app in the GZ with an open + * pipe into the zone (commonly into + * the zone's /var/run). This type + * of app will notice the closed + * connection and cleanup, but it may + * take a while and we have no easy + * way to notice that. To deal with + * this case, we will wait and retry + * a few times before we give up. */ - error++; - zerror(zlogp, B_FALSE, - "unable to unmount '%s'", path); - free_mnttable(mnts, nmnt); - goto out; + fail++; + if (fail < (UMOUNT_RETRIES - 1)) { + zerror(zlogp, B_FALSE, + "unable to unmount '%s', " + "retrying in 2 seconds", + path); + (void) sleep(2); + } else if (fail > UMOUNT_RETRIES) { + error++; + zerror(zlogp, B_FALSE, + "unmount of '%s' failed", + path); + free_mnttable(mnts, nmnt); + goto out; + } else { + /* Try the hook 2 times */ + brand_umount_cleanup(zlogp, + path); + } } } /* @@ -1060,23 +1104,10 @@ mount_one_dev_symlink_cb(void *arg, const char *source, const char *target) int vplat_get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep) { - zone_dochandle_t handle; - - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - if (zonecfg_get_iptype(handle, iptypep) != Z_OK) { + if (zonecfg_get_iptype(snap_hndl, iptypep) != Z_OK) { zerror(zlogp, B_FALSE, "invalid ip-type configuration"); - zonecfg_fini_handle(handle); return (-1); } - zonecfg_fini_handle(handle); return (0); } @@ -1089,14 +1120,13 @@ static int mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd) { char brand[MAXNAMELEN]; - zone_dochandle_t handle = NULL; brand_handle_t bh = NULL; struct zone_devtab ztab; di_prof_t prof = NULL; int err; int retval = -1; zone_iptype_t iptype; - const char *curr_iptype; + const char *curr_iptype = NULL; if (di_prof_init(devpath, &prof)) { zerror(zlogp, B_TRUE, "failed to initialize profile"); @@ -1131,6 +1161,8 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd) curr_iptype = "exclusive"; break; } + if (curr_iptype == NULL) + abort(); if (brand_platform_iter_devices(bh, zone_name, mount_one_dev_device_cb, prof, curr_iptype) != 0) { @@ -1145,28 +1177,19 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd) } /* Add user-specified devices and directories */ - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_FALSE, "can't initialize zone handle"); - goto cleanup; - } - if (err = zonecfg_get_handle(zone_name, handle)) { - zerror(zlogp, B_FALSE, "can't get handle for zone " - "%s: %s", zone_name, zonecfg_strerror(err)); - goto cleanup; - } - if (err = zonecfg_setdevent(handle)) { + if ((err = zonecfg_setdevent(snap_hndl)) != 0) { zerror(zlogp, B_FALSE, "%s: %s", zone_name, zonecfg_strerror(err)); goto cleanup; } - while (zonecfg_getdevent(handle, &ztab) == Z_OK) { + while (zonecfg_getdevent(snap_hndl, &ztab) == Z_OK) { if (di_prof_add_dev(prof, ztab.zone_dev_match)) { zerror(zlogp, B_TRUE, "failed to add " "user-specified device"); goto cleanup; } } - (void) zonecfg_enddevent(handle); + (void) zonecfg_enddevent(snap_hndl); /* Send profile to kernel */ if (di_prof_commit(prof)) { @@ -1179,8 +1202,6 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd) cleanup: if (bh != NULL) brand_close(bh); - if (handle != NULL) - zonecfg_fini_handle(handle); if (prof) di_prof_fini(prof); return (retval); @@ -1675,7 +1696,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) char luroot[MAXPATHLEN]; int i, num_fs = 0; struct zone_fstab *fs_ptr = NULL; - zone_dochandle_t handle = NULL; zone_state_t zstate; brand_handle_t bh; plat_gmount_cb_data_t cb; @@ -1699,12 +1719,7 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) goto bad; } - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - goto bad; - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK || - zonecfg_setfsent(handle) != Z_OK) { + if (zonecfg_setfsent(snap_hndl) != Z_OK) { zerror(zlogp, B_FALSE, "invalid configuration"); goto bad; } @@ -1722,7 +1737,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) /* Get a handle to the brand info for this zone */ if ((bh = brand_open(brand)) == NULL) { zerror(zlogp, B_FALSE, "unable to determine zone brand"); - zonecfg_fini_handle(handle); return (-1); } @@ -1737,7 +1751,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) plat_gmount_cb, &cb) != 0) { zerror(zlogp, B_FALSE, "unable to mount filesystems"); brand_close(bh); - zonecfg_fini_handle(handle); return (-1); } brand_close(bh); @@ -1748,13 +1761,10 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) * higher level directories (e.g., /usr) get mounted before * any beneath them (e.g., /usr/local). */ - if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs, + if (mount_filesystems_fsent(snap_hndl, zlogp, &fs_ptr, &num_fs, mount_cmd) != 0) goto bad; - zonecfg_fini_handle(handle); - handle = NULL; - /* * Normally when we mount a zone all the zone filesystems * get mounted relative to rootpath, which is usually @@ -1833,8 +1843,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd) return (0); bad: - if (handle != NULL) - zonecfg_fini_handle(handle); free_fs_data(fs_ptr, num_fs); return (-1); } @@ -2190,13 +2198,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id, if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) { /* * Here, we know that the interface can't be brought up. - * A similar warning message was already printed out to - * the console by zoneadm(1M) so instead we log the - * message to syslog and continue. */ - zerror(&logsys, B_TRUE, "WARNING: skipping network interface " - "'%s' which may not be present/plumbed in the " - "global zone.", lifr.lifr_name); (void) close(s); return (Z_OK); } @@ -2409,7 +2411,6 @@ bad: static int configure_shared_network_interfaces(zlog_t *zlogp) { - zone_dochandle_t handle; struct zone_nwiftab nwiftab, loopback_iftab; zoneid_t zoneid; @@ -2418,29 +2419,19 @@ configure_shared_network_interfaces(zlog_t *zlogp) return (-1); } - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - if (zonecfg_setnwifent(handle) == Z_OK) { + if (zonecfg_setnwifent(snap_hndl) == Z_OK) { for (;;) { - if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK) + if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK) break; + nwifent_free_attrs(&nwiftab); if (configure_one_interface(zlogp, zoneid, &nwiftab) != Z_OK) { - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); + (void) zonecfg_endnwifent(snap_hndl); return (-1); } } - (void) zonecfg_endnwifent(handle); + (void) zonecfg_endnwifent(snap_hndl); } - zonecfg_fini_handle(handle); if (is_system_labeled()) { /* * Labeled zones share the loopback interface @@ -2894,7 +2885,6 @@ free_ip_interface(zone_addr_list_t *zalist) static int configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) { - zone_dochandle_t handle; struct zone_nwiftab nwiftab; char rootpath[MAXPATHLEN]; char path[MAXPATHLEN]; @@ -2903,30 +2893,18 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) boolean_t added = B_FALSE; zone_addr_list_t *zalist = NULL, *new; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - - if (zonecfg_setnwifent(handle) != Z_OK) { - zonecfg_fini_handle(handle); + if (zonecfg_setnwifent(snap_hndl) != Z_OK) return (0); - } for (;;) { - if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK) + if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK) break; + nwifent_free_attrs(&nwiftab); if (prof == NULL) { if (zone_get_devroot(zone_name, rootpath, sizeof (rootpath)) != Z_OK) { - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); + (void) zonecfg_endnwifent(snap_hndl); zerror(zlogp, B_TRUE, "unable to determine dev root"); return (-1); @@ -2934,8 +2912,7 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) (void) snprintf(path, sizeof (path), "%s%s", rootpath, "/dev"); if (di_prof_init(path, &prof) != 0) { - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); + (void) zonecfg_endnwifent(snap_hndl); zerror(zlogp, B_TRUE, "failed to initialize profile"); return (-1); @@ -2959,17 +2936,17 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) nwiftab.zone_nwif_physical) == 0) { added = B_TRUE; } else { - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); - zerror(zlogp, B_TRUE, "failed to add network device"); - return (-1); + /* + * Failed to add network device, but the brand hook + * might be doing this for us, so keep silent. + */ + continue; } /* set up the new IP interface, and add them all later */ new = malloc(sizeof (*new)); if (new == NULL) { zerror(zlogp, B_TRUE, "no memory for %s", nwiftab.zone_nwif_physical); - zonecfg_fini_handle(handle); free_ip_interface(zalist); } bzero(new, sizeof (*new)); @@ -2979,16 +2956,14 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) } if (zalist != NULL) { if ((errno = add_net(zlogp, zoneid, zalist)) != 0) { - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); + (void) zonecfg_endnwifent(snap_hndl); zerror(zlogp, B_TRUE, "failed to add address"); free_ip_interface(zalist); return (-1); } free_ip_interface(zalist); } - (void) zonecfg_endnwifent(handle); - zonecfg_fini_handle(handle); + (void) zonecfg_endnwifent(snap_hndl); if (prof != NULL && added) { if (di_prof_commit(prof) != 0) { @@ -3124,48 +3099,23 @@ remove_datalink_protect(zlog_t *zlogp, zoneid_t zoneid) /* datalink does not belong to the GZ */ continue; } - if (dlstatus != DLADM_STATUS_OK) { + if (dlstatus != DLADM_STATUS_OK) zerror(zlogp, B_FALSE, + "clear 'protection' link property: %s", dladm_status2str(dlstatus, dlerr)); - free(dllinks); - return (-1); - } + dlstatus = dladm_set_linkprop(dld_handle, *dllink, "allowed-ips", NULL, 0, DLADM_OPT_ACTIVE); - if (dlstatus != DLADM_STATUS_OK) { + if (dlstatus != DLADM_STATUS_OK) zerror(zlogp, B_FALSE, + "clear 'allowed-ips' link property: %s", dladm_status2str(dlstatus, dlerr)); - free(dllinks); - return (-1); - } } free(dllinks); return (0); } static int -unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid) -{ - int dlnum = 0; - - /* - * The kernel shutdown callback for the dls module should have removed - * all datalinks from this zone. If any remain, then there's a - * problem. - */ - if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) { - zerror(zlogp, B_TRUE, "unable to list network interfaces"); - return (-1); - } - if (dlnum != 0) { - zerror(zlogp, B_FALSE, - "datalinks remain in zone after shutdown"); - return (-1); - } - return (0); -} - -static int tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid, const struct sockaddr_storage *local, const struct sockaddr_storage *remote) { @@ -3247,26 +3197,14 @@ static int get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd) { int error = -1; - zone_dochandle_t handle; char *privname = NULL; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - if (ALT_MOUNT(mount_cmd)) { zone_iptype_t iptype; - const char *curr_iptype; + const char *curr_iptype = NULL; - if (zonecfg_get_iptype(handle, &iptype) != Z_OK) { + if (zonecfg_get_iptype(snap_hndl, &iptype) != Z_OK) { zerror(zlogp, B_TRUE, "unable to determine ip-type"); - zonecfg_fini_handle(handle); return (-1); } @@ -3279,17 +3217,15 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd) break; } - if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) { - zonecfg_fini_handle(handle); + if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) return (0); - } + zerror(zlogp, B_FALSE, "failed to determine the zone's default privilege set"); - zonecfg_fini_handle(handle); return (-1); } - switch (zonecfg_get_privset(handle, privs, &privname)) { + switch (zonecfg_get_privset(snap_hndl, privs, &privname)) { case Z_OK: error = 0; break; @@ -3312,7 +3248,6 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd) } free(privname); - zonecfg_fini_handle(handle); return (error); } @@ -3325,7 +3260,6 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep) nvlist_t **nvlv = NULL; int rctlcount = 0; int error = -1; - zone_dochandle_t handle; struct zone_rctltab rctltab; rctlblk_t *rctlblk = NULL; uint64_t maxlwps; @@ -3334,16 +3268,6 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep) *bufp = NULL; *bufsizep = 0; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - rctltab.zone_rctl_valptr = NULL; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc"); @@ -3356,18 +3280,18 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep) * max-processes property. If only the max-processes property is set, * we add a max-lwps property with a limit derived from max-processes. */ - if (zonecfg_get_aliased_rctl(handle, ALIAS_MAXPROCS, &maxprocs) + if (zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXPROCS, &maxprocs) == Z_OK && - zonecfg_get_aliased_rctl(handle, ALIAS_MAXLWPS, &maxlwps) + zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXLWPS, &maxlwps) == Z_NO_ENTRY) { - if (zonecfg_set_aliased_rctl(handle, ALIAS_MAXLWPS, + if (zonecfg_set_aliased_rctl(snap_hndl, ALIAS_MAXLWPS, maxprocs * LWPS_PER_PROCESS) != Z_OK) { zerror(zlogp, B_FALSE, "unable to set max-lwps alias"); goto out; } } - if (zonecfg_setrctlent(handle) != Z_OK) { + if (zonecfg_setrctlent(snap_hndl) != Z_OK) { zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent"); goto out; } @@ -3376,7 +3300,7 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep) zerror(zlogp, B_TRUE, "memory allocation failed"); goto out; } - while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) { + while (zonecfg_getrctlent(snap_hndl, &rctltab) == Z_OK) { struct zone_rctlvaltab *rctlval; uint_t i, count; const char *name = rctltab.zone_rctl_name; @@ -3458,7 +3382,7 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep) nvlv = NULL; rctlcount++; } - (void) zonecfg_endrctlent(handle); + (void) zonecfg_endrctlent(snap_hndl); if (rctlcount == 0) { error = 0; @@ -3483,8 +3407,6 @@ out: nvlist_free(nvl); if (nvlv != NULL) free(nvlv); - if (handle != NULL) - zonecfg_fini_handle(handle); return (error); } @@ -3500,7 +3422,7 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr) > sizeof (cmdbuf)) return (-1); - if (do_subproc(zlogp, cmdbuf, retstr) != 0) + if (do_subproc(zlogp, cmdbuf, retstr, B_FALSE) != 0) return (-1); return (0); @@ -3509,7 +3431,6 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr) static int get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) { - zone_dochandle_t handle; struct zone_dstab dstab; size_t total, offset, len; int error = -1; @@ -3520,30 +3441,20 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) *bufp = NULL; *bufsizep = 0; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - if (get_implicit_datasets(zlogp, &implicit_datasets) != 0) { zerror(zlogp, B_FALSE, "getting implicit datasets failed"); goto out; } - if (zonecfg_setdsent(handle) != Z_OK) { + if (zonecfg_setdsent(snap_hndl) != Z_OK) { zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent"); goto out; } total = 0; - while (zonecfg_getdsent(handle, &dstab) == Z_OK) + while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) total += strlen(dstab.zone_dataset_name) + 1; - (void) zonecfg_enddsent(handle); + (void) zonecfg_enddsent(snap_hndl); if (implicit_datasets != NULL) implicit_len = strlen(implicit_datasets); @@ -3560,12 +3471,12 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) goto out; } - if (zonecfg_setdsent(handle) != Z_OK) { + if (zonecfg_setdsent(snap_hndl) != Z_OK) { zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent"); goto out; } offset = 0; - while (zonecfg_getdsent(handle, &dstab) == Z_OK) { + while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) { len = strlen(dstab.zone_dataset_name); (void) strlcpy(str + offset, dstab.zone_dataset_name, total - offset); @@ -3573,7 +3484,7 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) if (offset < total - 1) str[offset++] = ','; } - (void) zonecfg_enddsent(handle); + (void) zonecfg_enddsent(snap_hndl); if (implicit_len > 0) (void) strlcpy(str + offset, implicit_datasets, total - offset); @@ -3585,8 +3496,6 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep) out: if (error != 0 && str != NULL) free(str); - if (handle != NULL) - zonecfg_fini_handle(handle); if (implicit_datasets != NULL) free(implicit_datasets); @@ -3596,40 +3505,26 @@ out: static int validate_datasets(zlog_t *zlogp) { - zone_dochandle_t handle; struct zone_dstab dstab; zfs_handle_t *zhp; libzfs_handle_t *hdl; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (-1); - } - if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) { + if (zonecfg_setdsent(snap_hndl) != Z_OK) { zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (-1); - } - - if (zonecfg_setdsent(handle) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); return (-1); } if ((hdl = libzfs_init()) == NULL) { zerror(zlogp, B_FALSE, "opening ZFS library"); - zonecfg_fini_handle(handle); return (-1); } - while (zonecfg_getdsent(handle, &dstab) == Z_OK) { + while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) { if ((zhp = zfs_open(hdl, dstab.zone_dataset_name, ZFS_TYPE_FILESYSTEM)) == NULL) { zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'", dstab.zone_dataset_name); - zonecfg_fini_handle(handle); libzfs_fini(hdl); return (-1); } @@ -3644,7 +3539,6 @@ validate_datasets(zlog_t *zlogp) zerror(zlogp, B_FALSE, "cannot set 'zoned' " "property for ZFS dataset '%s'\n", dstab.zone_dataset_name); - zonecfg_fini_handle(handle); zfs_close(zhp); libzfs_fini(hdl); return (-1); @@ -3652,9 +3546,8 @@ validate_datasets(zlog_t *zlogp) zfs_close(zhp); } - (void) zonecfg_enddsent(handle); + (void) zonecfg_enddsent(snap_hndl); - zonecfg_fini_handle(handle); libzfs_fini(hdl); return (0); @@ -4388,62 +4281,25 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath) } /* - * Set memory cap and pool info for the zone's resource management - * configuration. + * Set pool info for the zone's resource management configuration. */ static int setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) { int res; uint64_t tmp; - struct zone_mcaptab mcap; char sched[MAXNAMELEN]; - zone_dochandle_t handle = NULL; char pool_err[128]; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (Z_BAD_HANDLE); - } - - if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); - zonecfg_fini_handle(handle); - return (res); - } - - /* - * If a memory cap is configured, set the cap in the kernel using - * zone_setattr() and make sure the rcapd SMF service is enabled. - */ - if (zonecfg_getmcapent(handle, &mcap) == Z_OK) { - uint64_t num; - char smf_err[128]; - - num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10); - if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) { - zerror(zlogp, B_TRUE, "could not set zone memory cap"); - zonecfg_fini_handle(handle); - return (Z_INVAL); - } - - if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) { - zerror(zlogp, B_FALSE, "enabling system/rcap service " - "failed: %s", smf_err); - zonecfg_fini_handle(handle); - return (Z_INVAL); - } - } - /* Get the scheduling class set in the zone configuration. */ - if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK && + if (zonecfg_get_sched_class(snap_hndl, sched, sizeof (sched)) == Z_OK && strlen(sched) > 0) { if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched, strlen(sched)) == -1) zerror(zlogp, B_TRUE, "WARNING: unable to set the " "default scheduling class"); - } else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp) + } else if (zonecfg_get_aliased_rctl(snap_hndl, ALIAS_SHARES, &tmp) == Z_OK) { /* * If the zone has the zone.cpu-shares rctl set then we want to @@ -4454,7 +4310,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) */ char class_name[PC_CLNMSZ]; - if (zonecfg_get_dflt_sched_class(handle, class_name, + if (zonecfg_get_dflt_sched_class(snap_hndl, class_name, sizeof (class_name)) != Z_OK) { zerror(zlogp, B_FALSE, "WARNING: unable to determine " "the zone's scheduling class"); @@ -4487,7 +4343,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) * right thing in all cases (reuse or create) based on the current * zonecfg. */ - if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err, + if ((res = zonecfg_bind_tmp_pool(snap_hndl, zoneid, pool_err, sizeof (pool_err))) != Z_OK) { if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND) zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting " @@ -4496,14 +4352,13 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) else zerror(zlogp, B_FALSE, "could not bind zone to " "temporary pool: %s", zonecfg_strerror(res)); - zonecfg_fini_handle(handle); return (Z_POOL_BIND); } /* * Check if we need to warn about poold not being enabled. */ - if (zonecfg_warn_poold(handle)) { + if (zonecfg_warn_poold(snap_hndl)) { zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has " "been specified\nbut the dynamic pool service is not " "enabled.\nThe system will not dynamically adjust the\n" @@ -4513,7 +4368,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) } /* The following is a warning, not an error. */ - if ((res = zonecfg_bind_pool(handle, zoneid, pool_err, + if ((res = zonecfg_bind_pool(snap_hndl, zoneid, pool_err, sizeof (pool_err))) != Z_OK) { if (res == Z_POOL_BIND) zerror(zlogp, B_FALSE, "WARNING: unable to bind to " @@ -4527,10 +4382,9 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid) } /* Update saved pool name in case it has changed */ - (void) zonecfg_get_poolname(handle, zone_name, pool_name, + (void) zonecfg_get_poolname(snap_hndl, zone_name, pool_name, sizeof (pool_name)); - zonecfg_fini_handle(handle); return (Z_OK); } @@ -4631,33 +4485,28 @@ setup_zone_fs_allowed(zone_dochandle_t handle, zlog_t *zlogp, zoneid_t zoneid) } static int -setup_zone_attrs(zlog_t *zlogp, char *zone_namep, zoneid_t zoneid) +setup_zone_attrs(zlog_t *zlogp, zoneid_t zoneid) { - zone_dochandle_t handle; int res = Z_OK; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, "getting zone configuration handle"); - return (Z_BAD_HANDLE); - } - if ((res = zonecfg_get_snapshot_handle(zone_namep, handle)) != Z_OK) { - zerror(zlogp, B_FALSE, "invalid configuration"); + if ((res = setup_zone_hostid(snap_hndl, zlogp, zoneid)) != Z_OK) goto out; - } - if ((res = setup_zone_hostid(handle, zlogp, zoneid)) != Z_OK) - goto out; - - if ((res = setup_zone_fs_allowed(handle, zlogp, zoneid)) != Z_OK) + if ((res = setup_zone_fs_allowed(snap_hndl, zlogp, zoneid)) != Z_OK) goto out; out: - zonecfg_fini_handle(handle); return (res); } +/* + * The zone_did is a persistent debug ID. Each zone should have a unique ID + * in the kernel. This is used for things like DTrace which want to monitor + * zones across reboots. They can't use the zoneid since that changes on + * each boot. + */ zoneid_t -vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd) +vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zone_did) { zoneid_t rval = -1; priv_set_t *privs; @@ -4673,7 +4522,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd) tsol_zcent_t *zcent = NULL; int match = 0; int doi = 0; - int flags; + int flags = -1; zone_iptype_t iptype; if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) { @@ -4695,6 +4544,8 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd) flags = ZCF_NET_EXCL; break; } + if (flags == -1) + abort(); if ((privs = priv_allocset()) == NULL) { zerror(zlogp, B_TRUE, "%s failed", "priv_allocset"); @@ -4798,7 +4649,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd) xerr = 0; if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf, rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel, - flags)) == -1) { + flags, zone_did)) == -1) { if (xerr == ZE_AREMOUNTS) { if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) { zerror(zlogp, B_FALSE, @@ -4844,7 +4695,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd) struct brand_attr attr; char modname[MAXPATHLEN]; - if (setup_zone_attrs(zlogp, zone_name, zoneid) != Z_OK) + if (setup_zone_attrs(zlogp, zoneid) != Z_OK) goto error; if ((bh = brand_open(brand_name)) == NULL) { @@ -4902,6 +4753,8 @@ error: } if (rctlbuf != NULL) free(rctlbuf); + if (zfsbuf != NULL) + free(zfsbuf); priv_freeset(privs); if (fp != NULL) zonecfg_close_scratch(fp); @@ -5044,6 +4897,8 @@ vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid) return (-1); } break; + default: + abort(); } } @@ -5119,7 +4974,8 @@ unmounted: } int -vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) +vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, + boolean_t debug) { char *kzone; zoneid_t zoneid; @@ -5158,16 +5014,12 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) goto error; } - if (remove_datalink_pool(zlogp, zoneid) != 0) { + if (remove_datalink_pool(zlogp, zoneid) != 0) zerror(zlogp, B_FALSE, "unable clear datalink pool property"); - goto error; - } - if (remove_datalink_protect(zlogp, zoneid) != 0) { + if (remove_datalink_protect(zlogp, zoneid) != 0) zerror(zlogp, B_FALSE, "unable clear datalink protect property"); - goto error; - } /* * The datalinks assigned to the zone will be removed from the NGZ as @@ -5207,7 +5059,7 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) brand_close(bh); if ((strlen(cmdbuf) > EXEC_LEN) && - (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) { + (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) { zerror(zlogp, B_FALSE, "%s failed", cmdbuf); goto error; } @@ -5239,12 +5091,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) } break; case ZS_EXCLUSIVE: - if (unconfigure_exclusive_network_interfaces(zlogp, - zoneid) != 0) { - zerror(zlogp, B_FALSE, "unable to unconfigure " - "network interfaces in zone"); - goto error; - } status = dladm_zone_halt(dld_handle, zoneid); if (status != DLADM_STATUS_OK) { zerror(zlogp, B_FALSE, "unable to notify " @@ -5281,14 +5127,9 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting) if (rebooting) { struct zone_psettab pset_tab; - zone_dochandle_t handle; - if ((handle = zonecfg_init_handle()) != NULL && - zonecfg_get_handle(zone_name, handle) == Z_OK && - zonecfg_lookup_pset(handle, &pset_tab) == Z_OK) + if (zonecfg_lookup_pset(snap_hndl, &pset_tab) == Z_OK) destroy_tmp_pool = B_FALSE; - - zonecfg_fini_handle(handle); } if (destroy_tmp_pool) { diff --git a/usr/src/cmd/zoneadmd/zcons.c b/usr/src/cmd/zoneadmd/zcons.c index 963bfd3100..d5a33133b7 100644 --- a/usr/src/cmd/zoneadmd/zcons.c +++ b/usr/src/cmd/zoneadmd/zcons.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* @@ -117,9 +117,10 @@ #define CONSOLE_SOCKPATH ZONES_TMPDIR "/%s.console_sock" +#define ZCONS_RETRY 10 + static int serverfd = -1; /* console server unix domain socket fd */ char boot_args[BOOTARGS_MAX]; -char bad_boot_arg[BOOTARGS_MAX]; /* * The eventstream is a simple one-directional flow of messages from the @@ -129,7 +130,10 @@ char bad_boot_arg[BOOTARGS_MAX]; */ static int eventstream[2]; - +/* flag used to cope with race creating master zcons devlink */ +static boolean_t master_zcons_failed = B_FALSE; +/* flag to track if we've seen a state change when there is no master zcons */ +static boolean_t state_changed = B_FALSE; int eventstream_init() @@ -321,7 +325,7 @@ destroy_console_devs(zlog_t *zlogp) * interfaces to instantiate a new zone console node. We do a lot of * sanity checking, and are careful to reuse a console if one exists. * - * Once the device is in the device tree, we kick devfsadm via di_init_devs() + * Once the device is in the device tree, we kick devfsadm via di_devlink_init() * to ensure that the appropriate symlinks (to the master and slave console * devices) are placed in /dev in the global zone. */ @@ -407,43 +411,63 @@ devlinks: * Open the master side of the console and issue the ZC_HOLDSLAVE ioctl, * which will cause the master to retain a reference to the slave. * This prevents ttymon from blowing through the slave's STREAMS anchor. + * + * In very rare cases the open returns ENOENT if devfs doesn't have + * everything setup yet due to heavy zone startup load. Wait for + * 1 sec. and retry a few times. Even if we can't setup the zone's + * console, we still go ahead and boot the zone. */ (void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s", zone_name, ZCONS_MASTER_NAME); - if ((masterfd = open(conspath, O_RDWR | O_NOCTTY)) == -1) { + for (i = 0; i < ZCONS_RETRY; i++) { + masterfd = open(conspath, O_RDWR | O_NOCTTY); + if (masterfd >= 0 || errno != ENOENT) + break; + (void) sleep(1); + } + if (masterfd == -1) { zerror(zlogp, B_TRUE, "ERROR: could not open master side of " "zone console for %s to acquire slave handle", zone_name); - goto error; + master_zcons_failed = B_TRUE; } + (void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s", zone_name, ZCONS_SLAVE_NAME); - if ((slavefd = open(conspath, O_RDWR | O_NOCTTY)) == -1) { + for (i = 0; i < ZCONS_RETRY; i++) { + slavefd = open(conspath, O_RDWR | O_NOCTTY); + if (slavefd >= 0 || errno != ENOENT) + break; + (void) sleep(1); + } + if (slavefd == -1) zerror(zlogp, B_TRUE, "ERROR: could not open slave side of zone" " console for %s to acquire slave handle", zone_name); - (void) close(masterfd); - goto error; - } + /* * This ioctl can occasionally return ENXIO if devfs doesn't have * everything plumbed up yet due to heavy zone startup load. Wait for * 1 sec. and retry a few times before we fail to boot the zone. */ - for (i = 0; i < 5; i++) { - if (ioctl(masterfd, ZC_HOLDSLAVE, (caddr_t)(intptr_t)slavefd) - == 0) { - rv = 0; - break; - } else if (errno != ENXIO) { - break; + if (masterfd != -1 && slavefd != -1) { + for (i = 0; i < ZCONS_RETRY; i++) { + if (ioctl(masterfd, ZC_HOLDSLAVE, + (caddr_t)(intptr_t)slavefd) == 0) { + rv = 0; + break; + } else if (errno != ENXIO) { + break; + } + (void) sleep(1); } - (void) sleep(1); + if (rv != 0) + zerror(zlogp, B_TRUE, "ERROR: error while acquiring " + "slave handle of zone console for %s", zone_name); } - if (rv != 0) - zerror(zlogp, B_TRUE, "ERROR: error while acquiring slave " - "handle of zone console for %s", zone_name); - (void) close(slavefd); - (void) close(masterfd); + if (slavefd != -1) + (void) close(slavefd); + if (masterfd != -1) + (void) close(masterfd); error: if (ddef_hdl) @@ -653,14 +677,6 @@ event_message(int clifd, char *clilocale, zone_evt_t evt) case Z_EVT_ZONE_BOOTFAILED: str = "NOTICE: Zone boot failed"; break; - case Z_EVT_ZONE_BADARGS: - /*LINTED*/ - (void) snprintf(lmsg, sizeof (lmsg), - localize_msg(clilocale, - "WARNING: Ignoring invalid boot arguments: %s"), - bad_boot_arg); - lstr = lmsg; - break; default: return; } @@ -854,7 +870,6 @@ init_console(zlog_t *zlogp) if (init_console_dev(zlogp) == -1) { zerror(zlogp, B_FALSE, "console setup: device initialization failed"); - return (-1); } if ((serverfd = init_console_sock(zlogp)) == -1) { @@ -866,6 +881,17 @@ init_console(zlog_t *zlogp) } /* + * Maintain a simple flag that tracks if we have seen at least one state + * change. This is currently only used to handle the special case where we are + * running without a console device, which is what normally drives shutdown. + */ +void +zcons_statechanged() +{ + state_changed = B_TRUE; +} + +/* * serve_console() is the master loop for driving console I/O. It is also the * routine which is ultimately responsible for "pulling the plug" on zoneadmd * when it realizes that the daemon should shut down. @@ -883,6 +909,7 @@ serve_console(zlog_t *zlogp) int masterfd; zone_state_t zstate; char conspath[MAXPATHLEN]; + static boolean_t cons_warned = B_FALSE; (void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s", zone_name, ZCONS_MASTER_NAME); @@ -890,6 +917,46 @@ serve_console(zlog_t *zlogp) for (;;) { masterfd = open(conspath, O_RDWR|O_NONBLOCK|O_NOCTTY); if (masterfd == -1) { + if (master_zcons_failed) { + /* + * If we don't have a console and the zone is + * not shutting down, there may have been a + * race/failure with devfs while creating the + * console. In this case we want to leave the + * zone up, even without a console, so + * periodically recheck. + */ + int i; + + /* + * In the normal flow of this loop, we use + * do_console_io to give things a chance to get + * going first. However, in this case we can't + * use that, so we have to wait for at least + * one state change before checking the state. + */ + for (i = 0; i < 60; i++) { + if (state_changed) + break; + (void) sleep(1); + } + + if (i < 60 && zone_get_state(zone_name, + &zstate) == Z_OK && + (zstate == ZONE_STATE_READY || + zstate == ZONE_STATE_RUNNING)) { + if (!cons_warned) { + zerror(zlogp, B_FALSE, + "WARNING: missing zone " + "console for %s", + zone_name); + cons_warned = B_TRUE; + } + (void) sleep(ZCONS_RETRY); + continue; + } + } + zerror(zlogp, B_TRUE, "failed to open console master"); (void) mutex_lock(&lock); goto death; diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c index 7a9d88410d..bb53b01d16 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.c +++ b/usr/src/cmd/zoneadmd/zoneadmd.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2014, Joyent, Inc. All rights reserved. */ /* @@ -68,6 +69,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <sys/sysmacros.h> +#include <sys/time.h> #include <bsm/adt.h> #include <bsm/adt_event.h> @@ -108,6 +110,7 @@ static char *progname; char *zone_name; /* zone which we are managing */ +zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */ char pool_name[MAXNAMELEN]; char default_brand[MAXNAMELEN]; char brand_name[MAXNAMELEN]; @@ -116,10 +119,11 @@ boolean_t zone_iscluster; boolean_t zone_islabeled; boolean_t shutdown_in_progress; static zoneid_t zone_id; +static zoneid_t zone_did = 0; dladm_handle_t dld_handle = NULL; -static char pre_statechg_hook[2 * MAXPATHLEN]; -static char post_statechg_hook[2 * MAXPATHLEN]; +char pre_statechg_hook[2 * MAXPATHLEN]; +char post_statechg_hook[2 * MAXPATHLEN]; char query_hook[2 * MAXPATHLEN]; zlog_t logsys; @@ -141,6 +145,9 @@ boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */ #define DEFAULT_LOCALE "C" +#define RSRC_NET "net" +#define RSRC_DEV "device" + static const char * z_cmd_name(zone_cmd_t zcmd) { @@ -257,34 +264,31 @@ zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...) } /* - * Emit a warning for any boot arguments which are unrecognized. Since - * Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we + * Since Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we * put the arguments into an argv style array, use getopt to process them, - * and put the resultant argument string back into outargs. + * and put the resultant argument string back into outargs. Non-Solaris brands + * may support alternate forms of boot arguments so we must handle that as well. * * During the filtering, we pull out any arguments which are truly "boot" * arguments, leaving only those which are to be passed intact to the * progenitor process. The one we support at the moment is -i, which * indicates to the kernel which program should be launched as 'init'. * - * A return of Z_INVAL indicates specifically that the arguments are - * not valid; this is a non-fatal error. Except for Z_OK, all other return - * values are treated as fatal. + * Except for Z_OK, all other return values are treated as fatal. */ static int filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs, - char *init_file, char *badarg) + char *init_file) { int argc = 0, argc_save; int i; - int err; + int err = Z_OK; char *arg, *lasts, **argv = NULL, **argv_save; char zonecfg_args[BOOTARGS_MAX]; char scratchargs[BOOTARGS_MAX], *sargs; char c; bzero(outargs, BOOTARGS_MAX); - bzero(badarg, BOOTARGS_MAX); /* * If the user didn't specify transient boot arguments, check @@ -292,25 +296,10 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs, * and use them if applicable. */ if (inargs == NULL || inargs[0] == '\0') { - zone_dochandle_t handle; - if ((handle = zonecfg_init_handle()) == NULL) { - zerror(zlogp, B_TRUE, - "getting zone configuration handle"); - return (Z_BAD_HANDLE); - } - err = zonecfg_get_snapshot_handle(zone_name, handle); - if (err != Z_OK) { - zerror(zlogp, B_FALSE, - "invalid configuration snapshot"); - zonecfg_fini_handle(handle); - return (Z_BAD_HANDLE); - } - bzero(zonecfg_args, sizeof (zonecfg_args)); - (void) zonecfg_get_bootargs(handle, zonecfg_args, + (void) zonecfg_get_bootargs(snap_hndl, zonecfg_args, sizeof (zonecfg_args)); inargs = zonecfg_args; - zonecfg_fini_handle(handle); } if (strlen(inargs) >= BOOTARGS_MAX) { @@ -390,36 +379,29 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs, break; case '?': /* - * We warn about unknown arguments but pass them - * along anyway-- if someone wants to develop their - * own init replacement, they can pass it whatever - * args they want. + * If a brand has its own init, we need to pass along + * whatever the user provides. We use the entire + * unknown string here so that we correctly handle + * unknown long options (e.g. --debug). */ - err = Z_INVAL; (void) snprintf(outargs, BOOTARGS_MAX, - "%s -%c", outargs, optopt); - (void) snprintf(badarg, BOOTARGS_MAX, - "%s -%c", badarg, optopt); + "%s %s", outargs, argv[optind - 1]); break; } } /* - * For Solaris Zones we warn about and discard non-option arguments. - * Hence 'boot foo bar baz gub' --> 'boot'. However, to be similar - * to the kernel, we concat up all the other remaining boot args. - * and warn on them as a group. + * We need to pass along everything else since we don't know what + * the brand's init is expecting. For example, an argument list like: + * --confdir /foo --debug + * will cause the getopt parsing to stop at '/foo' but we need to pass + * that on, along with the '--debug'. This does mean that we require + * any of our known options (-ifms) to preceed the brand-specific ones. */ - if (optind < argc) { - err = Z_INVAL; - while (optind < argc) { - (void) snprintf(badarg, BOOTARGS_MAX, "%s%s%s", - badarg, strlen(badarg) > 0 ? " " : "", - argv[optind]); - optind++; - } - zerror(zlogp, B_FALSE, "WARNING: Unused or invalid boot " - "arguments `%s'.", badarg); + while (optind < argc) { + (void) snprintf(outargs, BOOTARGS_MAX, "%s %s", outargs, + argv[optind]); + optind++; } done: @@ -458,7 +440,7 @@ mkzonedir(zlog_t *zlogp) * Run the brand's pre-state change callback, if it exists. */ static int -brand_prestatechg(zlog_t *zlogp, int state, int cmd) +brand_prestatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug) { char cmdbuf[2 * MAXPATHLEN]; const char *altroot; @@ -471,7 +453,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd) state, cmd, altroot) > sizeof (cmdbuf)) return (-1); - if (do_subproc(zlogp, cmdbuf, NULL) != 0) + if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0) return (-1); return (0); @@ -481,7 +463,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd) * Run the brand's post-state change callback, if it exists. */ static int -brand_poststatechg(zlog_t *zlogp, int state, int cmd) +brand_poststatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug) { char cmdbuf[2 * MAXPATHLEN]; const char *altroot; @@ -494,7 +476,7 @@ brand_poststatechg(zlog_t *zlogp, int state, int cmd) state, cmd, altroot) > sizeof (cmdbuf)) return (-1); - if (do_subproc(zlogp, cmdbuf, NULL) != 0) + if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0) return (-1); return (0); @@ -533,35 +515,44 @@ notify_zonestatd(zoneid_t zoneid) * subcommand. */ static int -zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate) +zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate, boolean_t debug) { int err; + boolean_t snapped = B_FALSE; - if (brand_prestatechg(zlogp, zstate, Z_READY) != 0) - return (-1); - + if ((snap_hndl = zonecfg_init_handle()) == NULL) { + zerror(zlogp, B_TRUE, "getting zone configuration handle"); + goto bad; + } if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) { zerror(zlogp, B_FALSE, "unable to create snapshot: %s", zonecfg_strerror(err)); goto bad; } + snapped = B_TRUE; - if ((zone_id = vplat_create(zlogp, mount_cmd)) == -1) { - if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) - zerror(zlogp, B_FALSE, "destroying snapshot: %s", - zonecfg_strerror(err)); + if (zonecfg_get_snapshot_handle(zone_name, snap_hndl) != Z_OK) { + zerror(zlogp, B_FALSE, "invalid configuration snapshot"); goto bad; } + + if (zone_did == 0) + zone_did = zone_get_did(zone_name); + + if (brand_prestatechg(zlogp, zstate, Z_READY, debug) != 0) + goto bad; + + if ((zone_id = vplat_create(zlogp, mount_cmd, zone_did)) == -1) + goto bad; + if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) { bringup_failure_recovery = B_TRUE; - (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE); - if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) - zerror(zlogp, B_FALSE, "destroying snapshot: %s", - zonecfg_strerror(err)); + (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE, + debug); goto bad; } - if (brand_poststatechg(zlogp, zstate, Z_READY) != 0) + if (brand_poststatechg(zlogp, zstate, Z_READY, debug) != 0) goto bad; return (0); @@ -571,7 +562,13 @@ bad: * If something goes wrong, we up the zones's state to the target * state, READY, and then invoke the hook as if we're halting. */ - (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT); + (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT, debug); + if (snapped) + if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) + zerror(zlogp, B_FALSE, "destroying snapshot: %s", + zonecfg_strerror(err)); + zonecfg_fini_handle(snap_hndl); + snap_hndl = NULL; return (-1); } @@ -686,6 +683,8 @@ mount_early_fs(void *data, const char *spec, const char *dir, char opt_buf[MAX_MNTOPT_STR]; int optlen = 0; int mflag = MS_DATA; + int i; + int ret; (void) ct_tmpl_clear(tmpl_fd); /* @@ -713,9 +712,26 @@ mount_early_fs(void *data, const char *spec, const char *dir, optlen = MAX_MNTOPT_STR; mflag = MS_OPTIONSTR; } - if (mount(spec, dir, mflag, fstype, NULL, 0, opt, optlen) != 0) - _exit(errno); - _exit(0); + + /* + * There is an obscure race condition which can cause mount + * to return EBUSY. This happens for example on the mount + * of the zone's /etc/svc/volatile file system if there is + * a GZ process running svcs -Z, which will touch the + * mountpoint, just as we're trying to do the mount. To cope + * with this, we retry up to 3 times to let this transient + * process get out of the way. + */ + for (i = 0; i < 3; i++) { + ret = 0; + if (mount(spec, dir, mflag, fstype, NULL, 0, opt, + optlen) != 0) + ret = errno; + if (ret != EBUSY) + break; + (void) sleep(1); + } + _exit(ret); } /* parent */ @@ -739,12 +755,150 @@ mount_early_fs(void *data, const char *spec, const char *dir, } /* + * env variable name format + * _ZONECFG_{resource name}_{identifying attr. name}_{property name} + * Any dashes (-) in the property names are replaced with underscore (_). + */ +static void +set_zonecfg_env(char *rsrc, char *attr, char *name, char *val) +{ + char *p; + char nm[MAXNAMELEN]; + + if (attr == NULL) + (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc, + name); + else + (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc, + attr, name); + + p = nm; + while ((p = strchr(p, '-')) != NULL) + *p++ = '_'; + + (void) setenv(nm, val, 1); +} + +/* + * Export zonecfg network and device properties into environment for the boot + * and state change hooks. + * If debug is true, export the brand hook debug env. variable as well. + * + * We could export more of the config in the future, as necessary. + */ +static int +setup_subproc_env(boolean_t debug) +{ + int res; + struct zone_nwiftab ntab; + struct zone_devtab dtab; + struct zone_attrtab atab; + char net_resources[MAXNAMELEN * 2]; + char dev_resources[MAXNAMELEN * 2]; + + /* snap_hndl is null when called through the set_brand_env code path */ + if (snap_hndl == NULL) + return (Z_OK); + + net_resources[0] = '\0'; + if ((res = zonecfg_setnwifent(snap_hndl)) != Z_OK) + goto done; + + while (zonecfg_getnwifent(snap_hndl, &ntab) == Z_OK) { + struct zone_res_attrtab *rap; + char *phys; + + phys = ntab.zone_nwif_physical; + + (void) strlcat(net_resources, phys, sizeof (net_resources)); + (void) strlcat(net_resources, " ", sizeof (net_resources)); + + set_zonecfg_env(RSRC_NET, phys, "physical", phys); + + set_zonecfg_env(RSRC_NET, phys, "address", + ntab.zone_nwif_address); + set_zonecfg_env(RSRC_NET, phys, "allowed-address", + ntab.zone_nwif_allowed_address); + set_zonecfg_env(RSRC_NET, phys, "defrouter", + ntab.zone_nwif_defrouter); + set_zonecfg_env(RSRC_NET, phys, "global-nic", + ntab.zone_nwif_gnic); + set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac); + set_zonecfg_env(RSRC_NET, phys, "vlan-id", + ntab.zone_nwif_vlan_id); + + for (rap = ntab.zone_nwif_attrp; rap != NULL; + rap = rap->zone_res_attr_next) + set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name, + rap->zone_res_attr_value); + nwifent_free_attrs(&ntab); + } + + (void) setenv("_ZONECFG_net_resources", net_resources, 1); + + (void) zonecfg_endnwifent(snap_hndl); + + if ((res = zonecfg_setdevent(snap_hndl)) != Z_OK) + goto done; + + while (zonecfg_getdevent(snap_hndl, &dtab) == Z_OK) { + struct zone_res_attrtab *rap; + char *match; + + match = dtab.zone_dev_match; + + (void) strlcat(dev_resources, match, sizeof (dev_resources)); + (void) strlcat(dev_resources, " ", sizeof (dev_resources)); + + for (rap = dtab.zone_dev_attrp; rap != NULL; + rap = rap->zone_res_attr_next) + set_zonecfg_env(RSRC_DEV, match, + rap->zone_res_attr_name, rap->zone_res_attr_value); + } + + (void) zonecfg_enddevent(snap_hndl); + + if ((res = zonecfg_setattrent(snap_hndl)) != Z_OK) + goto done; + + while (zonecfg_getattrent(snap_hndl, &atab) == Z_OK) { + set_zonecfg_env("attr", NULL, atab.zone_attr_name, + atab.zone_attr_value); + } + + (void) zonecfg_endattrent(snap_hndl); + + if (debug) + (void) setenv("_ZONEADMD_brand_debug", "1", 1); + else + (void) setenv("_ZONEADMD_brand_debug", "", 1); + + res = Z_OK; + +done: + return (res); +} + +void +nwifent_free_attrs(struct zone_nwiftab *np) +{ + struct zone_res_attrtab *rap; + + for (rap = np->zone_nwif_attrp; rap != NULL; ) { + struct zone_res_attrtab *tp = rap; + + rap = rap->zone_res_attr_next; + free(tp); + } +} + +/* * If retstr is not NULL, the output of the subproc is returned in the str, * otherwise it is output using zerror(). Any memory allocated for retstr * should be freed by the caller. */ int -do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr) +do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr, boolean_t debug) { char buf[1024]; /* arbitrary large amount */ char *inbuf; @@ -763,6 +917,11 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr) inbuf = buf; } + if (setup_subproc_env(debug) != Z_OK) { + zerror(zlogp, B_FALSE, "failed to setup environment"); + return (-1); + } + file = popen(cmdbuf, "r"); if (file == NULL) { zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf); @@ -771,8 +930,13 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr) while (fgets(inbuf, 1024, file) != NULL) { if (retstr == NULL) { - if (zlogp != &logsys) + if (zlogp != &logsys) { + int last = strlen(inbuf) - 1; + + if (inbuf[last] == '\n') + inbuf[last] = '\0'; zerror(zlogp, B_FALSE, "%s", inbuf); + } } else { char *p; @@ -802,8 +966,51 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr) return (WEXITSTATUS(status)); } +/* + * Get the path for this zone's init(1M) (or equivalent) process. First look + * for a zone-specific init-name attr, then get it from the brand. + */ +static int +get_initname(brand_handle_t bh, char *initname, int len) +{ + struct zone_attrtab a; + + bzero(&a, sizeof (a)); + (void) strlcpy(a.zone_attr_name, "init-name", + sizeof (a.zone_attr_name)); + + if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) { + (void) strlcpy(initname, a.zone_attr_value, len); + return (0); + } + + return (brand_get_initname(bh, initname, len)); +} + +/* + * Get the restart-init flag for this zone's init(1M) (or equivalent) process. + * First look for a zone-specific restart-init attr, then get it from the brand. + */ +static boolean_t +restartinit(brand_handle_t bh) +{ + struct zone_attrtab a; + + bzero(&a, sizeof (a)); + (void) strlcpy(a.zone_attr_name, "restart-init", + sizeof (a.zone_attr_name)); + + if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) { + if (strcmp(a.zone_attr_value, "false") == 0) + return (B_FALSE); + return (B_TRUE); + } + + return (brand_restartinit(bh)); +} + static int -zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) +zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug) { zoneid_t zoneid; struct stat st; @@ -813,12 +1020,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) fs_callback_t cb; brand_handle_t bh; zone_iptype_t iptype; - boolean_t links_loaded = B_FALSE; dladm_status_t status; char errmsg[DLADM_STRSIZE]; int err; + boolean_t restart_init; - if (brand_prestatechg(zlogp, zstate, Z_BOOT) != 0) + if (brand_prestatechg(zlogp, zstate, Z_BOOT, debug) != 0) return (-1); if ((zoneid = getzoneidbyname(zone_name)) == -1) { @@ -866,20 +1073,20 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) } /* Get the path for this zone's init(1M) (or equivalent) process. */ - if (brand_get_initname(bh, init_file, MAXPATHLEN) != 0) { + if (get_initname(bh, init_file, MAXPATHLEN) != 0) { zerror(zlogp, B_FALSE, "unable to determine zone's init(1M) location"); brand_close(bh); goto bad; } + /* See if we should restart init if it dies. */ + restart_init = restartinit(bh); + brand_close(bh); - err = filter_bootargs(zlogp, bootargs, nbootargs, init_file, - bad_boot_arg); - if (err == Z_INVAL) - eventstream_write(Z_EVT_ZONE_BADARGS); - else if (err != Z_OK) + err = filter_bootargs(zlogp, bootargs, nbootargs, init_file); + if (err != Z_OK) goto bad; assert(init_file[0] != '\0'); @@ -915,7 +1122,6 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) " %s", dladm_status2str(status, errmsg)); goto bad; } - links_loaded = B_TRUE; } /* @@ -924,7 +1130,7 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) * is booted. */ if ((strlen(cmdbuf) > EXEC_LEN) && - (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) { + (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) { zerror(zlogp, B_FALSE, "%s failed", cmdbuf); goto bad; } @@ -939,6 +1145,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) goto bad; } + if (!restart_init && zone_setattr(zoneid, ZONE_ATTR_INITNORESTART, + NULL, 0) == -1) { + zerror(zlogp, B_TRUE, "could not set zone init-no-restart"); + goto bad; + } + /* * Inform zonestatd of a new zone so that it can install a door for * the zone to contact it. @@ -950,9 +1162,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate) goto bad; } - if (brand_poststatechg(zlogp, zstate, Z_BOOT) != 0) + if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0) goto bad; + /* Startup a thread to perform memory capping for the zone. */ + create_mcap_thread(zlogp, zone_id); + return (0); bad: @@ -960,32 +1175,38 @@ bad: * If something goes wrong, we up the zones's state to the target * state, RUNNING, and then invoke the hook as if we're halting. */ - (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT); - if (links_loaded) - (void) dladm_zone_halt(dld_handle, zoneid); + (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT, debug); + return (-1); } static int -zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate) +zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate, + boolean_t debug) { int err; - if (brand_prestatechg(zlogp, zstate, Z_HALT) != 0) + if (brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0) return (-1); - if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) { + /* Shutting down, stop the memcap thread */ + destroy_mcap_thread(); + + if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) { if (!bringup_failure_recovery) zerror(zlogp, B_FALSE, "unable to destroy zone"); return (-1); } + if (brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0) + return (-1); + if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK) zerror(zlogp, B_FALSE, "destroying snapshot: %s", zonecfg_strerror(err)); - if (brand_poststatechg(zlogp, zstate, Z_HALT) != 0) - return (-1); + zonecfg_fini_handle(snap_hndl); + snap_hndl = NULL; return (0); } @@ -1167,6 +1388,39 @@ audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val, } /* + * Log the exit time and status of the zone's init process into + * {zonepath}/lastexited. If the zone shutdown normally, the exit status will + * be -1, otherwise it will be the exit status as described in wait.3c. + * If the zone is configured to restart init, then nothing will be logged if + * init exits unexpectedly (the kernel will never upcall in this case). + */ +static void +log_init_exit(int status) +{ + char zpath[MAXPATHLEN]; + char p[MAXPATHLEN]; + char buf[128]; + struct timeval t; + int fd; + + if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) + return; + if (snprintf(p, sizeof (p), "%s/lastexited", zpath) > sizeof (p)) + return; + if (gettimeofday(&t, NULL) != 0) + return; + if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec, + status) > sizeof (buf)) + return; + if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) + return; + + (void) write(fd, buf, strlen(buf)); + + (void) close(fd); +} + +/* * The main routine for the door server that deals with zone state transitions. */ /* ARGSUSED */ @@ -1179,9 +1433,11 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, zone_state_t zstate; zone_cmd_t cmd; + boolean_t debug; + int init_status; zone_cmd_arg_t *zargp; - boolean_t kernelcall; + boolean_t kernelcall = B_TRUE; int rval = -1; uint64_t uniqid; @@ -1231,6 +1487,8 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, goto out; } cmd = zargp->cmd; + debug = zargp->debug; + init_status = zargp->status; if (door_ucred(&uc) != 0) { zerror(&logsys, B_TRUE, "door_ucred"); @@ -1337,23 +1595,25 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case ZONE_STATE_INSTALLED: switch (cmd) { case Z_READY: - rval = zone_ready(zlogp, Z_MNT_BOOT, zstate); + rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, debug); if (rval == 0) eventstream_write(Z_EVT_ZONE_READIED); + zcons_statechanged(); break; case Z_BOOT: case Z_FORCEBOOT: eventstream_write(Z_EVT_ZONE_BOOTING); - if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) - == 0) { + if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, + debug)) == 0) { rval = zone_bootup(zlogp, zargp->bootbuf, - zstate); + zstate, debug); } audit_put_record(zlogp, uc, rval, "boot"); + zcons_statechanged(); if (rval != 0) { bringup_failure_recovery = B_TRUE; (void) zone_halt(zlogp, B_FALSE, B_FALSE, - zstate); + zstate, debug); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } break; @@ -1405,7 +1665,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = zone_ready(zlogp, strcmp(zargp->bootbuf, "-U") == 0 ? - Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate); + Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate, debug); if (rval != 0) break; @@ -1467,15 +1727,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = 0; break; case Z_BOOT: + case Z_FORCEBOOT: (void) strlcpy(boot_args, zargp->bootbuf, sizeof (boot_args)); eventstream_write(Z_EVT_ZONE_BOOTING); - rval = zone_bootup(zlogp, zargp->bootbuf, zstate); + rval = zone_bootup(zlogp, zargp->bootbuf, zstate, + debug); audit_put_record(zlogp, uc, rval, "boot"); + zcons_statechanged(); if (rval != 0) { bringup_failure_recovery = B_TRUE; (void) zone_halt(zlogp, B_FALSE, B_TRUE, - zstate); + zstate, debug); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; @@ -1483,15 +1746,17 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_HALT: if (kernelcall) /* Invalid; can't happen */ abort(); - if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate)) - != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate, + debug)) != 0) break; + zcons_statechanged(); eventstream_write(Z_EVT_ZONE_HALTED); break; case Z_SHUTDOWN: case Z_REBOOT: case Z_NOTE_UNINSTALLING: case Z_MOUNT: + case Z_FORCEMOUNT: case Z_UNMOUNT: if (kernelcall) /* Invalid; can't happen */ abort(); @@ -1508,7 +1773,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case Z_UNMOUNT: if (kernelcall) /* Invalid; can't happen */ abort(); - rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate); + rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate, debug); if (rval == 0) { eventstream_write(Z_EVT_ZONE_HALTED); (void) sema_post(&scratch_sem); @@ -1530,15 +1795,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, case ZONE_STATE_DOWN: switch (cmd) { case Z_READY: - if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate)) - != 0) + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate, + debug)) != 0) break; - if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) == 0) + zcons_statechanged(); + if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, + debug)) == 0) eventstream_write(Z_EVT_ZONE_READIED); else eventstream_write(Z_EVT_ZONE_HALTED); break; case Z_BOOT: + case Z_FORCEBOOT: /* * We could have two clients racing to boot this * zone; the second client loses, but his request @@ -1549,32 +1817,40 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, rval = 0; break; case Z_HALT: - if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate)) - != 0) + if (kernelcall) { + log_init_exit(init_status); + } else { + log_init_exit(-1); + } + if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate, + debug)) != 0) break; eventstream_write(Z_EVT_ZONE_HALTED); + zcons_statechanged(); break; case Z_REBOOT: (void) strlcpy(boot_args, zargp->bootbuf, sizeof (boot_args)); eventstream_write(Z_EVT_ZONE_REBOOTING); - if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate)) - != 0) { + if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate, + debug)) != 0) { eventstream_write(Z_EVT_ZONE_BOOTFAILED); boot_args[0] = '\0'; break; } - if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) - != 0) { + zcons_statechanged(); + if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, + debug)) != 0) { eventstream_write(Z_EVT_ZONE_BOOTFAILED); boot_args[0] = '\0'; break; } - rval = zone_bootup(zlogp, zargp->bootbuf, zstate); + rval = zone_bootup(zlogp, zargp->bootbuf, zstate, + debug); audit_put_record(zlogp, uc, rval, "reboot"); if (rval != 0) { (void) zone_halt(zlogp, B_FALSE, B_TRUE, - zstate); + zstate, debug); eventstream_write(Z_EVT_ZONE_BOOTFAILED); } boot_args[0] = '\0'; @@ -1586,6 +1862,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp, break; case Z_NOTE_UNINSTALLING: case Z_MOUNT: + case Z_FORCEMOUNT: case Z_UNMOUNT: zerror(zlogp, B_FALSE, "%s operation is invalid " "for zones in state '%s'", z_cmd_name(cmd), @@ -1749,11 +2026,35 @@ top: * state. */ if (zstate > ZONE_STATE_INSTALLED) { + static zoneid_t zid; + zerror(zlogp, B_FALSE, "zone '%s': WARNING: zone is in state '%s', but " "zoneadmd does not appear to be available; " "restarted zoneadmd to recover.", zone_name, zone_state_str(zstate)); + + /* + * Startup a thread to perform memory capping for the + * zone. zlogp won't be valid for much longer so use + * logsys. + */ + if ((zid = getzoneidbyname(zone_name)) != -1) + create_mcap_thread(&logsys, zid); + + /* recover the global configuration snapshot */ + if (snap_hndl == NULL) { + if ((snap_hndl = zonecfg_init_handle()) + == NULL || + zonecfg_create_snapshot(zone_name) + != Z_OK || + zonecfg_get_snapshot_handle(zone_name, + snap_hndl) != Z_OK) { + zerror(zlogp, B_FALSE, "recovering " + "zone configuration handle"); + goto out; + } + } } (void) fdetach(zone_door_path); @@ -1767,6 +2068,52 @@ out: } /* + * Run the query hook with the 'env' parameter. It should return a + * string of tab-delimited key-value pairs, each of which should be set + * in the environment. + * + * Because the env_vars string values become part of the environment, the + * string is static and we don't free it. + * + * This function is always called before zoneadmd forks and makes itself + * exclusive, so it is possible there could more than one instance of zoneadmd + * running in parallel at this point. Thus, we have no zonecfg snapshot and + * shouldn't take one yet (i.e. snap_hndl is NULL). Thats ok, since we don't + * need any zonecfg info to query for a brand-specific env value. + */ +static int +set_brand_env(zlog_t *zlogp) +{ + int ret = 0; + static char *env_vars = NULL; + char buf[2 * MAXPATHLEN]; + + if (query_hook[0] == '\0' || env_vars != NULL) + return (0); + + if (snprintf(buf, sizeof (buf), "%s env", query_hook) > sizeof (buf)) + return (-1); + + if (do_subproc(zlogp, buf, &env_vars, B_FALSE) != 0) + return (-1); + + if (env_vars != NULL) { + char *sp; + + sp = strtok(env_vars, "\t"); + while (sp != NULL) { + if (putenv(sp) != 0) { + ret = -1; + break; + } + sp = strtok(NULL, "\t"); + } + } + + return (ret); +} + +/* * Setup the brand's pre and post state change callbacks, as well as the * query callback, if any of these exist. */ @@ -2002,6 +2349,11 @@ main(int argc, char *argv[]) } priv_freeset(privset); + if (set_brand_env(zlogp) != 0) { + zerror(zlogp, B_FALSE, "Unable to setup brand's environment"); + return (1); + } + if (mkzonedir(zlogp) != 0) return (1); @@ -2042,6 +2394,13 @@ main(int argc, char *argv[]) (void) sigaddset(&block_cld, SIGCHLD); (void) sigprocmask(SIG_BLOCK, &block_cld, NULL); + /* + * The parent only needs stderr after the fork, so close other fd's + * that we inherited from zoneadm so that the parent doesn't have those + * open while waiting. The child will close the rest after the fork. + */ + closefrom(3); + if ((ctfd = init_template()) == -1) { zerror(zlogp, B_TRUE, "failed to create contract"); return (1); diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h index d784a303b3..ceab787dab 100644 --- a/usr/src/cmd/zoneadmd/zoneadmd.h +++ b/usr/src/cmd/zoneadmd/zoneadmd.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2014, Joyent, Inc. All rights reserved. */ #ifndef _ZONEADMD_H @@ -90,17 +91,18 @@ extern mutex_t msglock; extern boolean_t in_death_throes; extern boolean_t bringup_failure_recovery; extern char *zone_name; +extern zone_dochandle_t snap_hndl; extern char pool_name[MAXNAMELEN]; extern char brand_name[MAXNAMELEN]; extern char default_brand[MAXNAMELEN]; extern char boot_args[BOOTARGS_MAX]; -extern char bad_boot_arg[BOOTARGS_MAX]; extern boolean_t zone_isnative; extern boolean_t zone_iscluster; extern dladm_handle_t dld_handle; extern void zerror(zlog_t *, boolean_t, const char *, ...); extern char *localize_msg(char *locale, const char *msg); +extern void nwifent_free_attrs(struct zone_nwiftab *); /* * Eventstream interfaces. @@ -112,8 +114,7 @@ typedef enum { Z_EVT_ZONE_HALTED, Z_EVT_ZONE_READIED, Z_EVT_ZONE_UNINSTALLING, - Z_EVT_ZONE_BOOTFAILED, - Z_EVT_ZONE_BADARGS + Z_EVT_ZONE_BOOTFAILED } zone_evt_t; extern int eventstream_init(); @@ -135,9 +136,9 @@ typedef enum { /* * Virtual platform interfaces. */ -extern zoneid_t vplat_create(zlog_t *, zone_mnt_t); +extern zoneid_t vplat_create(zlog_t *, zone_mnt_t, zoneid_t); extern int vplat_bringup(zlog_t *, zone_mnt_t, zoneid_t); -extern int vplat_teardown(zlog_t *, boolean_t, boolean_t); +extern int vplat_teardown(zlog_t *, boolean_t, boolean_t, boolean_t); extern int vplat_get_iptype(zlog_t *, zone_iptype_t *); /* @@ -154,6 +155,13 @@ extern void resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen); */ extern int init_console(zlog_t *); extern void serve_console(zlog_t *); +extern void zcons_statechanged(); + +/* + * Memory capping thread creation. + */ +extern void create_mcap_thread(zlog_t *, zoneid_t); +extern void destroy_mcap_thread(); /* * Contract handling. @@ -163,7 +171,7 @@ extern int init_template(void); /* * Routine to manage child processes. */ -extern int do_subproc(zlog_t *, char *, char **); +extern int do_subproc(zlog_t *, char *, char **, boolean_t); #ifdef __cplusplus } |
