summaryrefslogtreecommitdiff
path: root/usr/src/cmd/zoneadmd
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/cmd/zoneadmd')
-rw-r--r--usr/src/cmd/zoneadmd/Makefile49
-rw-r--r--usr/src/cmd/zoneadmd/Makefile.com70
-rw-r--r--usr/src/cmd/zoneadmd/amd64/Makefile31
-rw-r--r--usr/src/cmd/zoneadmd/i386/Makefile30
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c1193
-rw-r--r--usr/src/cmd/zoneadmd/vplat.c437
-rw-r--r--usr/src/cmd/zoneadmd/zcons.c131
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.c589
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.h20
9 files changed, 2073 insertions, 477 deletions
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile
index 8324f7fefa..e81e4631aa 100644
--- a/usr/src/cmd/zoneadmd/Makefile
+++ b/usr/src/cmd/zoneadmd/Makefile
@@ -18,57 +18,54 @@
#
# CDDL HEADER END
-
-#
-
#
# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
#
PROG= zoneadmd
include ../Makefile.cmd
+include ../Makefile.ctf
-ROOTCMDDIR= $(ROOTLIB)/zones
-
-OBJS= zoneadmd.o zcons.o vplat.o
-SRCS = $(OBJS:.o=.c)
-POFILE=zoneadmd_all.po
-POFILES= $(OBJS:%.o=%.po)
+$(64ONLY)SUBDIRS= $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
-CFLAGS += $(CCVERBOSE)
-CERRWARN += -_gcc=-Wno-switch
-CERRWARN += -_gcc=-Wno-parentheses
-CERRWARN += -_gcc=-Wno-uninitialized
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
-LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
- -lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \
- -linetutil -lscf
XGETFLAGS += -a -x zoneadmd.xcl
+ROOTUSRLIBZONES = $(ROOT)/usr/lib/zones
+
.KEEP_STATE:
.PARALLEL:
-all: $(PROG)
+all: $(SUBDIRS)
$(PROG): $(OBJS)
$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
$(POST_PROCESS)
-install: all $(ROOTCMD)
-
-$(POFILE): $(POFILES)
- $(RM) $@
- $(CAT) $(POFILES) > $@
+install: $(SUBDIRS)
+ -$(RM) $(ROOTUSRLIBZONES)/$(PROG)
+ -$(LN) $(ISAEXEC) $(ROOTUSRLIBZONES)/$(PROG)
-clean:
- $(RM) $(OBJS)
+$(POFILE):
-lint: lint_SRCS
+clean clobebr lint: $(SUBDIRS)
check:
- $(CSTYLE) -p -P $(SRCS:%=%)
+ $(CSTYLE) -p -P *.c
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
include ../Makefile.targ
diff --git a/usr/src/cmd/zoneadmd/Makefile.com b/usr/src/cmd/zoneadmd/Makefile.com
new file mode 100644
index 0000000000..162d1f0219
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/Makefile.com
@@ -0,0 +1,70 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+#
+
+PROG= zoneadmd
+
+include ../../Makefile.cmd
+include ../../Makefile.ctf
+
+ROOTCMDDIR= $(ROOTLIB)/zones
+
+OBJS= zoneadmd.o zcons.o vplat.o mcap.o
+
+CFLAGS += $(CCVERBOSE)
+LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
+ -lgen -lbsm -lcontract -lzfs -luuid -lbrand -ldladm -ltsnet -ltsol \
+ -linetutil -lproc -lscf
+
+.KEEP_STATE:
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+ROOTUSRLIBZONES = $(ROOT)/usr/lib/zones
+ROOTUSRLIBZONES32 = $(ROOTUSRLIBZONES)/$(MACH32)
+ROOTUSRLIBZONES64 = $(ROOTUSRLIBZONES)/$(MACH64)
+ROOTUSRLIBZONESPROG32 = $(ROOTUSRLIBZONES32)/$(PROG)
+ROOTUSRLIBZONESPROG64 = $(ROOTUSRLIBZONES64)/$(PROG)
+$(ROOTUSRLIBZONES32)/%: $(ROOTUSRLIBZONES32) %
+ $(INS.file)
+$(ROOTUSRLIBZONES64)/%: $(ROOTUSRLIBZONES64) %
+ $(INS.file)
+$(ROOTUSRLIBZONES32):
+ $(INS.dir)
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+clean:
+ $(RM) $(OBJS)
+
+lint:
+ $(LINT.c) ../*.c $(LDLIBS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/zoneadmd/amd64/Makefile b/usr/src/cmd/zoneadmd/amd64/Makefile
new file mode 100644
index 0000000000..75ac51db32
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/amd64/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+#
+
+.KEEP_STATE:
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRLIBZONES64) $(ROOTUSRLIBZONESPROG64)
diff --git a/usr/src/cmd/zoneadmd/i386/Makefile b/usr/src/cmd/zoneadmd/i386/Makefile
new file mode 100644
index 0000000000..a8764e0638
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/i386/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+#
+
+.KEEP_STATE:
+
+include ../Makefile.com
+
+install: all $(ROOTUSRLIBZONES32) $(ROOTUSRLIBZONESPROG32)
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
new file mode 100644
index 0000000000..44917b0024
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -0,0 +1,1193 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements the code which runs a thread inside zoneadmd to cap
+ * the associated zone's physical memory. A thread to do this is started
+ * when the zone boots and is halted when the zone shuts down.
+ *
+ * Because of the way that the VM system is currently implemented, there is no
+ * way to go from the bottom up (page to process to zone). Thus, there is no
+ * obvious way to hook an rctl into the kernel's paging code to enforce a hard
+ * memory cap. Instead, we implement a soft physical memory cap which looks
+ * at the zone's overall rss and once it is over the cap, works from the top
+ * down (zone to process to page), looking at zone processes, to determine
+ * what to try to pageout to get the zone under its memory cap.
+ *
+ * The code uses the fast, cheap, but potentially very inaccurate sum of the
+ * rss values from psinfo_t to first approximate the zone's rss and will
+ * fallback to the vm_getusage syscall to determine the zone's rss if needed.
+ * It then checks the rss against the zone's zone.max-physical-memory rctl.
+ * Once the zone goes over its cap, then this thread will work through the
+ * zone's /proc process list, Pgrab-bing each process and stepping through the
+ * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
+ * to pageout pages, until the zone is again under its cap.
+ *
+ * Although zone memory capping is implemented as a soft cap by this user-level
+ * thread, the interfaces around memory caps that are exposed to the user are
+ * the standard ones; an rctl and kstats. This thread uses the rctl value
+ * to obtain the cap and works with the zone kernel code to update the kstats.
+ * If the implementation ever moves into the kernel, these exposed interfaces
+ * do not need to change.
+ *
+ * The thread adaptively sleeps, periodically checking the state of the
+ * zone. As the zone's rss gets closer to the cap, the thread will wake up
+ * more often to check the zone's status. Once the zone is over the cap,
+ * the thread will work to pageout until the zone is under the cap, as shown
+ * by updated vm_usage data.
+ *
+ * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
+ * are set by hrm_setbits() and on x86 that code path is only executed by
+ * segvn_pagelock -> hat_setstat -> hrm_setbits
+ * segvn_softunlock -^
+ * On SPARC there is an additional code path which may make this data
+ * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
+ * maps. If we ever fix this issue, then we could generalize this mcap code to
+ * do more with the data on active pages.
+ *
+ * For debugging, touch the file {zonepath}/mcap_debug.log. This will
+ * cause the thread to start logging its actions into that file (it may take
+ * a minute or two if the thread is currently sleeping). Removing that
+ * file will cause logging to stop.
+ */
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libproc.h>
+#include <limits.h>
+#include <procfs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/priocntl.h>
+#include <dirent.h>
+#include <zone.h>
+#include <libzonecfg.h>
+#include <thread.h>
+#include <values.h>
+#include <sys/vm_usage.h>
+#include <sys/resource.h>
+#include <sys/debug.h>
+#include <synch.h>
+#include <wait.h>
+#include <libcontract.h>
+#include <libcontract_priv.h>
+#include <sys/contract/process.h>
+#include "zoneadmd.h"
+
+ /* round up to next y = 2^n */
+#define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
+
+#define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
+
+/*
+ * zonecfg attribute tunables for memory capping.
+ * phys-mcap-cmd
+ * type: string
+ * specifies a command that can be run when over the cap
+ * phys-mcap-no-vmusage
+ * type: boolean
+ * true disables vm_getusage and just uses zone's proc. rss sum
+ * phys-mcap-no-pageout
+ * type: boolean
+ * true disables pageout when over
+ * phys-mcap-no-pf-throttle
+ * type: boolean
+ * true disables page fault throttling when over
+ */
+#define TUNE_CMD "phys-mcap-cmd"
+#define TUNE_NVMU "phys-mcap-no-vmusage"
+#define TUNE_NPAGE "phys-mcap-no-pageout"
+#define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
+
+/*
+ * These are only used in get_mem_info but global. We always need scale_rss and
+ * prev_fast_rss to be persistent but we also have the other two global so we
+ * can easily see these with mdb.
+ */
+uint64_t scale_rss = 0;
+uint64_t prev_fast_rss = 0;
+uint64_t fast_rss = 0;
+uint64_t accurate_rss = 0;
+
+static char zonename[ZONENAME_MAX];
+static char zonepath[MAXPATHLEN];
+static char zoneproc[MAXPATHLEN];
+static char debug_log[MAXPATHLEN];
+static zoneid_t zid;
+static mutex_t shutdown_mx;
+static cond_t shutdown_cv;
+static int shutting_down = 0;
+static thread_t mcap_tid;
+static FILE *debug_log_fp = NULL;
+static uint64_t zone_rss_cap; /* RSS cap(KB) */
+static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
+static boolean_t skip_vmusage = B_FALSE;
+static boolean_t skip_pageout = B_FALSE;
+static boolean_t skip_pf_throttle = B_FALSE;
+
+static zlog_t *logp;
+
+static int64_t check_suspend();
+static void get_mcap_tunables();
+
+/*
+ * Structure to hold current state about a process address space that we're
+ * working on.
+ */
+typedef struct {
+ int pr_curr; /* the # of the mapping we're working on */
+ int pr_nmap; /* number of mappings in address space */
+ prmap_t *pr_mapp; /* process's map array */
+} proc_map_t;
+
+typedef struct zsd_vmusage64 {
+ id_t vmu_zoneid;
+ uint_t vmu_type;
+ id_t vmu_id;
+ /*
+ * An amd64 kernel will align the following uint64_t members, but a
+ * 32bit i386 process will not without help.
+ */
+ int vmu_align_next_members_on_8_bytes;
+ uint64_t vmu_rss_all;
+ uint64_t vmu_rss_private;
+ uint64_t vmu_rss_shared;
+ uint64_t vmu_swap_all;
+ uint64_t vmu_swap_private;
+ uint64_t vmu_swap_shared;
+} zsd_vmusage64_t;
+
+/*
+ * Output a debug log message.
+ */
+/*PRINTFLIKE1*/
+static void
+debug(char *fmt, ...)
+{
+ va_list ap;
+
+ if (debug_log_fp == NULL)
+ return;
+
+ va_start(ap, fmt);
+ (void) vfprintf(debug_log_fp, fmt, ap);
+ va_end(ap);
+ (void) fflush(debug_log_fp);
+}
+
+/*
+ * Like sleep(3C) but can be interupted by cond_signal which is posted when
+ * we're shutting down the mcap thread.
+ */
+static void
+sleep_shutdown(int secs)
+{
+ timestruc_t to;
+
+ to.tv_sec = secs;
+ to.tv_nsec = 0;
+
+ (void) mutex_lock(&shutdown_mx);
+ if (!shutting_down)
+ (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
+ (void) mutex_unlock(&shutdown_mx);
+}
+
+static boolean_t
+proc_issystem(pid_t pid)
+{
+ char pc_clname[PC_CLNMSZ];
+
+ if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
+ PC_KY_NULL) != -1)
+ return (strcmp(pc_clname, "SYS") == 0);
+
+ return (B_TRUE);
+}
+
+/*
+ * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
+ */
+static void
+run_over_cmd()
+{
+ int ctfd;
+ int err;
+ pid_t childpid;
+ siginfo_t info;
+ ctid_t ct;
+
+ /*
+ * Before we enter the zone, we need to create a new process contract
+ * for the child, as required by zone_enter().
+ */
+ if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
+ return;
+ if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
+ ct_tmpl_set_informative(ctfd, 0) != 0 ||
+ ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
+ ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
+ ct_tmpl_activate(ctfd) != 0) {
+ (void) close(ctfd);
+ return;
+ }
+
+ childpid = fork();
+ switch (childpid) {
+ case -1:
+ (void) ct_tmpl_clear(ctfd);
+ (void) close(ctfd);
+ break;
+ case 0: /* Child */
+ (void) ct_tmpl_clear(ctfd);
+ (void) close(ctfd);
+ if (zone_enter(zid) == -1)
+ _exit(errno);
+ err = system(over_cmd);
+ _exit(err);
+ break;
+ default: /* Parent */
+ if (contract_latest(&ct) == -1)
+ ct = -1;
+ (void) ct_tmpl_clear(ctfd);
+ (void) close(ctfd);
+ err = waitid(P_PID, childpid, &info, WEXITED);
+ (void) contract_abandon_id(ct);
+ if (err == -1 || info.si_status != 0)
+ debug("over_cmd failed");
+ break;
+ }
+}
+
+/*
+ * Get the next mapping.
+ */
+static prmap_t *
+nextmapping(proc_map_t *pmp)
+{
+ if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
+ return (NULL);
+
+ return (&pmp->pr_mapp[pmp->pr_curr++]);
+}
+
+/*
+ * Initialize the proc_map_t to access the first mapping of an address space.
+ */
+static prmap_t *
+init_map(proc_map_t *pmp, pid_t pid)
+{
+ int fd;
+ int res;
+ struct stat st;
+ char pathbuf[MAXPATHLEN];
+
+ bzero(pmp, sizeof (proc_map_t));
+ pmp->pr_nmap = -1;
+
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
+ if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
+ return (NULL);
+
+redo:
+ errno = 0;
+ if (fstat(fd, &st) != 0)
+ goto done;
+
+ if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
+ debug("cannot malloc() %ld bytes for xmap", st.st_size);
+ goto done;
+ }
+ (void) bzero(pmp->pr_mapp, st.st_size);
+
+ errno = 0;
+ if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
+ free(pmp->pr_mapp);
+ pmp->pr_mapp = NULL;
+ if (res > 0 || errno == E2BIG) {
+ goto redo;
+ } else {
+ debug("pid %ld cannot read xmap\n", pid);
+ goto done;
+ }
+ }
+
+ pmp->pr_nmap = st.st_size / sizeof (prmap_t);
+
+done:
+ (void) close(fd);
+ return (nextmapping(pmp));
+}
+
+/*
+ * Attempt to invalidate the entire mapping from within the given process's
+ * address space. May return nonzero with errno as:
+ * ESRCH - process not found
+ * ENOMEM - segment not found
+ * EINVAL - mapping exceeds a single segment
+ */
+static int
+pageout_mapping(pid_t pid, prmap_t *pmp)
+{
+ int res;
+
+ if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
+ return (0);
+
+ errno = 0;
+ res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
+ pmp->pr_size);
+
+ return (res);
+}
+
+/*
+ * Work through a process paging out mappings until the whole address space was
+ * examined or the excess is < 0. Return our estimate of the updated excess.
+ */
+static int64_t
+pageout_process(pid_t pid, int64_t excess)
+{
+ int psfd;
+ prmap_t *pmap;
+ proc_map_t cur;
+ int res;
+ int64_t sum_d_rss, d_rss;
+ int64_t old_rss;
+ int map_cnt;
+ psinfo_t psinfo;
+ char pathbuf[MAXPATHLEN];
+
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
+ pid);
+ if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
+ return (excess);
+
+ cur.pr_mapp = NULL;
+
+ if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
+ goto done;
+
+ old_rss = (int64_t)psinfo.pr_rssize;
+ map_cnt = 0;
+
+ /* If unscannable, skip it. */
+ if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
+ debug("pid %ld: system process, skipping %s\n",
+ pid, psinfo.pr_psargs);
+ goto done;
+ }
+
+ /* If tiny RSS (16KB), skip it. */
+ if (old_rss <= 16) {
+ debug("pid %ld: skipping, RSS %lldKB %s\n",
+ pid, old_rss, psinfo.pr_psargs);
+ goto done;
+ }
+
+ /* Get segment residency information. */
+ pmap = init_map(&cur, pid);
+
+ /* Skip process if it has no mappings. */
+ if (pmap == NULL) {
+ debug("pid %ld: map unreadable; ignoring\n", pid);
+ goto done;
+ }
+
+ debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
+ pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
+
+ /*
+ * Within the process's address space, attempt to page out mappings.
+ */
+ sum_d_rss = 0;
+ while (excess > 0 && pmap != NULL && !shutting_down) {
+ /* invalidate the entire mapping */
+ if ((res = pageout_mapping(pid, pmap)) < 0)
+ debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
+ pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
+
+ map_cnt++;
+
+ /*
+ * Re-check the process rss and get the delta.
+ */
+ if (pread(psfd, &psinfo, sizeof (psinfo), 0)
+ != sizeof (psinfo)) {
+ excess -= old_rss;
+ goto done;
+ }
+
+ d_rss = (int64_t)psinfo.pr_rssize - old_rss;
+ old_rss = (int64_t)psinfo.pr_rssize;
+ sum_d_rss += d_rss;
+
+ /*
+ * d_rss hopefully should be negative (or 0 if nothing
+ * invalidated) but can be positive if more got paged in.
+ */
+ excess += d_rss;
+
+ if (excess <= 0) {
+ debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
+ "excess %lldKB\n", pid, map_cnt,
+ (unsigned long long)sum_d_rss, (long long)excess);
+ map_cnt = 0;
+
+ /*
+ * If we're actually under, this will suspend checking
+ * in the middle of this process's address space.
+ */
+ excess = check_suspend();
+ if (shutting_down)
+ goto done;
+
+ /*
+ * since we might have suspended, re-read process's rss
+ */
+ if (pread(psfd, &psinfo, sizeof (psinfo), 0)
+ != sizeof (psinfo)) {
+ excess -= old_rss;
+ goto done;
+ }
+
+ old_rss = (int64_t)psinfo.pr_rssize;
+
+ debug("pid %ld: resume pageout; excess %lld\n", pid,
+ (long long)excess);
+ sum_d_rss = 0;
+ }
+
+ pmap = nextmapping(&cur);
+ }
+
+ debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
+ pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
+
+done:
+ if (cur.pr_mapp != NULL)
+ free(cur.pr_mapp);
+
+ (void) close(psfd);
+
+ if (shutting_down)
+ return (0);
+
+ return (excess);
+}
+
+/*
+ * Get the zone's RSS data.
+ */
+static uint64_t
+get_mem_info()
+{
+ uint64_t n = 1;
+ zsd_vmusage64_t buf;
+ uint64_t tmp_rss;
+ DIR *pdir = NULL;
+ struct dirent *dent;
+
+ /*
+ * Start by doing the fast, cheap RSS calculation using the rss value
+ * in psinfo_t. Because that's per-process, it can lead to double
+ * counting some memory and overestimating how much is being used, but
+ * as long as that's not over the cap, then we don't need do the
+ * expensive calculation.
+ *
+ * If we have to do the expensive calculation, we remember the scaling
+ * factor so that we can try to use that on subsequent iterations for
+ * the fast rss.
+ */
+ if (shutting_down)
+ return (0);
+
+ if ((pdir = opendir(zoneproc)) == NULL)
+ return (0);
+
+ accurate_rss = 0;
+ fast_rss = 0;
+ while (!shutting_down && (dent = readdir(pdir)) != NULL) {
+ pid_t pid;
+ int psfd;
+ int64_t rss;
+ char pathbuf[MAXPATHLEN];
+ psinfo_t psinfo;
+
+ if (strcmp(".", dent->d_name) == 0 ||
+ strcmp("..", dent->d_name) == 0)
+ continue;
+
+ pid = atoi(dent->d_name);
+ if (pid == 0 || pid == 1)
+ continue;
+
+ (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
+ zoneproc, pid);
+
+ rss = 0;
+ if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
+ if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
+ sizeof (psinfo))
+ rss = (int64_t)psinfo.pr_rssize;
+
+ (void) close(psfd);
+ }
+
+ fast_rss += rss;
+ }
+
+ (void) closedir(pdir);
+
+ if (shutting_down)
+ return (0);
+
+ debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
+ scale_rss, prev_fast_rss);
+
+ /* see if we can get by with a scaled fast rss */
+ tmp_rss = fast_rss;
+ if (scale_rss > 1 && prev_fast_rss > 0) {
+ /*
+ * Only scale the fast value if it hasn't ballooned too much
+ * to trust.
+ */
+ if (fast_rss / prev_fast_rss < 2) {
+ fast_rss /= scale_rss;
+ debug("scaled fast rss: %lluKB\n", fast_rss);
+ }
+ }
+
+ if (fast_rss <= zone_rss_cap || skip_vmusage) {
+ uint64_t zone_rss_bytes;
+
+ zone_rss_bytes = fast_rss * 1024;
+ /* Use the zone's approx. RSS in the kernel */
+ (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
+ return (fast_rss);
+ }
+
+ buf.vmu_id = zid;
+
+ /* get accurate usage (cached data may be up to 5 seconds old) */
+ if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
+ (uintptr_t)&buf, (uintptr_t)&n) != 0) {
+ debug("vmusage failed\n");
+ (void) sleep_shutdown(1);
+ return (0);
+ }
+
+ if (n > 1) {
+ /* This should never happen */
+ debug("vmusage returned more than one result\n");
+ (void) sleep_shutdown(1);
+ return (0);
+ }
+
+ if (buf.vmu_id != zid) {
+ /* This should never happen */
+ debug("vmusage returned the incorrect zone\n");
+ (void) sleep_shutdown(1);
+ return (0);
+ }
+
+ accurate_rss = buf.vmu_rss_all / 1024;
+
+ /* calculate scaling factor to use for fast_rss from now on */
+ if (accurate_rss > 0) {
+ scale_rss = fast_rss / accurate_rss;
+ debug("new scaling factor: %llu\n", scale_rss);
+ /* remember the fast rss when we had to get the accurate rss */
+ prev_fast_rss = tmp_rss;
+ }
+
+ debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
+ scale_rss, prev_fast_rss);
+ return (accurate_rss);
+}
+
+/*
+ * Needed to read the zones physical-memory-cap rctl.
+ */
+static struct ps_prochandle *
+grab_zone_proc()
+{
+ DIR *dirp;
+ struct dirent *dentp;
+ struct ps_prochandle *ph = NULL;
+ int tmp;
+
+ if ((dirp = opendir(zoneproc)) == NULL)
+ return (NULL);
+
+ while (!shutting_down && (dentp = readdir(dirp))) {
+ int pid;
+
+ if (strcmp(".", dentp->d_name) == 0 ||
+ strcmp("..", dentp->d_name) == 0)
+ continue;
+
+ pid = atoi(dentp->d_name);
+ /* attempt to grab process */
+ if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
+ if (Psetflags(ph, PR_RLC) == 0) {
+ if (Pcreate_agent(ph) == 0) {
+ (void) closedir(dirp);
+ return (ph);
+ }
+ }
+ Prelease(ph, 0);
+ }
+ }
+
+ (void) closedir(dirp);
+ return (NULL);
+}
+
+static uint64_t
+get_zone_cap()
+{
+ rctlblk_t *rblk;
+ uint64_t mcap;
+ struct ps_prochandle *ph;
+
+ if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
+ return (UINT64_MAX);
+
+ if ((ph = grab_zone_proc()) == NULL) {
+ free(rblk);
+ return (UINT64_MAX);
+ }
+
+ if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
+ RCTL_FIRST)) {
+ Pdestroy_agent(ph);
+ Prelease(ph, 0);
+ free(rblk);
+ return (UINT64_MAX);
+ }
+
+ Pdestroy_agent(ph);
+ Prelease(ph, 0);
+
+ mcap = rctlblk_get_value(rblk);
+ free(rblk);
+ return (mcap);
+}
+
+/*
+ * check_suspend is invoked at the beginning of every pass through the process
+ * list or after we've paged out enough so that we think the excess is under
+ * the cap. The purpose is to periodically check the zone's rss and return
+ * the excess when the zone is over the cap. The rest of the time this
+ * function will sleep, periodically waking up to check the current rss.
+ *
+ * Depending on the percentage of penetration of the zone's rss into the
+ * cap we sleep for longer or shorter amounts. This reduces the impact of this
+ * work on the system, which is important considering that each zone will be
+ * monitoring its rss.
+ */
+static int64_t
+check_suspend()
+{
+ static hrtime_t last_cap_read = 0;
+ static uint64_t addon;
+ static uint64_t lo_thresh; /* Thresholds for how long to sleep */
+ static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
+ static uint64_t prev_zone_rss = 0;
+ static uint32_t pfdelay = 0; /* usec page fault delay when over */
+
+ /* Wait a second to give the async pageout a chance to catch up. */
+ (void) sleep_shutdown(1);
+
+ while (!shutting_down) {
+ int64_t new_excess;
+ int sleep_time;
+ hrtime_t now;
+ struct stat st;
+ uint64_t zone_rss; /* total RSS(KB) */
+
+ /*
+ * Check if the debug log files exists and enable or disable
+ * debug.
+ */
+ if (debug_log_fp == NULL) {
+ if (stat(debug_log, &st) == 0)
+ debug_log_fp = fopen(debug_log, "w");
+ } else {
+ if (stat(debug_log, &st) == -1) {
+ (void) fclose(debug_log_fp);
+ debug_log_fp = NULL;
+ }
+ }
+
+ /*
+ * If the CAP_REFRESH interval has passed, re-get the current
+ * cap in case it has been dynamically updated.
+ */
+ now = gethrtime();
+ if (now - last_cap_read > CAP_REFRESH) {
+ uint64_t mcap;
+
+ last_cap_read = now;
+
+ mcap = get_zone_cap();
+ if (mcap != 0 && mcap != UINT64_MAX)
+ zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
+ else
+ zone_rss_cap = UINT64_MAX;
+
+ lo_thresh = (uint64_t)(zone_rss_cap * .8);
+ hi_thresh = (uint64_t)(zone_rss_cap * .9);
+ addon = (uint64_t)(zone_rss_cap * 0.05);
+
+ /*
+ * We allow the memory cap tunables to be changed on
+ * the fly.
+ */
+ get_mcap_tunables();
+
+ debug("%s: %s\n", TUNE_CMD, over_cmd);
+ debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
+ debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
+ debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
+ debug("current cap %lluKB lo %lluKB hi %lluKB\n",
+ zone_rss_cap, lo_thresh, hi_thresh);
+ }
+
+ /* No cap, nothing to do. */
+ if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
+ debug("no cap, sleep 120 seconds\n");
+ (void) sleep_shutdown(120);
+ continue;
+ }
+
+ zone_rss = get_mem_info();
+
+ /* calculate excess */
+ new_excess = zone_rss - zone_rss_cap;
+
+ debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
+ zone_rss, zone_rss_cap, new_excess);
+
+ /*
+ * If necessary, updates stats.
+ */
+
+ /*
+ * If it looks like we did some paging out since last over the
+ * cap then update the kstat so we can approximate how much was
+ * paged out.
+ */
+ if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
+ uint64_t diff;
+
+ /* assume diff is num bytes we paged out */
+ diff = (prev_zone_rss - zone_rss) * 1024;
+
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
+ &diff, 0);
+ }
+ prev_zone_rss = zone_rss;
+
+ if (new_excess > 0) {
+ uint64_t n = 1;
+
+ /* Increment "nover" kstat. */
+ (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
+
+ if (!skip_pf_throttle) {
+ /*
+ * Tell the kernel to start throttling page
+ * faults by some number of usecs to help us
+ * catch up. If we are persistently over the
+ * cap the delay ramps up to a max of 2000usecs.
+ * Note that for delays less than 1 tick
+ * (i.e. all of these) we busy-wait in as_fault.
+ * delay faults/sec
+ * 125 8000
+ * 250 4000
+ * 500 2000
+ * 1000 1000
+ * 2000 500
+ */
+ if (pfdelay == 0)
+ pfdelay = 125;
+ else if (pfdelay < 2000)
+ pfdelay *= 2;
+
+ (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
+ &pfdelay, 0);
+ }
+
+ /*
+ * Once we go over the cap, then we want to
+ * page out a little extra instead of stopping
+ * right at the cap. To do this we add 5% to
+ * the excess so that pageout_proces will work
+ * a little longer before stopping.
+ */
+ return ((int64_t)(new_excess + addon));
+ }
+
+ /*
+ * At this point we are under the cap.
+ *
+ * Tell the kernel to stop throttling page faults.
+ *
+ * Scale the amount of time we sleep before rechecking the
+ * zone's memory usage. Also, scale the accpetable age of
+ * cached results from vm_getusage. We do this based on the
+ * penetration into the capped limit.
+ */
+ if (pfdelay > 0) {
+ pfdelay = 0;
+ (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
+ &pfdelay, 0);
+ }
+
+ if (zone_rss <= lo_thresh) {
+ sleep_time = 120;
+ } else if (zone_rss <= hi_thresh) {
+ sleep_time = 60;
+ } else {
+ sleep_time = 30;
+ }
+
+ debug("sleep %d seconds\n", sleep_time);
+ (void) sleep_shutdown(sleep_time);
+ }
+
+ /* Shutting down, tell the kernel so it doesn't throttle */
+ if (pfdelay > 0) {
+ pfdelay = 0;
+ (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
+ }
+
+ return (0);
+}
+
+static void
+get_mcap_tunables()
+{
+ zone_dochandle_t handle;
+ struct zone_attrtab attr;
+
+ over_cmd[0] = '\0';
+ if ((handle = zonecfg_init_handle()) == NULL)
+ return;
+
+ if (zonecfg_get_handle(zonename, handle) != Z_OK)
+ goto done;
+
+ /* Reset to defaults in case rebooting and settings have changed */
+ over_cmd[0] = '\0';
+ skip_vmusage = B_FALSE;
+ skip_pageout = B_FALSE;
+ skip_pf_throttle = B_FALSE;
+
+ if (zonecfg_setattrent(handle) != Z_OK)
+ goto done;
+ while (zonecfg_getattrent(handle, &attr) == Z_OK) {
+ if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
+ (void) strlcpy(over_cmd, attr.zone_attr_value,
+ sizeof (over_cmd));
+ } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
+ if (strcmp("true", attr.zone_attr_value) == 0)
+ skip_vmusage = B_TRUE;
+ } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
+ if (strcmp("true", attr.zone_attr_value) == 0)
+ skip_pageout = B_TRUE;
+ } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
+ if (strcmp("true", attr.zone_attr_value) == 0)
+ skip_pf_throttle = B_TRUE;
+ }
+ }
+ (void) zonecfg_endattrent(handle);
+
+done:
+ zonecfg_fini_handle(handle);
+}
+
+/* ARGSUSED */
+static int
+chk_proc_fs(void *data, const char *spec, const char *dir,
+ const char *fstype, const char *opt)
+{
+ if (fstype != NULL && strcmp(fstype, "proc") == 0)
+ *((boolean_t *)data) = B_TRUE;
+
+ return (0);
+}
+
+static boolean_t
+has_proc()
+{
+ brand_handle_t bh;
+ boolean_t fnd = B_FALSE;
+
+ if ((bh = brand_open(brand_name)) != NULL) {
+ (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
+ }
+
+ brand_close(bh);
+ return (fnd);
+}
+
+/*
+ * We run this loop for brands with no /proc to simply update the RSS, using
+ * the cheap GZ /proc data, every 5 minutes.
+ */
+static void
+no_procfs()
+{
+ DIR *pdir = NULL;
+ struct dirent *dent;
+ uint64_t zone_rss_bytes;
+
+ (void) sleep_shutdown(30);
+ while (!shutting_down) {
+ /*
+ * Just do the fast, cheap RSS calculation using the rss value
+ * in psinfo_t. Because that's per-process, it can lead to
+ * double counting some memory and overestimating how much is
+ * being used. Since there is no /proc in the zone, we use the
+ * GZ /proc and check for the correct zone.
+ */
+ if ((pdir = opendir("/proc")) == NULL)
+ return;
+
+ fast_rss = 0;
+ while (!shutting_down && (dent = readdir(pdir)) != NULL) {
+ pid_t pid;
+ int psfd;
+ int64_t rss;
+ char pathbuf[MAXPATHLEN];
+ psinfo_t psinfo;
+
+ if (strcmp(".", dent->d_name) == 0 ||
+ strcmp("..", dent->d_name) == 0)
+ continue;
+
+ pid = atoi(dent->d_name);
+ if (pid == 0 || pid == 1)
+ continue;
+
+ (void) snprintf(pathbuf, sizeof (pathbuf),
+ "/proc/%d/psinfo", pid);
+
+ rss = 0;
+ if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
+ if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
+ sizeof (psinfo)) {
+ if (psinfo.pr_zoneid == zid)
+ rss = (int64_t)psinfo.pr_rssize;
+ }
+
+ (void) close(psfd);
+ }
+
+ fast_rss += rss;
+ }
+
+ (void) closedir(pdir);
+
+ if (shutting_down)
+ return;
+
+ zone_rss_bytes = fast_rss * 1024;
+ /* Use the zone's approx. RSS in the kernel */
+ (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
+
+ (void) sleep_shutdown(300);
+ }
+}
+
+/*
+ * Thread that checks zone's memory usage and when over the cap, goes through
+ * the zone's process list trying to pageout processes to get under the cap.
+ */
+static void
+mcap_zone()
+{
+ DIR *pdir = NULL;
+ int64_t excess;
+
+ debug("thread startup\n");
+
+ get_mcap_tunables();
+
+ /*
+ * If the zone has no /proc filesystem, we can't use the fast algorithm
+ * to check RSS or pageout any processes. All we can do is periodically
+ * update it's RSS kstat using the expensive sycall.
+ */
+ if (!has_proc()) {
+ no_procfs();
+ debug("thread shutdown\n");
+ return;
+ }
+
+ /*
+ * When first starting it is likely lots of other zones are starting
+ * too because the system is booting. Since we just started the zone
+ * we're not worried about being over the cap right away, so we let
+ * things settle a bit and tolerate some older data here to minimize
+ * the load on the system.
+ */
+ (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
+
+ /* Wait until zone's /proc is mounted */
+ while (!shutting_down) {
+ struct stat st;
+
+ if (stat(zoneproc, &st) == 0 &&
+ strcmp(st.st_fstype, "proc") == 0)
+ break;
+ sleep_shutdown(5);
+ }
+
+ /* Open zone's /proc and walk entries. */
+ while (!shutting_down) {
+ if ((pdir = opendir(zoneproc)) != NULL)
+ break;
+ sleep_shutdown(5);
+ }
+
+ while (!shutting_down) {
+ struct dirent *dirent;
+
+ /* Wait until we've gone over the cap. */
+ excess = check_suspend();
+
+ debug("starting to scan, excess %lldk\n", (long long)excess);
+
+ if (over_cmd[0] != '\0') {
+ uint64_t zone_rss; /* total RSS(KB) */
+
+ debug("run phys_mcap_cmd: %s\n", over_cmd);
+ run_over_cmd();
+
+ zone_rss = get_mem_info();
+ excess = zone_rss - zone_rss_cap;
+ debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
+ zone_rss, zone_rss_cap, excess);
+ if (excess <= 0)
+ continue;
+ }
+
+ while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
+ pid_t pid;
+
+ if (strcmp(".", dirent->d_name) == 0 ||
+ strcmp("..", dirent->d_name) == 0)
+ continue;
+
+ pid = atoi(dirent->d_name);
+ if (pid == 0 || pid == 1)
+ continue;
+
+ if (skip_pageout)
+ (void) sleep_shutdown(2);
+ else
+ excess = pageout_process(pid, excess);
+
+ if (excess <= 0) {
+ debug("apparently under; excess %lld\n",
+ (long long)excess);
+ /* Double check the current excess */
+ excess = check_suspend();
+ }
+ }
+
+ debug("process pass done; excess %lld\n", (long long)excess);
+ rewinddir(pdir);
+
+ if (skip_pageout)
+ (void) sleep_shutdown(120);
+ }
+
+ if (pdir != NULL)
+ (void) closedir(pdir);
+ debug("thread shutdown\n");
+}
+
+void
+create_mcap_thread(zlog_t *zlogp, zoneid_t id)
+{
+ int res;
+ char brandname[MAXNAMELEN];
+
+ shutting_down = 0;
+ zid = id;
+ logp = zlogp;
+ (void) getzonenamebyid(zid, zonename, sizeof (zonename));
+
+ if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0)
+ zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename);
+
+ brandname[0] = '\0';
+ if (zone_get_brand(zonename, brandname, sizeof (brandname)) != 0)
+ zerror(zlogp, B_FALSE, "zone %s missing brand", zonename);
+
+ /* all but the lx brand currently use /proc */
+ if (strcmp(brandname, "lx") == 0) {
+ (void) snprintf(zoneproc, sizeof (zoneproc),
+ "%s/root/native/proc", zonepath);
+ } else {
+ (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
+ zonepath);
+ }
+
+ (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
+ zonepath);
+
+ res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
+ &mcap_tid);
+ if (res != 0) {
+ zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
+ res);
+ mcap_tid = 0;
+ }
+}
+
+void
+destroy_mcap_thread()
+{
+ if (mcap_tid != 0) {
+ shutting_down = 1;
+ (void) cond_signal(&shutdown_cv);
+ (void) thr_join(mcap_tid, NULL, NULL);
+ mcap_tid = 0;
+ }
+}
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index b9954b81b3..e63f87d2a0 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -137,6 +137,9 @@
#define ALT_MOUNT(mount_cmd) ((mount_cmd) != Z_MNT_BOOT)
+/* Number of times to retry unmounting if it fails */
+#define UMOUNT_RETRIES 30
+
/* a reasonable estimate for the number of lwps per process */
#define LWPS_PER_PROCESS 10
@@ -165,6 +168,7 @@ extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
/* from zoneadmd */
extern char query_hook[];
+extern char post_statechg_hook[];
/*
* For each "net" resource configured in zonecfg, we track a zone_addr_list_t
@@ -201,7 +205,7 @@ autofs_cleanup(zoneid_t zoneid)
/*
* Ask autofs to unmount all trigger nodes in the given zone.
*/
- return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
+ return (_autofssys(AUTOFS_UNMOUNTALL, (void *)((uintptr_t)zoneid)));
}
static void
@@ -592,6 +596,24 @@ root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
}
/*
+ * Perform brand-specific cleanup if we are unable to unmount a FS.
+ */
+static void
+brand_umount_cleanup(zlog_t *zlogp, char *path)
+{
+ char cmdbuf[2 * MAXPATHLEN];
+
+ if (post_statechg_hook[0] == '\0')
+ return;
+
+ if (snprintf(cmdbuf, sizeof (cmdbuf), "%s %d %d %s", post_statechg_hook,
+ ZONE_STATE_DOWN, Z_UNMOUNT, path) > sizeof (cmdbuf))
+ return;
+
+ (void) do_subproc(zlogp, cmdbuf, NULL, B_FALSE);
+}
+
+/*
* The general strategy for unmounting filesystems is as follows:
*
* - Remote filesystems may be dead, and attempting to contact them as
@@ -624,6 +646,7 @@ static int
unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
{
int error = 0;
+ int fail = 0;
FILE *mnttab;
struct mnttab *mnts;
uint_t nmnt;
@@ -711,18 +734,39 @@ unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
if (umount2(path, MS_FORCE) == 0) {
unmounted = B_TRUE;
stuck = B_FALSE;
+ fail = 0;
} else {
/*
- * The first failure indicates a
- * mount we won't be able to get
- * rid of automatically, so we
- * bail.
+ * We may hit a failure here if there
+ * is an app in the GZ with an open
+ * pipe into the zone (commonly into
+ * the zone's /var/run). This type
+ * of app will notice the closed
+ * connection and cleanup, but it may
+ * take a while and we have no easy
+ * way to notice that. To deal with
+ * this case, we will wait and retry
+ * a few times before we give up.
*/
- error++;
- zerror(zlogp, B_FALSE,
- "unable to unmount '%s'", path);
- free_mnttable(mnts, nmnt);
- goto out;
+ fail++;
+ if (fail < (UMOUNT_RETRIES - 1)) {
+ zerror(zlogp, B_FALSE,
+ "unable to unmount '%s', "
+ "retrying in 2 seconds",
+ path);
+ (void) sleep(2);
+ } else if (fail > UMOUNT_RETRIES) {
+ error++;
+ zerror(zlogp, B_FALSE,
+ "unmount of '%s' failed",
+ path);
+ free_mnttable(mnts, nmnt);
+ goto out;
+ } else {
+ /* Try the hook 2 times */
+ brand_umount_cleanup(zlogp,
+ path);
+ }
}
}
/*
@@ -1060,23 +1104,10 @@ mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
int
vplat_get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
{
- zone_dochandle_t handle;
-
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
- if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
+ if (zonecfg_get_iptype(snap_hndl, iptypep) != Z_OK) {
zerror(zlogp, B_FALSE, "invalid ip-type configuration");
- zonecfg_fini_handle(handle);
return (-1);
}
- zonecfg_fini_handle(handle);
return (0);
}
@@ -1089,14 +1120,13 @@ static int
mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
{
char brand[MAXNAMELEN];
- zone_dochandle_t handle = NULL;
brand_handle_t bh = NULL;
struct zone_devtab ztab;
di_prof_t prof = NULL;
int err;
int retval = -1;
zone_iptype_t iptype;
- const char *curr_iptype;
+ const char *curr_iptype = NULL;
if (di_prof_init(devpath, &prof)) {
zerror(zlogp, B_TRUE, "failed to initialize profile");
@@ -1131,6 +1161,8 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
curr_iptype = "exclusive";
break;
}
+ if (curr_iptype == NULL)
+ abort();
if (brand_platform_iter_devices(bh, zone_name,
mount_one_dev_device_cb, prof, curr_iptype) != 0) {
@@ -1145,28 +1177,19 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
}
/* Add user-specified devices and directories */
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_FALSE, "can't initialize zone handle");
- goto cleanup;
- }
- if (err = zonecfg_get_handle(zone_name, handle)) {
- zerror(zlogp, B_FALSE, "can't get handle for zone "
- "%s: %s", zone_name, zonecfg_strerror(err));
- goto cleanup;
- }
- if (err = zonecfg_setdevent(handle)) {
+ if ((err = zonecfg_setdevent(snap_hndl)) != 0) {
zerror(zlogp, B_FALSE, "%s: %s", zone_name,
zonecfg_strerror(err));
goto cleanup;
}
- while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
+ while (zonecfg_getdevent(snap_hndl, &ztab) == Z_OK) {
if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
zerror(zlogp, B_TRUE, "failed to add "
"user-specified device");
goto cleanup;
}
}
- (void) zonecfg_enddevent(handle);
+ (void) zonecfg_enddevent(snap_hndl);
/* Send profile to kernel */
if (di_prof_commit(prof)) {
@@ -1179,8 +1202,6 @@ mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
cleanup:
if (bh != NULL)
brand_close(bh);
- if (handle != NULL)
- zonecfg_fini_handle(handle);
if (prof)
di_prof_fini(prof);
return (retval);
@@ -1675,7 +1696,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
char luroot[MAXPATHLEN];
int i, num_fs = 0;
struct zone_fstab *fs_ptr = NULL;
- zone_dochandle_t handle = NULL;
zone_state_t zstate;
brand_handle_t bh;
plat_gmount_cb_data_t cb;
@@ -1699,12 +1719,7 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
goto bad;
}
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- goto bad;
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
- zonecfg_setfsent(handle) != Z_OK) {
+ if (zonecfg_setfsent(snap_hndl) != Z_OK) {
zerror(zlogp, B_FALSE, "invalid configuration");
goto bad;
}
@@ -1722,7 +1737,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
/* Get a handle to the brand info for this zone */
if ((bh = brand_open(brand)) == NULL) {
zerror(zlogp, B_FALSE, "unable to determine zone brand");
- zonecfg_fini_handle(handle);
return (-1);
}
@@ -1737,7 +1751,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
plat_gmount_cb, &cb) != 0) {
zerror(zlogp, B_FALSE, "unable to mount filesystems");
brand_close(bh);
- zonecfg_fini_handle(handle);
return (-1);
}
brand_close(bh);
@@ -1748,13 +1761,10 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
* higher level directories (e.g., /usr) get mounted before
* any beneath them (e.g., /usr/local).
*/
- if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
+ if (mount_filesystems_fsent(snap_hndl, zlogp, &fs_ptr, &num_fs,
mount_cmd) != 0)
goto bad;
- zonecfg_fini_handle(handle);
- handle = NULL;
-
/*
* Normally when we mount a zone all the zone filesystems
* get mounted relative to rootpath, which is usually
@@ -1833,8 +1843,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
return (0);
bad:
- if (handle != NULL)
- zonecfg_fini_handle(handle);
free_fs_data(fs_ptr, num_fs);
return (-1);
}
@@ -2190,13 +2198,7 @@ configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
/*
* Here, we know that the interface can't be brought up.
- * A similar warning message was already printed out to
- * the console by zoneadm(1M) so instead we log the
- * message to syslog and continue.
*/
- zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
- "'%s' which may not be present/plumbed in the "
- "global zone.", lifr.lifr_name);
(void) close(s);
return (Z_OK);
}
@@ -2409,7 +2411,6 @@ bad:
static int
configure_shared_network_interfaces(zlog_t *zlogp)
{
- zone_dochandle_t handle;
struct zone_nwiftab nwiftab, loopback_iftab;
zoneid_t zoneid;
@@ -2418,29 +2419,19 @@ configure_shared_network_interfaces(zlog_t *zlogp)
return (-1);
}
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
- if (zonecfg_setnwifent(handle) == Z_OK) {
+ if (zonecfg_setnwifent(snap_hndl) == Z_OK) {
for (;;) {
- if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
+ if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK)
break;
+ nwifent_free_attrs(&nwiftab);
if (configure_one_interface(zlogp, zoneid, &nwiftab) !=
Z_OK) {
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
return (-1);
}
}
- (void) zonecfg_endnwifent(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
}
- zonecfg_fini_handle(handle);
if (is_system_labeled()) {
/*
* Labeled zones share the loopback interface
@@ -2894,7 +2885,6 @@ free_ip_interface(zone_addr_list_t *zalist)
static int
configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
{
- zone_dochandle_t handle;
struct zone_nwiftab nwiftab;
char rootpath[MAXPATHLEN];
char path[MAXPATHLEN];
@@ -2903,30 +2893,18 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
boolean_t added = B_FALSE;
zone_addr_list_t *zalist = NULL, *new;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
-
- if (zonecfg_setnwifent(handle) != Z_OK) {
- zonecfg_fini_handle(handle);
+ if (zonecfg_setnwifent(snap_hndl) != Z_OK)
return (0);
- }
for (;;) {
- if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
+ if (zonecfg_getnwifent(snap_hndl, &nwiftab) != Z_OK)
break;
+ nwifent_free_attrs(&nwiftab);
if (prof == NULL) {
if (zone_get_devroot(zone_name, rootpath,
sizeof (rootpath)) != Z_OK) {
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
zerror(zlogp, B_TRUE,
"unable to determine dev root");
return (-1);
@@ -2934,8 +2912,7 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
(void) snprintf(path, sizeof (path), "%s%s", rootpath,
"/dev");
if (di_prof_init(path, &prof) != 0) {
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
zerror(zlogp, B_TRUE,
"failed to initialize profile");
return (-1);
@@ -2959,17 +2936,17 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
nwiftab.zone_nwif_physical) == 0) {
added = B_TRUE;
} else {
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
- zerror(zlogp, B_TRUE, "failed to add network device");
- return (-1);
+ /*
+ * Failed to add network device, but the brand hook
+ * might be doing this for us, so keep silent.
+ */
+ continue;
}
/* set up the new IP interface, and add them all later */
new = malloc(sizeof (*new));
if (new == NULL) {
zerror(zlogp, B_TRUE, "no memory for %s",
nwiftab.zone_nwif_physical);
- zonecfg_fini_handle(handle);
free_ip_interface(zalist);
}
bzero(new, sizeof (*new));
@@ -2979,16 +2956,14 @@ configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
}
if (zalist != NULL) {
if ((errno = add_net(zlogp, zoneid, zalist)) != 0) {
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
zerror(zlogp, B_TRUE, "failed to add address");
free_ip_interface(zalist);
return (-1);
}
free_ip_interface(zalist);
}
- (void) zonecfg_endnwifent(handle);
- zonecfg_fini_handle(handle);
+ (void) zonecfg_endnwifent(snap_hndl);
if (prof != NULL && added) {
if (di_prof_commit(prof) != 0) {
@@ -3124,48 +3099,23 @@ remove_datalink_protect(zlog_t *zlogp, zoneid_t zoneid)
/* datalink does not belong to the GZ */
continue;
}
- if (dlstatus != DLADM_STATUS_OK) {
+ if (dlstatus != DLADM_STATUS_OK)
zerror(zlogp, B_FALSE,
+ "clear 'protection' link property: %s",
dladm_status2str(dlstatus, dlerr));
- free(dllinks);
- return (-1);
- }
+
dlstatus = dladm_set_linkprop(dld_handle, *dllink,
"allowed-ips", NULL, 0, DLADM_OPT_ACTIVE);
- if (dlstatus != DLADM_STATUS_OK) {
+ if (dlstatus != DLADM_STATUS_OK)
zerror(zlogp, B_FALSE,
+ "clear 'allowed-ips' link property: %s",
dladm_status2str(dlstatus, dlerr));
- free(dllinks);
- return (-1);
- }
}
free(dllinks);
return (0);
}
static int
-unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
-{
- int dlnum = 0;
-
- /*
- * The kernel shutdown callback for the dls module should have removed
- * all datalinks from this zone. If any remain, then there's a
- * problem.
- */
- if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
- zerror(zlogp, B_TRUE, "unable to list network interfaces");
- return (-1);
- }
- if (dlnum != 0) {
- zerror(zlogp, B_FALSE,
- "datalinks remain in zone after shutdown");
- return (-1);
- }
- return (0);
-}
-
-static int
tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
{
@@ -3247,26 +3197,14 @@ static int
get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
{
int error = -1;
- zone_dochandle_t handle;
char *privname = NULL;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
-
if (ALT_MOUNT(mount_cmd)) {
zone_iptype_t iptype;
- const char *curr_iptype;
+ const char *curr_iptype = NULL;
- if (zonecfg_get_iptype(handle, &iptype) != Z_OK) {
+ if (zonecfg_get_iptype(snap_hndl, &iptype) != Z_OK) {
zerror(zlogp, B_TRUE, "unable to determine ip-type");
- zonecfg_fini_handle(handle);
return (-1);
}
@@ -3279,17 +3217,15 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
break;
}
- if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) {
- zonecfg_fini_handle(handle);
+ if (zonecfg_default_privset(privs, curr_iptype) == Z_OK)
return (0);
- }
+
zerror(zlogp, B_FALSE,
"failed to determine the zone's default privilege set");
- zonecfg_fini_handle(handle);
return (-1);
}
- switch (zonecfg_get_privset(handle, privs, &privname)) {
+ switch (zonecfg_get_privset(snap_hndl, privs, &privname)) {
case Z_OK:
error = 0;
break;
@@ -3312,7 +3248,6 @@ get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
}
free(privname);
- zonecfg_fini_handle(handle);
return (error);
}
@@ -3325,7 +3260,6 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
nvlist_t **nvlv = NULL;
int rctlcount = 0;
int error = -1;
- zone_dochandle_t handle;
struct zone_rctltab rctltab;
rctlblk_t *rctlblk = NULL;
uint64_t maxlwps;
@@ -3334,16 +3268,6 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
*bufp = NULL;
*bufsizep = 0;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
-
rctltab.zone_rctl_valptr = NULL;
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
@@ -3356,18 +3280,18 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
* max-processes property. If only the max-processes property is set,
* we add a max-lwps property with a limit derived from max-processes.
*/
- if (zonecfg_get_aliased_rctl(handle, ALIAS_MAXPROCS, &maxprocs)
+ if (zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXPROCS, &maxprocs)
== Z_OK &&
- zonecfg_get_aliased_rctl(handle, ALIAS_MAXLWPS, &maxlwps)
+ zonecfg_get_aliased_rctl(snap_hndl, ALIAS_MAXLWPS, &maxlwps)
== Z_NO_ENTRY) {
- if (zonecfg_set_aliased_rctl(handle, ALIAS_MAXLWPS,
+ if (zonecfg_set_aliased_rctl(snap_hndl, ALIAS_MAXLWPS,
maxprocs * LWPS_PER_PROCESS) != Z_OK) {
zerror(zlogp, B_FALSE, "unable to set max-lwps alias");
goto out;
}
}
- if (zonecfg_setrctlent(handle) != Z_OK) {
+ if (zonecfg_setrctlent(snap_hndl) != Z_OK) {
zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
goto out;
}
@@ -3376,7 +3300,7 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
zerror(zlogp, B_TRUE, "memory allocation failed");
goto out;
}
- while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
+ while (zonecfg_getrctlent(snap_hndl, &rctltab) == Z_OK) {
struct zone_rctlvaltab *rctlval;
uint_t i, count;
const char *name = rctltab.zone_rctl_name;
@@ -3458,7 +3382,7 @@ get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
nvlv = NULL;
rctlcount++;
}
- (void) zonecfg_endrctlent(handle);
+ (void) zonecfg_endrctlent(snap_hndl);
if (rctlcount == 0) {
error = 0;
@@ -3483,8 +3407,6 @@ out:
nvlist_free(nvl);
if (nvlv != NULL)
free(nvlv);
- if (handle != NULL)
- zonecfg_fini_handle(handle);
return (error);
}
@@ -3500,7 +3422,7 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr)
> sizeof (cmdbuf))
return (-1);
- if (do_subproc(zlogp, cmdbuf, retstr) != 0)
+ if (do_subproc(zlogp, cmdbuf, retstr, B_FALSE) != 0)
return (-1);
return (0);
@@ -3509,7 +3431,6 @@ get_implicit_datasets(zlog_t *zlogp, char **retstr)
static int
get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
{
- zone_dochandle_t handle;
struct zone_dstab dstab;
size_t total, offset, len;
int error = -1;
@@ -3520,30 +3441,20 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
*bufp = NULL;
*bufsizep = 0;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
-
if (get_implicit_datasets(zlogp, &implicit_datasets) != 0) {
zerror(zlogp, B_FALSE, "getting implicit datasets failed");
goto out;
}
- if (zonecfg_setdsent(handle) != Z_OK) {
+ if (zonecfg_setdsent(snap_hndl) != Z_OK) {
zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
goto out;
}
total = 0;
- while (zonecfg_getdsent(handle, &dstab) == Z_OK)
+ while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK)
total += strlen(dstab.zone_dataset_name) + 1;
- (void) zonecfg_enddsent(handle);
+ (void) zonecfg_enddsent(snap_hndl);
if (implicit_datasets != NULL)
implicit_len = strlen(implicit_datasets);
@@ -3560,12 +3471,12 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
goto out;
}
- if (zonecfg_setdsent(handle) != Z_OK) {
+ if (zonecfg_setdsent(snap_hndl) != Z_OK) {
zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
goto out;
}
offset = 0;
- while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+ while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) {
len = strlen(dstab.zone_dataset_name);
(void) strlcpy(str + offset, dstab.zone_dataset_name,
total - offset);
@@ -3573,7 +3484,7 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
if (offset < total - 1)
str[offset++] = ',';
}
- (void) zonecfg_enddsent(handle);
+ (void) zonecfg_enddsent(snap_hndl);
if (implicit_len > 0)
(void) strlcpy(str + offset, implicit_datasets, total - offset);
@@ -3585,8 +3496,6 @@ get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
out:
if (error != 0 && str != NULL)
free(str);
- if (handle != NULL)
- zonecfg_fini_handle(handle);
if (implicit_datasets != NULL)
free(implicit_datasets);
@@ -3596,40 +3505,26 @@ out:
static int
validate_datasets(zlog_t *zlogp)
{
- zone_dochandle_t handle;
struct zone_dstab dstab;
zfs_handle_t *zhp;
libzfs_handle_t *hdl;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (-1);
- }
- if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
+ if (zonecfg_setdsent(snap_hndl) != Z_OK) {
zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (-1);
- }
-
- if (zonecfg_setdsent(handle) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
return (-1);
}
if ((hdl = libzfs_init()) == NULL) {
zerror(zlogp, B_FALSE, "opening ZFS library");
- zonecfg_fini_handle(handle);
return (-1);
}
- while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+ while (zonecfg_getdsent(snap_hndl, &dstab) == Z_OK) {
if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
ZFS_TYPE_FILESYSTEM)) == NULL) {
zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
dstab.zone_dataset_name);
- zonecfg_fini_handle(handle);
libzfs_fini(hdl);
return (-1);
}
@@ -3644,7 +3539,6 @@ validate_datasets(zlog_t *zlogp)
zerror(zlogp, B_FALSE, "cannot set 'zoned' "
"property for ZFS dataset '%s'\n",
dstab.zone_dataset_name);
- zonecfg_fini_handle(handle);
zfs_close(zhp);
libzfs_fini(hdl);
return (-1);
@@ -3652,9 +3546,8 @@ validate_datasets(zlog_t *zlogp)
zfs_close(zhp);
}
- (void) zonecfg_enddsent(handle);
+ (void) zonecfg_enddsent(snap_hndl);
- zonecfg_fini_handle(handle);
libzfs_fini(hdl);
return (0);
@@ -4388,62 +4281,25 @@ duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
}
/*
- * Set memory cap and pool info for the zone's resource management
- * configuration.
+ * Set pool info for the zone's resource management configuration.
*/
static int
setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
{
int res;
uint64_t tmp;
- struct zone_mcaptab mcap;
char sched[MAXNAMELEN];
- zone_dochandle_t handle = NULL;
char pool_err[128];
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (Z_BAD_HANDLE);
- }
-
- if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
- zonecfg_fini_handle(handle);
- return (res);
- }
-
- /*
- * If a memory cap is configured, set the cap in the kernel using
- * zone_setattr() and make sure the rcapd SMF service is enabled.
- */
- if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
- uint64_t num;
- char smf_err[128];
-
- num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10);
- if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
- zerror(zlogp, B_TRUE, "could not set zone memory cap");
- zonecfg_fini_handle(handle);
- return (Z_INVAL);
- }
-
- if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
- zerror(zlogp, B_FALSE, "enabling system/rcap service "
- "failed: %s", smf_err);
- zonecfg_fini_handle(handle);
- return (Z_INVAL);
- }
- }
-
/* Get the scheduling class set in the zone configuration. */
- if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
+ if (zonecfg_get_sched_class(snap_hndl, sched, sizeof (sched)) == Z_OK &&
strlen(sched) > 0) {
if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched,
strlen(sched)) == -1)
zerror(zlogp, B_TRUE, "WARNING: unable to set the "
"default scheduling class");
- } else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp)
+ } else if (zonecfg_get_aliased_rctl(snap_hndl, ALIAS_SHARES, &tmp)
== Z_OK) {
/*
* If the zone has the zone.cpu-shares rctl set then we want to
@@ -4454,7 +4310,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
*/
char class_name[PC_CLNMSZ];
- if (zonecfg_get_dflt_sched_class(handle, class_name,
+ if (zonecfg_get_dflt_sched_class(snap_hndl, class_name,
sizeof (class_name)) != Z_OK) {
zerror(zlogp, B_FALSE, "WARNING: unable to determine "
"the zone's scheduling class");
@@ -4487,7 +4343,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
* right thing in all cases (reuse or create) based on the current
* zonecfg.
*/
- if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
+ if ((res = zonecfg_bind_tmp_pool(snap_hndl, zoneid, pool_err,
sizeof (pool_err))) != Z_OK) {
if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND)
zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting "
@@ -4496,14 +4352,13 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
else
zerror(zlogp, B_FALSE, "could not bind zone to "
"temporary pool: %s", zonecfg_strerror(res));
- zonecfg_fini_handle(handle);
return (Z_POOL_BIND);
}
/*
* Check if we need to warn about poold not being enabled.
*/
- if (zonecfg_warn_poold(handle)) {
+ if (zonecfg_warn_poold(snap_hndl)) {
zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has "
"been specified\nbut the dynamic pool service is not "
"enabled.\nThe system will not dynamically adjust the\n"
@@ -4513,7 +4368,7 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
}
/* The following is a warning, not an error. */
- if ((res = zonecfg_bind_pool(handle, zoneid, pool_err,
+ if ((res = zonecfg_bind_pool(snap_hndl, zoneid, pool_err,
sizeof (pool_err))) != Z_OK) {
if (res == Z_POOL_BIND)
zerror(zlogp, B_FALSE, "WARNING: unable to bind to "
@@ -4527,10 +4382,9 @@ setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
}
/* Update saved pool name in case it has changed */
- (void) zonecfg_get_poolname(handle, zone_name, pool_name,
+ (void) zonecfg_get_poolname(snap_hndl, zone_name, pool_name,
sizeof (pool_name));
- zonecfg_fini_handle(handle);
return (Z_OK);
}
@@ -4631,33 +4485,28 @@ setup_zone_fs_allowed(zone_dochandle_t handle, zlog_t *zlogp, zoneid_t zoneid)
}
static int
-setup_zone_attrs(zlog_t *zlogp, char *zone_namep, zoneid_t zoneid)
+setup_zone_attrs(zlog_t *zlogp, zoneid_t zoneid)
{
- zone_dochandle_t handle;
int res = Z_OK;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE, "getting zone configuration handle");
- return (Z_BAD_HANDLE);
- }
- if ((res = zonecfg_get_snapshot_handle(zone_namep, handle)) != Z_OK) {
- zerror(zlogp, B_FALSE, "invalid configuration");
+ if ((res = setup_zone_hostid(snap_hndl, zlogp, zoneid)) != Z_OK)
goto out;
- }
- if ((res = setup_zone_hostid(handle, zlogp, zoneid)) != Z_OK)
- goto out;
-
- if ((res = setup_zone_fs_allowed(handle, zlogp, zoneid)) != Z_OK)
+ if ((res = setup_zone_fs_allowed(snap_hndl, zlogp, zoneid)) != Z_OK)
goto out;
out:
- zonecfg_fini_handle(handle);
return (res);
}
+/*
+ * The zone_did is a persistent debug ID. Each zone should have a unique ID
+ * in the kernel. This is used for things like DTrace which want to monitor
+ * zones across reboots. They can't use the zoneid since that changes on
+ * each boot.
+ */
zoneid_t
-vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
+vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zone_did)
{
zoneid_t rval = -1;
priv_set_t *privs;
@@ -4673,7 +4522,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
tsol_zcent_t *zcent = NULL;
int match = 0;
int doi = 0;
- int flags;
+ int flags = -1;
zone_iptype_t iptype;
if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
@@ -4695,6 +4544,8 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
flags = ZCF_NET_EXCL;
break;
}
+ if (flags == -1)
+ abort();
if ((privs = priv_allocset()) == NULL) {
zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
@@ -4798,7 +4649,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
xerr = 0;
if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel,
- flags)) == -1) {
+ flags, zone_did)) == -1) {
if (xerr == ZE_AREMOUNTS) {
if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
zerror(zlogp, B_FALSE,
@@ -4844,7 +4695,7 @@ vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
struct brand_attr attr;
char modname[MAXPATHLEN];
- if (setup_zone_attrs(zlogp, zone_name, zoneid) != Z_OK)
+ if (setup_zone_attrs(zlogp, zoneid) != Z_OK)
goto error;
if ((bh = brand_open(brand_name)) == NULL) {
@@ -4902,6 +4753,8 @@ error:
}
if (rctlbuf != NULL)
free(rctlbuf);
+ if (zfsbuf != NULL)
+ free(zfsbuf);
priv_freeset(privs);
if (fp != NULL)
zonecfg_close_scratch(fp);
@@ -5044,6 +4897,8 @@ vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
return (-1);
}
break;
+ default:
+ abort();
}
}
@@ -5119,7 +4974,8 @@ unmounted:
}
int
-vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
+vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting,
+ boolean_t debug)
{
char *kzone;
zoneid_t zoneid;
@@ -5158,16 +5014,12 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
goto error;
}
- if (remove_datalink_pool(zlogp, zoneid) != 0) {
+ if (remove_datalink_pool(zlogp, zoneid) != 0)
zerror(zlogp, B_FALSE, "unable clear datalink pool property");
- goto error;
- }
- if (remove_datalink_protect(zlogp, zoneid) != 0) {
+ if (remove_datalink_protect(zlogp, zoneid) != 0)
zerror(zlogp, B_FALSE,
"unable clear datalink protect property");
- goto error;
- }
/*
* The datalinks assigned to the zone will be removed from the NGZ as
@@ -5207,7 +5059,7 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
brand_close(bh);
if ((strlen(cmdbuf) > EXEC_LEN) &&
- (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
+ (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
goto error;
}
@@ -5239,12 +5091,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
}
break;
case ZS_EXCLUSIVE:
- if (unconfigure_exclusive_network_interfaces(zlogp,
- zoneid) != 0) {
- zerror(zlogp, B_FALSE, "unable to unconfigure "
- "network interfaces in zone");
- goto error;
- }
status = dladm_zone_halt(dld_handle, zoneid);
if (status != DLADM_STATUS_OK) {
zerror(zlogp, B_FALSE, "unable to notify "
@@ -5281,14 +5127,9 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
if (rebooting) {
struct zone_psettab pset_tab;
- zone_dochandle_t handle;
- if ((handle = zonecfg_init_handle()) != NULL &&
- zonecfg_get_handle(zone_name, handle) == Z_OK &&
- zonecfg_lookup_pset(handle, &pset_tab) == Z_OK)
+ if (zonecfg_lookup_pset(snap_hndl, &pset_tab) == Z_OK)
destroy_tmp_pool = B_FALSE;
-
- zonecfg_fini_handle(handle);
}
if (destroy_tmp_pool) {
diff --git a/usr/src/cmd/zoneadmd/zcons.c b/usr/src/cmd/zoneadmd/zcons.c
index 963bfd3100..d5a33133b7 100644
--- a/usr/src/cmd/zoneadmd/zcons.c
+++ b/usr/src/cmd/zoneadmd/zcons.c
@@ -22,7 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
*/
/*
@@ -117,9 +117,10 @@
#define CONSOLE_SOCKPATH ZONES_TMPDIR "/%s.console_sock"
+#define ZCONS_RETRY 10
+
static int serverfd = -1; /* console server unix domain socket fd */
char boot_args[BOOTARGS_MAX];
-char bad_boot_arg[BOOTARGS_MAX];
/*
* The eventstream is a simple one-directional flow of messages from the
@@ -129,7 +130,10 @@ char bad_boot_arg[BOOTARGS_MAX];
*/
static int eventstream[2];
-
+/* flag used to cope with race creating master zcons devlink */
+static boolean_t master_zcons_failed = B_FALSE;
+/* flag to track if we've seen a state change when there is no master zcons */
+static boolean_t state_changed = B_FALSE;
int
eventstream_init()
@@ -321,7 +325,7 @@ destroy_console_devs(zlog_t *zlogp)
* interfaces to instantiate a new zone console node. We do a lot of
* sanity checking, and are careful to reuse a console if one exists.
*
- * Once the device is in the device tree, we kick devfsadm via di_init_devs()
+ * Once the device is in the device tree, we kick devfsadm via di_devlink_init()
* to ensure that the appropriate symlinks (to the master and slave console
* devices) are placed in /dev in the global zone.
*/
@@ -407,43 +411,63 @@ devlinks:
* Open the master side of the console and issue the ZC_HOLDSLAVE ioctl,
* which will cause the master to retain a reference to the slave.
* This prevents ttymon from blowing through the slave's STREAMS anchor.
+ *
+ * In very rare cases the open returns ENOENT if devfs doesn't have
+ * everything setup yet due to heavy zone startup load. Wait for
+ * 1 sec. and retry a few times. Even if we can't setup the zone's
+ * console, we still go ahead and boot the zone.
*/
(void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s",
zone_name, ZCONS_MASTER_NAME);
- if ((masterfd = open(conspath, O_RDWR | O_NOCTTY)) == -1) {
+ for (i = 0; i < ZCONS_RETRY; i++) {
+ masterfd = open(conspath, O_RDWR | O_NOCTTY);
+ if (masterfd >= 0 || errno != ENOENT)
+ break;
+ (void) sleep(1);
+ }
+ if (masterfd == -1) {
zerror(zlogp, B_TRUE, "ERROR: could not open master side of "
"zone console for %s to acquire slave handle", zone_name);
- goto error;
+ master_zcons_failed = B_TRUE;
}
+
(void) snprintf(conspath, sizeof (conspath), "/dev/zcons/%s/%s",
zone_name, ZCONS_SLAVE_NAME);
- if ((slavefd = open(conspath, O_RDWR | O_NOCTTY)) == -1) {
+ for (i = 0; i < ZCONS_RETRY; i++) {
+ slavefd = open(conspath, O_RDWR | O_NOCTTY);
+ if (slavefd >= 0 || errno != ENOENT)
+ break;
+ (void) sleep(1);
+ }
+ if (slavefd == -1)
zerror(zlogp, B_TRUE, "ERROR: could not open slave side of zone"
" console for %s to acquire slave handle", zone_name);
- (void) close(masterfd);
- goto error;
- }
+
/*
* This ioctl can occasionally return ENXIO if devfs doesn't have
* everything plumbed up yet due to heavy zone startup load. Wait for
* 1 sec. and retry a few times before we fail to boot the zone.
*/
- for (i = 0; i < 5; i++) {
- if (ioctl(masterfd, ZC_HOLDSLAVE, (caddr_t)(intptr_t)slavefd)
- == 0) {
- rv = 0;
- break;
- } else if (errno != ENXIO) {
- break;
+ if (masterfd != -1 && slavefd != -1) {
+ for (i = 0; i < ZCONS_RETRY; i++) {
+ if (ioctl(masterfd, ZC_HOLDSLAVE,
+ (caddr_t)(intptr_t)slavefd) == 0) {
+ rv = 0;
+ break;
+ } else if (errno != ENXIO) {
+ break;
+ }
+ (void) sleep(1);
}
- (void) sleep(1);
+ if (rv != 0)
+ zerror(zlogp, B_TRUE, "ERROR: error while acquiring "
+ "slave handle of zone console for %s", zone_name);
}
- if (rv != 0)
- zerror(zlogp, B_TRUE, "ERROR: error while acquiring slave "
- "handle of zone console for %s", zone_name);
- (void) close(slavefd);
- (void) close(masterfd);
+ if (slavefd != -1)
+ (void) close(slavefd);
+ if (masterfd != -1)
+ (void) close(masterfd);
error:
if (ddef_hdl)
@@ -653,14 +677,6 @@ event_message(int clifd, char *clilocale, zone_evt_t evt)
case Z_EVT_ZONE_BOOTFAILED:
str = "NOTICE: Zone boot failed";
break;
- case Z_EVT_ZONE_BADARGS:
- /*LINTED*/
- (void) snprintf(lmsg, sizeof (lmsg),
- localize_msg(clilocale,
- "WARNING: Ignoring invalid boot arguments: %s"),
- bad_boot_arg);
- lstr = lmsg;
- break;
default:
return;
}
@@ -854,7 +870,6 @@ init_console(zlog_t *zlogp)
if (init_console_dev(zlogp) == -1) {
zerror(zlogp, B_FALSE,
"console setup: device initialization failed");
- return (-1);
}
if ((serverfd = init_console_sock(zlogp)) == -1) {
@@ -866,6 +881,17 @@ init_console(zlog_t *zlogp)
}
/*
+ * Maintain a simple flag that tracks if we have seen at least one state
+ * change. This is currently only used to handle the special case where we are
+ * running without a console device, which is what normally drives shutdown.
+ */
+void
+zcons_statechanged()
+{
+ state_changed = B_TRUE;
+}
+
+/*
* serve_console() is the master loop for driving console I/O. It is also the
* routine which is ultimately responsible for "pulling the plug" on zoneadmd
* when it realizes that the daemon should shut down.
@@ -883,6 +909,7 @@ serve_console(zlog_t *zlogp)
int masterfd;
zone_state_t zstate;
char conspath[MAXPATHLEN];
+ static boolean_t cons_warned = B_FALSE;
(void) snprintf(conspath, sizeof (conspath),
"/dev/zcons/%s/%s", zone_name, ZCONS_MASTER_NAME);
@@ -890,6 +917,46 @@ serve_console(zlog_t *zlogp)
for (;;) {
masterfd = open(conspath, O_RDWR|O_NONBLOCK|O_NOCTTY);
if (masterfd == -1) {
+ if (master_zcons_failed) {
+ /*
+ * If we don't have a console and the zone is
+ * not shutting down, there may have been a
+ * race/failure with devfs while creating the
+ * console. In this case we want to leave the
+ * zone up, even without a console, so
+ * periodically recheck.
+ */
+ int i;
+
+ /*
+ * In the normal flow of this loop, we use
+ * do_console_io to give things a chance to get
+ * going first. However, in this case we can't
+ * use that, so we have to wait for at least
+ * one state change before checking the state.
+ */
+ for (i = 0; i < 60; i++) {
+ if (state_changed)
+ break;
+ (void) sleep(1);
+ }
+
+ if (i < 60 && zone_get_state(zone_name,
+ &zstate) == Z_OK &&
+ (zstate == ZONE_STATE_READY ||
+ zstate == ZONE_STATE_RUNNING)) {
+ if (!cons_warned) {
+ zerror(zlogp, B_FALSE,
+ "WARNING: missing zone "
+ "console for %s",
+ zone_name);
+ cons_warned = B_TRUE;
+ }
+ (void) sleep(ZCONS_RETRY);
+ continue;
+ }
+ }
+
zerror(zlogp, B_TRUE, "failed to open console master");
(void) mutex_lock(&lock);
goto death;
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c
index 7a9d88410d..bb53b01d16 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.c
+++ b/usr/src/cmd/zoneadmd/zoneadmd.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -68,6 +69,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
+#include <sys/time.h>
#include <bsm/adt.h>
#include <bsm/adt_event.h>
@@ -108,6 +110,7 @@
static char *progname;
char *zone_name; /* zone which we are managing */
+zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */
char pool_name[MAXNAMELEN];
char default_brand[MAXNAMELEN];
char brand_name[MAXNAMELEN];
@@ -116,10 +119,11 @@ boolean_t zone_iscluster;
boolean_t zone_islabeled;
boolean_t shutdown_in_progress;
static zoneid_t zone_id;
+static zoneid_t zone_did = 0;
dladm_handle_t dld_handle = NULL;
-static char pre_statechg_hook[2 * MAXPATHLEN];
-static char post_statechg_hook[2 * MAXPATHLEN];
+char pre_statechg_hook[2 * MAXPATHLEN];
+char post_statechg_hook[2 * MAXPATHLEN];
char query_hook[2 * MAXPATHLEN];
zlog_t logsys;
@@ -141,6 +145,9 @@ boolean_t bringup_failure_recovery = B_FALSE; /* ignore certain failures */
#define DEFAULT_LOCALE "C"
+#define RSRC_NET "net"
+#define RSRC_DEV "device"
+
static const char *
z_cmd_name(zone_cmd_t zcmd)
{
@@ -257,34 +264,31 @@ zerror(zlog_t *zlogp, boolean_t use_strerror, const char *fmt, ...)
}
/*
- * Emit a warning for any boot arguments which are unrecognized. Since
- * Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we
+ * Since Solaris boot arguments are getopt(3c) compatible (see kernel(1m)), we
* put the arguments into an argv style array, use getopt to process them,
- * and put the resultant argument string back into outargs.
+ * and put the resultant argument string back into outargs. Non-Solaris brands
+ * may support alternate forms of boot arguments so we must handle that as well.
*
* During the filtering, we pull out any arguments which are truly "boot"
* arguments, leaving only those which are to be passed intact to the
* progenitor process. The one we support at the moment is -i, which
* indicates to the kernel which program should be launched as 'init'.
*
- * A return of Z_INVAL indicates specifically that the arguments are
- * not valid; this is a non-fatal error. Except for Z_OK, all other return
- * values are treated as fatal.
+ * Except for Z_OK, all other return values are treated as fatal.
*/
static int
filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
- char *init_file, char *badarg)
+ char *init_file)
{
int argc = 0, argc_save;
int i;
- int err;
+ int err = Z_OK;
char *arg, *lasts, **argv = NULL, **argv_save;
char zonecfg_args[BOOTARGS_MAX];
char scratchargs[BOOTARGS_MAX], *sargs;
char c;
bzero(outargs, BOOTARGS_MAX);
- bzero(badarg, BOOTARGS_MAX);
/*
* If the user didn't specify transient boot arguments, check
@@ -292,25 +296,10 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
* and use them if applicable.
*/
if (inargs == NULL || inargs[0] == '\0') {
- zone_dochandle_t handle;
- if ((handle = zonecfg_init_handle()) == NULL) {
- zerror(zlogp, B_TRUE,
- "getting zone configuration handle");
- return (Z_BAD_HANDLE);
- }
- err = zonecfg_get_snapshot_handle(zone_name, handle);
- if (err != Z_OK) {
- zerror(zlogp, B_FALSE,
- "invalid configuration snapshot");
- zonecfg_fini_handle(handle);
- return (Z_BAD_HANDLE);
- }
-
bzero(zonecfg_args, sizeof (zonecfg_args));
- (void) zonecfg_get_bootargs(handle, zonecfg_args,
+ (void) zonecfg_get_bootargs(snap_hndl, zonecfg_args,
sizeof (zonecfg_args));
inargs = zonecfg_args;
- zonecfg_fini_handle(handle);
}
if (strlen(inargs) >= BOOTARGS_MAX) {
@@ -390,36 +379,29 @@ filter_bootargs(zlog_t *zlogp, const char *inargs, char *outargs,
break;
case '?':
/*
- * We warn about unknown arguments but pass them
- * along anyway-- if someone wants to develop their
- * own init replacement, they can pass it whatever
- * args they want.
+ * If a brand has its own init, we need to pass along
+ * whatever the user provides. We use the entire
+ * unknown string here so that we correctly handle
+ * unknown long options (e.g. --debug).
*/
- err = Z_INVAL;
(void) snprintf(outargs, BOOTARGS_MAX,
- "%s -%c", outargs, optopt);
- (void) snprintf(badarg, BOOTARGS_MAX,
- "%s -%c", badarg, optopt);
+ "%s %s", outargs, argv[optind - 1]);
break;
}
}
/*
- * For Solaris Zones we warn about and discard non-option arguments.
- * Hence 'boot foo bar baz gub' --> 'boot'. However, to be similar
- * to the kernel, we concat up all the other remaining boot args.
- * and warn on them as a group.
+ * We need to pass along everything else since we don't know what
+ * the brand's init is expecting. For example, an argument list like:
+ * --confdir /foo --debug
+ * will cause the getopt parsing to stop at '/foo' but we need to pass
+ * that on, along with the '--debug'. This does mean that we require
+ * any of our known options (-ifms) to preceed the brand-specific ones.
*/
- if (optind < argc) {
- err = Z_INVAL;
- while (optind < argc) {
- (void) snprintf(badarg, BOOTARGS_MAX, "%s%s%s",
- badarg, strlen(badarg) > 0 ? " " : "",
- argv[optind]);
- optind++;
- }
- zerror(zlogp, B_FALSE, "WARNING: Unused or invalid boot "
- "arguments `%s'.", badarg);
+ while (optind < argc) {
+ (void) snprintf(outargs, BOOTARGS_MAX, "%s %s", outargs,
+ argv[optind]);
+ optind++;
}
done:
@@ -458,7 +440,7 @@ mkzonedir(zlog_t *zlogp)
* Run the brand's pre-state change callback, if it exists.
*/
static int
-brand_prestatechg(zlog_t *zlogp, int state, int cmd)
+brand_prestatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
{
char cmdbuf[2 * MAXPATHLEN];
const char *altroot;
@@ -471,7 +453,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd)
state, cmd, altroot) > sizeof (cmdbuf))
return (-1);
- if (do_subproc(zlogp, cmdbuf, NULL) != 0)
+ if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
return (-1);
return (0);
@@ -481,7 +463,7 @@ brand_prestatechg(zlog_t *zlogp, int state, int cmd)
* Run the brand's post-state change callback, if it exists.
*/
static int
-brand_poststatechg(zlog_t *zlogp, int state, int cmd)
+brand_poststatechg(zlog_t *zlogp, int state, int cmd, boolean_t debug)
{
char cmdbuf[2 * MAXPATHLEN];
const char *altroot;
@@ -494,7 +476,7 @@ brand_poststatechg(zlog_t *zlogp, int state, int cmd)
state, cmd, altroot) > sizeof (cmdbuf))
return (-1);
- if (do_subproc(zlogp, cmdbuf, NULL) != 0)
+ if (do_subproc(zlogp, cmdbuf, NULL, debug) != 0)
return (-1);
return (0);
@@ -533,35 +515,44 @@ notify_zonestatd(zoneid_t zoneid)
* subcommand.
*/
static int
-zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate)
+zone_ready(zlog_t *zlogp, zone_mnt_t mount_cmd, int zstate, boolean_t debug)
{
int err;
+ boolean_t snapped = B_FALSE;
- if (brand_prestatechg(zlogp, zstate, Z_READY) != 0)
- return (-1);
-
+ if ((snap_hndl = zonecfg_init_handle()) == NULL) {
+ zerror(zlogp, B_TRUE, "getting zone configuration handle");
+ goto bad;
+ }
if ((err = zonecfg_create_snapshot(zone_name)) != Z_OK) {
zerror(zlogp, B_FALSE, "unable to create snapshot: %s",
zonecfg_strerror(err));
goto bad;
}
+ snapped = B_TRUE;
- if ((zone_id = vplat_create(zlogp, mount_cmd)) == -1) {
- if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
- zerror(zlogp, B_FALSE, "destroying snapshot: %s",
- zonecfg_strerror(err));
+ if (zonecfg_get_snapshot_handle(zone_name, snap_hndl) != Z_OK) {
+ zerror(zlogp, B_FALSE, "invalid configuration snapshot");
goto bad;
}
+
+ if (zone_did == 0)
+ zone_did = zone_get_did(zone_name);
+
+ if (brand_prestatechg(zlogp, zstate, Z_READY, debug) != 0)
+ goto bad;
+
+ if ((zone_id = vplat_create(zlogp, mount_cmd, zone_did)) == -1)
+ goto bad;
+
if (vplat_bringup(zlogp, mount_cmd, zone_id) != 0) {
bringup_failure_recovery = B_TRUE;
- (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE);
- if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
- zerror(zlogp, B_FALSE, "destroying snapshot: %s",
- zonecfg_strerror(err));
+ (void) vplat_teardown(NULL, (mount_cmd != Z_MNT_BOOT), B_FALSE,
+ debug);
goto bad;
}
- if (brand_poststatechg(zlogp, zstate, Z_READY) != 0)
+ if (brand_poststatechg(zlogp, zstate, Z_READY, debug) != 0)
goto bad;
return (0);
@@ -571,7 +562,13 @@ bad:
* If something goes wrong, we up the zones's state to the target
* state, READY, and then invoke the hook as if we're halting.
*/
- (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT);
+ (void) brand_poststatechg(zlogp, ZONE_STATE_READY, Z_HALT, debug);
+ if (snapped)
+ if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
+ zerror(zlogp, B_FALSE, "destroying snapshot: %s",
+ zonecfg_strerror(err));
+ zonecfg_fini_handle(snap_hndl);
+ snap_hndl = NULL;
return (-1);
}
@@ -686,6 +683,8 @@ mount_early_fs(void *data, const char *spec, const char *dir,
char opt_buf[MAX_MNTOPT_STR];
int optlen = 0;
int mflag = MS_DATA;
+ int i;
+ int ret;
(void) ct_tmpl_clear(tmpl_fd);
/*
@@ -713,9 +712,26 @@ mount_early_fs(void *data, const char *spec, const char *dir,
optlen = MAX_MNTOPT_STR;
mflag = MS_OPTIONSTR;
}
- if (mount(spec, dir, mflag, fstype, NULL, 0, opt, optlen) != 0)
- _exit(errno);
- _exit(0);
+
+ /*
+ * There is an obscure race condition which can cause mount
+ * to return EBUSY. This happens for example on the mount
+ * of the zone's /etc/svc/volatile file system if there is
+ * a GZ process running svcs -Z, which will touch the
+ * mountpoint, just as we're trying to do the mount. To cope
+ * with this, we retry up to 3 times to let this transient
+ * process get out of the way.
+ */
+ for (i = 0; i < 3; i++) {
+ ret = 0;
+ if (mount(spec, dir, mflag, fstype, NULL, 0, opt,
+ optlen) != 0)
+ ret = errno;
+ if (ret != EBUSY)
+ break;
+ (void) sleep(1);
+ }
+ _exit(ret);
}
/* parent */
@@ -739,12 +755,150 @@ mount_early_fs(void *data, const char *spec, const char *dir,
}
/*
+ * env variable name format
+ * _ZONECFG_{resource name}_{identifying attr. name}_{property name}
+ * Any dashes (-) in the property names are replaced with underscore (_).
+ */
+static void
+set_zonecfg_env(char *rsrc, char *attr, char *name, char *val)
+{
+ char *p;
+ char nm[MAXNAMELEN];
+
+ if (attr == NULL)
+ (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s", rsrc,
+ name);
+ else
+ (void) snprintf(nm, sizeof (nm), "_ZONECFG_%s_%s_%s", rsrc,
+ attr, name);
+
+ p = nm;
+ while ((p = strchr(p, '-')) != NULL)
+ *p++ = '_';
+
+ (void) setenv(nm, val, 1);
+}
+
+/*
+ * Export zonecfg network and device properties into environment for the boot
+ * and state change hooks.
+ * If debug is true, export the brand hook debug env. variable as well.
+ *
+ * We could export more of the config in the future, as necessary.
+ */
+static int
+setup_subproc_env(boolean_t debug)
+{
+ int res;
+ struct zone_nwiftab ntab;
+ struct zone_devtab dtab;
+ struct zone_attrtab atab;
+ char net_resources[MAXNAMELEN * 2];
+ char dev_resources[MAXNAMELEN * 2];
+
+ /* snap_hndl is null when called through the set_brand_env code path */
+ if (snap_hndl == NULL)
+ return (Z_OK);
+
+ net_resources[0] = '\0';
+ if ((res = zonecfg_setnwifent(snap_hndl)) != Z_OK)
+ goto done;
+
+ while (zonecfg_getnwifent(snap_hndl, &ntab) == Z_OK) {
+ struct zone_res_attrtab *rap;
+ char *phys;
+
+ phys = ntab.zone_nwif_physical;
+
+ (void) strlcat(net_resources, phys, sizeof (net_resources));
+ (void) strlcat(net_resources, " ", sizeof (net_resources));
+
+ set_zonecfg_env(RSRC_NET, phys, "physical", phys);
+
+ set_zonecfg_env(RSRC_NET, phys, "address",
+ ntab.zone_nwif_address);
+ set_zonecfg_env(RSRC_NET, phys, "allowed-address",
+ ntab.zone_nwif_allowed_address);
+ set_zonecfg_env(RSRC_NET, phys, "defrouter",
+ ntab.zone_nwif_defrouter);
+ set_zonecfg_env(RSRC_NET, phys, "global-nic",
+ ntab.zone_nwif_gnic);
+ set_zonecfg_env(RSRC_NET, phys, "mac-addr", ntab.zone_nwif_mac);
+ set_zonecfg_env(RSRC_NET, phys, "vlan-id",
+ ntab.zone_nwif_vlan_id);
+
+ for (rap = ntab.zone_nwif_attrp; rap != NULL;
+ rap = rap->zone_res_attr_next)
+ set_zonecfg_env(RSRC_NET, phys, rap->zone_res_attr_name,
+ rap->zone_res_attr_value);
+ nwifent_free_attrs(&ntab);
+ }
+
+ (void) setenv("_ZONECFG_net_resources", net_resources, 1);
+
+ (void) zonecfg_endnwifent(snap_hndl);
+
+ if ((res = zonecfg_setdevent(snap_hndl)) != Z_OK)
+ goto done;
+
+ while (zonecfg_getdevent(snap_hndl, &dtab) == Z_OK) {
+ struct zone_res_attrtab *rap;
+ char *match;
+
+ match = dtab.zone_dev_match;
+
+ (void) strlcat(dev_resources, match, sizeof (dev_resources));
+ (void) strlcat(dev_resources, " ", sizeof (dev_resources));
+
+ for (rap = dtab.zone_dev_attrp; rap != NULL;
+ rap = rap->zone_res_attr_next)
+ set_zonecfg_env(RSRC_DEV, match,
+ rap->zone_res_attr_name, rap->zone_res_attr_value);
+ }
+
+ (void) zonecfg_enddevent(snap_hndl);
+
+ if ((res = zonecfg_setattrent(snap_hndl)) != Z_OK)
+ goto done;
+
+ while (zonecfg_getattrent(snap_hndl, &atab) == Z_OK) {
+ set_zonecfg_env("attr", NULL, atab.zone_attr_name,
+ atab.zone_attr_value);
+ }
+
+ (void) zonecfg_endattrent(snap_hndl);
+
+ if (debug)
+ (void) setenv("_ZONEADMD_brand_debug", "1", 1);
+ else
+ (void) setenv("_ZONEADMD_brand_debug", "", 1);
+
+ res = Z_OK;
+
+done:
+ return (res);
+}
+
+void
+nwifent_free_attrs(struct zone_nwiftab *np)
+{
+ struct zone_res_attrtab *rap;
+
+ for (rap = np->zone_nwif_attrp; rap != NULL; ) {
+ struct zone_res_attrtab *tp = rap;
+
+ rap = rap->zone_res_attr_next;
+ free(tp);
+ }
+}
+
+/*
* If retstr is not NULL, the output of the subproc is returned in the str,
* otherwise it is output using zerror(). Any memory allocated for retstr
* should be freed by the caller.
*/
int
-do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
+do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr, boolean_t debug)
{
char buf[1024]; /* arbitrary large amount */
char *inbuf;
@@ -763,6 +917,11 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
inbuf = buf;
}
+ if (setup_subproc_env(debug) != Z_OK) {
+ zerror(zlogp, B_FALSE, "failed to setup environment");
+ return (-1);
+ }
+
file = popen(cmdbuf, "r");
if (file == NULL) {
zerror(zlogp, B_TRUE, "could not launch: %s", cmdbuf);
@@ -771,8 +930,13 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
while (fgets(inbuf, 1024, file) != NULL) {
if (retstr == NULL) {
- if (zlogp != &logsys)
+ if (zlogp != &logsys) {
+ int last = strlen(inbuf) - 1;
+
+ if (inbuf[last] == '\n')
+ inbuf[last] = '\0';
zerror(zlogp, B_FALSE, "%s", inbuf);
+ }
} else {
char *p;
@@ -802,8 +966,51 @@ do_subproc(zlog_t *zlogp, char *cmdbuf, char **retstr)
return (WEXITSTATUS(status));
}
+/*
+ * Get the path for this zone's init(1M) (or equivalent) process. First look
+ * for a zone-specific init-name attr, then get it from the brand.
+ */
+static int
+get_initname(brand_handle_t bh, char *initname, int len)
+{
+ struct zone_attrtab a;
+
+ bzero(&a, sizeof (a));
+ (void) strlcpy(a.zone_attr_name, "init-name",
+ sizeof (a.zone_attr_name));
+
+ if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
+ (void) strlcpy(initname, a.zone_attr_value, len);
+ return (0);
+ }
+
+ return (brand_get_initname(bh, initname, len));
+}
+
+/*
+ * Get the restart-init flag for this zone's init(1M) (or equivalent) process.
+ * First look for a zone-specific restart-init attr, then get it from the brand.
+ */
+static boolean_t
+restartinit(brand_handle_t bh)
+{
+ struct zone_attrtab a;
+
+ bzero(&a, sizeof (a));
+ (void) strlcpy(a.zone_attr_name, "restart-init",
+ sizeof (a.zone_attr_name));
+
+ if (zonecfg_lookup_attr(snap_hndl, &a) == Z_OK) {
+ if (strcmp(a.zone_attr_value, "false") == 0)
+ return (B_FALSE);
+ return (B_TRUE);
+ }
+
+ return (brand_restartinit(bh));
+}
+
static int
-zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
+zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
{
zoneid_t zoneid;
struct stat st;
@@ -813,12 +1020,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
fs_callback_t cb;
brand_handle_t bh;
zone_iptype_t iptype;
- boolean_t links_loaded = B_FALSE;
dladm_status_t status;
char errmsg[DLADM_STRSIZE];
int err;
+ boolean_t restart_init;
- if (brand_prestatechg(zlogp, zstate, Z_BOOT) != 0)
+ if (brand_prestatechg(zlogp, zstate, Z_BOOT, debug) != 0)
return (-1);
if ((zoneid = getzoneidbyname(zone_name)) == -1) {
@@ -866,20 +1073,20 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
}
/* Get the path for this zone's init(1M) (or equivalent) process. */
- if (brand_get_initname(bh, init_file, MAXPATHLEN) != 0) {
+ if (get_initname(bh, init_file, MAXPATHLEN) != 0) {
zerror(zlogp, B_FALSE,
"unable to determine zone's init(1M) location");
brand_close(bh);
goto bad;
}
+ /* See if we should restart init if it dies. */
+ restart_init = restartinit(bh);
+
brand_close(bh);
- err = filter_bootargs(zlogp, bootargs, nbootargs, init_file,
- bad_boot_arg);
- if (err == Z_INVAL)
- eventstream_write(Z_EVT_ZONE_BADARGS);
- else if (err != Z_OK)
+ err = filter_bootargs(zlogp, bootargs, nbootargs, init_file);
+ if (err != Z_OK)
goto bad;
assert(init_file[0] != '\0');
@@ -915,7 +1122,6 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
" %s", dladm_status2str(status, errmsg));
goto bad;
}
- links_loaded = B_TRUE;
}
/*
@@ -924,7 +1130,7 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
* is booted.
*/
if ((strlen(cmdbuf) > EXEC_LEN) &&
- (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
+ (do_subproc(zlogp, cmdbuf, NULL, debug) != Z_OK)) {
zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
goto bad;
}
@@ -939,6 +1145,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
goto bad;
}
+ if (!restart_init && zone_setattr(zoneid, ZONE_ATTR_INITNORESTART,
+ NULL, 0) == -1) {
+ zerror(zlogp, B_TRUE, "could not set zone init-no-restart");
+ goto bad;
+ }
+
/*
* Inform zonestatd of a new zone so that it can install a door for
* the zone to contact it.
@@ -950,9 +1162,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate)
goto bad;
}
- if (brand_poststatechg(zlogp, zstate, Z_BOOT) != 0)
+ if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0)
goto bad;
+ /* Startup a thread to perform memory capping for the zone. */
+ create_mcap_thread(zlogp, zone_id);
+
return (0);
bad:
@@ -960,32 +1175,38 @@ bad:
* If something goes wrong, we up the zones's state to the target
* state, RUNNING, and then invoke the hook as if we're halting.
*/
- (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT);
- if (links_loaded)
- (void) dladm_zone_halt(dld_handle, zoneid);
+ (void) brand_poststatechg(zlogp, ZONE_STATE_RUNNING, Z_HALT, debug);
+
return (-1);
}
static int
-zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate)
+zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
+ boolean_t debug)
{
int err;
- if (brand_prestatechg(zlogp, zstate, Z_HALT) != 0)
+ if (brand_prestatechg(zlogp, zstate, Z_HALT, debug) != 0)
return (-1);
- if (vplat_teardown(zlogp, unmount_cmd, rebooting) != 0) {
+ /* Shutting down, stop the memcap thread */
+ destroy_mcap_thread();
+
+ if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
if (!bringup_failure_recovery)
zerror(zlogp, B_FALSE, "unable to destroy zone");
return (-1);
}
+ if (brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0)
+ return (-1);
+
if ((err = zonecfg_destroy_snapshot(zone_name)) != Z_OK)
zerror(zlogp, B_FALSE, "destroying snapshot: %s",
zonecfg_strerror(err));
- if (brand_poststatechg(zlogp, zstate, Z_HALT) != 0)
- return (-1);
+ zonecfg_fini_handle(snap_hndl);
+ snap_hndl = NULL;
return (0);
}
@@ -1167,6 +1388,39 @@ audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
}
/*
+ * Log the exit time and status of the zone's init process into
+ * {zonepath}/lastexited. If the zone shutdown normally, the exit status will
+ * be -1, otherwise it will be the exit status as described in wait.3c.
+ * If the zone is configured to restart init, then nothing will be logged if
+ * init exits unexpectedly (the kernel will never upcall in this case).
+ */
+static void
+log_init_exit(int status)
+{
+ char zpath[MAXPATHLEN];
+ char p[MAXPATHLEN];
+ char buf[128];
+ struct timeval t;
+ int fd;
+
+ if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK)
+ return;
+ if (snprintf(p, sizeof (p), "%s/lastexited", zpath) > sizeof (p))
+ return;
+ if (gettimeofday(&t, NULL) != 0)
+ return;
+ if (snprintf(buf, sizeof (buf), "%ld.%ld %d\n", t.tv_sec, t.tv_usec,
+ status) > sizeof (buf))
+ return;
+ if ((fd = open(p, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0)
+ return;
+
+ (void) write(fd, buf, strlen(buf));
+
+ (void) close(fd);
+}
+
+/*
* The main routine for the door server that deals with zone state transitions.
*/
/* ARGSUSED */
@@ -1179,9 +1433,11 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
zone_state_t zstate;
zone_cmd_t cmd;
+ boolean_t debug;
+ int init_status;
zone_cmd_arg_t *zargp;
- boolean_t kernelcall;
+ boolean_t kernelcall = B_TRUE;
int rval = -1;
uint64_t uniqid;
@@ -1231,6 +1487,8 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
goto out;
}
cmd = zargp->cmd;
+ debug = zargp->debug;
+ init_status = zargp->status;
if (door_ucred(&uc) != 0) {
zerror(&logsys, B_TRUE, "door_ucred");
@@ -1337,23 +1595,25 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case ZONE_STATE_INSTALLED:
switch (cmd) {
case Z_READY:
- rval = zone_ready(zlogp, Z_MNT_BOOT, zstate);
+ rval = zone_ready(zlogp, Z_MNT_BOOT, zstate, debug);
if (rval == 0)
eventstream_write(Z_EVT_ZONE_READIED);
+ zcons_statechanged();
break;
case Z_BOOT:
case Z_FORCEBOOT:
eventstream_write(Z_EVT_ZONE_BOOTING);
- if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
- == 0) {
+ if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+ debug)) == 0) {
rval = zone_bootup(zlogp, zargp->bootbuf,
- zstate);
+ zstate, debug);
}
audit_put_record(zlogp, uc, rval, "boot");
+ zcons_statechanged();
if (rval != 0) {
bringup_failure_recovery = B_TRUE;
(void) zone_halt(zlogp, B_FALSE, B_FALSE,
- zstate);
+ zstate, debug);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
break;
@@ -1405,7 +1665,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
rval = zone_ready(zlogp,
strcmp(zargp->bootbuf, "-U") == 0 ?
- Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate);
+ Z_MNT_UPDATE : Z_MNT_SCRATCH, zstate, debug);
if (rval != 0)
break;
@@ -1467,15 +1727,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
rval = 0;
break;
case Z_BOOT:
+ case Z_FORCEBOOT:
(void) strlcpy(boot_args, zargp->bootbuf,
sizeof (boot_args));
eventstream_write(Z_EVT_ZONE_BOOTING);
- rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
+ rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
+ debug);
audit_put_record(zlogp, uc, rval, "boot");
+ zcons_statechanged();
if (rval != 0) {
bringup_failure_recovery = B_TRUE;
(void) zone_halt(zlogp, B_FALSE, B_TRUE,
- zstate);
+ zstate, debug);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
boot_args[0] = '\0';
@@ -1483,15 +1746,17 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case Z_HALT:
if (kernelcall) /* Invalid; can't happen */
abort();
- if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
- != 0)
+ if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
+ debug)) != 0)
break;
+ zcons_statechanged();
eventstream_write(Z_EVT_ZONE_HALTED);
break;
case Z_SHUTDOWN:
case Z_REBOOT:
case Z_NOTE_UNINSTALLING:
case Z_MOUNT:
+ case Z_FORCEMOUNT:
case Z_UNMOUNT:
if (kernelcall) /* Invalid; can't happen */
abort();
@@ -1508,7 +1773,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case Z_UNMOUNT:
if (kernelcall) /* Invalid; can't happen */
abort();
- rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate);
+ rval = zone_halt(zlogp, B_TRUE, B_FALSE, zstate, debug);
if (rval == 0) {
eventstream_write(Z_EVT_ZONE_HALTED);
(void) sema_post(&scratch_sem);
@@ -1530,15 +1795,18 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
case ZONE_STATE_DOWN:
switch (cmd) {
case Z_READY:
- if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
- != 0)
+ if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
+ debug)) != 0)
break;
- if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate)) == 0)
+ zcons_statechanged();
+ if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+ debug)) == 0)
eventstream_write(Z_EVT_ZONE_READIED);
else
eventstream_write(Z_EVT_ZONE_HALTED);
break;
case Z_BOOT:
+ case Z_FORCEBOOT:
/*
* We could have two clients racing to boot this
* zone; the second client loses, but his request
@@ -1549,32 +1817,40 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
rval = 0;
break;
case Z_HALT:
- if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate))
- != 0)
+ if (kernelcall) {
+ log_init_exit(init_status);
+ } else {
+ log_init_exit(-1);
+ }
+ if ((rval = zone_halt(zlogp, B_FALSE, B_FALSE, zstate,
+ debug)) != 0)
break;
eventstream_write(Z_EVT_ZONE_HALTED);
+ zcons_statechanged();
break;
case Z_REBOOT:
(void) strlcpy(boot_args, zargp->bootbuf,
sizeof (boot_args));
eventstream_write(Z_EVT_ZONE_REBOOTING);
- if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate))
- != 0) {
+ if ((rval = zone_halt(zlogp, B_FALSE, B_TRUE, zstate,
+ debug)) != 0) {
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
boot_args[0] = '\0';
break;
}
- if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate))
- != 0) {
+ zcons_statechanged();
+ if ((rval = zone_ready(zlogp, Z_MNT_BOOT, zstate,
+ debug)) != 0) {
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
boot_args[0] = '\0';
break;
}
- rval = zone_bootup(zlogp, zargp->bootbuf, zstate);
+ rval = zone_bootup(zlogp, zargp->bootbuf, zstate,
+ debug);
audit_put_record(zlogp, uc, rval, "reboot");
if (rval != 0) {
(void) zone_halt(zlogp, B_FALSE, B_TRUE,
- zstate);
+ zstate, debug);
eventstream_write(Z_EVT_ZONE_BOOTFAILED);
}
boot_args[0] = '\0';
@@ -1586,6 +1862,7 @@ server(void *cookie, char *args, size_t alen, door_desc_t *dp,
break;
case Z_NOTE_UNINSTALLING:
case Z_MOUNT:
+ case Z_FORCEMOUNT:
case Z_UNMOUNT:
zerror(zlogp, B_FALSE, "%s operation is invalid "
"for zones in state '%s'", z_cmd_name(cmd),
@@ -1749,11 +2026,35 @@ top:
* state.
*/
if (zstate > ZONE_STATE_INSTALLED) {
+ static zoneid_t zid;
+
zerror(zlogp, B_FALSE,
"zone '%s': WARNING: zone is in state '%s', but "
"zoneadmd does not appear to be available; "
"restarted zoneadmd to recover.",
zone_name, zone_state_str(zstate));
+
+ /*
+ * Startup a thread to perform memory capping for the
+ * zone. zlogp won't be valid for much longer so use
+ * logsys.
+ */
+ if ((zid = getzoneidbyname(zone_name)) != -1)
+ create_mcap_thread(&logsys, zid);
+
+ /* recover the global configuration snapshot */
+ if (snap_hndl == NULL) {
+ if ((snap_hndl = zonecfg_init_handle())
+ == NULL ||
+ zonecfg_create_snapshot(zone_name)
+ != Z_OK ||
+ zonecfg_get_snapshot_handle(zone_name,
+ snap_hndl) != Z_OK) {
+ zerror(zlogp, B_FALSE, "recovering "
+ "zone configuration handle");
+ goto out;
+ }
+ }
}
(void) fdetach(zone_door_path);
@@ -1767,6 +2068,52 @@ out:
}
/*
+ * Run the query hook with the 'env' parameter. It should return a
+ * string of tab-delimited key-value pairs, each of which should be set
+ * in the environment.
+ *
+ * Because the env_vars string values become part of the environment, the
+ * string is static and we don't free it.
+ *
+ * This function is always called before zoneadmd forks and makes itself
+ * exclusive, so it is possible there could more than one instance of zoneadmd
+ * running in parallel at this point. Thus, we have no zonecfg snapshot and
+ * shouldn't take one yet (i.e. snap_hndl is NULL). Thats ok, since we don't
+ * need any zonecfg info to query for a brand-specific env value.
+ */
+static int
+set_brand_env(zlog_t *zlogp)
+{
+ int ret = 0;
+ static char *env_vars = NULL;
+ char buf[2 * MAXPATHLEN];
+
+ if (query_hook[0] == '\0' || env_vars != NULL)
+ return (0);
+
+ if (snprintf(buf, sizeof (buf), "%s env", query_hook) > sizeof (buf))
+ return (-1);
+
+ if (do_subproc(zlogp, buf, &env_vars, B_FALSE) != 0)
+ return (-1);
+
+ if (env_vars != NULL) {
+ char *sp;
+
+ sp = strtok(env_vars, "\t");
+ while (sp != NULL) {
+ if (putenv(sp) != 0) {
+ ret = -1;
+ break;
+ }
+ sp = strtok(NULL, "\t");
+ }
+ }
+
+ return (ret);
+}
+
+/*
* Setup the brand's pre and post state change callbacks, as well as the
* query callback, if any of these exist.
*/
@@ -2002,6 +2349,11 @@ main(int argc, char *argv[])
}
priv_freeset(privset);
+ if (set_brand_env(zlogp) != 0) {
+ zerror(zlogp, B_FALSE, "Unable to setup brand's environment");
+ return (1);
+ }
+
if (mkzonedir(zlogp) != 0)
return (1);
@@ -2042,6 +2394,13 @@ main(int argc, char *argv[])
(void) sigaddset(&block_cld, SIGCHLD);
(void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
+ /*
+ * The parent only needs stderr after the fork, so close other fd's
+ * that we inherited from zoneadm so that the parent doesn't have those
+ * open while waiting. The child will close the rest after the fork.
+ */
+ closefrom(3);
+
if ((ctfd = init_template()) == -1) {
zerror(zlogp, B_TRUE, "failed to create contract");
return (1);
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h
index d784a303b3..ceab787dab 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.h
+++ b/usr/src/cmd/zoneadmd/zoneadmd.h
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2014, Joyent, Inc. All rights reserved.
*/
#ifndef _ZONEADMD_H
@@ -90,17 +91,18 @@ extern mutex_t msglock;
extern boolean_t in_death_throes;
extern boolean_t bringup_failure_recovery;
extern char *zone_name;
+extern zone_dochandle_t snap_hndl;
extern char pool_name[MAXNAMELEN];
extern char brand_name[MAXNAMELEN];
extern char default_brand[MAXNAMELEN];
extern char boot_args[BOOTARGS_MAX];
-extern char bad_boot_arg[BOOTARGS_MAX];
extern boolean_t zone_isnative;
extern boolean_t zone_iscluster;
extern dladm_handle_t dld_handle;
extern void zerror(zlog_t *, boolean_t, const char *, ...);
extern char *localize_msg(char *locale, const char *msg);
+extern void nwifent_free_attrs(struct zone_nwiftab *);
/*
* Eventstream interfaces.
@@ -112,8 +114,7 @@ typedef enum {
Z_EVT_ZONE_HALTED,
Z_EVT_ZONE_READIED,
Z_EVT_ZONE_UNINSTALLING,
- Z_EVT_ZONE_BOOTFAILED,
- Z_EVT_ZONE_BADARGS
+ Z_EVT_ZONE_BOOTFAILED
} zone_evt_t;
extern int eventstream_init();
@@ -135,9 +136,9 @@ typedef enum {
/*
* Virtual platform interfaces.
*/
-extern zoneid_t vplat_create(zlog_t *, zone_mnt_t);
+extern zoneid_t vplat_create(zlog_t *, zone_mnt_t, zoneid_t);
extern int vplat_bringup(zlog_t *, zone_mnt_t, zoneid_t);
-extern int vplat_teardown(zlog_t *, boolean_t, boolean_t);
+extern int vplat_teardown(zlog_t *, boolean_t, boolean_t, boolean_t);
extern int vplat_get_iptype(zlog_t *, zone_iptype_t *);
/*
@@ -154,6 +155,13 @@ extern void resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen);
*/
extern int init_console(zlog_t *);
extern void serve_console(zlog_t *);
+extern void zcons_statechanged();
+
+/*
+ * Memory capping thread creation.
+ */
+extern void create_mcap_thread(zlog_t *, zoneid_t);
+extern void destroy_mcap_thread();
/*
* Contract handling.
@@ -163,7 +171,7 @@ extern int init_template(void);
/*
* Routine to manage child processes.
*/
-extern int do_subproc(zlog_t *, char *, char **);
+extern int do_subproc(zlog_t *, char *, char **, boolean_t);
#ifdef __cplusplus
}