summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2014-12-23 15:59:39 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2014-12-23 15:59:39 +0000
commit7559f59bb4c3691722b46b2a0e0ede29e8f4a777 (patch)
tree6cc3316c468e09ff9cc758026a5965b847835caf
parent6cc95dc1252c4c529be1b7da28efe7dc6918924a (diff)
downloadillumos-joyent-7559f59bb4c3691722b46b2a0e0ede29e8f4a777.tar.gz
OS-3524 in order to support interaction with docker containers, need to be able to connect to stdio for init from GZ
OS-3525 in order to support 'docker logs' need to be able to get stdio from zone to log file
-rw-r--r--manifest3
-rw-r--r--usr/src/cmd/devfsadm/misc_link.c37
-rw-r--r--usr/src/cmd/zlogin/zlogin.c110
-rw-r--r--usr/src/cmd/zoneadm/zoneadm.c4
-rw-r--r--usr/src/cmd/zoneadmd/Makefile.com4
-rw-r--r--usr/src/cmd/zoneadmd/mcap.c15
-rw-r--r--usr/src/cmd/zoneadmd/vplat.c35
-rw-r--r--usr/src/cmd/zoneadmd/zfd.c1248
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.c65
-rw-r--r--usr/src/cmd/zoneadmd/zoneadmd.h7
-rw-r--r--usr/src/lib/brand/lx/zone/platform.xml3
-rw-r--r--usr/src/man/man1/zlogin.125
-rw-r--r--usr/src/man/man7d/Makefile4
-rw-r--r--usr/src/man/man7d/zfd.7d39
-rw-r--r--usr/src/uts/common/Makefile.files2
-rw-r--r--usr/src/uts/common/io/pseudo.conf9
-rw-r--r--usr/src/uts/common/io/zfd.c815
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/zfd.h53
-rw-r--r--usr/src/uts/intel/Makefile.intel1
-rw-r--r--usr/src/uts/intel/zfd/Makefile48
-rw-r--r--usr/src/uts/sparc/Makefile.sparc2
-rw-r--r--usr/src/uts/sparc/zfd/Makefile50
23 files changed, 2466 insertions, 114 deletions
diff --git a/manifest b/manifest
index 457e9864c0..16fb00d708 100644
--- a/manifest
+++ b/manifest
@@ -4434,6 +4434,7 @@ f usr/include/sys/xti_inet.h 0644 root bin
f usr/include/sys/xti_osi.h 0644 root bin
f usr/include/sys/xti_xtiopt.h 0644 root bin
f usr/include/sys/zcons.h 0644 root bin
+f usr/include/sys/zfd.h 0644 root bin
f usr/include/sys/zmod.h 0644 root bin
f usr/include/sys/zone.h 0644 root bin
f usr/include/sysexits.h 0644 root bin
@@ -4544,6 +4545,7 @@ f usr/kernel/drv/amd64/smbsrv 0755 root sys
f usr/kernel/drv/amd64/sppp 0755 root sys
f usr/kernel/drv/amd64/sppptun 0755 root sys
f usr/kernel/drv/amd64/zcons 0755 root sys
+f usr/kernel/drv/amd64/zfd 0755 root sys
f usr/kernel/drv/bpf.conf 0644 root sys
f usr/kernel/drv/dump.conf 0644 root sys
f usr/kernel/drv/eventfd.conf 0644 root sys
@@ -18394,6 +18396,7 @@ f usr/share/man/man7d/xge.7d 0444 root bin
f usr/share/man/man7d/yge.7d 0444 root bin
f usr/share/man/man7d/zcons.7d 0444 root bin
f usr/share/man/man7d/zero.7d 0444 root bin
+f usr/share/man/man7d/zfd.7d 0444 root bin
d usr/share/man/man7fs 0755 root bin
f usr/share/man/man7fs/bootfs.7fs 0444 root bin
f usr/share/man/man7fs/ctfs.7fs 0444 root bin
diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c
index 9f4c20ac5e..f37a1227b6 100644
--- a/usr/src/cmd/devfsadm/misc_link.c
+++ b/usr/src/cmd/devfsadm/misc_link.c
@@ -32,6 +32,7 @@
#include <limits.h>
#include <sys/zone.h>
#include <sys/zcons.h>
+#include <sys/zfd.h>
#include <sys/cpuid_drv.h>
static int display(di_minor_t minor, di_node_t node);
@@ -53,6 +54,7 @@ static int av_create(di_minor_t minor, di_node_t node);
static int tsalarm_create(di_minor_t minor, di_node_t node);
static int ntwdt_create(di_minor_t minor, di_node_t node);
static int zcons_create(di_minor_t minor, di_node_t node);
+static int zfd_create(di_minor_t minor, di_node_t node);
static int cpuid(di_minor_t minor, di_node_t node);
static int glvc(di_minor_t minor, di_node_t node);
static int ses_callback(di_minor_t minor, di_node_t node);
@@ -177,6 +179,9 @@ static devfsadm_create_t misc_cbt[] = {
{ "pseudo", "ddi_pseudo", "zcons",
TYPE_EXACT | DRV_EXACT, ILEVEL_0, zcons_create,
},
+ { "pseudo", "ddi_pseudo", "zfd",
+ TYPE_EXACT | DRV_EXACT, ILEVEL_0, zfd_create,
+ },
{ "pseudo", "ddi_pseudo", CPUID_DRIVER_NAME,
TYPE_EXACT | DRV_EXACT, ILEVEL_0, cpuid,
},
@@ -225,6 +230,9 @@ static devfsadm_remove_t misc_remove_cbt[] = {
ZCONS_SLAVE_NAME ")$",
RM_PRE | RM_HOT | RM_ALWAYS, ILEVEL_0, devfsadm_rm_all
},
+ { "pseudo", "^zfd/" ZONENAME_REGEXP "/(master|slave)/[0-9]+$",
+ RM_PRE | RM_HOT | RM_ALWAYS, ILEVEL_0, devfsadm_rm_all
+ },
{ "pseudo", "^" CPUID_SELF_NAME "$", RM_ALWAYS | RM_PRE | RM_HOT,
ILEVEL_0, devfsadm_rm_all
},
@@ -672,6 +680,35 @@ zcons_create(di_minor_t minor, di_node_t node)
return (DEVFSADM_CONTINUE);
}
+static int
+zfd_create(di_minor_t minor, di_node_t node)
+{
+ char *minor_str;
+ char *zonename;
+ int *id;
+ char path[MAXPATHLEN];
+
+ minor_str = di_minor_name(minor);
+
+ if (di_prop_lookup_strings(DDI_DEV_T_ANY, node, "zfd_zname",
+ &zonename) == -1)
+ return (DEVFSADM_CONTINUE);
+
+ if (di_prop_lookup_ints(DDI_DEV_T_ANY, node, "zfd_id", &id) == -1)
+ return (DEVFSADM_CONTINUE);
+
+ if (strncmp(minor_str, "slave", 5) == 0) {
+ (void) snprintf(path, sizeof (path), "zfd/%s/slave/%d",
+ zonename, id[0]);
+ } else {
+ (void) snprintf(path, sizeof (path), "zfd/%s/master/%d",
+ zonename, id[0]);
+ }
+ (void) devfsadm_mklink(path, node, minor, 0);
+
+ return (DEVFSADM_CONTINUE);
+}
+
/*
* /dev/cpu/self/cpuid -> /devices/pseudo/cpuid@0:self
*/
diff --git a/usr/src/cmd/zlogin/zlogin.c b/usr/src/cmd/zlogin/zlogin.c
index a5bd206b11..6a31b9c354 100644
--- a/usr/src/cmd/zlogin/zlogin.c
+++ b/usr/src/cmd/zlogin/zlogin.c
@@ -156,7 +156,7 @@ static boolean_t forced_login = B_FALSE;
static void
usage(void)
{
- (void) fprintf(stderr, gettext("usage: %s [ -inQCES ] [ -e cmdchar ] "
+ (void) fprintf(stderr, gettext("usage: %s [-inQCIES] [-e cmdchar] "
"[-l user] zonename [command [args ...] ]\n"), pname);
exit(2);
}
@@ -256,7 +256,7 @@ postfork_dropprivs()
* with it to determine whether it will allow us to connect.
*/
static int
-get_console_master(const char *zname)
+get_interactive_master(const char *zname, int notcons)
{
int sockfd = -1;
struct sockaddr_un servaddr;
@@ -264,20 +264,32 @@ get_console_master(const char *zname)
char handshake[MAXPATHLEN], c;
int msglen;
int i = 0, err = 0;
+ char *sock_str;
if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
zperror(gettext("could not create socket"));
return (-1);
}
+ if (notcons) {
+ sock_str = "%s/%s.server_sock";
+ } else {
+ sock_str = "%s/%s.console_sock";
+ }
+
bzero(&servaddr, sizeof (servaddr));
servaddr.sun_family = AF_UNIX;
(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
- "%s/%s.console_sock", ZONES_TMPDIR, zname);
+ sock_str, ZONES_TMPDIR, zname);
if (connect(sockfd, (struct sockaddr *)&servaddr,
sizeof (servaddr)) == -1) {
- zperror(gettext("Could not connect to zone console"));
+ if (errno == ENOENT && notcons)
+ (void) fprintf(stderr, "%s: %s\n", pname,
+ gettext("Could not connect to zone (is interactive "
+ "mode enabled?)"));
+ else
+ zperror(gettext("Could not connect to zone"));
goto bad;
}
masterfd = sockfd;
@@ -315,15 +327,14 @@ get_console_master(const char *zname)
* the server died off.
*/
if (err == -1) {
- zperror(gettext("Could not connect to zone console"));
+ zperror(gettext("Could not connect to zone"));
goto bad;
}
if (strncmp(handshake, "OK", sizeof (handshake)) == 0)
return (0);
- zerror(gettext("Console is already in use by process ID %s."),
- handshake);
+ zerror(gettext("Zone is already in use by process ID %s."), handshake);
bad:
(void) close(sockfd);
masterfd = -1;
@@ -1752,10 +1763,40 @@ get_username()
return (nptr->pw_name);
}
+static boolean_t
+is_standalone_int_mode(char *zonename)
+{
+ boolean_t sa = B_FALSE;
+ zone_dochandle_t handle;
+ struct zone_attrtab attr;
+
+ if ((handle = zonecfg_init_handle()) == NULL)
+ return (sa);
+
+ if (zonecfg_get_handle(zonename, handle) != Z_OK)
+ goto done;
+
+ if (zonecfg_setattrent(handle) != Z_OK)
+ goto done;
+ while (zonecfg_getattrent(handle, &attr) == Z_OK) {
+ if (strcmp("zlog-mode", attr.zone_attr_name) == 0) {
+ if (strncmp("int", attr.zone_attr_value, 3) == 0)
+ sa = B_TRUE;
+ break;
+ }
+ }
+ (void) zonecfg_endattrent(handle);
+
+done:
+ zonecfg_fini_handle(handle);
+ return (sa);
+}
+
+
int
main(int argc, char **argv)
{
- int arg, console = 0;
+ int arg, console = 0, imode = 0;
zoneid_t zoneid;
zone_state_t st;
char *login = "root";
@@ -1784,7 +1825,7 @@ main(int argc, char **argv)
(void) getpname(argv[0]);
username = get_username();
- while ((arg = getopt(argc, argv, "inECR:Se:l:Q")) != EOF) {
+ while ((arg = getopt(argc, argv, "inECIR:Se:l:Q")) != EOF) {
switch (arg) {
case 'C':
console = 1;
@@ -1792,6 +1833,14 @@ main(int argc, char **argv)
case 'E':
nocmdchar = 1;
break;
+ case 'I':
+ /*
+ * interactive mode is just a slight variation on the
+ * console mode.
+ */
+ console = 1;
+ imode = 1;
+ break;
case 'R': /* undocumented */
if (*optarg != '/') {
zerror(gettext("root path must be absolute."));
@@ -1856,7 +1905,7 @@ main(int argc, char **argv)
}
- if (iflag !=0 && nflag != 0) {
+ if (iflag != 0 && nflag != 0) {
zerror(gettext("-i and -n flags are incompatible"));
usage();
}
@@ -1975,10 +2024,15 @@ main(int argc, char **argv)
}
/*
- * The console is a separate case from the rest of the code; handle
- * it first.
+ * The console (or standalong interactive mode) is a separate case from
+ * the rest of the code; handle it first.
*/
if (console) {
+ if (imode && !is_standalone_int_mode(zonename)) {
+ zerror(gettext("the zlog-mode is not interactive"));
+ return (1);
+ }
+
/*
* Ensure that zoneadmd for this zone is running.
*/
@@ -1988,15 +2042,19 @@ main(int argc, char **argv)
/*
* Make contact with zoneadmd.
*/
- if (get_console_master(zonename) == -1)
+ if (get_interactive_master(zonename, imode) == -1)
return (1);
- if (!quiet)
- (void) printf(
- gettext("[Connected to zone '%s' console]\n"),
- zonename);
+ if (!quiet) {
+ if (imode)
+ (void) printf(gettext("[Connected to zone '%s' "
+ "interactively]\n"), zonename);
+ else
+ (void) printf(gettext("[Connected to zone '%s' "
+ "console]\n"), zonename);
+ }
- if (set_tty_rawmode(STDIN_FILENO) == -1) {
+ if (!imode && set_tty_rawmode(STDIN_FILENO) == -1) {
reset_tty();
zperror(gettext("failed to set stdin pty to raw mode"));
return (1);
@@ -2009,11 +2067,17 @@ main(int argc, char **argv)
* Run the I/O loop until we get disconnected.
*/
doio(masterfd, -1, masterfd, -1, -1, B_FALSE);
- reset_tty();
- if (!quiet)
- (void) printf(
- gettext("\n[Connection to zone '%s' console "
- "closed]\n"), zonename);
+ if (!imode)
+ reset_tty();
+ if (!quiet) {
+ if (imode)
+ (void) printf(gettext("\n[Interactive "
+ "connection to zone '%s' closed]\n"),
+ zonename);
+ else
+ (void) printf(gettext("\n[Connection to zone "
+ "'%s' console closed]\n"), zonename);
+ }
return (0);
}
diff --git a/usr/src/cmd/zoneadm/zoneadm.c b/usr/src/cmd/zoneadm/zoneadm.c
index 6fdd00e39c..396fc91699 100644
--- a/usr/src/cmd/zoneadm/zoneadm.c
+++ b/usr/src/cmd/zoneadm/zoneadm.c
@@ -3946,8 +3946,8 @@ cleanup_zonepath(char *zonepath, boolean_t all)
* exist if the zone was force-attached after a
* migration.
*/
- char *std_entries[] = {"dev", "lastexited", "lu", "root",
- "SUNWattached.xml", NULL};
+ char *std_entries[] = {"dev", "lastexited", "logs", "lu",
+ "root", "SUNWattached.xml", NULL};
/* (MAXPATHLEN * 5) is for the 5 std_entries dirs */
char cmdbuf[sizeof (RMCOMMAND) + (MAXPATHLEN * 5) + 64];
diff --git a/usr/src/cmd/zoneadmd/Makefile.com b/usr/src/cmd/zoneadmd/Makefile.com
index 162d1f0219..c8becc3e8c 100644
--- a/usr/src/cmd/zoneadmd/Makefile.com
+++ b/usr/src/cmd/zoneadmd/Makefile.com
@@ -20,7 +20,7 @@
#
# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2011, Joyent, Inc. All rights reserved.
+# Copyright 2014, Joyent, Inc. All rights reserved.
#
PROG= zoneadmd
@@ -30,7 +30,7 @@ include ../../Makefile.ctf
ROOTCMDDIR= $(ROOTLIB)/zones
-OBJS= zoneadmd.o zcons.o vplat.o mcap.o
+OBJS= zoneadmd.o zcons.o zfd.o vplat.o mcap.o
CFLAGS += $(CCVERBOSE)
LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair \
diff --git a/usr/src/cmd/zoneadmd/mcap.c b/usr/src/cmd/zoneadmd/mcap.c
index 44917b0024..16cd2dd07a 100644
--- a/usr/src/cmd/zoneadmd/mcap.c
+++ b/usr/src/cmd/zoneadmd/mcap.c
@@ -139,8 +139,6 @@ uint64_t prev_fast_rss = 0;
uint64_t fast_rss = 0;
uint64_t accurate_rss = 0;
-static char zonename[ZONENAME_MAX];
-static char zonepath[MAXPATHLEN];
static char zoneproc[MAXPATHLEN];
static char debug_log[MAXPATHLEN];
static zoneid_t zid;
@@ -907,7 +905,7 @@ get_mcap_tunables()
if ((handle = zonecfg_init_handle()) == NULL)
return;
- if (zonecfg_get_handle(zonename, handle) != Z_OK)
+ if (zonecfg_get_handle(zone_name, handle) != Z_OK)
goto done;
/* Reset to defaults in case rebooting and settings have changed */
@@ -1146,22 +1144,13 @@ void
create_mcap_thread(zlog_t *zlogp, zoneid_t id)
{
int res;
- char brandname[MAXNAMELEN];
shutting_down = 0;
zid = id;
logp = zlogp;
- (void) getzonenamebyid(zid, zonename, sizeof (zonename));
-
- if (zone_get_zonepath(zonename, zonepath, sizeof (zonepath)) != 0)
- zerror(zlogp, B_FALSE, "zone %s missing zonepath", zonename);
-
- brandname[0] = '\0';
- if (zone_get_brand(zonename, brandname, sizeof (brandname)) != 0)
- zerror(zlogp, B_FALSE, "zone %s missing brand", zonename);
/* all but the lx brand currently use /proc */
- if (strcmp(brandname, "lx") == 0) {
+ if (strcmp(brand_name, "lx") == 0) {
(void) snprintf(zoneproc, sizeof (zoneproc),
"%s/root/native/proc", zonepath);
} else {
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index e63f87d2a0..5a86b1cf50 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent Inc. All rights reserved.
+ * Copyright 2014, Joyent Inc. All rights reserved.
*/
/*
@@ -1691,7 +1691,6 @@ static int
mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
{
char rootpath[MAXPATHLEN];
- char zonepath[MAXPATHLEN];
char brand[MAXNAMELEN];
char luroot[MAXPATHLEN];
int i, num_fs = 0;
@@ -1709,11 +1708,6 @@ mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
goto bad;
}
- if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
- zerror(zlogp, B_TRUE, "unable to determine zone path");
- goto bad;
- }
-
if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
zerror(zlogp, B_TRUE, "unable to determine zone root");
goto bad;
@@ -3601,17 +3595,11 @@ validate_rootds_label(zlog_t *zlogp, char *rootpath, m_label_t *zone_sl)
zfs_handle_t *zhp;
libzfs_handle_t *hdl;
m_label_t ds_sl;
- char zonepath[MAXPATHLEN];
char ds_hexsl[MAXNAMELEN];
if (!is_system_labeled())
return (0);
- if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
- zerror(zlogp, B_TRUE, "unable to determine zone path");
- return (-1);
- }
-
if (!is_zonepath_zfs(zonepath))
return (0);
@@ -4843,7 +4831,7 @@ write_index_file(zoneid_t zoneid)
int
vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
{
- char zonepath[MAXPATHLEN];
+ char zpath[MAXPATHLEN];
if (mount_cmd == Z_MNT_BOOT && validate_datasets(zlogp) != 0) {
lofs_discard_mnttab();
@@ -4854,15 +4842,11 @@ vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
* Before we try to mount filesystems we need to create the
* attribute backing store for /dev
*/
- if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
- lofs_discard_mnttab();
- return (-1);
- }
- resolve_lofs(zlogp, zonepath, sizeof (zonepath));
+ (void) strlcpy(zpath, zonepath, sizeof (zpath));
+ resolve_lofs(zlogp, zpath, sizeof (zpath));
/* Make /dev directory owned by root, grouped sys */
- if (make_one_dir(zlogp, zonepath, "/dev", DEFAULT_DIR_MODE,
- 0, 3) != 0) {
+ if (make_one_dir(zlogp, zpath, "/dev", DEFAULT_DIR_MODE, 0, 3) != 0) {
lofs_discard_mnttab();
return (-1);
}
@@ -4981,7 +4965,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting,
zoneid_t zoneid;
int res;
char pool_err[128];
- char zpath[MAXPATHLEN];
char cmdbuf[MAXPATHLEN];
brand_handle_t bh = NULL;
dladm_status_t status;
@@ -5033,12 +5016,6 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting,
goto error;
}
- /* Get the zonepath of this zone */
- if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
- zerror(zlogp, B_FALSE, "unable to determine zone path");
- goto error;
- }
-
/* Get a handle to the brand info for this zone */
if ((bh = brand_open(brand_name)) == NULL) {
zerror(zlogp, B_FALSE, "unable to determine zone brand");
@@ -5049,7 +5026,7 @@ vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting,
* brand a chance to cleanup any custom configuration.
*/
(void) strcpy(cmdbuf, EXEC_PREFIX);
- if (brand_get_halt(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+ if (brand_get_halt(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
sizeof (cmdbuf) - EXEC_LEN) < 0) {
brand_close(bh);
zerror(zlogp, B_FALSE, "unable to determine branded zone's "
diff --git a/usr/src/cmd/zoneadmd/zfd.c b/usr/src/cmd/zoneadmd/zfd.c
new file mode 100644
index 0000000000..1e1ac48d15
--- /dev/null
+++ b/usr/src/cmd/zoneadmd/zfd.c
@@ -0,0 +1,1248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Zone file descriptor support is used as a mechanism for a process inside the
+ * zone to either log messages to the GZ zoneadmd or as a way to interact
+ * directly with the process (via zlogin -I). The zfd thread is modeled on
+ * the zcons thread so see the comment header in zcons.c for a general overview.
+ * Unlike with zcons, which has a single endpoint within the zone and a single
+ * endpoint used by zoneadmd, we setup multiple endpoints within the zone.
+ * In the interactive mode we setup fd 0, 1 and 2 for use as stdin, stdout and
+ * stderr. In the logging mode we only setup fd 1 and 2 for use as stdout and
+ * stderr.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/termios.h>
+#include <sys/zfd.h>
+#include <sys/mkdev.h>
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <stropts.h>
+#include <thread.h>
+#include <ucred.h>
+#include <unistd.h>
+#include <zone.h>
+#include <signal.h>
+#include <wchar.h>
+
+#include <libdevinfo.h>
+#include <libdevice.h>
+#include <libzonecfg.h>
+
+#include <syslog.h>
+#include <sys/modctl.h>
+
+#include "zoneadmd.h"
+
+static zlog_t *zlogp;
+static int shutting_down = 0;
+static thread_t logger_tid;
+static int logfd = -1;
+
+/*
+ * The eventstream is a simple one-directional flow of messages implemented
+ * with a pipe. It is used to wake up the poller when it needs to shutdown.
+ */
+static int eventstream[2] = {-1, -1};
+
+#define LOGNAME "stdio.log"
+#define ZLOG_MODE "zlog-mode"
+#define ZFDNEX_DEVTREEPATH "/pseudo/zfdnex@2"
+#define ZFDNEX_FILEPATH "/devices/pseudo/zfdnex@2"
+#define SERVER_SOCKPATH ZONES_TMPDIR "/%s.server_sock"
+#define ZTTY_RETRY 5
+
+typedef enum {
+ ZLOG_NONE = 0,
+ ZLOG_LOG,
+ ZLOG_INTERACTIVE,
+} zlog_mode_t;
+
+/*
+ * count_zfd_devs() and its helper count_cb() do a walk of the subtree of the
+ * device tree where zfd nodes are represented. The goal is to count zfd
+ * instances already setup for a zone with the given name.
+ *
+ * Note: this algorithm is a linear search of nodes in the zfdnex subtree
+ * of the device tree, and could be a scalability problem, but I don't see
+ * how to avoid it.
+ */
+
+/*
+ * cb_data is shared by count_cb and destroy_cb for simplicity.
+ */
+struct cb_data {
+ zlog_t *zlogp;
+ int found;
+ int killed;
+};
+
+static int
+count_cb(di_node_t node, void *arg)
+{
+ struct cb_data *cb = (struct cb_data *)arg;
+ char *prop_data;
+
+ if (di_prop_lookup_strings(DDI_DEV_T_ANY, node, "zfd_zname",
+ &prop_data) != -1) {
+ assert(prop_data != NULL);
+ if (strcmp(prop_data, zone_name) == 0) {
+ cb->found++;
+ return (DI_WALK_CONTINUE);
+ }
+ }
+ return (DI_WALK_CONTINUE);
+}
+
+static int
+count_zfd_devs(zlog_t *zlogp)
+{
+ di_node_t root;
+ struct cb_data cb;
+
+ bzero(&cb, sizeof (cb));
+ cb.zlogp = zlogp;
+
+ if ((root = di_init(ZFDNEX_DEVTREEPATH, DINFOCPYALL)) == DI_NODE_NIL) {
+ zerror(zlogp, B_TRUE, "di_init failed");
+ return (-1);
+ }
+
+ (void) di_walk_node(root, DI_WALK_CLDFIRST, (void *)&cb, count_cb);
+ di_fini(root);
+ return (cb.found);
+}
+
+/*
+ * destroy_zfd_devs() and its helper destroy_cb() tears down any zfd instances
+ * associated with this zone. If things went very wrong, we might have an
+ * incorrect number of instances hanging around. This routine hunts down and
+ * tries to remove all of them. Of course, if the fd is open, the instance will
+ * not detach, which is a potential issue.
+ */
+static int
+destroy_cb(di_node_t node, void *arg)
+{
+ struct cb_data *cb = (struct cb_data *)arg;
+ char *prop_data;
+ char *tmp;
+ char devpath[MAXPATHLEN];
+ devctl_hdl_t hdl;
+
+ if (di_prop_lookup_strings(DDI_DEV_T_ANY, node, "zfd_zname",
+ &prop_data) == -1)
+ return (DI_WALK_CONTINUE);
+
+ assert(prop_data != NULL);
+ if (strcmp(prop_data, zone_name) != 0) {
+ /* this is a zfd for a different zone */
+ return (DI_WALK_CONTINUE);
+ }
+
+ cb->found++;
+ tmp = di_devfs_path(node);
+ (void) snprintf(devpath, sizeof (devpath), "/devices/%s", tmp);
+ di_devfs_path_free(tmp);
+
+ if ((hdl = devctl_device_acquire(devpath, 0)) == NULL) {
+ zerror(cb->zlogp, B_TRUE, "WARNING: zfd %s found, "
+ "but it could not be controlled.", devpath);
+ return (DI_WALK_CONTINUE);
+ }
+ if (devctl_device_remove(hdl) == 0) {
+ cb->killed++;
+ } else {
+ zerror(cb->zlogp, B_TRUE, "WARNING: zfd %s found, "
+ "but it could not be removed.", devpath);
+ }
+ devctl_release(hdl);
+ return (DI_WALK_CONTINUE);
+}
+
+static int
+destroy_zfd_devs(zlog_t *zlogp)
+{
+ di_node_t root;
+ struct cb_data cb;
+
+ bzero(&cb, sizeof (cb));
+ cb.zlogp = zlogp;
+
+ if ((root = di_init(ZFDNEX_DEVTREEPATH, DINFOCPYALL)) == DI_NODE_NIL) {
+ zerror(zlogp, B_TRUE, "di_init failed");
+ return (-1);
+ }
+
+ (void) di_walk_node(root, DI_WALK_CLDFIRST, (void *)&cb, destroy_cb);
+
+ di_fini(root);
+ return (0);
+}
+
+static void
+make_tty(zlog_t *zlogp, int id)
+{
+ int i;
+ int fd = -1;
+ char stdpath[MAXPATHLEN];
+
+ /*
+ * Open the master side of the dev and issue the ZFD_MAKETTY ioctl,
+ * which will cause the the various tty-related streams modules to be
+ * pushed when the slave opens the device.
+ *
+ * In very rare cases the open returns ENOENT if devfs doesn't have
+ * everything setup yet due to heavy zone startup load. Wait for
+ * 1 sec. and retry a few times. Even if we can't setup tty mode
+ * we still move on.
+ */
+ (void) snprintf(stdpath, sizeof (stdpath), "/dev/zfd/%s/master/%d",
+ zone_name, id);
+
+ for (i = 0; !shutting_down && i < ZTTY_RETRY; i++) {
+ fd = open(stdpath, O_RDWR | O_NOCTTY);
+ if (fd >= 0 || errno != ENOENT)
+ break;
+ (void) sleep(1);
+ }
+ if (fd == -1) {
+ zerror(zlogp, B_TRUE, "ERROR: could not open zfd %d for "
+ "zone %s to set tty mode", id, zone_name);
+ } else {
+ /*
+ * This ioctl can occasionally return ENXIO if devfs doesn't
+ * have everything plumbed up yet due to heavy zone startup
+ * load. Wait for 1 sec. and retry a few times before we give
+ * up.
+ */
+ for (i = 0; !shutting_down && i < ZTTY_RETRY; i++) {
+ if (ioctl(fd, ZFD_MAKETTY) == 0) {
+ break;
+ } else if (errno != ENXIO) {
+ break;
+ }
+ (void) sleep(1);
+ }
+ }
+
+ if (fd != -1)
+ (void) close(fd);
+}
+
+/*
+ * init_zfd_devs() drives the device-tree configuration of the zone fd devices.
+ * The general strategy is to use the libdevice (devctl) interfaces to
+ * instantiate 2 or 3 new zone fd nodes. We do a lot of sanity checking, and
+ * are careful to reuse a dev if one exists.
+ *
+ * Once the devices are in the device tree, we kick devfsadm via
+ * di_devlink_init() to ensure that the appropriate symlinks (to the master and
+ * slave fd devices) are placed in /dev in the global zone.
+ */
+static int
+init_zfd_dev(zlog_t *zlogp, devctl_hdl_t bus_hdl, int id)
+{
+ int rv = -1;
+ devctl_ddef_t ddef_hdl = NULL;
+ devctl_hdl_t dev_hdl = NULL;
+
+ if ((ddef_hdl = devctl_ddef_alloc("zfd", 0)) == NULL) {
+ zerror(zlogp, B_TRUE, "failed to allocate ddef handle");
+ goto error;
+ }
+
+ /*
+ * Set four properties on this node; the first is the name of the
+ * zone; the second is a flag which lets pseudo know that it is
+ * OK to automatically allocate an instance # for this device;
+ * the third tells the device framework not to auto-detach this
+ * node-- we need the node to still be there when we ask devfsadmd
+ * to make links, and when we need to open it.
+ */
+ if (devctl_ddef_string(ddef_hdl, "zfd_zname", zone_name) == -1) {
+ zerror(zlogp, B_TRUE, "failed to create zfd_zname property");
+ goto error;
+ }
+ if (devctl_ddef_int(ddef_hdl, "zfd_id", id) == -1) {
+ zerror(zlogp, B_TRUE, "failed to create zfd_id property");
+ goto error;
+ }
+ if (devctl_ddef_int(ddef_hdl, "auto-assign-instance", 1) == -1) {
+ zerror(zlogp, B_TRUE, "failed to create auto-assign-instance "
+ "property");
+ goto error;
+ }
+ if (devctl_ddef_int(ddef_hdl, "ddi-no-autodetach", 1) == -1) {
+ zerror(zlogp, B_TRUE, "failed to create ddi-no-auto-detach "
+ "property");
+ goto error;
+ }
+ if (devctl_bus_dev_create(bus_hdl, ddef_hdl, 0, &dev_hdl) == -1) {
+ zerror(zlogp, B_TRUE, "failed to create zfd node");
+ goto error;
+ }
+ rv = 0;
+
+error:
+ if (ddef_hdl)
+ devctl_ddef_free(ddef_hdl);
+ if (dev_hdl)
+ devctl_release(dev_hdl);
+ return (rv);
+}
+
+static int
+init_zfd_devs(zlog_t *zlogp, int start)
+{
+ devctl_hdl_t bus_hdl = NULL;
+ di_devlink_handle_t dl = NULL;
+ int rv = -1;
+ int ndevs;
+ int i;
+
+ /*
+ * Don't re-setup zone fd devs if they already exist; just
+ * skip ahead to making devlinks, which we do for sanity's sake.
+ */
+ ndevs = count_zfd_devs(zlogp);
+ if (ndevs == (3 - start))
+ goto devlinks;
+
+ if (ndevs > 0 || ndevs == -1) {
+ if (destroy_zfd_devs(zlogp) == -1)
+ goto error;
+ }
+
+ /*
+ * Time to make the devices.
+ */
+ if ((bus_hdl = devctl_bus_acquire(ZFDNEX_FILEPATH, 0)) == NULL) {
+ zerror(zlogp, B_TRUE, "devctl_bus_acquire failed");
+ goto error;
+ }
+
+ for (i = start; i < 3; i++) {
+ if (init_zfd_dev(zlogp, bus_hdl, i) != 0)
+ goto error;
+ }
+
+devlinks:
+ if ((dl = di_devlink_init("zfd", DI_MAKE_LINK)) == NULL) {
+ zerror(zlogp, B_TRUE, "failed to create devlinks");
+ goto error;
+ }
+
+ (void) di_devlink_fini(&dl);
+ rv = 0;
+
+ /*
+ * We know that start is 0 when we're interactive and that is the
+ * only time we want to look like a tty.
+ */
+ if (start == 0) {
+ for (i = start; i < 3; i++)
+ make_tty(zlogp, i);
+ }
+
+error:
+ if (bus_hdl)
+ devctl_release(bus_hdl);
+ return (rv);
+}
+
+static int
+init_server_sock(zlog_t *zlogp)
+{
+ int servfd;
+ struct sockaddr_un servaddr;
+
+ bzero(&servaddr, sizeof (servaddr));
+ servaddr.sun_family = AF_UNIX;
+ (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+ SERVER_SOCKPATH, zone_name);
+
+ if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ zerror(zlogp, B_TRUE, "server setup: could not create socket");
+ return (-1);
+ }
+ (void) unlink(servaddr.sun_path);
+
+ if (bind(servfd, (struct sockaddr *)&servaddr,
+ sizeof (servaddr)) == -1) {
+ zerror(zlogp, B_TRUE,
+ "server setup: could not bind to socket");
+ goto out;
+ }
+
+ if (listen(servfd, 4) == -1) {
+ zerror(zlogp, B_TRUE,
+ "server setup: could not listen on socket");
+ goto out;
+ }
+ return (servfd);
+
+out:
+ (void) unlink(servaddr.sun_path);
+ (void) close(servfd);
+ return (-1);
+}
+
+static void
+destroy_server_sock(int servfd)
+{
+ char path[MAXPATHLEN];
+
+ (void) snprintf(path, sizeof (path), SERVER_SOCKPATH, zone_name);
+ (void) unlink(path);
+ (void) shutdown(servfd, SHUT_RDWR);
+ (void) close(servfd);
+}
+
+/*
+ * Read the "ident" string from the client's descriptor; this routine also
+ * tolerates being called with pid=NULL, for times when you want to "eat"
+ * the ident string from a client without saving it.
+ */
+static int
+get_client_ident(int clifd, pid_t *pid, char *locale, size_t locale_len)
+{
+ char buf[BUFSIZ], *bufp;
+ size_t buflen = sizeof (buf);
+ char c = '\0';
+ int i = 0, r;
+
+ /* "eat up the ident string" case, for simplicity */
+ if (pid == NULL) {
+ assert(locale == NULL && locale_len == 0);
+ while (read(clifd, &c, 1) == 1) {
+ if (c == '\n')
+ return (0);
+ }
+ }
+
+ bzero(buf, sizeof (buf));
+ while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) {
+ buflen--;
+ if (c == '\n')
+ break;
+
+ buf[i] = c;
+ i++;
+ }
+ if (r == -1)
+ return (-1);
+
+ /*
+ * We've filled the buffer, but still haven't seen \n. Keep eating
+ * until we find it; we don't expect this to happen, but this is
+ * defensive.
+ */
+ if (c != '\n') {
+ while ((r = read(clifd, &c, sizeof (c))) > 0)
+ if (c == '\n')
+ break;
+ }
+
+ /*
+ * Parse buffer for message of the form: IDENT <pid> <locale>
+ */
+ bufp = buf;
+ if (strncmp(bufp, "IDENT ", 6) != 0)
+ return (-1);
+ bufp += 6;
+ errno = 0;
+ *pid = strtoll(bufp, &bufp, 10);
+ if (errno != 0)
+ return (-1);
+
+ while (*bufp != '\0' && isspace(*bufp))
+ bufp++;
+ (void) strlcpy(locale, bufp, locale_len);
+
+ return (0);
+}
+
+static int
+accept_client(int servfd, pid_t *pid, char *locale, size_t locale_len)
+{
+ int connfd;
+ struct sockaddr_un cliaddr;
+ socklen_t clilen;
+ int flags;
+
+ clilen = sizeof (cliaddr);
+ connfd = accept(servfd, (struct sockaddr *)&cliaddr, &clilen);
+ if (connfd == -1)
+ return (-1);
+ if (get_client_ident(connfd, pid, locale, locale_len) == -1) {
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+ return (-1);
+ }
+ (void) write(connfd, "OK\n", 3);
+
+ flags = fcntl(connfd, F_GETFD, 0);
+ if (flags != -1)
+ (void) fcntl(connfd, F_SETFD, flags | O_NONBLOCK | FD_CLOEXEC);
+
+ return (connfd);
+}
+
+static void
+reject_client(int servfd, pid_t clientpid)
+{
+ int connfd;
+ struct sockaddr_un cliaddr;
+ socklen_t clilen;
+ char nak[MAXPATHLEN];
+
+ clilen = sizeof (cliaddr);
+ connfd = accept(servfd, (struct sockaddr *)&cliaddr, &clilen);
+
+ /*
+ * After getting its ident string, tell client to get lost.
+ */
+ if (get_client_ident(connfd, NULL, NULL, 0) == 0) {
+ (void) snprintf(nak, sizeof (nak), "%lu\n",
+ clientpid);
+ (void) write(connfd, nak, strlen(nak));
+ }
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+}
+
+/*
+ * Check to see if the client at the other end of the socket is still alive; we
+ * know it is not if it throws EPIPE at us when we try to write an otherwise
+ * harmless 0-length message to it.
+ */
+static int
+test_client(int clifd)
+{
+ if ((write(clifd, "", 0) == -1) && errno == EPIPE)
+ return (-1);
+ return (0);
+}
+
+/*
+ * This routine drives the interactive I/O loop. It polls for input from the
+ * zone side of the fd (output to stdout/stderr), and from the client
+ * (input to the zone's stdin). Additionally, it polls on the server fd,
+ * and disconnects any clients that might try to hook up with the zone while
+ * the fd's are in use.
+ *
+ * When the client first calls us up, it is expected to send a line giving its
+ * "identity"; this consists of the string 'IDENT <pid> <locale>'. This is so
+ * that we can report that the fd's are busy, along with some diagnostics
+ * about who has them busy; the locale is ignore here but kept for compatability
+ * with the zlogin code when running on the zone's console.
+ *
+ * We need to handle the case where there is no server within the zone (or
+ * the server gets stuck) and data that we're writing to the zone server's
+ * stdin fills the pipe. Because open_fd() always opens non-blocking our
+ * writes could return -1 with EAGAIN. Since we ignore errors on the write
+ * to stdin, we won't get blocked.
+ */
+static void
+do_zfd_io(int servfd, int stdinfd, int stdoutfd, int stderrfd)
+{
+ struct pollfd pollfds[5];
+ char ibuf[BUFSIZ];
+ int cc, ret;
+ int clifd = -1;
+ int pollerr = 0;
+ char clilocale[MAXPATHLEN];
+ pid_t clipid = 0;
+
+ /* client, watch for read events */
+ pollfds[0].fd = clifd;
+ pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
+ POLLPRI | POLLERR | POLLHUP | POLLNVAL;
+
+ /* stdout, watch for read events */
+ pollfds[1].fd = stdoutfd;
+ pollfds[1].events = pollfds[0].events;
+
+ /* stderr, watch for read events */
+ pollfds[2].fd = stderrfd;
+ pollfds[2].events = pollfds[0].events;
+
+ /* the server socket; watch for events (new connections) */
+ pollfds[3].fd = servfd;
+ pollfds[3].events = pollfds[0].events;
+
+ /* the eventstram; any input means the zone is halting */
+ pollfds[4].fd = eventstream[1];
+ pollfds[4].events = pollfds[0].events;
+
+ while (!shutting_down) {
+ pollfds[0].revents = pollfds[1].revents = 0;
+ pollfds[2].revents = pollfds[3].revents = 0;
+ pollfds[4].revents = 0;
+
+ ret = poll(pollfds, 5, -1);
+ if (ret == -1 && errno != EINTR) {
+ zerror(zlogp, B_TRUE, "poll failed");
+ /* we are hosed, close connection */
+ break;
+ }
+
+ /* event from client side */
+ if (pollfds[0].revents) {
+ if (pollfds[0].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ errno = 0;
+ cc = read(clifd, ibuf, BUFSIZ);
+ if (cc <= 0 && (errno != EINTR) &&
+ (errno != EAGAIN)) {
+ break;
+ }
+ /*
+ * See comment for this function on what
+ * happens if there is no reader in the zone.
+ */
+ (void) write(stdinfd, ibuf, cc);
+ } else {
+ pollerr = pollfds[0].revents;
+ zerror(zlogp, B_FALSE, "closing connection "
+ "with client, pollerr %d\n", pollerr);
+ break;
+ }
+ }
+
+ /* event from stdout */
+ if (pollfds[1].revents) {
+ if (pollfds[1].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ errno = 0;
+ cc = read(stdoutfd, ibuf, BUFSIZ);
+ if (cc <= 0 && (errno != EINTR) &&
+ (errno != EAGAIN))
+ break;
+ /*
+ * Lose I/O if no one is listening
+ */
+ if (clifd != -1 && cc > 0)
+ (void) write(clifd, ibuf, cc);
+ } else {
+ pollerr = pollfds[1].revents;
+ zerror(zlogp, B_FALSE,
+ "closing connection with stdout zfd, "
+ "pollerr %d\n", pollerr);
+ break;
+ }
+ }
+
+ /* event from stderr */
+ if (pollfds[2].revents) {
+ if (pollfds[2].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ errno = 0;
+ cc = read(stderrfd, ibuf, BUFSIZ);
+ if (cc <= 0 && (errno != EINTR) &&
+ (errno != EAGAIN))
+ break;
+ /*
+ * Lose I/O if no one is listening
+ */
+ if (clifd != -1 && cc > 0)
+ (void) write(clifd, ibuf, cc);
+ } else {
+ pollerr = pollfds[2].revents;
+ zerror(zlogp, B_FALSE,
+ "closing connection with stderr zfd, "
+ "pollerr %d\n", pollerr);
+ break;
+ }
+ }
+
+ /* event from server socket */
+ if (pollfds[3].revents &&
+ (pollfds[3].revents & (POLLIN | POLLRDNORM))) {
+ if (clifd != -1) {
+ /*
+ * Test the client to see if it is really
+ * still alive. If it has died but we
+ * haven't yet detected that, we might
+ * deny a legitimate connect attempt. If it
+ * is dead, we break out; once we tear down
+ * the old connection, the new connection
+ * will happen.
+ */
+ if (test_client(clifd) == -1) {
+ break;
+ }
+ /* we're already handling a client */
+ reject_client(servfd, clipid);
+
+ } else if ((clifd = accept_client(servfd, &clipid,
+ clilocale, sizeof (clilocale))) != -1) {
+ pollfds[0].fd = clifd;
+
+ } else {
+ break;
+ }
+ }
+
+ /*
+ * Watch for events on the eventstream. This is how we get
+ * notified of the zone halting, etc. It provides us a
+ * "wakeup" from poll when important things happen, which
+ * is good.
+ */
+ if (pollfds[4].revents) {
+ break;
+ }
+ }
+
+ if (clifd != -1) {
+ (void) shutdown(clifd, SHUT_RDWR);
+ (void) close(clifd);
+ }
+}
+
+/*
+ * Modify the input string with json escapes. Since the destination can thus
+ * be larger than the source, it may get truncated, although we do use a
+ * larger buffer.
+ */
+static void
+escape_json(char *sbuf, int slen, char *dbuf, int dlen)
+{
+ int i;
+ mbstate_t mbr;
+ wchar_t c;
+ size_t sz;
+
+ bzero(&mbr, sizeof (mbr));
+
+ sbuf[slen - 1] = '\0';
+ i = 0;
+ while (i < dlen && (sz = mbrtowc(&c, sbuf, MB_CUR_MAX, &mbr)) > 0) {
+ switch (c) {
+ case '\\':
+ dbuf[i++] = '\\';
+ dbuf[i++] = '\\';
+ break;
+
+ case '"':
+ dbuf[i++] = '\\';
+ dbuf[i++] = '"';
+ break;
+
+ case '\b':
+ dbuf[i++] = '\\';
+ dbuf[i++] = 'b';
+ break;
+
+ case '\f':
+ dbuf[i++] = '\\';
+ dbuf[i++] = 'f';
+ break;
+
+ case '\n':
+ dbuf[i++] = '\\';
+ dbuf[i++] = 'n';
+ break;
+
+ case '\r':
+ dbuf[i++] = '\\';
+ dbuf[i++] = 'r';
+ break;
+
+ case '\t':
+ dbuf[i++] = '\\';
+ dbuf[i++] = 't';
+ break;
+
+ default:
+ if ((c >= 0x00 && c <= 0x1f) ||
+ (c > 0x7f && c <= 0xffff)) {
+
+ i += snprintf(&dbuf[i], (dlen - i), "\\u%04x",
+ (int)(0xffff & c));
+ } else if (c >= 0x20 && c <= 0x7f) {
+ dbuf[i++] = 0xff & c;
+ }
+
+ break;
+ }
+ sbuf += sz;
+ }
+
+ if (i == dlen)
+ dbuf[--i] = '\0';
+ else
+ dbuf[i] = '\0';
+}
+
+/*
+ * We output to the log file as json.
+ * ex. for string 'msg\n' on the zone's stdout:
+ * {"log":"msg\n","stream":"stdout","time":"2014-10-24T20:12:11.101973117Z"}
+ *
+ * We use ns in the last field of the timestamp for compatability.
+ */
+static void
+wr_log_msg(char *buf, int len, int from)
+{
+ struct timeval tv;
+ int olen;
+ char ts[64];
+ char nbuf[BUFSIZ * 2];
+ char obuf[BUFSIZ * 2];
+
+ escape_json(buf, len, nbuf, sizeof (nbuf));
+
+ if (gettimeofday(&tv, NULL) != 0)
+ return;
+ (void) strftime(ts, sizeof (ts), "%FT%T", gmtime(&tv.tv_sec));
+
+ olen = snprintf(obuf, sizeof (obuf),
+ "{\"log\":\"%s\",\"stream\":\"%s\",\"time\":\"%s.%ldZ\"}\n",
+ nbuf, (from == 1) ? "stdout" : "stderr", ts, tv.tv_usec * 1000);
+
+ (void) write(logfd, obuf, olen);
+}
+
+/*
+ * This routine runs the log file I/O loop. It polls for input from the
+ * zone's stdout and stderr, formats the msg in json and writes it to the
+ * log file.
+ */
+static void
+do_zfd_logging(int stdoutfd, int stderrfd)
+{
+ struct pollfd pollfds[3];
+ char ibuf[BUFSIZ];
+ int cc, ret;
+ int pollerr = 0;
+
+ /* stdout, watch for read events */
+ pollfds[0].fd = stdoutfd;
+ pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
+ POLLPRI | POLLERR | POLLHUP | POLLNVAL;
+
+ /* stderr, watch for read events */
+ pollfds[1].fd = stderrfd;
+ pollfds[1].events = pollfds[0].events;
+
+ /* the eventstream; any input means the zone is halting */
+ pollfds[2].fd = eventstream[1];
+ pollfds[2].events = pollfds[0].events;
+
+ while (!shutting_down) {
+ pollfds[0].revents = 0;
+ pollfds[1].revents = 0;
+ pollfds[2].revents = 0;
+
+ ret = poll(pollfds, 3, -1);
+ if (ret == -1 && errno != EINTR) {
+ zerror(zlogp, B_TRUE, "poll failed");
+ /* we are hosed, shutdown logger */
+ break;
+ }
+
+ /* event from zone's stdout */
+ if (pollfds[0].revents) {
+ if (pollfds[0].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ errno = 0;
+ cc = read(stdoutfd, ibuf, BUFSIZ);
+ if (cc <= 0 && errno != EINTR &&
+ errno != EAGAIN)
+ break;
+ if (cc > 0)
+ wr_log_msg(ibuf, cc, 1);
+ } else {
+ pollerr = pollfds[0].revents;
+ zerror(zlogp, B_FALSE, "closing connection "
+ "with zfd stdin, pollerr %d\n", pollerr);
+ break;
+ }
+ }
+
+ /* event from zone's stderr */
+ if (pollfds[1].revents) {
+ if (pollfds[1].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ errno = 0;
+ cc = read(stderrfd, ibuf, BUFSIZ);
+ if (cc <= 0 && errno != EINTR &&
+ errno != EAGAIN)
+ break;
+ if (cc > 0)
+ wr_log_msg(ibuf, cc, 2);
+ } else {
+ pollerr = pollfds[1].revents;
+ zerror(zlogp, B_FALSE, "closing connection "
+ "with zfd stderr, pollerr %d\n", pollerr);
+ break;
+ }
+ }
+
+
+ /*
+ * Watch for events on the eventstream. This is how we get
+ * notified of the zone halting. It provides us a "wakeup"
+ * from poll.
+ */
+ if (pollfds[2].revents)
+ break;
+ }
+
+ (void) close(logfd);
+ logfd = -1;
+}
+
+static int
+open_fd(int id)
+{
+ int fd;
+ int flag = O_NONBLOCK | O_NOCTTY | O_CLOEXEC;
+ int retried = 0;
+ char stdpath[MAXPATHLEN];
+
+ (void) snprintf(stdpath, sizeof (stdpath), "/dev/zfd/%s/master/%d",
+ zone_name, id);
+
+ if (id == 0) {
+ /* zone's stdin, so we're writing to it */
+ flag |= O_WRONLY;
+ } else {
+ /* zone's stdout or stderr, so we're reading from it */
+ flag |= O_RDONLY;
+ }
+
+ while (!shutting_down) {
+ if ((fd = open(stdpath, flag)) != -1)
+ return (fd);
+
+ if (retried++ > 60)
+ break;
+
+ (void) sleep(1);
+ }
+
+ return (-1);
+}
+
+/*
+ * Body of the worker thread to perform interactive IO to the stdin, stdout and
+ * stderr zfd's.
+ *
+ * The stdin, stdout and stderr are from the perspective of the process inside
+ * the zone, so the zoneadmd view is opposite (i.e. we write to the stdin fd
+ * and read from the stdout/stderr fds).
+ */
+static void
+interactive()
+{
+ int serverfd = -1;
+ int stdinfd = -1;
+ int stdoutfd = -1;
+ int stderrfd = -1;
+
+ if (pipe(eventstream) != 0) {
+ zerror(zlogp, B_TRUE, "failed to open interactive control "
+ "pipe");
+ return;
+ }
+
+ while (!shutting_down) {
+ if ((serverfd = init_server_sock(zlogp)) == -1) {
+ zerror(zlogp, B_FALSE,
+ "server setup: socket initialization failed");
+ goto death;
+ }
+
+ if (!shutting_down) {
+ if ((stdinfd = open_fd(0)) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to open stdin zfd");
+ goto death;
+ }
+
+ /*
+ * Setting RPROTDIS on the stream means that the
+ * control portion of messages received (which we don't
+ * care about) will be discarded by the stream head. If
+ * we allowed such messages, we wouldn't be able to use
+ * read(2), as it fails (EBADMSG) when a message with a
+ * control element is received.
+ */
+ if (ioctl(stdinfd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to set options on stdin zfd");
+ goto death;
+ }
+ }
+
+ if (!shutting_down) {
+ if ((stdoutfd = open_fd(1)) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to open stdout zfd");
+ goto death;
+ }
+
+ if (ioctl(stdoutfd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to set options on stdout zfd");
+ goto death;
+ }
+ }
+
+ if (!shutting_down) {
+ if ((stderrfd = open_fd(2)) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to open stderr zfd");
+ goto death;
+ }
+
+ if (ioctl(stderrfd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+ zerror(zlogp, B_TRUE,
+ "failed to set options on stderr zfd");
+ goto death;
+ }
+ }
+
+ do_zfd_io(serverfd, stdinfd, stdoutfd, stderrfd);
+death:
+ destroy_server_sock(serverfd);
+
+ (void) close(stdinfd);
+ (void) close(stdoutfd);
+ (void) close(stderrfd);
+ }
+
+ (void) close(eventstream[0]);
+ eventstream[0] = -1;
+ (void) close(eventstream[1]);
+ eventstream[1] = -1;
+}
+
+static void
+open_logfile()
+{
+ char logpath[MAXPATHLEN];
+
+ logfd = -1;
+
+ (void) snprintf(logpath, sizeof (logpath), "%s/logs", zonepath);
+ (void) mkdir(logpath, 0700);
+
+ (void) snprintf(logpath, sizeof (logpath), "%s/logs/%s", zonepath,
+ LOGNAME);
+
+ if ((logfd = open(logpath, O_WRONLY | O_APPEND | O_CREAT, 0600)) == -1)
+ zerror(zlogp, B_TRUE, "failed to open log file");
+}
+
+/* ARGSUSED */
+void
+hup_handler(int i)
+{
+ (void) close(logfd);
+ open_logfile();
+}
+
+/*
+ * Body of the worker thread to log the zfd's stdout and stderr to a log file.
+ *
+ * The stdout and stderr are from the perspective of the process inside the
+ * zone, so the zoneadmd view is opposite (i.e. we read from the stdout/stderr
+ * fds). Since this is the logger worker we ignore the zone's stdin fd.
+ */
+static void
+logger()
+{
+ int stdoutfd = -1;
+ int stderrfd = -1;
+ sigset_t blockset;
+
+ if (!shutting_down) {
+ open_logfile();
+ }
+
+ /*
+ * This thread should receive SIGHUP so that it can close the log
+ * file, and reopen it, during log rotation.
+ */
+ sigset(SIGHUP, hup_handler);
+ (void) sigfillset(&blockset);
+ (void) sigdelset(&blockset, SIGHUP);
+ (void) thr_sigsetmask(SIG_BLOCK, &blockset, NULL);
+
+ if (!shutting_down) {
+ if (pipe(eventstream) != 0) {
+ zerror(zlogp, B_TRUE, "failed to open logger control "
+ "pipe");
+ goto death;
+ }
+ }
+
+ if (!shutting_down) {
+ if ((stdoutfd = open_fd(1)) == -1) {
+ zerror(zlogp, B_TRUE, "failed to open stdout zfd");
+ goto death;
+ }
+
+ /*
+ * Setting RPROTDIS on the stream means that the control
+ * portion of messages received (which we don't care about)
+ * will be discarded by the stream head. If we allowed such
+ * messages, we wouldn't be able to use read(2), as it fails
+ * (EBADMSG) when a message with a control element is received.
+ */
+ if (ioctl(stdoutfd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+ zerror(zlogp, B_TRUE, "failed to set options on "
+ "stdout zfd");
+ goto death;
+ }
+ }
+
+ if (!shutting_down) {
+ if ((stderrfd = open_fd(2)) == -1) {
+ zerror(zlogp, B_TRUE, "failed to open stderr zfd");
+ goto death;
+ }
+
+ if (ioctl(stderrfd, I_SRDOPT, RNORM|RPROTDIS) == -1) {
+ zerror(zlogp, B_TRUE, "failed to set options on "
+ "stderr zfd");
+ goto death;
+ }
+ }
+
+ do_zfd_logging(stdoutfd, stderrfd);
+
+death:
+ (void) close(eventstream[0]);
+ eventstream[0] = -1;
+ (void) close(eventstream[1]);
+ eventstream[1] = -1;
+ (void) close(logfd);
+ (void) close(stdoutfd);
+ (void) close(stderrfd);
+}
+
+static zlog_mode_t
+get_logger_mode()
+{
+ zlog_mode_t mode = ZLOG_NONE;
+ zone_dochandle_t handle;
+ struct zone_attrtab attr;
+
+ if ((handle = zonecfg_init_handle()) == NULL)
+ return (mode);
+
+ if (zonecfg_get_handle(zone_name, handle) != Z_OK)
+ goto done;
+
+ if (zonecfg_setattrent(handle) != Z_OK)
+ goto done;
+ while (zonecfg_getattrent(handle, &attr) == Z_OK) {
+ if (strcmp(ZLOG_MODE, attr.zone_attr_name) == 0) {
+ if (strncmp("log", attr.zone_attr_value, 3) == 0) {
+ mode = ZLOG_LOG;
+ } else if (strncmp("int",
+ attr.zone_attr_value, 3) == 0) {
+ mode = ZLOG_INTERACTIVE;
+ }
+ break;
+ }
+ }
+ (void) zonecfg_endattrent(handle);
+
+done:
+ zonecfg_fini_handle(handle);
+ return (mode);
+}
+
+void
+create_log_thread(zlog_t *logp, zoneid_t id)
+{
+ int res;
+ int zdev_start;
+ zlog_mode_t mode;
+ void *(*worker) (void*);
+
+ shutting_down = 0;
+ zlogp = logp;
+
+ mode = get_logger_mode();
+ if (mode == ZLOG_NONE)
+ return;
+
+ if (mode == ZLOG_INTERACTIVE) {
+ worker = (void *(*)(void *))interactive;
+ zdev_start = 0;
+ } else {
+ worker = (void *(*)(void *))logger;
+ zdev_start = 1;
+ }
+
+ if (init_zfd_devs(zlogp, zdev_start) == -1) {
+ zerror(zlogp, B_FALSE,
+ "zfd setup: device initialization failed");
+ return;
+ }
+
+ res = thr_create(NULL, NULL, worker, NULL, NULL, &logger_tid);
+ if (res != 0) {
+ zerror(zlogp, B_FALSE, "error %d creating logger thread", res);
+ logger_tid = 0;
+ }
+}
+
+void
+destroy_log_thread()
+{
+ if (logger_tid != 0) {
+ int stop = 1;
+
+ shutting_down = 1;
+ /* break out of poll to shutdown */
+ if (eventstream[0] != -1)
+ (void) write(eventstream[0], &stop, sizeof (stop));
+ (void) thr_join(logger_tid, NULL, NULL);
+ logger_tid = 0;
+ }
+
+ (void) destroy_zfd_devs(zlogp);
+}
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.c b/usr/src/cmd/zoneadmd/zoneadmd.c
index bb53b01d16..72c14bc9ff 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.c
+++ b/usr/src/cmd/zoneadmd/zoneadmd.c
@@ -111,6 +111,7 @@
static char *progname;
char *zone_name; /* zone which we are managing */
zone_dochandle_t snap_hndl; /* handle for snapshot created when ready */
+char zonepath[MAXNAMELEN];
char pool_name[MAXNAMELEN];
char default_brand[MAXNAMELEN];
char brand_name[MAXNAMELEN];
@@ -620,15 +621,8 @@ mount_early_fs(void *data, const char *spec, const char *dir,
/* determine the zone rootpath */
if (mount_cmd) {
- char zonepath[MAXPATHLEN];
char luroot[MAXPATHLEN];
- if (zone_get_zonepath(zone_name,
- zonepath, sizeof (zonepath)) != Z_OK) {
- zerror(zlogp, B_FALSE, "unable to determine zone path");
- return (-1);
- }
-
(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
resolve_lofs(zlogp, luroot, sizeof (luroot));
(void) strlcpy(rootpath, luroot, sizeof (rootpath));
@@ -1014,7 +1008,7 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
{
zoneid_t zoneid;
struct stat st;
- char zpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
+ char rpath[MAXPATHLEN], initpath[MAXPATHLEN], init_file[MAXPATHLEN];
char nbootargs[BOOTARGS_MAX];
char cmdbuf[MAXPATHLEN];
fs_callback_t cb;
@@ -1058,13 +1052,8 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
/*
* Get the brand's boot callback if it exists.
*/
- if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
- zerror(zlogp, B_FALSE, "unable to determine zone path");
- brand_close(bh);
- goto bad;
- }
(void) strcpy(cmdbuf, EXEC_PREFIX);
- if (brand_get_boot(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+ if (brand_get_boot(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
sizeof (cmdbuf) - EXEC_LEN) != 0) {
zerror(zlogp, B_FALSE,
"unable to determine branded zone's boot callback");
@@ -1092,12 +1081,12 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
assert(init_file[0] != '\0');
/* Try to anticipate possible problems: Make sure init is executable. */
- if (zone_get_rootpath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
+ if (zone_get_rootpath(zone_name, rpath, sizeof (rpath)) != Z_OK) {
zerror(zlogp, B_FALSE, "unable to determine zone root");
goto bad;
}
- (void) snprintf(initpath, sizeof (initpath), "%s%s", zpath, init_file);
+ (void) snprintf(initpath, sizeof (initpath), "%s%s", rpath, init_file);
if (stat(initpath, &st) == -1) {
zerror(zlogp, B_TRUE, "could not stat %s", initpath);
@@ -1165,6 +1154,9 @@ zone_bootup(zlog_t *zlogp, const char *bootargs, int zstate, boolean_t debug)
if (brand_poststatechg(zlogp, zstate, Z_BOOT, debug) != 0)
goto bad;
+ /* Startup a thread to perform zfd logging/tty svc for the zone. */
+ create_log_thread(zlogp, zone_id);
+
/* Startup a thread to perform memory capping for the zone. */
create_mcap_thread(zlogp, zone_id);
@@ -1195,9 +1187,13 @@ zone_halt(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting, int zstate,
if (vplat_teardown(zlogp, unmount_cmd, rebooting, debug) != 0) {
if (!bringup_failure_recovery)
zerror(zlogp, B_FALSE, "unable to destroy zone");
+ destroy_log_thread();
return (-1);
}
+ /* Shut down is done, stop the log thread */
+ destroy_log_thread();
+
if (brand_poststatechg(zlogp, zstate, Z_HALT, debug) != 0)
return (-1);
@@ -1218,7 +1214,6 @@ zone_graceful_shutdown(zlog_t *zlogp)
pid_t child;
char cmdbuf[MAXPATHLEN];
brand_handle_t bh = NULL;
- char zpath[MAXPATHLEN];
ctid_t ct;
int tmpl_fd;
int child_status;
@@ -1239,18 +1234,12 @@ zone_graceful_shutdown(zlog_t *zlogp)
return (-1);
}
- if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
- zerror(zlogp, B_FALSE, "unable to determine zone path");
- brand_close(bh);
- return (-1);
- }
-
/*
* If there is a brand 'shutdown' callback, execute it now to give the
* brand a chance to cleanup any custom configuration.
*/
(void) strcpy(cmdbuf, EXEC_PREFIX);
- if (brand_get_shutdown(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
+ if (brand_get_shutdown(bh, zone_name, zonepath, cmdbuf + EXEC_LEN,
sizeof (cmdbuf) - EXEC_LEN) != 0 || strlen(cmdbuf) <= EXEC_LEN) {
(void) strcat(cmdbuf, SHUTDOWN_DEFAULT);
}
@@ -1397,15 +1386,12 @@ audit_put_record(zlog_t *zlogp, ucred_t *uc, int return_val,
static void
log_init_exit(int status)
{
- char zpath[MAXPATHLEN];
char p[MAXPATHLEN];
char buf[128];
struct timeval t;
int fd;
- if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK)
- return;
- if (snprintf(p, sizeof (p), "%s/lastexited", zpath) > sizeof (p))
+ if (snprintf(p, sizeof (p), "%s/lastexited", zonepath) > sizeof (p))
return;
if (gettimeofday(&t, NULL) != 0)
return;
@@ -2035,12 +2021,15 @@ top:
zone_name, zone_state_str(zstate));
/*
- * Startup a thread to perform memory capping for the
+ * Startup a thread to perform the zfd logging/tty svc
+ * and a thread to perform memory capping for the
* zone. zlogp won't be valid for much longer so use
* logsys.
*/
- if ((zid = getzoneidbyname(zone_name)) != -1)
+ if ((zid = getzoneidbyname(zone_name)) != -1) {
+ create_log_thread(&logsys, zid);
create_mcap_thread(&logsys, zid);
+ }
/* recover the global configuration snapshot */
if (snap_hndl == NULL) {
@@ -2120,15 +2109,10 @@ set_brand_env(zlog_t *zlogp)
static int
brand_callback_init(brand_handle_t bh, char *zone_name)
{
- char zpath[MAXPATHLEN];
-
- if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK)
- return (-1);
-
(void) strlcpy(pre_statechg_hook, EXEC_PREFIX,
sizeof (pre_statechg_hook));
- if (brand_get_prestatechange(bh, zone_name, zpath,
+ if (brand_get_prestatechange(bh, zone_name, zonepath,
pre_statechg_hook + EXEC_LEN,
sizeof (pre_statechg_hook) - EXEC_LEN) != 0)
return (-1);
@@ -2139,7 +2123,7 @@ brand_callback_init(brand_handle_t bh, char *zone_name)
(void) strlcpy(post_statechg_hook, EXEC_PREFIX,
sizeof (post_statechg_hook));
- if (brand_get_poststatechange(bh, zone_name, zpath,
+ if (brand_get_poststatechange(bh, zone_name, zonepath,
post_statechg_hook + EXEC_LEN,
sizeof (post_statechg_hook) - EXEC_LEN) != 0)
return (-1);
@@ -2150,7 +2134,7 @@ brand_callback_init(brand_handle_t bh, char *zone_name)
(void) strlcpy(query_hook, EXEC_PREFIX,
sizeof (query_hook));
- if (brand_get_query(bh, zone_name, zpath, query_hook + EXEC_LEN,
+ if (brand_get_query(bh, zone_name, zonepath, query_hook + EXEC_LEN,
sizeof (query_hook) - EXEC_LEN) != 0)
return (-1);
@@ -2278,6 +2262,11 @@ main(int argc, char *argv[])
return (1);
}
+ if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
+ zerror(zlogp, B_FALSE, "unable to determine zone path");
+ return (-1);
+ }
+
if (zonecfg_default_brand(default_brand,
sizeof (default_brand)) != Z_OK) {
zerror(zlogp, B_FALSE, "unable to determine default brand");
diff --git a/usr/src/cmd/zoneadmd/zoneadmd.h b/usr/src/cmd/zoneadmd/zoneadmd.h
index ceab787dab..7e5dcea432 100644
--- a/usr/src/cmd/zoneadmd/zoneadmd.h
+++ b/usr/src/cmd/zoneadmd/zoneadmd.h
@@ -91,6 +91,7 @@ extern mutex_t msglock;
extern boolean_t in_death_throes;
extern boolean_t bringup_failure_recovery;
extern char *zone_name;
+extern char zonepath[MAXNAMELEN];
extern zone_dochandle_t snap_hndl;
extern char pool_name[MAXNAMELEN];
extern char brand_name[MAXNAMELEN];
@@ -164,6 +165,12 @@ extern void create_mcap_thread(zlog_t *, zoneid_t);
extern void destroy_mcap_thread();
/*
+ * Zone FD log thread creation.
+ */
+extern void create_log_thread(zlog_t *, zoneid_t);
+extern void destroy_log_thread();
+
+/*
* Contract handling.
*/
extern int init_template(void);
diff --git a/usr/src/lib/brand/lx/zone/platform.xml b/usr/src/lib/brand/lx/zone/platform.xml
index 3df4b62922..e6a2ef46e3 100644
--- a/usr/src/lib/brand/lx/zone/platform.xml
+++ b/usr/src/lib/brand/lx/zone/platform.xml
@@ -121,6 +121,9 @@
<!-- Renamed devices to create under /dev -->
<device match="brand/lx/ptmx" name="ptmx" />
<device match="zcons/%z/zoneconsole" name="console" />
+ <device match="zfd/%z/slave/0" name="zfd/0" />
+ <device match="zfd/%z/slave/1" name="zfd/1" />
+ <device match="zfd/%z/slave/2" name="zfd/2" />
<!-- Audio devices to create under /dev -->
<device match="brand/lx/dsp" name="dsp" />
diff --git a/usr/src/man/man1/zlogin.1 b/usr/src/man/man1/zlogin.1
index 129718e11e..4c9c8734a5 100644
--- a/usr/src/man/man1/zlogin.1
+++ b/usr/src/man/man1/zlogin.1
@@ -14,13 +14,13 @@
.\" Copyright 2013 DEY Storage Systems, Inc.
.\" Copyright (c) 2014 Gary Mills
.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved
-.TH ZLOGIN 1 "Jan 22, 2014"
+.TH ZLOGIN 1 "Dec 22, 2014"
.SH NAME
zlogin \- enter a zone
.SH SYNOPSIS
.LP
.nf
-\fBzlogin\fR [\fB-CEQ\fR] [\fB-e\fR \fIc\fR] [\fB-l\fR \fIusername\fR] \fIzonename\fR
+\fBzlogin\fR [\fB-CEIQ\fR] [\fB-e\fR \fIc\fR] [\fB-l\fR \fIusername\fR] \fIzonename\fR
.fi
.LP
@@ -37,7 +37,7 @@ system zone. Only a superuser operating in the global system zone can use this
utility.
.sp
.LP
-\fBzlogin\fR operates in one of three modes:
+\fBzlogin\fR operates in one of four modes:
.sp
.ne 2
.na
@@ -81,6 +81,16 @@ available once the zone is in the installed state. Connections to the console
are persistent across reboot of the zone.
.RE
+.sp
+.ne 2
+.na
+\fBStandalone-processs Interactive Mode\fR
+.ad
+.RS 24n
+If the \fB-I\fR option is specified the user is connected to the zone's stdin,
+stdout and stderr \fBzfd(7D)\fR devices.
+.RE
+
.SH OPTIONS
.sp
.LP
@@ -125,6 +135,15 @@ Forces interactive mode when a utility argument is specified.
.RE
.sp
+.ne 2
+.na
+\fB\fB-I\fR\fR
+.ad
+.RS 15n
+Connects to the zone's \fBzfd(7D)\fR devices.
+.RE
+
+.sp
.sp
.ne 2
.na
diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile
index 5140f21c65..ddcb5ed45b 100644
--- a/usr/src/man/man7d/Makefile
+++ b/usr/src/man/man7d/Makefile
@@ -13,6 +13,7 @@
# Copyright 2011, Richard Lowe
# Copyright 2013 Nexenta Systems, Inc. All rights reserved.
# Copyright 2014 Garrett D'Amore <garrett@damore.org>
+# Copyright 2014 Joyent, Inc. All rights reserved.
#
include $(SRC)/Makefile.master
@@ -151,7 +152,8 @@ _MANFILES= aac.7d \
xge.7d \
yge.7d \
zcons.7d \
- zero.7d
+ zero.7d \
+ zfd.7d
sparc_MANFILES= audiocs.7d \
bbc_beep.7d \
diff --git a/usr/src/man/man7d/zfd.7d b/usr/src/man/man7d/zfd.7d
new file mode 100644
index 0000000000..f06777fee8
--- /dev/null
+++ b/usr/src/man/man7d/zfd.7d
@@ -0,0 +1,39 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2014, Joyent, Inc. All rights reserved.
+.\"
+.Dd "Dec 22, 2014"
+.Dt ZFD 7D
+.Os
+.Sh NAME
+.Nm zfd
+.Nd Zone file descriptor driver
+.Sh DESCRIPTION
+The
+.Nm zfd
+character driver exports up to three file descriptors into the zone. These can
+be used by a standalone process within the zone as
+.Vt stdin ,
+.Vt stdout ,
+and
+.Vt stderr .
+The
+.Nm zfd
+driver behaves in a similar manner as the
+.Nm zcons(7D)
+device.
+Inside a zone, the slave side file descriptors appear as
+.Nm /dev/zfd/[0-2] .
+.Sh SEE ALSO
+.Xr zlogin 1 ,
+.Xr zoneadmd 1M ,
+.Xr zcons 7D
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 517a7a6363..03c50355d7 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1083,6 +1083,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o
ZCONS_OBJS += zcons.o
+ZFD_OBJS += zfd.o
+
NV_SATA_OBJS += nv_sata.o
SI3124_OBJS += si3124.o
diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf
index 42248e93d6..08affec609 100644
--- a/usr/src/uts/common/io/pseudo.conf
+++ b/usr/src/uts/common/io/pseudo.conf
@@ -22,8 +22,7 @@
#
# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2014 Joyent, Inc. All rights reserved.
#
# This file is private to the pseudonex driver. It should not be edited.
#
@@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0;
# /pseudo; it has as its children the zone console pseudo nodes.
#
name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons";
+
+#
+# zfdnex is an alias for pseudo; this node is instantiated as a child of
+# /pseudo; it has as its children the zone fd pseudo nodes.
+#
+name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd";
diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c
new file mode 100644
index 0000000000..f70115653f
--- /dev/null
+++ b/usr/src/uts/common/io/zfd.c
@@ -0,0 +1,815 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Zone File Descriptor Driver.
+ *
+ * This driver is derived from the zcons driver which is in turn derived from
+ * the pts/ptm drivers. The purpose is to expose file descriptors within the
+ * zone which are connected to zoneadmd and used for logging or an interactive
+ * connection to a process within the zone.
+ *
+ * Its implementation is straightforward. Each instance of the driver
+ * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd
+ * uses these devices unidirectionally to provide stdin, stdout and stderr to
+ * the process within the zone.
+ *
+ * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd,
+ * using the devctl framework; thus the driver does not need to maintain any
+ * sort of "admin" node.
+ *
+ * The driver shuttles I/O from master side to slave side and back. In a break
+ * from the pts/ptm semantics, if one side is not open, I/O directed towards
+ * it will simply be discarded. This is so that if zoneadmd is not holding the
+ * master side fd open (i.e. it has died somehow), processes in the zone do not
+ * experience any errors and I/O to the fd does not cause the process to hang.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/debug.h>
+#include <sys/devops.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kstr.h>
+#include <sys/modctl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/zfd.h>
+#include <sys/vnode.h>
+#include <sys/fs/snode.h>
+#include <sys/zone.h>
+
+static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int zfd_attach(dev_info_t *, ddi_attach_cmd_t);
+static int zfd_detach(dev_info_t *, ddi_detach_cmd_t);
+
+static int zfd_open(queue_t *, dev_t *, int, int, cred_t *);
+static int zfd_close(queue_t *, int, cred_t *);
+static void zfd_wput(queue_t *, mblk_t *);
+static void zfd_rsrv(queue_t *);
+static void zfd_wsrv(queue_t *);
+
+/*
+ * The instance number is encoded in the dev_t in the minor number; the lowest
+ * bit of the minor number is used to track the master vs. slave side of the
+ * fd. The rest of the bits in the minor number are the instance.
+ */
+#define ZFD_MASTER_MINOR 0
+#define ZFD_SLAVE_MINOR 1
+
+#define ZFD_INSTANCE(x) (getminor((x)) >> 1)
+#define ZFD_NODE(x) (getminor((x)) & 0x01)
+
+/*
+ * This macro converts a zfd_state_t pointer to the associated slave minor
+ * node's dev_t.
+ */
+#define ZFD_STATE_TO_SLAVEDEV(x) \
+ (makedevice(ddi_driver_major((x)->zfd_devinfo), \
+ (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR)))
+
+int zfd_debug = 0;
+#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a)
+#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b)
+
+/*
+ * ZFD Pseudo Terminal Module: stream data structure definitions,
+ * based on zcons.
+ */
+static struct module_info zfd_info = {
+ 0x20FD, /* ZOFD - 8445 */
+ "zfd",
+ 0, /* min packet size */
+ INFPSZ, /* max packet size - infinity */
+ 2048, /* high water */
+ 128 /* low water */
+};
+
+static struct qinit zfd_rinit = {
+ NULL,
+ (int (*)()) zfd_rsrv,
+ zfd_open,
+ zfd_close,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct qinit zfd_winit = {
+ (int (*)()) zfd_wput,
+ (int (*)()) zfd_wsrv,
+ NULL,
+ NULL,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct streamtab zfd_tab_info = {
+ &zfd_rinit,
+ &zfd_winit,
+ NULL,
+ NULL
+};
+
+#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL)
+
+/*
+ * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops)
+ */
+DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \
+ nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \
+ ddi_quiesce_not_needed);
+
+/*
+ * Module linkage information for the kernel.
+ */
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* Type of module (this is a pseudo driver) */
+ "Zone FD driver", /* description of module */
+ &zfd_ops /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modldrv,
+ NULL
+};
+
+typedef struct zfd_state {
+ dev_info_t *zfd_devinfo;
+ queue_t *zfd_master_rdq;
+ queue_t *zfd_slave_rdq;
+ vnode_t *zfd_slave_vnode;
+ int zfd_state;
+ int zfd_tty;
+} zfd_state_t;
+
+#define ZFD_STATE_MOPEN 0x01
+#define ZFD_STATE_SOPEN 0x02
+
+static void *zfd_soft_state;
+
+/*
+ * List of STREAMS modules that is pushed onto a slave instance after the
+ * ZFD_MAKETTY ioctl has been received.
+ */
+static char *zfd_mods[] = {
+ "ptem",
+ "ldterm",
+ "ttcompat",
+ NULL
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t),
+ 0)) != 0) {
+ return (err);
+ }
+
+ if ((err = mod_install(&modlinkage)) != 0)
+ ddi_soft_state_fini(zfd_soft_state);
+
+ return (err);
+}
+
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = mod_remove(&modlinkage)) != 0) {
+ return (err);
+ }
+
+ ddi_soft_state_fini(&zfd_soft_state);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+ char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN];
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME,
+ instance);
+ (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME,
+ instance);
+
+ /*
+ * Create the master and slave minor nodes.
+ */
+ if ((ddi_create_minor_node(dip, slavenm, S_IFCHR,
+ instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) ||
+ (ddi_create_minor_node(dip, masternm, S_IFCHR,
+ instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) {
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+ return (DDI_FAILURE);
+ }
+
+ VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL);
+ zfds->zfd_devinfo = dip;
+ zfds->zfd_tty = 0;
+ return (DDI_SUCCESS);
+}
+
+static int
+zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (DDI_FAILURE);
+
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) ||
+ (zfds->zfd_state & ZFD_STATE_SOPEN)) {
+ DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip);
+ return (DDI_FAILURE);
+ }
+
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * zfd_getinfo()
+ * getinfo(9e) entrypoint.
+ */
+/*ARGSUSED*/
+static int
+zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ zfd_state_t *zfds;
+ int instance = ZFD_INSTANCE((dev_t)arg);
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if ((zfds = ddi_get_soft_state(zfd_soft_state,
+ instance)) == NULL)
+ return (DDI_FAILURE);
+ *result = zfds->zfd_devinfo;
+ return (DDI_SUCCESS);
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)instance;
+ return (DDI_SUCCESS);
+ }
+ return (DDI_FAILURE);
+}
+
+/*
+ * Return the equivalent queue from the other side of the relationship.
+ * e.g.: given the slave's write queue, return the master's write queue.
+ */
+static queue_t *
+zfd_switch(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq)
+ return (zfds->zfd_slave_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_slave_rdq));
+ else if (qp == zfds->zfd_slave_rdq)
+ return (zfds->zfd_master_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_master_rdq));
+ else
+ return (NULL);
+}
+
+/*
+ * For debugging and outputting messages. Returns the name of the side of
+ * the relationship associated with this queue.
+ */
+static const char *
+zfd_side(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq ||
+ OTHERQ(qp) == zfds->zfd_master_rdq) {
+ return ("master");
+ }
+ ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq);
+ return ("slave");
+}
+
+/*ARGSUSED*/
+static int
+zfd_master_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+
+ /*
+ * Enforce exclusivity on the master side; the only consumer should
+ * be the zoneadmd for the zone.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0)
+ return (EBUSY);
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_master_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_MOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+ qprocson(rqp);
+
+ /*
+ * Following qprocson(), the master side is fully plumbed into the
+ * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq
+ * will allow the slave to send messages to us (the master).
+ * This cannot occur before qprocson() because the master is not
+ * ready to process them until that point.
+ */
+ zfds->zfd_master_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ if (oflag & FNOCTTY)
+ sop->so_flags = SO_HIWAT | SO_LOWAT;
+ else
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfd_slave_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+ /*
+ * The slave side can be opened as many times as needed.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds));
+ return (0);
+ }
+
+ if (zfds->zfd_tty == 1) {
+ major_t major;
+ minor_t minor;
+ minor_t lastminor;
+ uint_t anchorindex;
+
+ /*
+ * Set up sad(7D) so that the necessary STREAMS modules will
+ * be in place. A wrinkle is that 'ptem' must be anchored
+ * in place (see streamio(7i)) because we always want the
+ * fd to have terminal semantics.
+ */
+ minor =
+ ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR;
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ lastminor = 0;
+ anchorindex = 1;
+ if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor,
+ &anchorindex, zfd_mods) != 0) {
+ DBG("zfd_slave_open(): kstr_autopush() failed\n");
+ return (EIO);
+ }
+ }
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_slave_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_SOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+
+ qprocson(rqp);
+
+ /*
+ * Must follow qprocson(), since we aren't ready to process until then.
+ */
+ zfds->zfd_slave_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*
+ * open(9e) entrypoint; checks sflag, and rejects anything unordinary.
+ */
+static int
+zfd_open(queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ int instance = ZFD_INSTANCE(*devp);
+ int ret;
+ zfd_state_t *zfds;
+
+ if (sflag != 0)
+ return (EINVAL);
+
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (ENXIO);
+
+ switch (ZFD_NODE(*devp)) {
+ case ZFD_MASTER_MINOR:
+ ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp);
+ break;
+ case ZFD_SLAVE_MINOR:
+ ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp);
+ break;
+ default:
+ ret = ENXIO;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * close(9e) entrypoint.
+ */
+/*ARGSUSED1*/
+static int
+zfd_close(queue_t *rqp, int flag, cred_t *credp)
+{
+ queue_t *wqp;
+ mblk_t *bp;
+ zfd_state_t *zfds;
+ major_t major;
+ minor_t minor;
+
+ zfds = (zfd_state_t *)rqp->q_ptr;
+
+ if (rqp == zfds->zfd_master_rdq) {
+ DBG("Closing master side");
+
+ zfds->zfd_master_rdq = NULL;
+ zfds->zfd_state &= ~ZFD_STATE_MOPEN;
+
+ /*
+ * qenable slave side write queue so that it can flush
+ * its messages as master's read queue is going away
+ */
+ if (zfds->zfd_slave_rdq != NULL) {
+ qenable(WR(zfds->zfd_slave_rdq));
+ }
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ } else if (rqp == zfds->zfd_slave_rdq) {
+
+ DBG("Closing slave side");
+ zfds->zfd_state &= ~ZFD_STATE_SOPEN;
+ zfds->zfd_slave_rdq = NULL;
+
+ wqp = WR(rqp);
+ while ((bp = getq(wqp)) != NULL) {
+ if (zfds->zfd_master_rdq != NULL)
+ putnext(zfds->zfd_master_rdq, bp);
+ else if (bp->b_datap->db_type == M_IOCTL)
+ miocnak(wqp, bp, 0, 0);
+ else
+ freemsg(bp);
+ }
+
+ /*
+ * Qenable master side write queue so that it can flush its
+ * messages as slaves's read queue is going away.
+ */
+ if (zfds->zfd_master_rdq != NULL)
+ qenable(WR(zfds->zfd_master_rdq));
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ if (zfds->zfd_tty == 1) {
+ /*
+ * Clear the sad configuration so that reopening
+ * doesn't fail to set up sad configuration.
+ */
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ minor = ddi_get_instance(zfds->zfd_devinfo) << 1 |
+ ZFD_SLAVE_MINOR;
+ (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor,
+ NULL, NULL, NULL);
+ }
+ }
+
+ return (0);
+}
+
+static void
+handle_mflush(queue_t *qp, mblk_t *mp)
+{
+ mblk_t *nmp;
+ DBG1("M_FLUSH on %s side", zfd_side(qp));
+
+ if (*mp->b_rptr & FLUSHW) {
+ DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp));
+ flushq(qp, FLUSHDATA);
+ *mp->b_rptr &= ~FLUSHW;
+ if ((*mp->b_rptr & FLUSHR) == 0) {
+ /*
+ * FLUSHW only. Change to FLUSHR and putnext other side,
+ * then we are done.
+ */
+ *mp->b_rptr |= FLUSHR;
+ if (zfd_switch(RD(qp)) != NULL) {
+ putnext(zfd_switch(RD(qp)), mp);
+ return;
+ }
+ } else if ((zfd_switch(RD(qp)) != NULL) &&
+ (nmp = copyb(mp)) != NULL) {
+ /*
+ * It is a FLUSHRW; we copy the mblk and send
+ * it to the other side, since we still need to use
+ * the mblk in FLUSHR processing, below.
+ */
+ putnext(zfd_switch(RD(qp)), nmp);
+ }
+ }
+
+ if (*mp->b_rptr & FLUSHR) {
+ DBG("qreply(qp) turning FLUSHR around\n");
+ qreply(qp, mp);
+ return;
+ }
+ freemsg(mp);
+}
+
+/*
+ * wput(9E) is symmetric for master and slave sides, so this handles both
+ * without splitting the codepath. (The only exception to this is the
+ * processing of zfd ioctls, which is restricted to the master side.)
+ *
+ * zfd_wput() looks at the other side; if there is no process holding that
+ * side open, it frees the message. This prevents processes from hanging
+ * if no one is holding open the fd. Otherwise, it putnext's high
+ * priority messages, putnext's normal messages if possible, and otherwise
+ * enqueues the messages; in the case that something is enqueued, wsrv(9E)
+ * will take care of eventually shuttling I/O to the other side.
+ */
+static void
+zfd_wput(queue_t *qp, mblk_t *mp)
+{
+ unsigned char type = mp->b_datap->db_type;
+ zfd_state_t *zfds;
+ struct iocblk *iocbp;
+
+ ASSERT(qp->q_ptr);
+
+ DBG1("entering zfd_wput, %s side", zfd_side(qp));
+
+ /*
+ * Process zfd ioctl messages if qp is the master side's write queue.
+ */
+ zfds = (zfd_state_t *)qp->q_ptr;
+ if (zfds->zfd_master_rdq != NULL && qp == WR(zfds->zfd_master_rdq) &&
+ type == M_IOCTL) {
+ iocbp = (struct iocblk *)(void *)mp->b_rptr;
+ switch (iocbp->ioc_cmd) {
+ case ZFD_MAKETTY:
+ /*
+ * The process that passed the ioctl must be running in
+ * the global zone.
+ */
+ if (crgetzoneid(iocbp->ioc_cr) != GLOBAL_ZONEID) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+ zfds->zfd_tty = 1;
+ miocack(qp, mp, 0, 0);
+ return;
+ default:
+ break;
+ }
+ }
+
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG1("wput to %s side (no one listening)", zfd_side(qp));
+ switch (type) {
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ case M_IOCTL:
+ miocnak(qp, mp, 0, 0);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+ return;
+ }
+
+ if (type >= QPCTL) {
+ DBG1("(hipri) wput, %s side", zfd_side(qp));
+ switch (type) {
+ case M_READ: /* supposedly from ldterm? */
+ DBG("zfd_wput: tossing M_READ\n");
+ freemsg(mp);
+ break;
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ default:
+ /*
+ * Put this to the other side.
+ */
+ ASSERT(zfd_switch(RD(qp)) != NULL);
+ putnext(zfd_switch(RD(qp)), mp);
+ break;
+ }
+ DBG1("done (hipri) wput, %s side", zfd_side(qp));
+ return;
+ }
+
+ /*
+ * Only putnext if there isn't already something in the queue.
+ * otherwise things would wind up out of order.
+ */
+ if (qp->q_first == NULL &&
+ bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+ DBG("wput: putting message to other side\n");
+ putnext(RD(zfd_switch(qp)), mp);
+ } else {
+ DBG("wput: putting msg onto queue\n");
+ (void) putq(qp, mp);
+ }
+ DBG1("done wput, %s side", zfd_side(qp));
+}
+
+/*
+ * rsrv(9E) is symmetric for master and slave, so zfd_rsrv() handles both
+ * without splitting up the codepath.
+ *
+ * Enable the write side of the partner. This triggers the partner to send
+ * messages queued on its write side to this queue's read side.
+ */
+static void
+zfd_rsrv(queue_t *qp)
+{
+ zfd_state_t *zfds;
+ zfds = (zfd_state_t *)qp->q_ptr;
+
+ /*
+ * Care must be taken here, as either of the master or slave side
+ * qptr could be NULL.
+ */
+ ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq);
+ if (zfd_switch(qp) == NULL) {
+ DBG("zfd_rsrv: other side isn't listening\n");
+ return;
+ }
+ qenable(WR(zfd_switch(qp)));
+}
+
+/*
+ * This routine is symmetric for master and slave, so it handles both without
+ * splitting up the codepath.
+ *
+ * If there are messages on this queue that can be sent to the other, send
+ * them via putnext(). Else, if queued messages cannot be sent, leave them
+ * on this queue.
+ */
+static void
+zfd_wsrv(queue_t *qp)
+{
+ mblk_t *mp;
+
+ DBG1("zfd_wsrv master (%s) side", zfd_side(qp));
+
+ /*
+ * Partner has no read queue, so take the data, and throw it away.
+ */
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG("zfd_wsrv: other side isn't listening");
+ while ((mp = getq(qp)) != NULL) {
+ if (mp->b_datap->db_type == M_IOCTL)
+ miocnak(qp, mp, 0, 0);
+ else
+ freemsg(mp);
+ }
+ flushq(qp, FLUSHALL);
+ return;
+ }
+
+ /*
+ * while there are messages on this write queue...
+ */
+ while ((mp = getq(qp)) != NULL) {
+ /*
+ * Due to the way zfd_wput is implemented, we should never
+ * see a control message here.
+ */
+ ASSERT(mp->b_datap->db_type < QPCTL);
+
+ if (bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+ DBG("wsrv: send message to other side\n");
+ putnext(RD(zfd_switch(qp)), mp);
+ } else {
+ DBG("wsrv: putting msg back on queue\n");
+ (void) putbq(qp, mp);
+ break;
+ }
+ }
+}
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index b52be94c90..6f28942704 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -659,6 +659,7 @@ CHKHDRS= \
watchpoint.h \
winlockio.h \
zcons.h \
+ zfd.h \
zone.h \
xti_inet.h \
xti_osi.h \
diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h
new file mode 100644
index 0000000000..c676f143ac
--- /dev/null
+++ b/usr/src/uts/common/sys/zfd.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_ZFD_H
+#define _SYS_ZFD_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Minor node name of the global zone side (often called the "master" side)
+ * of the zfd dev.
+ */
+#define ZFD_MASTER_NAME "master"
+
+/*
+ * Minor node name of the non-global zone side (often called the "slave"
+ * side) of the zfd dev.
+ */
+#define ZFD_SLAVE_NAME "slave"
+
+#define ZFD_NAME_LEN 16
+
+/*
+ * ZFD_IOC forms the base for all zfd ioctls.
+ */
+#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8))
+
+/*
+ * This ioctl tells the slave side it should push the TTY stream modules
+ * so that the fd looks like a tty.
+ */
+#define ZFD_MAKETTY (ZFD_IOC | 0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFD_H */
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index 49ff0b8a2e..830580e390 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -364,6 +364,7 @@ DRV_KMODS += wpi
DRV_KMODS += xge
DRV_KMODS += yge
DRV_KMODS += zcons
+DRV_KMODS += zfd
DRV_KMODS += zyd
DRV_KMODS += simnet
DRV_KMODS += stmf
diff --git a/usr/src/uts/intel/zfd/Makefile b/usr/src/uts/intel/zfd/Makefile
new file mode 100644
index 0000000000..c270466d08
--- /dev/null
+++ b/usr/src/uts/intel/zfd/Makefile
@@ -0,0 +1,48 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Joyent, Inc. All rights reserved.
+#
+# uts/intel/zfd/Makefile
+
+UTSBASE = ../..
+
+MODULE = zfd
+OBJECTS = $(ZFD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(ZFD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(USR_DRV_DIR)/$(MODULE)
+
+include $(UTSBASE)/intel/Makefile.intel
+
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc
index cd7206a353..a5b719ae80 100644
--- a/usr/src/uts/sparc/Makefile.sparc
+++ b/usr/src/uts/sparc/Makefile.sparc
@@ -217,7 +217,7 @@ DRV_KMODS += log logindmux kssl mm nca physmem pm poll pool
DRV_KMODS += pseudo ptc ptm pts ptsl ramdisk random rsm rts sad
DRV_KMODS += simnet softmac sppp sppptun sy sysevent sysmsg
DRV_KMODS += spdsock
-DRV_KMODS += tcp tcp6 tl tnf ttymux udp udp6 wc winlock zcons
+DRV_KMODS += tcp tcp6 tl tnf ttymux udp udp6 wc winlock zcons zfd
DRV_KMODS += ippctl
DRV_KMODS += dld
DRV_KMODS += ipd
diff --git a/usr/src/uts/sparc/zfd/Makefile b/usr/src/uts/sparc/zfd/Makefile
new file mode 100644
index 0000000000..ebdba686b4
--- /dev/null
+++ b/usr/src/uts/sparc/zfd/Makefile
@@ -0,0 +1,50 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Joyent, Inc. All rights reserved.
+#
+# uts/intel/zfd/Makefile
+
+UTSBASE = ../..
+
+MODULE = zfd
+OBJECTS = $(ZFD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(ZFD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(USR_DRV_DIR)/$(MODULE)
+
+include $(UTSBASE)/sparc/Makefile.sparc
+
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+CFLAGS += $(CCVERBOSE)
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/sparc/Makefile.targ