summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/Makefile4
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile1
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.c10
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.h2
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c8
-rw-r--r--usr/src/cmd/vndadm/Makefile65
-rw-r--r--usr/src/cmd/vndadm/test/Makefile19
-rw-r--r--usr/src/cmd/vndadm/test/Makefile.com43
-rw-r--r--usr/src/cmd/vndadm/test/Makefile.subdirs29
-rw-r--r--usr/src/cmd/vndadm/test/Makefile.targ59
-rw-r--r--usr/src/cmd/vndadm/test/scripts/Makefile28
-rwxr-xr-xusr/src/cmd/vndadm/test/scripts/vndtest.ksh300
-rw-r--r--usr/src/cmd/vndadm/test/tst/Makefile18
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/Makefile34
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/cmd.common.ksh33
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh30
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh.out2
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/create.sdev.ksh25
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/create.setbuf.ksh34
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/ecreate.destroy.ksh25
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadprop.ksh24
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadvalue.ksh24
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbuftoobig.ksh24
-rw-r--r--usr/src/cmd/vndadm/test/tst/cmd/ecreate.setrdonlyprop.ksh24
-rw-r--r--usr/src/cmd/vndadm/test/tst/dld/Makefile27
-rw-r--r--usr/src/cmd/vndadm/test/tst/dld/create.reuse.ksh31
-rw-r--r--usr/src/cmd/vndadm/test/tst/dld/dld.common.ksh29
-rw-r--r--usr/src/cmd/vndadm/test/tst/dld/ecreate.ipfirst.ksh27
-rw-r--r--usr/src/cmd/vndadm/test/tst/dld/ecreate.vndfirst.ksh27
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/Makefile49
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.attach.c63
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.attachnolink.c67
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.badlinkname.c119
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.doublelink.c82
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.gioctlattach.c69
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.link.c76
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.linkexists.c90
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.ngioctlfault.c96
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv1.c69
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv2.c69
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv3.c70
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv4.c75
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv5.c77
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.olink.c77
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.olinknopriv.c83
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/create.rmenolink.c69
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.attachrdonly.c63
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.badioctl.c79
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.basicopenctl.c76
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlfault.c78
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlnattach.c100
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.iocsize.ksh54
-rw-r--r--usr/src/cmd/vndadm/test/tst/ioctl/tst.openctlbadflags.c88
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/Makefile44
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.badlink.c39
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.badpropid.c76
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.badpropsize.c63
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.badzone.c43
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.basic.c49
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.enomem.c91
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.frameioeagain.c80
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.open.c56
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.propiter.c79
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/create.proprdonly.c63
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/err.badclose.c33
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/tst.badopen.c49
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/tst.strerror.c30
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/tst.strerror.exe.out37
-rw-r--r--usr/src/cmd/vndadm/test/tst/lib/tst.strsyserror.c50
-rw-r--r--usr/src/cmd/vndadm/vndadm.c872
-rw-r--r--usr/src/cmd/vndstat/Makefile33
-rw-r--r--usr/src/cmd/vndstat/vndstat.c542
-rw-r--r--usr/src/lib/Makefile4
-rw-r--r--usr/src/lib/libdlpi/common/libdlpi.c61
-rw-r--r--usr/src/lib/libdlpi/common/libdlpi.h2
-rw-r--r--usr/src/lib/libdlpi/common/libdlpi_impl.h3
-rw-r--r--usr/src/lib/libdlpi/common/mapfile-vers5
-rw-r--r--usr/src/lib/libdtrace/Makefile.com8
-rw-r--r--usr/src/lib/libdtrace/common/mac.d.in66
-rw-r--r--usr/src/lib/libdtrace/common/mac.sed.in45
-rw-r--r--usr/src/lib/libdtrace/common/vnd.d28
-rw-r--r--usr/src/lib/libvnd/Makefile42
-rw-r--r--usr/src/lib/libvnd/Makefile.com39
-rw-r--r--usr/src/lib/libvnd/amd64/Makefile19
-rw-r--r--usr/src/lib/libvnd/common/libvnd.c550
-rw-r--r--usr/src/lib/libvnd/common/libvnd.h84
-rw-r--r--usr/src/lib/libvnd/common/llib-lvnd19
-rw-r--r--usr/src/lib/libvnd/common/mapfile-vers55
-rw-r--r--usr/src/lib/libvnd/i386/Makefile18
-rw-r--r--usr/src/man/Makefile1
-rw-r--r--usr/src/man/Makefile.man5
-rw-r--r--usr/src/man/man1m/Makefile4
-rw-r--r--usr/src/man/man1m/snoop.1m22
-rw-r--r--usr/src/man/man1m/vndadm.1m652
-rw-r--r--usr/src/man/man1m/vndstat.1m163
-rw-r--r--usr/src/man/man3dlpi/Makefile2
-rw-r--r--usr/src/man/man3dlpi/dlpi_open.3dlpi31
-rw-r--r--usr/src/man/man3lib/Makefile1
-rw-r--r--usr/src/man/man3lib/libvnd.3lib690
-rw-r--r--usr/src/man/man3vnd/Makefile70
-rw-r--r--usr/src/man/man3vnd/vnd_create.3vnd280
-rw-r--r--usr/src/man/man3vnd/vnd_errno.3vnd170
-rw-r--r--usr/src/man/man3vnd/vnd_frameio_read.3vnd705
-rw-r--r--usr/src/man/man3vnd/vnd_pollfd.3vnd155
-rw-r--r--usr/src/man/man3vnd/vnd_prop_get.3vnd243
-rw-r--r--usr/src/man/man3vnd/vnd_prop_iter.3vnd148
-rw-r--r--usr/src/man/man3vnd/vnd_prop_writeable.3vnd101
-rw-r--r--usr/src/man/man3vnd/vnd_walk.3vnd155
-rw-r--r--usr/src/man/man7d/Makefile1
-rw-r--r--usr/src/man/man7d/vnd.7d119
-rw-r--r--usr/src/uts/common/Makefile.files9
-rw-r--r--usr/src/uts/common/Makefile.rules10
-rw-r--r--usr/src/uts/common/dtrace/sdt_subr.c33
-rw-r--r--usr/src/uts/common/fs/dev/sdev_netops.c257
-rw-r--r--usr/src/uts/common/fs/dev/sdev_plugin.c913
-rw-r--r--usr/src/uts/common/fs/dev/sdev_subr.c210
-rw-r--r--usr/src/uts/common/fs/dev/sdev_vfsops.c23
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c2
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c119
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h10
-rw-r--r--usr/src/uts/common/inet/squeue.c100
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c79
-rw-r--r--usr/src/uts/common/io/dld/dld_str.c104
-rw-r--r--usr/src/uts/common/io/dls/dls.c84
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c1
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c41
-rw-r--r--usr/src/uts/common/io/gsqueue/gsqueue.c607
-rw-r--r--usr/src/uts/common/io/vnd/frameio.c464
-rw-r--r--usr/src/uts/common/io/vnd/vnd.c5469
-rw-r--r--usr/src/uts/common/io/vnd/vnd.conf16
-rw-r--r--usr/src/uts/common/sys/Makefile6
-rw-r--r--usr/src/uts/common/sys/dld.h4
-rw-r--r--usr/src/uts/common/sys/dld_impl.h5
-rw-r--r--usr/src/uts/common/sys/dlpi.h11
-rw-r--r--usr/src/uts/common/sys/dls.h5
-rw-r--r--usr/src/uts/common/sys/dls_impl.h2
-rw-r--r--usr/src/uts/common/sys/frameio.h107
-rw-r--r--usr/src/uts/common/sys/fs/sdev_impl.h61
-rw-r--r--usr/src/uts/common/sys/fs/sdev_plugin.h106
-rw-r--r--usr/src/uts/common/sys/gsqueue.h65
-rw-r--r--usr/src/uts/common/sys/neti.h2
-rw-r--r--usr/src/uts/common/sys/netstack.h3
-rw-r--r--usr/src/uts/common/sys/squeue.h14
-rw-r--r--usr/src/uts/common/sys/squeue_impl.h2
-rw-r--r--usr/src/uts/common/sys/vnd.h141
-rw-r--r--usr/src/uts/common/sys/vnd_errno.h72
-rw-r--r--usr/src/uts/intel/Makefile.intel4
-rw-r--r--usr/src/uts/intel/dev/Makefile1
-rw-r--r--usr/src/uts/intel/gsqueue/Makefile49
-rw-r--r--usr/src/uts/intel/ipf/Makefile2
-rw-r--r--usr/src/uts/intel/ipf/ipf.global-objs.debug644
-rw-r--r--usr/src/uts/intel/vnd/Makefile56
-rw-r--r--usr/src/uts/sparc/ipf/ipf.global-objs.debug644
153 files changed, 18865 insertions, 351 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 5aa255deb0..fcd087bae6 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -21,7 +21,7 @@
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright 2010 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2013 Joyent, Inc. All rights reserved.
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
# Copyright (c) 2012 by Delphix. All rights reserved.
# Copyright (c) 2013 DEY Storage Systems, Inc. All rights reserved.
@@ -431,6 +431,8 @@ COMMON_SUBDIRS= \
valtools \
vgrind \
vi \
+ vndadm \
+ vndstat \
volcheck \
volrmmount \
vrrpadm \
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
index 1d408bccba..e285aa09d3 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/Makefile
@@ -45,6 +45,7 @@ SRCS= $(OBJS:.o=.c)
HDRS= snoop.h snoop_mip.h at.h snoop_ospf.h snoop_ospf6.h
include ../../../Makefile.cmd
+include ../../../Makefile.ctf
CPPFLAGS += -I. -I$(SRC)/common/net/dhcp \
-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.c
index 097dd6ee90..6d586ab9b5 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.c
@@ -121,6 +121,7 @@ main(int argc, char **argv)
char *output_area;
int nbytes;
char *datalink = NULL;
+ char *zonename = NULL;
dlpi_handle_t dh;
names[0] = '\0';
@@ -227,7 +228,7 @@ main(int argc, char **argv)
}
(void) setvbuf(stdout, NULL, _IOLBF, BUFSIZ);
- while ((c = getopt(argc, argv, "at:CPDSi:o:Nn:s:d:I:vVp:f:c:x:U?rqz"))
+ while ((c = getopt(argc, argv, "at:CPDSi:o:Nn:s:d:I:vVp:f:c:x:U?rqz:Z"))
!= EOF) {
switch (c) {
case 'a':
@@ -348,8 +349,11 @@ main(int argc, char **argv)
case 'U':
Uflg = B_TRUE;
break;
-#ifdef DEBUG
case 'z':
+ zonename = optarg;
+ break;
+#ifdef DEBUG
+ case 'Z':
zflg = B_TRUE;
break;
#endif /* DEBUG */
@@ -371,7 +375,7 @@ main(int argc, char **argv)
* requested was chosen, but that's too hard.
*/
if (!icapfile) {
- use_kern_pf = open_datalink(&dh, datalink);
+ use_kern_pf = open_datalink(&dh, datalink, zonename);
} else {
use_kern_pf = B_FALSE;
cap_open_read(icapfile);
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.h b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.h
index e4f182572b..40cefa2c59 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.h
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop.h
@@ -182,7 +182,7 @@ extern void cap_open_read(const char *);
extern void cap_open_write(const char *);
extern void cap_read(int, int, int, void (*)(), int);
extern void cap_close(void);
-extern boolean_t open_datalink(dlpi_handle_t *, const char *);
+extern boolean_t open_datalink(dlpi_handle_t *, const char *, const char *);
extern void init_datalink(dlpi_handle_t, ulong_t, ulong_t, struct timeval *,
struct Pf_ext_packetfilt *);
extern void net_read(dlpi_handle_t, size_t, int, void (*)(), int);
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
index ab6bc292ac..54fbdc844b 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/snoop/snoop_capture.c
@@ -114,7 +114,7 @@ select_datalink(const char *linkname, void *arg)
* about the datalink useful for building the proper packet filters.
*/
boolean_t
-open_datalink(dlpi_handle_t *dhp, const char *linkname)
+open_datalink(dlpi_handle_t *dhp, const char *linkname, const char *zonename)
{
int retval;
int flags = DLPI_PASSIVE | DLPI_RAW;
@@ -122,6 +122,9 @@ open_datalink(dlpi_handle_t *dhp, const char *linkname)
dlpi_info_t dlinfo;
if (linkname == NULL) {
+ if (zonename != NULL)
+ pr_err("a datalink must be specified with a zone name");
+
/*
* Select a datalink to use by default. Prefer datalinks that
* are plumbed by IP.
@@ -145,7 +148,8 @@ open_datalink(dlpi_handle_t *dhp, const char *linkname)
flags |= DLPI_DEVIPNET;
if (Iflg || strcmp(linkname, "lo0") == 0)
flags |= DLPI_IPNETINFO;
- if ((retval = dlpi_open(linkname, dhp, flags)) != DLPI_SUCCESS) {
+ if ((retval = dlpi_open_zone(linkname, zonename, dhp,
+ flags)) != DLPI_SUCCESS) {
pr_err("cannot open \"%s\": %s", linkname,
dlpi_strerror(retval));
}
diff --git a/usr/src/cmd/vndadm/Makefile b/usr/src/cmd/vndadm/Makefile
new file mode 100644
index 0000000000..aa9c22d296
--- /dev/null
+++ b/usr/src/cmd/vndadm/Makefile
@@ -0,0 +1,65 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+PROG= vndadm
+OBJS = vndadm.o
+SRCS = $(OBJS:%.o=../%.c)
+
+
+include ../Makefile.cmd
+include ../Makefile.ctf
+
+CLEANFILES += $(OBJS)
+CFLAGS += $(CCVERBOSE)
+LDLIBS += -lvnd
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+
+all := TARGET += all
+clean := TARGET += clean
+clobber := TARGET += clobber
+install := TARGET += install
+lint := TARGET += lint
+
+SUBDIRS = test
+
+.KEEP_STATE:
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+clean: $(SUBDIRS)
+ -$(RM) $(CLEANFILES)
+
+lint: lint_PROG $(SUBDIRS)
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+clobber: clean $(SUBDIRS)
+ $(RM) $(PROG)
+
+install: $(PROG) $(ROOTUSRSBINPROG) $(SUBDIRS)
+
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/Makefile b/usr/src/cmd/vndadm/test/Makefile
new file mode 100644
index 0000000000..12ef2c3a3c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+SUBDIRS = scripts tst
+
+include Makefile.subdirs
+include Makefile.com
diff --git a/usr/src/cmd/vndadm/test/Makefile.com b/usr/src/cmd/vndadm/test/Makefile.com
new file mode 100644
index 0000000000..cb096952ca
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/Makefile.com
@@ -0,0 +1,43 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include $(SRC)/Makefile.master
+include $(SRC)/cmd/Makefile.cmd
+
+#
+# Force c99 for everything
+#
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+
+#
+# Deal with odd lint bits.
+#
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2
+
+#
+# Install related definitions
+#
+ROOTOPTPKG = $(ROOT)/opt/vndtest
+ROOTBIN = $(ROOTOPTPKG)/bin
+ROOTTST = $(ROOTOPTPKG)/tst
+ROOTTSTDIR = $(ROOTTST)/$(TSTDIR)
+ROOTTSTEXES = $(EXETESTS:%=$(ROOTTSTDIR)/%)
+ROOTTSTSH = $(SHTESTS:%=$(ROOTTSTDIR)/%)
+ROOTOUT = $(OUTFILES:%=$(ROOTTSTDIR)/%)
+ROOTTESTS = $(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT)
+FILEMODE = 0555
+LDLIBS = $(LDLIBS.cmd)
+LINTEXE = $(EXETESTS:%.exe=%.exe.ln)
diff --git a/usr/src/cmd/vndadm/test/Makefile.subdirs b/usr/src/cmd/vndadm/test/Makefile.subdirs
new file mode 100644
index 0000000000..957448c23b
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/Makefile.subdirs
@@ -0,0 +1,29 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+.KEEP_STATE:
+
+all := TARGET += all
+clean := TARGET += clean
+clobber := TARGET += clobber
+install := TARGET += install
+lint := TARGET += lint
+
+all clean clobber install lint: $(SUBDIRS)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/cmd/vndadm/test/Makefile.targ b/usr/src/cmd/vndadm/test/Makefile.targ
new file mode 100644
index 0000000000..bcbd3c8f35
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/Makefile.targ
@@ -0,0 +1,59 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+$(ROOTOPTPKG):
+ $(INS.dir)
+
+$(ROOTBIN): $(ROOTOPTPKG)
+ $(INS.dir)
+
+$(ROOTBIN)/%: %.ksh $(ROOTBIN)
+ $(INS.rename)
+
+$(ROOTTST): $(ROOTOPTPKG)
+ $(INS.dir)
+
+$(ROOTTSTDIR): $(ROOTTST)
+ $(INS.dir)
+
+$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR)
+ $(INS.file)
+
+$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR)
+ $(INS.file)
+
+%.o: %.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+%.exe: %.o $(SUPOBJS)
+ $(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS)
+ $(POST_PROCESS)
+
+$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR)
+ $(INS.file)
+
+all: install
+
+%.exe.ln: %.c $(SUPOBJS)
+ $(LINT.c) $< $(LDLIBS)
+
+lint: $(LINTEXE)
+
+clean:
+ -$(RM) *.o $(CLEANFILES)
+
+clobber: clean
+ -$(RM) $(CLOBBERFILES)
diff --git a/usr/src/cmd/vndadm/test/scripts/Makefile b/usr/src/cmd/vndadm/test/scripts/Makefile
new file mode 100644
index 0000000000..d0f58918f9
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/scripts/Makefile
@@ -0,0 +1,28 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include ../Makefile.com
+
+SRCS = vndtest
+SCRIPTS = $(SRCS:%=$(ROOTBIN)/%)
+
+SCRIPTS := FILEMODE = 0555
+CLOBBERFILES = $(SCRIPTS)
+
+install: $(SCRIPTS)
+
+lint:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/scripts/vndtest.ksh b/usr/src/cmd/vndadm/test/scripts/vndtest.ksh
new file mode 100755
index 0000000000..1167a64802
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/scripts/vndtest.ksh
@@ -0,0 +1,300 @@
+#!/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc.
+#
+
+#
+# vnd test suite driver
+#
+unalias -a
+
+vt_arg0=$(basename $0)
+vt_root="$(dirname $0)/.."
+vt_ksh="/usr/bin/ksh"
+vt_outdir=
+vt_keep=
+vt_all=
+vt_tests=
+vt_stub=
+vt_vnics="vndtest1 vndtest2 vndtest3 vndtest4 vndtest5"
+vt_tnum=0
+vt_tfail=0
+vt_tsuc=0
+
+function usage
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] || echo "$msg" 2>&1
+ cat <<USAGE >&2
+Usage: $vt_arg0 [ -o dir ] [ -k ] [ -a | test ... ]
+
+ -o dir Sets 'dir' as the output directory
+ -a Runs all tests, ignores tests passed in
+ -k Keep output from all tests, not just failures
+ -m mdb binary to test
+USAGE
+ exit 2
+}
+
+function fatal
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] && msg="failed"
+ echo "$vt_arg0: $msg" >&2
+ exit 1
+}
+
+function setup_outdir
+{
+ vt_outdir="$vt_outdir/$vt_arg0.$$"
+ mkdir -p $vt_outdir || fatal "failed to make output dir $vt_outdir"
+}
+
+function setup_etherstub
+{
+ vt_ether="vndstub$$"
+
+ dladm create-etherstub -t $vt_ether || \
+ fatal "failed to create etherstub"
+}
+
+function cleanup_vnd
+{
+ typeset over=$1
+ typeset vnddevs vn
+
+ vnddevs=$(vndadm list -p -d: -o datalink,name)
+ [[ $? -eq 0 ]] || fatal "failed to list vnics"
+ for v in $vnddevs; do
+ vn=$(echo $v | awk 'BEGIN{ FS=":"}
+ { if ($1 == targ) { print $2 } }' targ=$over)
+ [[ -z "$vn" ]] && continue
+ vndadm destroy $vn || fatal "failed to destroy $vn"
+ done
+}
+
+function create_vnics
+{
+ for n in $vt_vnics; do
+ dladm create-vnic -t -l $vt_ether $n || fatal \
+ "failed to create vnic $n over $vt_ether"
+ done
+}
+
+function cleanup_vnics
+{
+ typeset nics vn
+
+ nics=$(dladm show-vnic -p -o over,link)
+ [[ $? -eq 0 ]] || fatal "failed to list vnics"
+ for n in $nics; do
+ vn=$(echo $n | awk 'BEGIN{ FS=":"}
+ { if ($1 == targ) { print $2 } }' targ=$vt_ether )
+ [[ -z "$vn" ]] && continue
+ cleanup_vnd $vn
+ #
+ # There may or may not be an IP device on our nics...
+ #
+ ifconfig $vn down unplumb 2>/dev/null || /bin/true
+ dladm delete-vnic $vn || fatal "failed to delete vnic $n"
+ done
+
+}
+
+function cleanup_etherstub
+{
+ cleanup_vnics
+ dladm delete-etherstub -t $vt_ether || \
+ fatal "failed to delete etherstub"
+}
+
+function run_single
+{
+ typeset name=$1
+ typeset expect base ext exe command odir res reason
+ typeset iserr
+
+ [[ -z "$name" ]] && fail "missing test to run"
+ base=${name##*/}
+ ext=${base##*.}
+ expect=${base%%.*}
+ odir="$vt_outdir/current"
+ [[ -z "$ext" ]] && fatal "found test without ext: $name"
+ [[ -z "$expect" ]] && fatal "found test without prefix: $name"
+
+ [[ "$expect" == "create" || "$expect" == "ecreate" ]] && create_vnics
+ if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then
+ iserr="yup"
+ else
+ iserr=""
+ fi
+
+ case "$ext" in
+ "ksh")
+ command="$vt_ksh ./$base"
+ ;;
+ "exe")
+ command="./$base"
+ ;;
+ "out")
+ #
+ # This is the file format for checking output against.
+ #
+ return 0
+ ;;
+ *)
+ echo "skipping test $name (unknown extensino)"
+ return 0
+ ;;
+ esac
+
+ echo "Executing test $name ... \c"
+ mkdir -p "$odir" >/dev/null || fatal "can't make output directory"
+ cd $(dirname $name) || fatal "failed to enter test directory"
+ $command $vt_vnics > "$odir/stdout" 2>"$odir/stderr"
+ res=$?
+ cd - > /dev/null || fatal "failed to leave test directory"
+
+ if [[ -f "$name.out" ]] && \
+ ! diff "$name.out" "$odir/stdout" >/dev/null; then
+ cp $name.out $odir/$base.out
+ reason="stdout mismatch"
+ elif [[ -n "$iserr" && $res -eq 0 ]]; then
+ reason="test exited $res, not non-zero"
+ elif [[ -z "$iserr" && $res -ne 0 ]]; then
+ reason="test exited $res, not zero"
+ fi
+
+ if [[ -n "$reason" ]]; then
+ echo "$reason"
+ ((vt_tfail++))
+ mv "$odir" "$vt_outdir/failure.$vt_tfail" || fatal \
+ "failed to move test output directory"
+ cp "$name" "$vt_outdir/failure.$vt_tfail/$(basename $name)" || \
+ fatal "failed to copy test into output directory"
+ else
+ echo "passed"
+ ((vt_tsuc++))
+ mv "$odir" "$vt_outdir/success.$vt_tsuc" || fatal \
+ "failed to move test directory"
+ fi
+
+ [[ "$expect" == "create" || "$expect" == "ecreate" ]] && cleanup_vnics
+
+ ((vt_tnum++))
+}
+
+function run_all
+{
+ typeset tests t dir
+
+ cd $vt_root || fatal "failed to enter root test directory"
+ tests=$(ls -1 */*/@(ecreate|create|tst|err).*.@(ksh|exe))
+ cd - > /dev/null
+ for t in $tests; do
+ run_single $t
+ done
+}
+
+function welcome
+{
+ cat <<WELCOME
+Starting tests...
+output directory: $vt_outdir
+WELCOME
+}
+
+function cleanup
+{
+ [[ -n "$vt_keep" ]] && return
+ rm -rf "$vt_outdir"/success.* || fatal \
+ "failed to remove successful test cases"
+ if [[ $vt_tfail -eq 0 ]]; then
+ rmdir "$vt_outdir" || fatal \
+ "failed to remove test output directory"
+ fi
+}
+
+function goodbye
+{
+ cat <<EOF
+
+-------------
+Results
+-------------
+
+Tests passed: $vt_tsuc
+Tests failed: $vt_tfail
+Tests ran: $vt_tnum
+
+EOF
+ if [[ $vt_tfail -eq 0 ]]; then
+ echo "Congrats, vnd isn't completely broken, the tests pass".
+ else
+ echo "Some tests failed, you have some work to do."
+ fi
+}
+
+while getopts ":ahko:m:" c $@; do
+ case "$c" in
+ a)
+ vt_all="y"
+ ;;
+ k)
+ vt_keep="y"
+ ;;
+ o)
+ vt_outdir="$OPTARG"
+ ;;
+ h)
+ usage
+ ;;
+ :)
+ usage "option requires an argument -- $OPTARG"
+ ;;
+ *)
+ usage "invalid option -- $OPTARG"
+ ;;
+ esac
+done
+
+shift $((OPTIND-1))
+
+[[ $(zonename) != "global" ]] && fatal "vndtest only runs in the global zone"
+
+[[ -z "$vt_all" && $# == 0 ]] && usage "no tests to run"
+
+[[ -z "$vt_outdir" ]] && vt_outdir="$PWD"
+
+setup_outdir
+setup_etherstub
+welcome
+
+if [[ ! -z "$vt_all" ]]; then
+ run_all
+else
+ for t in $@; do
+ [[ -f $t ]] || fatal "cannot find test $t"
+ run_single $t
+ done
+fi
+
+cleanup_etherstub
+goodbye
+cleanup
+
+#
+# Exit 1 if we have tests that return non-zero
+#
+[[ $vt_tfai -eq 0 ]]
diff --git a/usr/src/cmd/vndadm/test/tst/Makefile b/usr/src/cmd/vndadm/test/tst/Makefile
new file mode 100644
index 0000000000..9b1ba29429
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+SUBDIRS = cmd dld ioctl lib
+
+include ../Makefile.subdirs
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/Makefile b/usr/src/cmd/vndadm/test/tst/cmd/Makefile
new file mode 100644
index 0000000000..1ca20bf749
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/Makefile
@@ -0,0 +1,34 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+TSTDIR = cmd
+COMMONSH = cmd.common.ksh
+SHTESTS = $(COMMONSH) \
+ create.list.ksh \
+ create.sdev.ksh \
+ create.setbuf.ksh \
+ ecreate.destroy.ksh \
+ ecreate.setbadprop.ksh \
+ ecreate.setbadvalue.ksh \
+ ecreate.setbuftoobig.ksh \
+ ecreate.setrdonlyprop.ksh
+
+OUTFILES = create.list.ksh.out
+
+include ../../Makefile.com
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/cmd.common.ksh b/usr/src/cmd/vndadm/test/tst/cmd/cmd.common.ksh
new file mode 100644
index 0000000000..31e4e8bf5c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/cmd.common.ksh
@@ -0,0 +1,33 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Common ksh-based utilities
+#
+
+vt_arg0=$(basename $0)
+
+function fatal
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] && msg="failed"
+ echo "$vt_arg0: $msg" >&2
+ exit 1
+}
+
+[[ -z "$1" ]] && fatal "missing required vnic"
+[[ -z "$2" ]] && fatal "missing required vnic"
+[[ -z "$3" ]] && fatal "missing required vnic"
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh b/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh
new file mode 100644
index 0000000000..fdec9a85be
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh
@@ -0,0 +1,30 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Basic device listing
+#
+
+. ./cmd.common.ksh
+
+#
+# Use what we hope is a relatively unique name
+#
+cl_name="triforceofcourage0"
+vndadm create -l $1 $cl_name || fatal "failed to create vnd device"
+vndadm list -p -o name,zone $cl_name
+vndadm list -p -d: -o zone,name $cl_name
+vndadm destroy $cl_name || fatal "failed to destroy vnd device"
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh.out b/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh.out
new file mode 100644
index 0000000000..d208b38aab
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/create.list.ksh.out
@@ -0,0 +1,2 @@
+triforceofcourage0 global
+global:triforceofcourage0
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/create.sdev.ksh b/usr/src/cmd/vndadm/test/tst/cmd/create.sdev.ksh
new file mode 100644
index 0000000000..b816ade1de
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/create.sdev.ksh
@@ -0,0 +1,25 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Verify that our sdev links exist
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd"
+[[ -c /dev/vnd/$1 ]] || fatal "missing link"
+[[ -c /dev/vnd/zone/$(zonename)/$1 ]] || fatal "missing per-zone link"
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/create.setbuf.ksh b/usr/src/cmd/vndadm/test/tst/cmd/create.setbuf.ksh
new file mode 100644
index 0000000000..d50edbead4
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/create.setbuf.ksh
@@ -0,0 +1,34 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Set and validate the buffer size properties. Valiate that we can set
+# the value using the various number analogues, eg. 1024K, etc.
+#
+set -o pipefail
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm set $1 rxbuf=1M
+cur=$(vndadm get -p $1 rxbuf | nawk '{ print $4 }')
+[[ $? -eq 0 ]] || fatal "failed to get rxbuf"
+[[ $cur -eq 1048576 ]] || fatal "rxbuf is $cur, not 1M"
+
+vndadm set $1 txbuf=1024K
+cur=$(vndadm get -p $1 rxbuf | nawk '{ print $4 }')
+[[ $? -eq 0 ]] || fatal "failed to get txbuf"
+[[ $cur -eq 1048576 ]] || fatal "txbuf is $cur, not 1M"
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/ecreate.destroy.ksh b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.destroy.ksh
new file mode 100644
index 0000000000..e3c4931018
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.destroy.ksh
@@ -0,0 +1,25 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that destroy on a previously destroyed link fails
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm destroy $1 || fatal "failed to destroy vnd device"
+vndadm destroy $1
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadprop.ksh b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadprop.ksh
new file mode 100644
index 0000000000..30c27575b1
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadprop.ksh
@@ -0,0 +1,24 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that we can't set a non-existant proprety
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm set $1 ganon=ganondorf
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadvalue.ksh b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadvalue.ksh
new file mode 100644
index 0000000000..056b24a817
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbadvalue.ksh
@@ -0,0 +1,24 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that we can't set something to a garbage value
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm set $1 rxbuf=hello
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbuftoobig.ksh b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbuftoobig.ksh
new file mode 100644
index 0000000000..551e20461c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setbuftoobig.ksh
@@ -0,0 +1,24 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that we can't set a buffer value to a ridiculous size
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm set $1 rxsize=1T
diff --git a/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setrdonlyprop.ksh b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setrdonlyprop.ksh
new file mode 100644
index 0000000000..4beb53e227
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/cmd/ecreate.setrdonlyprop.ksh
@@ -0,0 +1,24 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that we can't set a read only property.
+#
+
+. ./cmd.common.ksh
+
+vndadm create $1 || fatal "failed to bring up vnd device"
+vndadm set $1 mintu=100
diff --git a/usr/src/cmd/vndadm/test/tst/dld/Makefile b/usr/src/cmd/vndadm/test/tst/dld/Makefile
new file mode 100644
index 0000000000..3088812630
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/dld/Makefile
@@ -0,0 +1,27 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+TSTDIR = dld
+COMMONSH = dld.common.ksh
+SHTESTS = $(COMMONSH) \
+ ecreate.ipfirst.ksh \
+ ecreate.vndfirst.ksh \
+ create.reuse.ksh
+
+include ../../Makefile.com
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/tst/dld/create.reuse.ksh b/usr/src/cmd/vndadm/test/tst/dld/create.reuse.ksh
new file mode 100644
index 0000000000..bc2ffde7f6
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/dld/create.reuse.ksh
@@ -0,0 +1,31 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure that we can reuse a data link
+#
+
+. ./dld.common.ksh
+
+dld_nic=$1
+[[ -z "$1" ]] && fatal "missing required vnic"
+
+vndadm create $dld_nic || fatal "failed to bring up vnd"
+vndadm destroy $dld_nic || fatal "failed to bring down vnd"
+ifconfig $dld_nic plumb up || fatal "failed to bring up IP"
+ifconfig $dld_nic down unplumb || fatal "failed to bring down IP"
+vndadm create $dld_nic || fatal "failed to bring up vnd"
+vndadm destroy $dld_nic || fatal "failed to bring down vnd"
diff --git a/usr/src/cmd/vndadm/test/tst/dld/dld.common.ksh b/usr/src/cmd/vndadm/test/tst/dld/dld.common.ksh
new file mode 100644
index 0000000000..7a2e0a8e2b
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/dld/dld.common.ksh
@@ -0,0 +1,29 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Common ksh-based utilities
+#
+
+vt_arg0=$(basename $0)
+
+function fatal
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] && msg="failed"
+ echo "$vt_arg0: $msg" >&2
+ exit 1
+}
diff --git a/usr/src/cmd/vndadm/test/tst/dld/ecreate.ipfirst.ksh b/usr/src/cmd/vndadm/test/tst/dld/ecreate.ipfirst.ksh
new file mode 100644
index 0000000000..e6409781cb
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/dld/ecreate.ipfirst.ksh
@@ -0,0 +1,27 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure vnd fails to come up when IP is up
+#
+
+. ./dld.common.ksh
+
+dld_nic=$1
+[[ -z "$1" ]] && fatal "missing required vnic"
+
+ifconfig $dld_nic plumb up || fatal "failed to bring up IP"
+vndadm create $dld_nic
diff --git a/usr/src/cmd/vndadm/test/tst/dld/ecreate.vndfirst.ksh b/usr/src/cmd/vndadm/test/tst/dld/ecreate.vndfirst.ksh
new file mode 100644
index 0000000000..ee7a13c09c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/dld/ecreate.vndfirst.ksh
@@ -0,0 +1,27 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Make sure IP fails to come up when vnd is up
+#
+
+. ./dld.common.ksh
+
+dld_nic=$1
+[[ -z "$1" ]] && fatal "missing required vnic"
+
+vndadm create $dld_nic || fatal "failed to bring up vnd"
+ifconfig $dld_nic plumb up
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/Makefile b/usr/src/cmd/vndadm/test/tst/ioctl/Makefile
new file mode 100644
index 0000000000..fe074f32b0
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/Makefile
@@ -0,0 +1,49 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+TSTDIR = ioctl
+EXETESTS = \
+ create.attach.exe \
+ create.attachnolink.exe \
+ create.badlinkname.exe \
+ create.doublelink.exe \
+ create.gioctlattach.exe \
+ create.link.exe \
+ create.linkexists.exe \
+ create.ngioctlfault.exe \
+ create.nopriv1.exe \
+ create.nopriv2.exe \
+ create.nopriv3.exe \
+ create.nopriv4.exe \
+ create.olink.exe \
+ create.olinknopriv.exe \
+ create.rmenolink.exe \
+ tst.attachrdonly.exe \
+ tst.basicopenctl.exe \
+ tst.badioctl.exe \
+ tst.gioctlfault.exe \
+ tst.gioctlnattach.exe \
+ tst.openctlbadflags.exe
+SHTESTS = \
+ tst.iocsize.ksh
+SUPBOBJS =
+
+CLOBBERFILES = $(EXETESTS)
+
+include ../../Makefile.com
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.attach.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.attach.c
new file mode 100644
index 0000000000..d7bca5cce3
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.attach.c
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Simply attach a nic
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.attachnolink.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.attachnolink.c
new file mode 100644
index 0000000000..43c6c99af5
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.attachnolink.c
@@ -0,0 +1,67 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Try to attach to a non-existant vnic
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ /*
+ * All datalink names have numbers, so we can pick a datalink which
+ * doesn't exist by not using numbers...
+ */
+ (void) strlcpy(via.via_name, "enolink", VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(via.via_errno == VND_E_NODATALINK);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.badlinkname.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.badlinkname.c
new file mode 100644
index 0000000000..e3a067d5ce
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.badlinkname.c
@@ -0,0 +1,119 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Test that we can't link a nic with invalid names
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+static const char *names[] = {
+ /* Reserved names */
+ "ctl",
+ "zone",
+ /* Invalid characters */
+ "The fight of the century",
+ "Link/Ganon",
+ "happens@7pm",
+ "#testing",
+ "asdf!!",
+ "power&courage&wisdom",
+ "over9000?",
+ "you're",
+ "100$",
+ "(function",
+ "x)",
+ "2^128",
+ "1++",
+ "No.",
+ "99%",
+ "*****",
+ "r|m",
+ "=0",
+ "`p0",
+ "goodbye~",
+ "however;",
+ "\"hesaid",
+ "shesaid\'",
+ /* emoji pile of poo */
+ "\xF0\x9F\x92\xA9",
+ NULL
+};
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret, i;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ for (i = 0; names[i] != NULL; i++) {
+ (void) strlcpy(vil.vil_name, names[i], VND_NAMELEN);
+ (void) fprintf(stderr, "Trying to create [%s]\n", names[i]);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == -1);
+ assert(vil.vil_errno == VND_E_BADNAME);
+ }
+
+ /* Finally, the missing null terminator */
+ for (i = 0; i < VND_NAMELEN; i++)
+ vil.vil_name[i] = 'a';
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == -1);
+ assert(vil.vil_errno == VND_E_BADNAME);
+
+ viu.viu_errno = 0;
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == -1);
+ assert(viu.viu_errno == VND_E_NOTLINKED);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.doublelink.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.doublelink.c
new file mode 100644
index 0000000000..dcf4f311e9
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.doublelink.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Link a nic, first should work, second will fail.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ (void) strlcpy(vil.vil_name, "dup", VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == -1);
+ assert(vil.vil_errno == VND_E_LINKED);
+ viu.viu_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == 0);
+ assert(viu.viu_errno == 0);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.gioctlattach.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.gioctlattach.c
new file mode 100644
index 0000000000..3d6f43377b
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.gioctlattach.c
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Ensure that we can't run global ioctls on an attached handle
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ via.via_name[0] = 'a';
+ via.via_name[1] = '\0';
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(via.via_errno == VND_E_ATTACHED);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.link.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.link.c
new file mode 100644
index 0000000000..16569d58cd
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.link.c
@@ -0,0 +1,76 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Link a nic
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ viu.viu_errno = 0;
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == 0);
+ assert(viu.viu_errno == 0);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.linkexists.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.linkexists.c
new file mode 100644
index 0000000000..4e3be0db5d
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.linkexists.c
@@ -0,0 +1,90 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Try to create two devices with the same link name.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, fd2, ret;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 3) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+ fd2 = open(VND_PATH, O_RDWR);
+ assert(fd2 > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(via.via_name, argv[2], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd2, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, "dup", VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ (void) strlcpy(vil.vil_name, "dup", VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd2, VND_IOC_LINK, &vil);
+ assert(ret == -1);
+ assert(vil.vil_errno == VND_E_LINKEXISTS);
+
+ viu.viu_errno = 0;
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == 0);
+ assert(viu.viu_errno == 0);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.ngioctlfault.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.ngioctlfault.c
new file mode 100644
index 0000000000..bf174f1a8f
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.ngioctlfault.c
@@ -0,0 +1,96 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Pass bad addresses to all of our non-global ioctls
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+static int requests[] = {
+ VND_IOC_LINK,
+ VND_IOC_UNLINK,
+ VND_IOC_GETRXBUF,
+ VND_IOC_SETRXBUF,
+ VND_IOC_GETTXBUF,
+ VND_IOC_SETTXBUF,
+ VND_IOC_GETMINTU,
+ VND_IOC_GETMAXTU,
+ VND_IOC_GETMAXBUF,
+ -1
+};
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret, i;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ for (i = 0; requests[i] != -1; i++) {
+ ret = ioctl(fd, requests[i], (void *)(uintptr_t)i);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+ }
+
+
+ viu.viu_errno = 0;
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == 0);
+ assert(viu.viu_errno == 0);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv1.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv1.c
new file mode 100644
index 0000000000..6d5ad0eec2
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv1.c
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to attach a device without PRIV_NET_CONFIG
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <priv.h>
+#include <string.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <stdio.h>
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ priv_set_t *ps;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_SYS_NET_CONFIG) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd >= 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(errno == EPERM);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv2.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv2.c
new file mode 100644
index 0000000000..6b38f159a0
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv2.c
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to attach a device without PRIV_NET_RAWACCESS
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <priv.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ priv_set_t *ps;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_NET_RAWACCESS) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd >= 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(errno == EPERM);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv3.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv3.c
new file mode 100644
index 0000000000..a8c43fc46d
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv3.c
@@ -0,0 +1,70 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to attach a device without PRIV_NET_CONFIG and PRIV_NET_RAWACCESS
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <priv.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ priv_set_t *ps;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_SYS_NET_CONFIG) == 0);
+ assert(priv_addset(ps, PRIV_NET_RAWACCESS) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd >= 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(errno == EPERM);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv4.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv4.c
new file mode 100644
index 0000000000..aed0204544
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv4.c
@@ -0,0 +1,75 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to link a device without PRIV_NET_CONFIG
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <priv.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ priv_set_t *ps;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd >= 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_SYS_NET_CONFIG) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == -1);
+ assert(errno == EPERM);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv5.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv5.c
new file mode 100644
index 0000000000..2db8ecc95f
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.nopriv5.c
@@ -0,0 +1,77 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to open a device without PRIV_NET_RAWACCESS
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <priv.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, fd2, ret;
+ priv_set_t *ps;
+ char *path;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd >= 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_SYS_NET_RAWACCESS) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ (void) asprintf(&path, "/dev/vnd/%s", argv[1]);
+ assert(path != NULL);
+ fd2 = open(path, O_RDWR);
+ assert(fd2 == -1);
+ assert(errno == EPERM);
+
+ free(path);
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.olink.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.olink.c
new file mode 100644
index 0000000000..0f9292bbae
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.olink.c
@@ -0,0 +1,77 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Open a /dev/vnd/%s link
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ char *path;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ ret = asprintf(&path, "/dev/vnd/%s", argv[1]);
+ assert(ret != -1);
+
+ ret = open(path, O_RDONLY);
+ assert(ret > 0);
+ assert(close(ret) == 0);
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.olinknopriv.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.olinknopriv.c
new file mode 100644
index 0000000000..338218e751
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.olinknopriv.c
@@ -0,0 +1,83 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to open a /dev/vnd/%s without PRIV_NET_RAWACCESS
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <priv.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ char *path;
+ priv_set_t *ps;
+ vnd_ioc_attach_t via;
+ vnd_ioc_link_t vil;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ (void) strlcpy(vil.vil_name, argv[1], VND_NAMELEN);
+ vil.vil_errno = 0;
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(ret == 0);
+ assert(vil.vil_errno == 0);
+
+ ret = asprintf(&path, "/dev/vnd/%s", argv[1]);
+ assert(ret != -1);
+
+ ps = priv_allocset();
+ assert(ps != NULL);
+ assert(priv_addset(ps, PRIV_NET_RAWACCESS) == 0);
+ assert(setppriv(PRIV_OFF, PRIV_PERMITTED, ps) == 0);
+
+ ret = open(path, O_RDWR);
+ assert(ret == -1);
+ assert(errno == EPERM);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/create.rmenolink.c b/usr/src/cmd/vndadm/test/tst/ioctl/create.rmenolink.c
new file mode 100644
index 0000000000..d44e6512a7
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/create.rmenolink.c
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Verify that unlink fails when we're not linked.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+ vnd_ioc_unlink_t viu;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == 0);
+ assert(via.via_errno == 0);
+
+ viu.viu_errno = 0;
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(ret == -1);
+ assert(viu.viu_errno == VND_E_NOTLINKED);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.attachrdonly.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.attachrdonly.c
new file mode 100644
index 0000000000..29def6182d
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.attachrdonly.c
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Fail to attach when /dev/vnd/ctl is opened read only.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(int argc, const char *argv[])
+{
+ int fd, ret;
+ vnd_ioc_attach_t via;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= VND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDONLY);
+ assert(fd > 0);
+
+ (void) strlcpy(via.via_name, argv[1], VND_NAMELEN);
+ via.via_zoneid = 0;
+ via.via_errno = 0;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, &via);
+ assert(ret == -1);
+ assert(errno == EBADF);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.badioctl.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.badioctl.c
new file mode 100644
index 0000000000..f26722f035
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.badioctl.c
@@ -0,0 +1,79 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Throw a bunch of bad ioctls at us and make sure that we get ENOTTY.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <limits.h>
+#include <assert.h>
+
+/*
+ * We're including a bunch of bad header files that have ioctl numbers that we
+ * know we shouldn't.
+ */
+#include <sys/ipd.h>
+#include <sys/dtrace.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+/*
+ * A series of bad requests
+ */
+static int requests[] = {
+ 0,
+ 1,
+ 42,
+ 169,
+ 4096,
+ INT_MAX,
+ IPDIOC_CORRUPT,
+ IPDIOC_REMOVE,
+ DTRACEIOC_CONF,
+ DTRACEIOC_REPLICATE,
+ -1
+};
+
+int
+main(void)
+{
+ int fd, i;
+
+ fd = open(VND_PATH, O_RDONLY);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s read only: %s\n",
+ VND_PATH, strerror(errno));
+ return (1);
+ }
+
+ for (i = 0; requests[i] != -1; i++) {
+ int ret;
+ ret = ioctl(fd, requests[i], NULL);
+ assert(ret == -1);
+ assert(errno == ENOTTY);
+ }
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.basicopenctl.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.basicopenctl.c
new file mode 100644
index 0000000000..852ad5550f
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.basicopenctl.c
@@ -0,0 +1,76 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Ensure that we can do a basic open of the device for read, write, and
+ * read/write.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(void)
+{
+ int fd;
+
+ fd = open(VND_PATH, O_RDONLY);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s read only: %s\n",
+ VND_PATH, strerror(errno));
+ return (1);
+ }
+
+ if (close(fd) != 0) {
+ (void) fprintf(stderr, "failed to close vnd fd: %s\n",
+ strerror(errno));
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s read/write: %s\n",
+ VND_PATH, strerror(errno));
+ return (1);
+ }
+
+ if (close(fd) != 0) {
+ (void) fprintf(stderr, "failed to close vnd fd: %s\n",
+ strerror(errno));
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_WRONLY);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s write only: %s\n",
+ VND_PATH, strerror(errno));
+ return (1);
+ }
+
+ if (close(fd) != 0) {
+ (void) fprintf(stderr, "failed to close vnd fd: %s\n",
+ strerror(errno));
+ return (1);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlfault.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlfault.c
new file mode 100644
index 0000000000..b581b5dd4c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlfault.c
@@ -0,0 +1,78 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Pass pointers to arbitrary addresses and make sure we properly get EFAULT for
+ * all the global ioctls.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <limits.h>
+#include <assert.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(void)
+{
+ int fd, ret;
+ vnd_ioc_attach_t *via;
+ vnd_ioc_list_t *vil;
+ vnd_ioc_buf_t *vib;
+
+ fd = open(VND_PATH, O_RDWR);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s r/w: %s\n", VND_PATH,
+ strerror(errno));
+ return (1);
+ }
+
+ via = (vnd_ioc_attach_t *)(uintptr_t)23;
+ vil = (vnd_ioc_list_t *)(uintptr_t)42;
+ vib = (vnd_ioc_buf_t *)(uintptr_t)169;
+
+ ret = ioctl(fd, VND_IOC_ATTACH, NULL);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+ ret = ioctl(fd, VND_IOC_LIST, NULL);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+ ret = ioctl(fd, VND_IOC_GETMAXBUF, NULL);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+
+ ret = ioctl(fd, VND_IOC_ATTACH, via);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+ ret = ioctl(fd, VND_IOC_LIST, vil);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+ ret = ioctl(fd, VND_IOC_GETMAXBUF, vib);
+ assert(ret == -1);
+ assert(errno == EFAULT);
+
+ assert(close(fd) == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlnattach.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlnattach.c
new file mode 100644
index 0000000000..98acffa194
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.gioctlnattach.c
@@ -0,0 +1,100 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Here we test that all the ioctls which require us to be on a local device
+ * fail to work. Specifically, the errno should be VND_E_NOTATTACHED
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include <sys/vnd.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+static int vib_ioc[] = {
+ VND_IOC_GETRXBUF,
+ VND_IOC_SETRXBUF,
+ VND_IOC_GETTXBUF,
+ VND_IOC_SETTXBUF,
+ VND_IOC_GETMINTU,
+ VND_IOC_GETMAXTU,
+ -1
+};
+
+int
+main(void)
+{
+ int fd, ret, i;
+ vnd_ioc_link_t vil;
+ vnd_ioc_unlink_t viu;
+ vnd_ioc_buf_t vib;
+ frameio_t *fio;
+ char buf[1];
+
+ fd = open(VND_PATH, O_RDWR);
+ if (fd < 0) {
+ (void) fprintf(stderr, "failed to open %s r/w: %s\n", VND_PATH,
+ strerror(errno));
+ return (1);
+ }
+
+ bzero(&vil, sizeof (vnd_ioc_link_t));
+ vil.vil_name[0] = 'a';
+ bzero(&viu, sizeof (vnd_ioc_unlink_t));
+ bzero(&vib, sizeof (vnd_ioc_buf_t));
+ fio = malloc(sizeof (frameio_t) + sizeof (framevec_t));
+ assert(fio != NULL);
+ fio->fio_version = FRAMEIO_CURRENT_VERSION;
+ fio->fio_nvpf = 1;
+ fio->fio_nvecs = 1;
+ fio->fio_vecs[0].fv_buf = buf;
+ fio->fio_vecs[0].fv_buflen = 1;
+
+ ret = ioctl(fd, VND_IOC_LINK, &vil);
+ assert(vil.vil_errno == VND_E_NOTATTACHED);
+ ret = ioctl(fd, VND_IOC_UNLINK, &viu);
+ assert(viu.viu_errno == VND_E_NOTLINKED);
+
+ for (i = 0; vib_ioc[i] != -1; i++) {
+ bzero(&vib, sizeof (vib));
+ ret = ioctl(fd, vib_ioc[i], &vib);
+ assert(vib.vib_errno == VND_E_NOTATTACHED);
+ }
+
+ /* The frameio ioctls only use standard errnos */
+ ret = ioctl(fd, VND_IOC_FRAMEIO_READ, fio);
+ assert(ret == -1);
+ assert(errno == ENXIO);
+ ret = ioctl(fd, VND_IOC_FRAMEIO_WRITE, fio);
+ assert(ret == -1);
+ assert(errno == ENXIO);
+
+ free(fio);
+ assert(close(fd) == 0);
+
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.iocsize.ksh b/usr/src/cmd/vndadm/test/tst/ioctl/tst.iocsize.ksh
new file mode 100644
index 0000000000..9b30043d47
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.iocsize.ksh
@@ -0,0 +1,54 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# Ensure structure sizes for both ILP32 and LP64 are the same
+#
+
+vt_arg0=$(basename $0)
+vt_structs="vnd_ioc_attach_t vnd_ioc_link_t vnd_ioc_unlink_t"
+vt_structs="$vt_structs vnd_ioc_nonblock_t vnd_ioc_buf_t vnd_ioc_info_t"
+
+vt_t32="/tmp/vnd.iocsize.32.$$"
+vt_t64="/tmp/vnd.iocsize.64.$$"
+
+function fatal
+{
+ typeset msg="$*"
+ [[ -z "$msg" ]] && msg="failed"
+ echo "$vt_arg0: $msg" >&2
+ exit 1
+}
+
+function dump_types
+{
+ typeset file=$1
+ typeset lib=$2
+ typeset t
+
+ for t in $vn_structs; do
+ mdb -e \'::print -at $t\' $lib >> $file || fatal \
+ "failed to dump type $t from $lib"
+ done
+}
+
+rm -f $vt_t32 $vt_t64 || fatal "failed to cleanup old temp files"
+touch $vt_t32 $vt_t64 || fatal "failed to create temp files"
+
+dump_types $vt_t32 /usr/lib/libvnd.so.1
+dump_types $vt_t64 /usr/lib/64/libvnd.so.1
+
+diff $vt_t32 $vt_t64
diff --git a/usr/src/cmd/vndadm/test/tst/ioctl/tst.openctlbadflags.c b/usr/src/cmd/vndadm/test/tst/ioctl/tst.openctlbadflags.c
new file mode 100644
index 0000000000..65e48029b7
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/ioctl/tst.openctlbadflags.c
@@ -0,0 +1,88 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure that we can't open the vnd control device with invalid flags.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+
+#define VND_PATH "/dev/vnd/ctl"
+
+int
+main(void)
+{
+ int fd;
+
+ fd = open(VND_PATH, O_RDONLY | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_EXCL!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_EXCL!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_WRONLY | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_EXCL!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDONLY | O_NDELAY);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR | O_NDELAY);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_WRONLY | O_NDELAY);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDONLY | O_NDELAY | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY | O_EXCL!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_RDWR | O_NDELAY | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY | O_EXCL!");
+ return (1);
+ }
+
+ fd = open(VND_PATH, O_WRONLY | O_NDELAY | O_EXCL);
+ if (fd != -1) {
+ (void) fprintf(stderr, "somehow opened vnd O_NDELAY | O_EXCL!");
+ return (1);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/Makefile b/usr/src/cmd/vndadm/test/tst/lib/Makefile
new file mode 100644
index 0000000000..d7a1ed8fa5
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/Makefile
@@ -0,0 +1,44 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+TSTDIR = lib
+EXETESTS = \
+ create.basic.exe \
+ create.badlink.exe \
+ create.badpropid.exe \
+ create.badpropsize.exe \
+ create.badzone.exe \
+ create.enomem.exe \
+ create.frameioeagain.exe \
+ create.open.exe \
+ create.propiter.exe \
+ create.proprdonly.exe \
+ err.badclose.exe \
+ tst.badopen.exe \
+ tst.strerror.exe \
+ tst.strsyserror.exe
+OUTFILES = tst.strerror.exe.out
+SHTESTS =
+SUPBOBJS =
+
+CLOBBERFILES = $(EXETESTS)
+
+include ../../Makefile.com
+
+LDLIBS += -lvnd
+
+install: $(ROOTTESTS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.badlink.c b/usr/src/cmd/vndadm/test/tst/lib/create.badlink.c
new file mode 100644
index 0000000000..aefec3ed44
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.badlink.c
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure that we can't create something in the context of a datalink that
+ * doesn't exist.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <libvnd.h>
+
+int
+main(void)
+{
+ int syserr;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ vhp = vnd_create(NULL, "foobar", "foobar", &vnderr, &syserr);
+ (void) printf("%d, %d\n", vnderr, syserr);
+ assert(vhp == NULL);
+ assert(vnderr == VND_E_NODATALINK);
+ assert(syserr == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.badpropid.c b/usr/src/cmd/vndadm/test/tst/lib/create.badpropid.c
new file mode 100644
index 0000000000..15334fa31c
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.badpropid.c
@@ -0,0 +1,76 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure that we can't get and set nonexisting properties.
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr, ret;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ ret = vnd_prop_get(vhp, VND_PROP_MAX, NULL, 0);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROP);
+ assert(vnd_syserrno(vhp) == 0);
+
+ ret = vnd_prop_get(vhp, VND_PROP_MAX + 5, NULL, 0);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROP);
+ assert(vnd_syserrno(vhp) == 0);
+
+ ret = vnd_prop_set(vhp, VND_PROP_MAX, NULL, 0);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROP);
+ assert(vnd_syserrno(vhp) == 0);
+
+ ret = vnd_prop_set(vhp, VND_PROP_MAX + 5, NULL, 0);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROP);
+ assert(vnd_syserrno(vhp) == 0);
+
+ ret = vnd_prop_writeable(VND_PROP_MAX, NULL);
+ assert(ret == -1);
+
+ ret = vnd_prop_writeable(VND_PROP_MAX + 5, NULL);
+ assert(ret == -1);
+
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.badpropsize.c b/usr/src/cmd/vndadm/test/tst/lib/create.badpropsize.c
new file mode 100644
index 0000000000..d5fefd3764
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.badpropsize.c
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Validate that we can't set properties with bogus sizes.
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <limits.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr, ret, i;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ for (i = 0; i < VND_PROP_MAX; i++) {
+ ret = vnd_prop_get(vhp, i, NULL, INT32_MAX);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROPSIZE);
+ assert(vnd_syserrno(vhp) == 0);
+
+ ret = vnd_prop_set(vhp, i, NULL, INT32_MAX);
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_BADPROPSIZE);
+ assert(vnd_syserrno(vhp) == 0);
+ }
+
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.badzone.c b/usr/src/cmd/vndadm/test/tst/lib/create.badzone.c
new file mode 100644
index 0000000000..30f9612963
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.badzone.c
@@ -0,0 +1,43 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure that we can't create something in the context of a zone that
+ * doesn't exist.
+ */
+
+#include <assert.h>
+#include <sys/zone.h>
+#include <string.h>
+#include <libvnd.h>
+
+int
+main(void)
+{
+ int syserr;
+ vnd_errno_t vnderr;
+ char zname[ZONENAME_MAX+4];
+ vnd_handle_t *vhp;
+
+ (void) memset(zname, 'a', sizeof (zname));
+ zname[ZONENAME_MAX+3] = '\0';
+
+ vhp = vnd_create(zname, "foobar", "foobar", &vnderr, &syserr);
+ assert(vhp == NULL);
+ assert(vnderr == VND_E_NOZONE);
+ assert(syserr == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.basic.c b/usr/src/cmd/vndadm/test/tst/lib/create.basic.c
new file mode 100644
index 0000000000..5335f8cbb4
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.basic.c
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Simple create and destroy.
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.enomem.c b/usr/src/cmd/vndadm/test/tst/lib/create.enomem.c
new file mode 100644
index 0000000000..9203e369ae
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.enomem.c
@@ -0,0 +1,91 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Verify that we can't allocate a handle when in an ENOMEM situation.
+ */
+
+#include <procfs.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/sysmacros.h>
+#include <assert.h>
+#include <strings.h>
+
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int fd;
+ int syserr;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+ pstatus_t status;
+ void *addr;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ fd = open("/proc/self/status", O_RDONLY);
+ if (fd < 0)
+ exit(1);
+ if (read(fd, &status, sizeof (status)) != sizeof (status))
+ exit(1);
+
+ addr = mmap((caddr_t)P2ROUNDUP(status.pr_brkbase +
+ status.pr_brksize, 0x1000), 0x1000,
+ PROT_READ, MAP_ANON | MAP_FIXED | MAP_PRIVATE, -1, 0);
+ if (addr == (void *)-1) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* malloc an approximate size of the vnd_handle_t */
+ for (;;) {
+ void *buf;
+
+ buf = malloc(8);
+ if (buf == NULL)
+ break;
+ }
+
+ for (;;) {
+ void *buf;
+
+ buf = malloc(4);
+ if (buf == NULL)
+ break;
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp == NULL);
+ assert(vnderr == VND_E_NOMEM);
+ assert(syserr == 0);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.frameioeagain.c b/usr/src/cmd/vndadm/test/tst/lib/create.frameioeagain.c
new file mode 100644
index 0000000000..6cb14fb7df
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.frameioeagain.c
@@ -0,0 +1,80 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Create a datalink, set it to non-blocking mode and ensure that we get EAGAIN
+ * from frame I/O calls. Note that if this test is not plumbed up over an
+ * etherstub, then it is likely that other traffic will appear on the device and
+ * this will fail. Note that the test suite always creates these devices over an
+ * etherstub.
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr, ret, fd;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+ frameio_t *fio;
+ char buf[1520];
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ fd = vnd_pollfd(vhp);
+ ret = fcntl(fd, F_SETFL, O_NONBLOCK);
+ assert(ret == 0);
+
+ fio = malloc(sizeof (frameio_t) +
+ sizeof (framevec_t));
+ assert(fio != NULL);
+ fio->fio_version = FRAMEIO_CURRENT_VERSION;
+ fio->fio_nvpf = 1;
+ fio->fio_nvecs = 1;
+
+ fio->fio_vecs[0].fv_buf = buf;
+ fio->fio_vecs[0].fv_buflen = sizeof (buf);
+
+ ret = vnd_frameio_read(vhp, fio);
+ (void) printf("%d, %d\n", ret, errno);
+ assert(ret == -1);
+ assert(errno == EAGAIN);
+
+ vnd_close(vhp);
+ free(fio);
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.open.c b/usr/src/cmd/vndadm/test/tst/lib/create.open.c
new file mode 100644
index 0000000000..9cb1d7e40e
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.open.c
@@ -0,0 +1,56 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure we can open a created datalink.
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp, *vhp2;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ vhp2 = vnd_open(NULL, argv[1], &vnderr, &syserr);
+ assert(vhp2 != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ vnd_close(vhp2);
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.propiter.c b/usr/src/cmd/vndadm/test/tst/lib/create.propiter.c
new file mode 100644
index 0000000000..a0b46180f7
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.propiter.c
@@ -0,0 +1,79 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Ensure that vnd_prop_iter sees all props;
+ */
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <libvnd.h>
+
+static boolean_t *g_props;
+
+/* ARGSUSED */
+static int
+prop_cb(vnd_handle_t *vhp, vnd_prop_t prop, void *unused)
+{
+ assert(prop < VND_PROP_MAX);
+ g_props[prop] = B_TRUE;
+
+ return (0);
+}
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr, i, ret;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ g_props = malloc(sizeof (boolean_t) * VND_PROP_MAX);
+ if (g_props == NULL) {
+ (void) fprintf(stderr, "failed to alloc memory for %d "
+ "boolean_t\n", VND_PROP_MAX);
+ return (1);
+ }
+ for (i = 0; i < VND_PROP_MAX; i++)
+ g_props[i] = B_FALSE;
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ ret = vnd_prop_iter(vhp, prop_cb, NULL);
+ assert(ret == 0);
+
+ for (i = 0; i < VND_PROP_MAX; i++)
+ assert(g_props[i] == B_TRUE);
+
+ free(g_props);
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/create.proprdonly.c b/usr/src/cmd/vndadm/test/tst/lib/create.proprdonly.c
new file mode 100644
index 0000000000..18b1f7d58d
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/create.proprdonly.c
@@ -0,0 +1,63 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Validate that we can't set read only properties
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <limits.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr, ret;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+ vnd_prop_buf_t vpb;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_create(NULL, argv[1], argv[1], &vnderr, &syserr);
+ assert(vhp != NULL);
+ assert(vnderr == 0);
+ assert(syserr == 0);
+
+ ret = vnd_prop_get(vhp, VND_PROP_MINTU, &vpb,
+ sizeof (vnd_prop_buf_t));
+ assert(ret == 0);
+
+ ret = vnd_prop_set(vhp, VND_PROP_MINTU, &vpb,
+ sizeof (vnd_prop_buf_t));
+ assert(ret == -1);
+ assert(vnd_errno(vhp) == VND_E_PROPRDONLY);
+ assert(vnd_syserrno(vhp) == 0);
+
+ vnd_close(vhp);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/err.badclose.c b/usr/src/cmd/vndadm/test/tst/lib/err.badclose.c
new file mode 100644
index 0000000000..8c832506a0
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/err.badclose.c
@@ -0,0 +1,33 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * This program should segfault.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <libvnd.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp = (void *)0x42;
+ vnd_close(vhp);
+ /* This should not be reached */
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/tst.badopen.c b/usr/src/cmd/vndadm/test/tst/lib/tst.badopen.c
new file mode 100644
index 0000000000..4f67ce79ed
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/tst.badopen.c
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Make sure we can't open a vnd device that doesn't exist
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <strings.h>
+#include <assert.h>
+#include <libvnd.h>
+
+int
+main(int argc, const char *argv[])
+{
+ int syserr;
+ vnd_errno_t vnderr;
+ vnd_handle_t *vhp;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "missing arguments...\n");
+ return (1);
+ }
+
+ if (strlen(argv[1]) >= LIBVND_NAMELEN) {
+ (void) fprintf(stderr, "vnic name too long...\n");
+ return (1);
+ }
+
+ vhp = vnd_open(NULL, argv[1], &vnderr, &syserr);
+ assert(vhp == NULL);
+ assert(vnderr == VND_E_SYS);
+ assert(syserr == ENOENT);
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.c b/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.c
new file mode 100644
index 0000000000..a99a9ecbf6
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.c
@@ -0,0 +1,30 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Verify that all the error strings we care about match what we expect.
+ */
+
+#include <stdio.h>
+#include <libvnd.h>
+
+int
+main(void)
+{
+ int i;
+ for (i = 0; i <= VND_E_UNKNOWN + 1; i++)
+ (void) printf("[%s]\n", vnd_strerror(i));
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.exe.out b/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.exe.out
new file mode 100644
index 0000000000..83dbcdfdb4
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/tst.strerror.exe.out
@@ -0,0 +1,37 @@
+[no error]
+[not enough memory available]
+[no such datalink]
+[datalink not of type DL_ETHER]
+[unknown dlpi failure]
+[DL_ATTACH_REQ failed]
+[DL_BIND_REQ failed]
+[DL_PROMISCON_REQ failed]
+[DLD_CAPAB_DIRECT enable failed]
+[bad datalink capability]
+[bad datalink subcapability]
+[bad dld version]
+[failed to create kstats]
+[no such vnd link]
+[netstack doesn't exist]
+[device already associated]
+[device already attached]
+[device already linked]
+[invalid name]
+[permission denied]
+[no such zone]
+[failed to initialize vnd stream module]
+[device not attached]
+[device not linked]
+[another device has the same link name]
+[failed to create minor node]
+[requested buffer size is too large]
+[requested buffer size is too small]
+[unable to obtain exclusive access to dlpi link, link busy]
+[DLD direct capability not supported over data link]
+[invalid property size]
+[invalid property]
+[property is read only]
+[unexpected system error]
+[capabilities invalid, pass-through module detected]
+[unknown error]
+[unknown error]
diff --git a/usr/src/cmd/vndadm/test/tst/lib/tst.strsyserror.c b/usr/src/cmd/vndadm/test/tst/lib/tst.strsyserror.c
new file mode 100644
index 0000000000..b95e6372e4
--- /dev/null
+++ b/usr/src/cmd/vndadm/test/tst/lib/tst.strsyserror.c
@@ -0,0 +1,50 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Verify that the error message from libvnd's strsyserrno is the same as the
+ * underlying strerror function's. It should be. We'll just check an assortment
+ * of errnos.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <libvnd.h>
+
+int
+main(void)
+{
+ int i;
+ const char *vnd, *libc;
+ for (i = 0; i < 42; i++) {
+ vnd = vnd_strsyserror(i);
+ libc = strerror(i);
+ if ((vnd != NULL && libc == NULL) ||
+ (vnd == NULL && libc != NULL)) {
+ (void) fprintf(stderr, "errno %d, vnd: %p, libc: %p",
+ i, (void *)vnd, (void *)libc);
+ return (1);
+ }
+ if (vnd != NULL && strcmp(vnd, libc) != 0) {
+ (void) fprintf(stderr,
+ "errno %d: libc and vnd disagree.\n", i);
+ (void) fprintf(stderr, "vnd: %s\n", vnd);
+ (void) fprintf(stderr, "libc: %s\n", libc);
+ return (1);
+ }
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/vndadm/vndadm.c b/usr/src/cmd/vndadm/vndadm.c
new file mode 100644
index 0000000000..6811663696
--- /dev/null
+++ b/usr/src/cmd/vndadm/vndadm.c
@@ -0,0 +1,872 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+#include <libgen.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <zone.h>
+
+#include <libvnd.h>
+
+typedef int (*vndadm_print_t)(vnd_handle_t *, vnd_prop_t);
+typedef int (*vndadm_parse_t)(char *, void **, size_t *);
+
+typedef struct vndadm_proptbl {
+ const char *vp_name;
+ vndadm_print_t vp_print;
+ vndadm_parse_t vp_parse;
+} vndadm_proptbl_t;
+
+/*
+ * Forwards
+ */
+static int usage(const char *, ...);
+static int vndadm_print_size(vnd_handle_t *, vnd_prop_t);
+static int vndadm_print_number(vnd_handle_t *, vnd_prop_t);
+static int vndadm_parse_size(char *, void **, size_t *);
+
+/*
+ * Globals
+ */
+static char *vnd_pname;
+
+static void
+vnd_vwarn(vnd_errno_t verr, int syserr, const char *format, va_list alist)
+{
+ (void) fprintf(stderr, "%s: ", vnd_pname);
+ (void) vfprintf(stderr, format, alist);
+ if (strchr(format, '\n') == NULL) {
+ (void) fprintf(stderr, ": %s\n", verr != VND_E_SYS ?
+ vnd_strerror(verr) : vnd_strsyserror(syserr));
+ }
+}
+
+static void
+vnd_libwarn(vnd_errno_t verr, int syserr, const char *format, ...)
+{
+ va_list alist;
+
+ va_start(alist, format);
+ vnd_vwarn(verr, syserr, format, alist);
+ va_end(alist);
+}
+
+static void
+vnd_warn(const char *format, ...)
+{
+ va_list alist;
+
+ va_start(alist, format);
+ vnd_vwarn(0, 0, format, alist);
+ va_end(alist);
+}
+
+static vndadm_proptbl_t vndadm_propname_tbl[] = {
+ { "rxbuf", vndadm_print_size,
+ vndadm_parse_size }, /* VND_PROP_RXBUF */
+ { "txbuf", vndadm_print_size,
+ vndadm_parse_size }, /* VND_PROP_TXBUF */
+ { "maxsize", vndadm_print_size, NULL }, /* VND_PROP_MAXBUF */
+ { "mintu", vndadm_print_number, NULL }, /* VND_PROP_MINTU */
+ { "maxtu", vndadm_print_number, NULL }, /* VND_PROP_MAXTU */
+ NULL /* VND_PROP_MAX */
+};
+
+static const char *
+vndadm_prop_to_name(vnd_prop_t prop)
+{
+ if (prop > VND_PROP_MAX)
+ return (NULL);
+
+ return (vndadm_propname_tbl[prop].vp_name);
+}
+
+static vnd_prop_t
+vndadm_name_to_prop(const char *name)
+{
+ int i;
+
+ for (i = 0; i < VND_PROP_MAX; i++) {
+ if (strcmp(name, vndadm_propname_tbl[i].vp_name) == 0)
+ return (i);
+ }
+
+ return (VND_PROP_MAX);
+}
+
+static int
+vndadm_print_size(vnd_handle_t *vhp, vnd_prop_t prop)
+{
+ vnd_prop_buf_t buf;
+
+ if (vnd_prop_get(vhp, prop, &buf, sizeof (buf)) != 0) {
+ vnd_libwarn(vnd_errno(vhp), vnd_syserrno(vhp),
+ "failed to get property %s", vndadm_prop_to_name(prop));
+ return (1);
+ }
+
+ (void) printf("%lld", buf.vpb_size);
+ return (0);
+}
+
+static int
+vndadm_print_number(vnd_handle_t *vhp, vnd_prop_t prop)
+{
+ vnd_prop_buf_t buf;
+
+ if (vnd_prop_get(vhp, prop, &buf, sizeof (buf)) != 0) {
+ vnd_libwarn(vnd_errno(vhp), vnd_syserrno(vhp),
+ "failed to get property %s", vndadm_prop_to_name(prop));
+ return (1);
+ }
+
+ (void) printf("%lld", buf.vpb_size);
+ return (0);
+}
+
+static int
+vndadm_parse_size(char *str, void **bufp, size_t *sizep)
+{
+ char *end;
+ unsigned long long val, orig;
+ vnd_prop_buf_t *buf;
+
+ errno = 0;
+ val = strtoull(str, &end, 10);
+ if (errno != 0) {
+ vnd_warn("%s: not a number\n", str);
+ return (1);
+ }
+
+ orig = val;
+ switch (*end) {
+ case 'g':
+ case 'G':
+ val *= 1024;
+ if (val < orig)
+ goto overflow;
+ /*FALLTHRU*/
+ case 'm':
+ case 'M':
+ val *= 1024;
+ if (val < orig)
+ goto overflow;
+ /*FALLTHRU*/
+ case 'k':
+ case 'K':
+ val *= 1024;
+ if (val < orig)
+ goto overflow;
+ end++;
+ break;
+ default:
+ break;
+ }
+
+ if (*end == 'b' || *end == 'B')
+ end++;
+ if (*end != '\0') {
+ vnd_warn("%s: not a number", str);
+ return (1);
+ }
+
+ buf = malloc(sizeof (vnd_prop_buf_t));
+ if (buf == NULL) {
+ vnd_warn("failed to allocate memory for setting a property");
+ return (1);
+ }
+
+ buf->vpb_size = val;
+ *bufp = buf;
+ *sizep = sizeof (vnd_prop_buf_t);
+
+ return (0);
+
+overflow:
+ vnd_warn("value overflowed: %s\n", str);
+ return (1);
+}
+
+static void
+vndadm_create_usage(FILE *out)
+{
+ (void) fprintf(out, "\tcreate:\t\t[-z zonename] -l datalink name\n");
+}
+
+static int
+vndadm_create(int argc, char *argv[])
+{
+ int c, syserr;
+ vnd_errno_t vnderr;
+ const char *datalink = NULL;
+ const char *linkname = NULL;
+ const char *zonename = NULL;
+ vnd_handle_t *vhp;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, ":z:l:")) != -1) {
+ switch (c) {
+ case 'l':
+ datalink = optarg;
+ break;
+ case 'z':
+ zonename = optarg;
+ break;
+ case ':':
+ return (usage("-%c requires an operand\n", optopt));
+ case '?':
+ return (usage("unknown option: -%c\n", optopt));
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ return (usage("missing required link name\n"));
+ } else if (argc > 1) {
+ return (usage("create: too many arguments for link name, "
+ "pick one\n"));
+ }
+ linkname = argv[0];
+ if (datalink == NULL)
+ datalink = linkname;
+
+ vhp = vnd_create(zonename, datalink, linkname, &vnderr, &syserr);
+ if (vhp == NULL) {
+ vnd_libwarn(vnderr, syserr,
+ "failed to create datapath link %s", linkname);
+ return (1);
+ }
+
+ vnd_close(vhp);
+ return (0);
+}
+
+static void
+vndadm_destroy_usage(FILE *out)
+{
+ (void) fprintf(out, "\tdestroy:\t[-z zonename] [link]...\n");
+}
+
+static int
+vndadm_destroy(int argc, char *argv[])
+{
+ vnd_handle_t *vhp;
+ int c, syserr;
+ vnd_errno_t vnderr;
+ const char *zonename = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, ":z:")) != -1) {
+ switch (c) {
+ case 'z':
+ zonename = optarg;
+ break;
+ case ':':
+ return (usage("-%c requires an operand\n", optopt));
+ case '?':
+ return (usage("unknown option: -%c\n", optopt));
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ return (usage("extraneous arguments\n"));
+ }
+
+ vhp = vnd_open(zonename, argv[0], &vnderr, &syserr);
+ if (vhp == NULL) {
+ vnd_libwarn(vnderr, syserr, "failed to open link: %s", argv[0]);
+ return (1);
+ }
+
+ if (vnd_unlink(vhp) != 0) {
+ vnd_libwarn(vnd_errno(vhp), vnd_syserrno(vhp),
+ "failed to destroy link %s", argv[0]);
+ return (1);
+ }
+
+ vnd_close(vhp);
+ return (0);
+}
+
+static void
+vndadm_list_usage(FILE *out)
+{
+ (void) fprintf(out, "\tlist:\t\t[-p] [-d delim] [-o field,...] "
+ "[-z zonename] [link]...\n");
+}
+
+#define VNDADM_LIST_NFIELDS 3
+
+typedef struct vndadm_list_cb {
+ int vsc_argc;
+ char **vsc_argv;
+ int vsc_found;
+ boolean_t vsc_parse;
+ const char *vsc_delim;
+ int vsc_order[VNDADM_LIST_NFIELDS];
+ int vsc_last;
+ zoneid_t vsc_zid;
+} vndadm_list_cb_t;
+
+typedef struct vndadm_list_field {
+ const char *vlf_name;
+ const char *vlf_header;
+ int vlf_size;
+ void (*vlf_print)(struct vndadm_list_field *, vnd_info_t *, boolean_t);
+ void (*vlf_parse)(struct vndadm_list_field *, vnd_info_t *, boolean_t);
+} vndadm_list_field_t;
+
+static void
+vlf_print_link(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ if (last == B_TRUE) {
+ (void) printf("%s", viip->vi_name);
+ } else {
+ (void) printf("%-*s", vlfp->vlf_size, viip->vi_name);
+ }
+}
+
+/* ARGSUSED */
+static void
+vlf_parse_link(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ (void) printf("%s", viip->vi_name);
+}
+
+static void
+vlf_print_datalink(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ if (last == B_TRUE) {
+ (void) printf("%s", viip->vi_datalink);
+ } else {
+ (void) printf("%-*s", vlfp->vlf_size, viip->vi_datalink);
+ }
+}
+
+/* ARGSUSED */
+static void
+vlf_parse_datalink(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ (void) printf("%s", viip->vi_datalink);
+}
+
+static void
+vlf_print_zone(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ char buf[ZONENAME_MAX];
+
+ if (getzonenamebyid(viip->vi_zone, buf, sizeof (buf)) <= 0)
+ (void) strlcpy(buf, "<unknown>", sizeof (buf));
+
+ if (last == B_TRUE) {
+ (void) printf("%s", buf);
+ } else {
+ (void) printf("%-*s", vlfp->vlf_size, buf);
+ }
+}
+
+/* ARGSUSED */
+static void
+vlf_parse_zone(vndadm_list_field_t *vlfp, vnd_info_t *viip,
+ boolean_t last)
+{
+ char buf[ZONENAME_MAX];
+
+ if (getzonenamebyid(viip->vi_zone, buf, sizeof (buf)) <= 0)
+ (void) strlcpy(buf, "<unknown>", sizeof (buf));
+
+ (void) printf("%s", buf);
+}
+
+static vndadm_list_field_t vlf_tbl[] = {
+ { "name", "NAME", 16, vlf_print_link, vlf_parse_link },
+ { "datalink", "DATALINK", 16, vlf_print_datalink, vlf_parse_datalink },
+ { "zone", "ZONENAME", 32, vlf_print_zone, vlf_parse_zone },
+ { NULL }
+};
+
+
+static int
+vndadm_list_f(vnd_info_t *viip, void *arg)
+{
+ int i;
+ boolean_t found;
+ vndadm_list_cb_t *vscp = arg;
+
+ if (vscp->vsc_zid != ALL_ZONES && vscp->vsc_zid != viip->vi_zone)
+ return (0);
+
+ if (vscp->vsc_argc != 0) {
+ found = B_FALSE;
+ for (i = 0; i < vscp->vsc_argc; i++) {
+ if (strcmp(viip->vi_name, vscp->vsc_argv[i]) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ if (found == B_FALSE)
+ return (0);
+ vscp->vsc_found++;
+ }
+
+ for (i = 0; i < VNDADM_LIST_NFIELDS && vscp->vsc_order[i] != -1; i++) {
+ boolean_t last = i == vscp->vsc_last;
+ if (vscp->vsc_parse == B_TRUE)
+ vlf_tbl[vscp->vsc_order[i]].vlf_parse(
+ &vlf_tbl[vscp->vsc_order[i]], viip, last);
+ else
+ vlf_tbl[vscp->vsc_order[i]].vlf_print(
+ &vlf_tbl[vscp->vsc_order[i]], viip, last);
+
+ if (last == B_FALSE)
+ (void) printf("%s", vscp->vsc_delim);
+ }
+ (void) printf("\n");
+
+ return (0);
+}
+
+static int
+vndadm_list(int argc, char *argv[])
+{
+ int c, i, syserr;
+ vnd_errno_t vnderr;
+ boolean_t parse = B_FALSE;
+ const char *zonename = NULL, *delim = NULL;
+ char *fields = NULL;
+ vndadm_list_cb_t vsc;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, ":pd:o:z:")) != -1) {
+ switch (c) {
+ case 'p':
+ parse = B_TRUE;
+ break;
+ case 'd':
+ delim = optarg;
+ break;
+ case 'o':
+ fields = optarg;
+ break;
+ case 'z':
+ zonename = optarg;
+ break;
+ case ':':
+ return (usage("-%c requires an operand\n", optopt));
+ case '?':
+ return (usage("unknown option: -%c\n", optopt));
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ vsc.vsc_argc = argc;
+ vsc.vsc_argv = argv;
+ vsc.vsc_found = 0;
+ if (zonename != NULL) {
+ vsc.vsc_zid = getzoneidbyname(zonename);
+ if (vsc.vsc_zid == -1) {
+ vnd_warn("no such zone: %s\n", zonename);
+ return (1);
+ }
+ } else {
+ vsc.vsc_zid = ALL_ZONES;
+ }
+
+ /* Sanity check parseable related stuff */
+ if (delim != NULL && parse == B_FALSE) {
+ return (usage("-d cannot be used without -p\n"));
+ }
+
+ if (parse == B_TRUE && fields == NULL) {
+ return (usage("-p cannot be used without -o\n"));
+ }
+
+ /* validate our fields, if any */
+ if (fields != NULL) {
+ char *c, *n;
+ int floc = 0;
+
+ c = fields;
+ for (;;) {
+ if (floc >= VNDADM_LIST_NFIELDS) {
+ return (usage("too many fields specified "
+ "for -o\n"));
+ }
+
+ n = strchr(c, ',');
+ if (n != NULL)
+ *n = '\0';
+
+ for (i = 0; i < VNDADM_LIST_NFIELDS; i++) {
+ if (strcasecmp(c, vlf_tbl[i].vlf_name) == 0)
+ break;
+ }
+ if (i == VNDADM_LIST_NFIELDS) {
+ vnd_warn("invalid field for -o: %s\nvalid "
+ "fields are:", c);
+ for (i = 0; i < VNDADM_LIST_NFIELDS; i++)
+ vnd_warn(" %s", vlf_tbl[i].vlf_name);
+ vnd_warn("\n");
+ return (usage(NULL));
+ }
+ vsc.vsc_order[floc] = i;
+ floc++;
+
+ if (n == NULL)
+ break;
+ c = n + 1;
+ }
+
+ vsc.vsc_last = floc - 1;
+ while (floc < VNDADM_LIST_NFIELDS)
+ vsc.vsc_order[floc++] = -1;
+ } else {
+ vsc.vsc_order[0] = 0;
+ vsc.vsc_order[1] = 1;
+ vsc.vsc_order[2] = 2;
+ }
+
+ vsc.vsc_parse = parse;
+ vsc.vsc_delim = delim;
+ if (vsc.vsc_delim == NULL)
+ vsc.vsc_delim = " ";
+
+ if (vsc.vsc_parse != B_TRUE) {
+ for (i = 0; i < VNDADM_LIST_NFIELDS && vsc.vsc_order[i] != -1;
+ i++) {
+ if (i + 1 == VNDADM_LIST_NFIELDS) {
+ (void) printf("%s\n",
+ vlf_tbl[vsc.vsc_order[i]].vlf_header);
+ continue;
+ }
+ (void) printf("%-*s ",
+ vlf_tbl[vsc.vsc_order[i]].vlf_size,
+ vlf_tbl[vsc.vsc_order[i]].vlf_header);
+ }
+ }
+
+ if (vnd_walk(vndadm_list_f, &vsc, &vnderr, &syserr) != 0) {
+ vnd_libwarn(vnderr, syserr, "failed to walk vnd links");
+ return (1);
+ }
+
+ if (argc > 0 && vsc.vsc_found == 0) {
+ vnd_warn("no links matched requested names\n");
+ return (1);
+ }
+
+ return (0);
+}
+
+typedef struct vndadm_get {
+ boolean_t vg_parse;
+ const char *vg_delim;
+ const char *vg_link;
+ int vg_argc;
+ char **vg_argv;
+} vndadm_get_t;
+
+static int
+vndadm_get_cb(vnd_handle_t *vhp, vnd_prop_t prop, void *arg)
+{
+ boolean_t writeable;
+ const char *perm;
+ vndadm_get_t *vgp = arg;
+ const char *name = vndadm_prop_to_name(prop);
+
+ /* Verify if this is a prop we're supposed to print */
+ if (vgp->vg_argc > 0) {
+ int i;
+ boolean_t found = B_FALSE;
+ for (i = 0; i < vgp->vg_argc; i++) {
+ if (strcmp(name, vgp->vg_argv[i]) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ if (found == B_FALSE)
+ return (0);
+ }
+
+ if (vnd_prop_writeable(prop, &writeable) != 0)
+ abort();
+
+ perm = writeable ? "rw" : "r-";
+
+ if (vgp->vg_parse == B_TRUE) {
+ (void) printf("%s%s%s%s%s%s", vgp->vg_link, vgp->vg_delim,
+ name, vgp->vg_delim, perm, vgp->vg_delim);
+ } else {
+ (void) printf("%-13s %-16s %-5s ", vgp->vg_link, name, perm);
+ }
+
+ if (vndadm_propname_tbl[prop].vp_print != NULL) {
+ if (vndadm_propname_tbl[prop].vp_print(vhp, prop) != 0)
+ return (1);
+ } else {
+ (void) printf("-");
+ }
+ (void) printf("\n");
+ return (0);
+}
+
+static int
+vndadm_get(int argc, char *argv[])
+{
+ vnd_handle_t *vhp;
+ boolean_t parse = B_FALSE;
+ vndadm_get_t vg;
+ int c, syserr;
+ vnd_errno_t vnderr;
+ const char *zonename = NULL, *delim = NULL;
+
+ if (argc <= 0) {
+ return (usage("get requires a link name\n"));
+ }
+
+ optind = 0;
+ while ((c = getopt(argc, argv, ":pd:z:")) != -1) {
+ switch (c) {
+ case 'p':
+ parse = B_TRUE;
+ break;
+ case 'd':
+ delim = optarg;
+ break;
+ case 'z':
+ zonename = optarg;
+ break;
+ case ':':
+ return (usage("-%c requires an operand\n", optopt));
+ case '?':
+ return (usage("unknown option: -%c\n", optopt));
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ return (usage("missing required link\n"));
+ }
+
+ vhp = vnd_open(zonename, argv[0], &vnderr, &syserr);
+ if (vhp == NULL) {
+ vnd_libwarn(vnderr, syserr, "failed to open link: %s", argv[0]);
+ return (1);
+ }
+
+ vg.vg_argc = argc - 1;
+ vg.vg_argv = argv + 1;
+ vg.vg_link = argv[0];
+ vg.vg_parse = parse;
+ vg.vg_delim = delim != NULL ? delim : " ";
+ if (vg.vg_parse == B_FALSE)
+ (void) printf("%-13s %-16s %-5s %s\n", "LINK", "PROPERTY",
+ "PERM", "VALUE");
+
+ if (vnd_prop_iter(vhp, vndadm_get_cb, &vg) != 0)
+ return (1);
+
+ return (0);
+}
+
+static void
+vndadm_get_usage(FILE *out)
+{
+ (void) fprintf(out,
+ "\tget:\t\t[-p] [-d delim] [-z zonename] link [prop]...\n");
+}
+
+static int
+vndadm_set(int argc, char *argv[])
+{
+ vnd_handle_t *vhp;
+ int c, i, syserr;
+ vnd_errno_t vnderr;
+ const char *zonename = NULL;
+
+ optind = 0;
+ while ((c = getopt(argc, argv, ":z:")) != -1) {
+ switch (c) {
+ case 'z':
+ zonename = optarg;
+ break;
+ case ':':
+ return (usage("-%c requires an operand\n", optopt));
+ case '?':
+ return (usage("unknown option: -%c\n", optopt));
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 2) {
+ return (usage("missing arguments to set\n"));
+ }
+
+ vhp = vnd_open(zonename, argv[0], &vnderr, &syserr);
+ if (vhp == NULL) {
+ vnd_libwarn(vnderr, syserr, "failed to open link: %s", argv[0]);
+ return (1);
+ }
+
+ for (i = 1; i < argc; i++) {
+ char *eq, *key, *value;
+ boolean_t writeable;
+ vnd_prop_t prop;
+ void *buf;
+ size_t psize;
+ int ret;
+
+ key = argv[i];
+ eq = strchr(key, '=');
+ if (eq == NULL) {
+ vnd_warn("invalid property name=value: %s\n", key);
+ return (1);
+ }
+ *eq = '\0';
+ value = eq + 1;
+ if (*value == '\0') {
+ vnd_warn("property value required for %s\n", key);
+ return (1);
+ }
+ prop = vndadm_name_to_prop(key);
+ if (prop == VND_PROP_MAX) {
+ vnd_warn("unknown property: %s\n", key);
+ return (1);
+ }
+
+ if (vnd_prop_writeable(prop, &writeable) != 0)
+ abort();
+ if (writeable != B_TRUE) {
+ vnd_warn("property %s is read-only\n", key);
+ return (1);
+ }
+ assert(vndadm_propname_tbl[prop].vp_parse != NULL);
+
+ /*
+ * vp_parse functions should say what explicitly is invalid. We
+ * should indicate that the property failed.
+ */
+ ret = vndadm_propname_tbl[prop].vp_parse(value, &buf, &psize);
+ if (ret != 0) {
+ vnd_warn("failed to set property %s\n", key);
+ return (1);
+ }
+
+ ret = vnd_prop_set(vhp, prop, buf, psize);
+ free(buf);
+ if (ret != 0) {
+ vnd_libwarn(vnd_errno(vhp), vnd_syserrno(vhp),
+ "failed to set property %s", key);
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+static void
+vndadm_set_usage(FILE *out)
+{
+ (void) fprintf(out, "\tset:\t\t[-z zonename] link prop=val...\n");
+}
+
+typedef struct vnd_cmdtab {
+ const char *vc_name;
+ int (*vc_op)(int, char *[]);
+ void (*vc_usage)(FILE *);
+} vnd_cmdtab_t;
+
+static vnd_cmdtab_t vnd_tab[] = {
+ { "create", vndadm_create, vndadm_create_usage },
+ { "destroy", vndadm_destroy, vndadm_destroy_usage },
+ { "list", vndadm_list, vndadm_list_usage },
+ { "get", vndadm_get, vndadm_get_usage },
+ { "set", vndadm_set, vndadm_set_usage },
+ { NULL, NULL }
+};
+
+static int
+usage(const char *format, ...)
+{
+ vnd_cmdtab_t *tab;
+ const char *help = "usage: %s <subcommand> <args> ...\n";
+
+ if (format != NULL) {
+ va_list alist;
+
+ va_start(alist, format);
+ (void) fprintf(stderr, "%s: ", vnd_pname);
+ (void) vfprintf(stderr, format, alist);
+ va_end(alist);
+ }
+ (void) fprintf(stderr, help, vnd_pname);
+ for (tab = vnd_tab; tab->vc_name != NULL; tab++)
+ tab->vc_usage(stderr);
+
+ return (2);
+}
+
+int
+main(int argc, char *argv[])
+{
+ vnd_cmdtab_t *tab;
+
+ vnd_pname = basename(argv[0]);
+ if (argc < 2) {
+ return (usage(NULL));
+ }
+
+ for (tab = vnd_tab; tab->vc_name != NULL; tab++) {
+ if (strcmp(argv[1], tab->vc_name) == 0) {
+ argc -= 2; argv += 2;
+ assert(argc >= 0);
+ return (tab->vc_op(argc, argv));
+ }
+ }
+
+ return (usage("unknown sub-command '%s'\n", argv[1]));
+}
diff --git a/usr/src/cmd/vndstat/Makefile b/usr/src/cmd/vndstat/Makefile
new file mode 100644
index 0000000000..c77eef3887
--- /dev/null
+++ b/usr/src/cmd/vndstat/Makefile
@@ -0,0 +1,33 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+PROG= vndstat
+
+include ../Makefile.cmd
+
+LDLIBS += -lkstat
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+install: all $(ROOTPROG)
+
+clean:
+ $(RM) $(PROG)
+
+lint: lint_PROG
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/vndstat/vndstat.c b/usr/src/cmd/vndstat/vndstat.c
new file mode 100644
index 0000000000..6f6c76fc12
--- /dev/null
+++ b/usr/src/cmd/vndstat/vndstat.c
@@ -0,0 +1,542 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/kstat.h>
+#include <kstat.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <alloca.h>
+#include <signal.h>
+#include <sys/varargs.h>
+#include <sys/int_limits.h>
+#include <sys/sysmacros.h>
+
+#define KSTAT_FIELD_USEINSTANCE 0x01
+#define KSTAT_FIELD_NODELTA 0x02
+#define KSTAT_FIELD_FILLER 0x04
+#define KSTAT_FIELD_STRING 0x08
+#define KSTAT_FIELD_UNIT 0x10
+#define KSTAT_FIELD_LJUST 0x20
+
+typedef struct kstat_field {
+ char *ksf_header; /* header for field */
+ char *ksf_name; /* name of stat, if any */
+ int ksf_width; /* width for field in output line */
+ uint32_t ksf_flags; /* flags for this field, if any */
+ char *ksf_suffix; /* optional suffix for units */
+ int ksf_hint; /* index hint for field in kstat */
+} kstat_field_t;
+
+typedef struct kstat_instance {
+ char ksi_name[KSTAT_STRLEN]; /* name of the underlying kstat */
+ int ksi_instance; /* instance identifer of this kstat */
+ kstat_t *ksi_ksp; /* pointer to the kstat */
+ uint64_t *ksi_data[2]; /* pointer to two generations of data */
+ hrtime_t ksi_snaptime[2]; /* hrtime for data generations */
+ int ksi_gen; /* current generation */
+ struct kstat_instance *ksi_next; /* next in instance list */
+} kstat_instance_t;
+
+const char *g_cmd = "vndstat";
+
+static void
+kstat_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ uint64_t n = num;
+ int index = 0;
+ char u;
+
+ while (n >= 1024) {
+ n /= 1024;
+ index++;
+ }
+
+ u = " KMGTPE"[index];
+
+ if (index == 0) {
+ (void) snprintf(buf, buflen, "%llu", n);
+ } else if ((num & ((1ULL << 10 * index) - 1)) == 0) {
+ /*
+ * If this is an even multiple of the base, always display
+ * without any decimal precision.
+ */
+ (void) snprintf(buf, buflen, "%llu%c", n, u);
+ } else {
+ /*
+ * We want to choose a precision that reflects the best choice
+ * for fitting in 5 characters. This can get rather tricky when
+ * we have numbers that are very close to an order of magnitude.
+ * For example, when displaying 10239 (which is really 9.999K),
+ * we want only a single place of precision for 10.0K. We could
+ * develop some complex heuristics for this, but it's much
+ * easier just to try each combination in turn.
+ */
+ int i;
+ for (i = 2; i >= 0; i--) {
+ if (snprintf(buf, buflen, "%.*f%c", i,
+ (double)num / (1ULL << 10 * index), u) <= 5)
+ break;
+ }
+ }
+}
+
+static void
+fatal(char *fmt, ...)
+{
+ va_list ap;
+ int error = errno;
+
+ va_start(ap, fmt);
+
+ (void) fprintf(stderr, "%s: ", g_cmd);
+ /*LINTED*/
+ (void) vfprintf(stderr, fmt, ap);
+
+ if (fmt[strlen(fmt) - 1] != '\n')
+ (void) fprintf(stderr, ": %s\n", strerror(error));
+
+ exit(EXIT_FAILURE);
+}
+
+int
+kstat_field_hint(kstat_t *ksp, kstat_field_t *field)
+{
+ kstat_named_t *nm = KSTAT_NAMED_PTR(ksp);
+ int i;
+
+ assert(ksp->ks_type == KSTAT_TYPE_NAMED);
+
+ for (i = 0; i < ksp->ks_ndata; i++) {
+ if (strcmp(field->ksf_name, nm[i].name) == 0)
+ return (field->ksf_hint = i);
+ }
+
+ fatal("could not find field '%s' in %s:%d\n",
+ field->ksf_name, ksp->ks_name, ksp->ks_instance);
+
+ return (0);
+}
+
+int
+kstat_instances_compare(const void *lhs, const void *rhs)
+{
+ kstat_instance_t *l = *((kstat_instance_t **)lhs);
+ kstat_instance_t *r = *((kstat_instance_t **)rhs);
+ int rval;
+
+ if ((rval = strcmp(l->ksi_name, r->ksi_name)) != 0)
+ return (rval);
+
+ if (l->ksi_instance < r->ksi_instance)
+ return (-1);
+
+ if (l->ksi_instance > r->ksi_instance)
+ return (1);
+
+ return (0);
+}
+
+void
+kstat_instances_update(kstat_ctl_t *kcp, kstat_instance_t **head,
+ boolean_t (*interested)(kstat_t *))
+{
+ int ninstances = 0, i;
+ kstat_instance_t **sorted, *ksi, *next;
+ kstat_t *ksp;
+ kid_t kid;
+
+ if ((kid = kstat_chain_update(kcp)) == 0 && *head != NULL)
+ return;
+
+ if (kid == -1)
+ fatal("failed to update kstat chain");
+
+ for (ksi = *head; ksi != NULL; ksi = ksi->ksi_next)
+ ksi->ksi_ksp = NULL;
+
+ for (ksp = kcp->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
+ kstat_instance_t *last = NULL;
+
+ if (!interested(ksp))
+ continue;
+
+ /*
+ * Now look to see if we have this instance and name. (Yes,
+ * this is a linear search; we're assuming that this list is
+ * modest in size.)
+ */
+ for (ksi = *head; ksi != NULL; ksi = ksi->ksi_next) {
+ last = ksi;
+
+ if (ksi->ksi_instance != ksp->ks_instance)
+ continue;
+
+ if (strcmp(ksi->ksi_name, ksp->ks_name) != 0)
+ continue;
+
+ ksi->ksi_ksp = ksp;
+ ninstances++;
+ break;
+ }
+
+ if (ksi != NULL)
+ continue;
+
+ if ((ksi = malloc(sizeof (kstat_instance_t))) == NULL)
+ fatal("could not allocate memory for stat instance");
+
+ bzero(ksi, sizeof (kstat_instance_t));
+ (void) strlcpy(ksi->ksi_name, ksp->ks_name, KSTAT_STRLEN);
+ ksi->ksi_instance = ksp->ks_instance;
+ ksi->ksi_ksp = ksp;
+ ksi->ksi_next = NULL;
+
+ if (last == NULL) {
+ assert(*head == NULL);
+ *head = ksi;
+ } else {
+ last->ksi_next = ksi;
+ }
+
+ ninstances++;
+ }
+
+ /*
+ * Now we know how many instances we have; iterate back over them,
+ * pruning the stale ones and adding the active ones to a holding
+ * array in which to sort them.
+ */
+ sorted = (void *)alloca(ninstances * sizeof (kstat_instance_t *));
+ ninstances = 0;
+
+ for (ksi = *head; ksi != NULL; ksi = next) {
+ next = ksi->ksi_next;
+
+ if (ksi->ksi_ksp == NULL) {
+ free(ksi);
+ } else {
+ sorted[ninstances++] = ksi;
+ }
+ }
+
+ if (ninstances == 0) {
+ *head = NULL;
+ return;
+ }
+
+ qsort(sorted, ninstances, sizeof (kstat_instance_t *),
+ kstat_instances_compare);
+
+ *head = sorted[0];
+
+ for (i = 0; i < ninstances; i++) {
+ ksi = sorted[i];
+ ksi->ksi_next = i < ninstances - 1 ? sorted[i + 1] : NULL;
+ }
+}
+
+void
+kstat_instances_read(kstat_ctl_t *kcp, kstat_instance_t *instances,
+ kstat_field_t *fields)
+{
+ kstat_instance_t *ksi;
+ int i, nfields;
+
+ for (nfields = 0; fields[nfields].ksf_header != NULL; nfields++)
+ continue;
+
+ for (ksi = instances; ksi != NULL; ksi = ksi->ksi_next) {
+ kstat_t *ksp = ksi->ksi_ksp;
+
+ if (ksp == NULL)
+ continue;
+
+ if (kstat_read(kcp, ksp, NULL) == -1) {
+ if (errno == ENXIO) {
+ /*
+ * Our kstat has been removed since the update;
+ * NULL it out to prevent us from trying to read
+ * it again (and to indicate that it should not
+ * be displayed) and drive on.
+ */
+ ksi->ksi_ksp = NULL;
+ continue;
+ }
+
+ fatal("failed to read kstat %s:%d",
+ ksi->ksi_name, ksi->ksi_instance);
+ }
+
+ if (ksp->ks_type != KSTAT_TYPE_NAMED) {
+ fatal("%s:%d is not a named kstat", ksi->ksi_name,
+ ksi->ksi_instance);
+ }
+
+ if (ksi->ksi_data[0] == NULL) {
+ size_t size = nfields * sizeof (uint64_t) * 2;
+ uint64_t *data;
+
+ if ((data = malloc(size)) == NULL)
+ fatal("could not allocate memory");
+
+ bzero(data, size);
+ ksi->ksi_data[0] = data;
+ ksi->ksi_data[1] = &data[nfields];
+ }
+
+ for (i = 0; i < nfields; i++) {
+ kstat_named_t *nm = KSTAT_NAMED_PTR(ksp);
+ kstat_field_t *field = &fields[i];
+ int hint = field->ksf_hint;
+
+ if (field->ksf_name == NULL)
+ continue;
+
+ if (hint < 0 || hint >= ksp->ks_ndata ||
+ strcmp(field->ksf_name, nm[hint].name) != 0) {
+ hint = kstat_field_hint(ksp, field);
+ }
+
+ if (field->ksf_flags & KSTAT_FIELD_STRING)
+ ksi->ksi_data[ksi->ksi_gen][i] =
+ (uint64_t)(uintptr_t)
+ nm[hint].value.str.addr.ptr;
+ else
+ ksi->ksi_data[ksi->ksi_gen][i] =
+ nm[hint].value.ui64;
+ }
+
+ ksi->ksi_snaptime[ksi->ksi_gen] = ksp->ks_snaptime;
+ ksi->ksi_gen ^= 1;
+ }
+}
+
+uint64_t
+kstat_instances_delta(kstat_instance_t *ksi, int i)
+{
+ int gen = ksi->ksi_gen;
+ uint64_t delta = ksi->ksi_data[gen ^ 1][i] - ksi->ksi_data[gen][i];
+ uint64_t tdelta = ksi->ksi_snaptime[gen ^ 1] - ksi->ksi_snaptime[gen];
+
+ return (((delta * (uint64_t)NANOSEC) + (tdelta / 2)) / tdelta);
+}
+
+void
+kstat_instances_print(kstat_instance_t *instances, kstat_field_t *fields,
+ boolean_t header)
+{
+ kstat_instance_t *ksi = instances;
+ int i, nfields;
+
+ for (nfields = 0; fields[nfields].ksf_header != NULL; nfields++)
+ continue;
+
+ if (header) {
+ for (i = 0; i < nfields; i++) {
+ if (fields[i].ksf_flags & KSTAT_FIELD_LJUST) {
+ (void) printf("%s%c", fields[i].ksf_header,
+ i < nfields - 1 ? ' ' : '\n');
+ continue;
+ }
+ (void) printf("%*s%c", fields[i].ksf_width,
+ fields[i].ksf_header, i < nfields - 1 ? ' ' : '\n');
+ }
+ }
+
+ for (ksi = instances; ksi != NULL; ksi = ksi->ksi_next) {
+ if (ksi->ksi_snaptime[1] == 0 || ksi->ksi_ksp == NULL)
+ continue;
+
+ for (i = 0; i < nfields; i++) {
+ char trailer = i < nfields - 1 ? ' ' : '\n';
+
+ if (fields[i].ksf_flags & KSTAT_FIELD_FILLER) {
+ (void) printf("%*s%c", fields[i].ksf_width,
+ fields[i].ksf_header, trailer);
+ continue;
+ }
+
+ if (fields[i].ksf_flags & KSTAT_FIELD_STRING) {
+ (void) printf("%*s%c", fields[i].ksf_width,
+ (char *)(uintptr_t)ksi->ksi_data[
+ ksi->ksi_gen ^ 1][i],
+ trailer);
+ continue;
+ }
+
+ if (fields[i].ksf_flags & KSTAT_FIELD_UNIT) {
+ char buf[128];
+ size_t flen = fields[i].ksf_width + 1;
+ const char *suffix = "";
+
+ if (fields[i].ksf_suffix != NULL) {
+ suffix = fields[i].ksf_suffix;
+ flen -= strlen(fields[i].ksf_suffix);
+ }
+
+ kstat_nicenum(fields[i].ksf_flags &
+ KSTAT_FIELD_NODELTA ?
+ ksi->ksi_data[ksi->ksi_gen ^ 1][i] :
+ kstat_instances_delta(ksi, i), buf,
+ MIN(sizeof (buf), flen));
+ (void) printf("%*s%s%c", flen - 1, buf,
+ suffix, trailer);
+ continue;
+ }
+
+ (void) printf("%*lld%c", fields[i].ksf_width,
+ fields[i].ksf_flags & KSTAT_FIELD_USEINSTANCE ?
+ ksi->ksi_instance :
+ fields[i].ksf_flags & KSTAT_FIELD_NODELTA ?
+ ksi->ksi_data[ksi->ksi_gen ^ 1][i] :
+ kstat_instances_delta(ksi, i), trailer);
+ }
+ }
+}
+
+static boolean_t
+interested(kstat_t *ksp)
+{
+ const char *module = "vnd";
+ const char *class = "net";
+
+ if (strcmp(ksp->ks_module, module) != 0)
+ return (B_FALSE);
+
+ if (strcmp(ksp->ks_class, class) != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/* BEGIN CSTYLED */
+char *g_usage = "Usage: vndstat [interval [count]]\n"
+ "\n"
+ " Displays statistics for active vnd devices, with one line per device.\n"
+ " All statistics are reported as per-second rates.\n"
+ "\n"
+ " The columns are as follows:\n"
+ "\n"
+ " zone => name of the zone with the device\n"
+ " name => name of the vnd device\n"
+ " rx => bytes received\n"
+ " tx => bytes transmitted\n"
+ " drops => number of dropped packets\n"
+ " txfc => number of transmit flow control events\n"
+ "\n";
+/* END CSTYLED */
+
+void
+usage()
+{
+ (void) fprintf(stderr, "%s", g_usage);
+ exit(EXIT_FAILURE);
+}
+
+/*ARGSUSED*/
+void
+intr(int sig)
+{}
+
+/*ARGSUSED*/
+int
+main(int argc, char **argv)
+{
+ kstat_ctl_t *kcp;
+ kstat_instance_t *instances = NULL;
+ int i = 0;
+ int interval = 1;
+ int count = INT32_MAX;
+ struct itimerval itimer;
+ struct sigaction act;
+ sigset_t set;
+ char *endp;
+
+ kstat_field_t fields[] = {
+ { "name", "linkname", 6, KSTAT_FIELD_STRING },
+ { "|", NULL, 1, KSTAT_FIELD_FILLER },
+ { "rx B/s", "rbytes", 8, KSTAT_FIELD_UNIT, "B/s" },
+ { "|", NULL, 1, KSTAT_FIELD_FILLER },
+ { "tx B/s", "obytes", 8, KSTAT_FIELD_UNIT, "B/s" },
+ { "|", NULL, 1, KSTAT_FIELD_FILLER },
+ { "drops", "total_drops", 5 },
+ { "txfc", "flowcontrol_events", 4 },
+ { "|", NULL, 1, KSTAT_FIELD_FILLER },
+ { "zone", "zonename", 36,
+ KSTAT_FIELD_STRING | KSTAT_FIELD_LJUST },
+ { NULL }
+ };
+
+ if (argc > 1) {
+ interval = strtol(argv[1], &endp, 10);
+
+ if (*endp != '\0' || interval <= 0)
+ usage();
+ }
+
+ if (argc > 2) {
+ count = strtol(argv[2], &endp, 10);
+
+ if (*endp != '\0' || count <= 0)
+ usage();
+ }
+
+ if ((kcp = kstat_open()) == NULL)
+ fatal("could not open /dev/kstat");
+
+ (void) sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ act.sa_handler = intr;
+ (void) sigaction(SIGALRM, &act, NULL);
+
+ (void) sigemptyset(&set);
+ (void) sigaddset(&set, SIGALRM);
+ (void) sigprocmask(SIG_BLOCK, &set, NULL);
+
+ bzero(&itimer, sizeof (itimer));
+ itimer.it_value.tv_sec = interval;
+ itimer.it_interval.tv_sec = interval;
+
+ if (setitimer(ITIMER_REAL, &itimer, NULL) != 0) {
+ fatal("could not set timer to %d second%s", interval,
+ interval == 1 ? "" : "s");
+ }
+
+ (void) sigemptyset(&set);
+
+ for (;;) {
+ kstat_instances_update(kcp, &instances, interested);
+ kstat_instances_read(kcp, instances, fields);
+
+ if (i++ > 0) {
+ kstat_instances_print(instances, fields,
+ instances != NULL && instances->ksi_next == NULL ?
+ (((i - 2) % 20) == 0) : B_TRUE);
+ }
+
+ if (i > count)
+ break;
+
+ (void) sigsuspend(&set);
+ }
+
+ /*NOTREACHED*/
+ return (0);
+}
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index 265d45030c..291f8c5a83 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -157,7 +157,8 @@ SUBDIRS += \
librdc \
libinstzones \
libpkg \
- libpcidb
+ libpcidb \
+ libvnd
SUBDIRS += \
passwdutil \
@@ -463,6 +464,7 @@ HDRSUBDIRS= \
libumem \
libunistat \
libuutil \
+ libvnd \
libwanboot \
libwanbootutil \
libwrap \
diff --git a/usr/src/lib/libdlpi/common/libdlpi.c b/usr/src/lib/libdlpi/common/libdlpi.c
index bda2648955..1ec147270a 100644
--- a/usr/src/lib/libdlpi/common/libdlpi.c
+++ b/usr/src/lib/libdlpi/common/libdlpi.c
@@ -22,6 +22,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc.
+ */
/*
* Data-Link Provider Interface (Version 2)
@@ -51,7 +54,7 @@
#include "libdlpi_impl.h"
-static int i_dlpi_open(const char *, int *, uint_t, boolean_t);
+static int i_dlpi_open(const char *, const char *, int *, uint_t, boolean_t);
static int i_dlpi_style1_open(dlpi_impl_t *);
static int i_dlpi_style2_open(dlpi_impl_t *);
static int i_dlpi_checkstyle(dlpi_impl_t *, t_uscalar_t);
@@ -130,7 +133,8 @@ dlpi_walk(dlpi_walkfunc_t *fn, void *arg, uint_t flags)
}
int
-dlpi_open(const char *linkname, dlpi_handle_t *dhp, uint_t flags)
+dlpi_open_zone(const char *linkname, const char *zonename, dlpi_handle_t *dhp,
+ uint_t flags)
{
int retval, on = 1;
ifspec_t ifsp;
@@ -164,6 +168,16 @@ dlpi_open(const char *linkname, dlpi_handle_t *dhp, uint_t flags)
if (getenv("DLPI_DEVONLY") != NULL)
dip->dli_oflags |= DLPI_DEVONLY;
+ if (zonename == NULL) {
+ dip->dli_zonename[0] = '\0';
+ } else {
+ if (strlcpy(dip->dli_zonename, zonename,
+ sizeof (dip->dli_zonename)) >= sizeof (dip->dli_zonename)) {
+ free(dip);
+ return (DLPI_EZONENAMEINVAL);
+ }
+ }
+
/* Copy linkname provided to the function. */
if (strlcpy(dip->dli_linkname, linkname, sizeof (dip->dli_linkname)) >=
sizeof (dip->dli_linkname)) {
@@ -237,6 +251,12 @@ dlpi_open(const char *linkname, dlpi_handle_t *dhp, uint_t flags)
return (DLPI_SUCCESS);
}
+int
+dlpi_open(const char *linkname, dlpi_handle_t *dhp, uint_t flags)
+{
+ return (dlpi_open_zone(linkname, NULL, dhp, flags));
+}
+
void
dlpi_close(dlpi_handle_t dh)
{
@@ -1013,6 +1033,15 @@ dlpi_iftype(uint_t dlpitype)
* /dev - if DLPI_DEVONLY is specified, or if there is no
* data-link with the specified name (could be /dev/ip)
*
+ * If a zone's name has been specified, eg. via dlpi_open_zone, then we instead
+ * will check in:
+ *
+ * /dev/ipnet/zone/%z/ - if DLPI_DEVIPNET is specified
+ * /dev/net/zone/%z/ - if a data-link with the specified name exists.
+ *
+ * When a zone name is specified, all of the fallback procedures that we opt for
+ * in the normal case are not used.
+ *
* In particular, if DLPI_DEVIPNET is not specified, this function is used to
* open a data-link node, or "/dev/ip" node. It is usually be called firstly
* with style1 being B_TRUE, and if that fails and the return value is not
@@ -1040,7 +1069,8 @@ dlpi_iftype(uint_t dlpitype)
* the second style-2 open attempt.
*/
static int
-i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1)
+i_dlpi_open(const char *provider, const char *zonename, int *fd, uint_t flags,
+ boolean_t style1)
{
char path[MAXPATHLEN];
int oflags;
@@ -1051,7 +1081,13 @@ i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1)
oflags |= O_EXCL;
if (flags & DLPI_DEVIPNET) {
- (void) snprintf(path, sizeof (path), "/dev/ipnet/%s", provider);
+ if (*zonename != '\0') {
+ (void) snprintf(path, sizeof (path),
+ "/dev/ipnet/zone/%s/%s", zonename, provider);
+ } else {
+ (void) snprintf(path, sizeof (path), "/dev/ipnet/%s",
+ provider);
+ }
if ((*fd = open(path, oflags)) != -1)
return (DLPI_SUCCESS);
else
@@ -1070,7 +1106,13 @@ i_dlpi_open(const char *provider, int *fd, uint_t flags, boolean_t style1)
if (dlpi_parselink(provider, driver, &ppa) != DLPI_SUCCESS)
goto fallback;
- (void) snprintf(path, sizeof (path), "/dev/net/%s", provider);
+ if (*zonename != '\0') {
+ (void) snprintf(path, sizeof (path),
+ "/dev/net/zone/%s/%s", zonename, provider);
+ } else {
+ (void) snprintf(path, sizeof (path), "/dev/net/%s",
+ provider);
+ }
if ((*fd = open(path, oflags)) != -1)
return (DLPI_SUCCESS);
@@ -1130,7 +1172,8 @@ i_dlpi_style1_open(dlpi_impl_t *dip)
int retval, save_errno;
int fd;
- retval = i_dlpi_open(dip->dli_linkname, &fd, dip->dli_oflags, B_TRUE);
+ retval = i_dlpi_open(dip->dli_linkname, dip->dli_zonename, &fd,
+ dip->dli_oflags, B_TRUE);
if (retval != DLPI_SUCCESS)
return (retval);
dip->dli_fd = fd;
@@ -1153,7 +1196,8 @@ i_dlpi_style2_open(dlpi_impl_t *dip)
int fd;
int retval, save_errno;
- retval = i_dlpi_open(dip->dli_provider, &fd, dip->dli_oflags, B_FALSE);
+ retval = i_dlpi_open(dip->dli_provider, dip->dli_zonename, &fd,
+ dip->dli_oflags, B_FALSE);
if (retval != DLPI_SUCCESS)
return (retval);
dip->dli_fd = fd;
@@ -1571,7 +1615,8 @@ static const char *libdlpi_errlist[] = {
/* DLPI_ENOTENOTSUP */
"invalid DLPI notification type", /* DLPI_ENOTEINVAL */
"invalid DLPI notification id", /* DLPI_ENOTEIDINVAL */
- "DLPI_IPNETINFO not supported" /* DLPI_EIPNETINFONOTSUP */
+ "DLPI_IPNETINFO not supported", /* DLPI_EIPNETINFONOTSUP */
+ "invalid zone name" /* DLPI_EZONENAMEINVAL */
};
const char *
diff --git a/usr/src/lib/libdlpi/common/libdlpi.h b/usr/src/lib/libdlpi/common/libdlpi.h
index 993ac1b7a4..364413ee3a 100644
--- a/usr/src/lib/libdlpi/common/libdlpi.h
+++ b/usr/src/lib/libdlpi/common/libdlpi.h
@@ -93,6 +93,7 @@ enum {
DLPI_ENOTENOTSUP, /* DLPI notification not supported by link */
DLPI_ENOTEIDINVAL, /* invalid DLPI notification id */
DLPI_EIPNETINFONOTSUP, /* DLPI_IPNETINFO not supported */
+ DLPI_EZONENAMEINVAL, /* invalid zone name */
DLPI_ERRMAX /* Highest + 1 libdlpi error code */
};
@@ -184,6 +185,7 @@ typedef boolean_t dlpi_walkfunc_t(const char *, void *);
extern void dlpi_walk(dlpi_walkfunc_t *, void *, uint_t);
extern int dlpi_open(const char *, dlpi_handle_t *, uint_t);
+extern int dlpi_open_zone(const char *, const char *, dlpi_handle_t *, uint_t);
extern void dlpi_close(dlpi_handle_t);
extern int dlpi_info(dlpi_handle_t, dlpi_info_t *, uint_t);
extern int dlpi_bind(dlpi_handle_t, uint_t, uint_t *);
diff --git a/usr/src/lib/libdlpi/common/libdlpi_impl.h b/usr/src/lib/libdlpi/common/libdlpi_impl.h
index 70708ff5af..8969cce7cb 100644
--- a/usr/src/lib/libdlpi/common/libdlpi_impl.h
+++ b/usr/src/lib/libdlpi/common/libdlpi_impl.h
@@ -28,6 +28,7 @@
#include <libdlpi.h>
#include <sys/sysmacros.h>
+#include <sys/zone.h>
#ifdef __cplusplus
extern "C" {
@@ -112,6 +113,8 @@ typedef struct dlpi_impl_s {
/* full linkname including PPA */
char dli_provider[DLPI_LINKNAME_MAX];
/* only provider name */
+ char dli_zonename[ZONENAME_MAX];
+ /* optionally specified zone */
t_uscalar_t dli_style; /* style 1 or 2 */
uint_t dli_saplen; /* bound SAP length */
uint_t dli_sap; /* bound SAP value */
diff --git a/usr/src/lib/libdlpi/common/mapfile-vers b/usr/src/lib/libdlpi/common/mapfile-vers
index ed3231dc92..c818e5e660 100644
--- a/usr/src/lib/libdlpi/common/mapfile-vers
+++ b/usr/src/lib/libdlpi/common/mapfile-vers
@@ -67,6 +67,11 @@ SYMBOL_VERSION SUNW_1.1 { # first release of libdlpi, Solaris 11
SYMBOL_VERSION SUNWprivate {
global:
+ #
+ # dlpi_open_zone should be moved to a new public section once it is
+ # upstreamed into illumos-gate .
+ #
+ dlpi_open_zone;
dlpi_parselink;
dlpi_makelink;
dlpi_style;
diff --git a/usr/src/lib/libdtrace/Makefile.com b/usr/src/lib/libdtrace/Makefile.com
index f10edcf878..42ab0e4b78 100644
--- a/usr/src/lib/libdtrace/Makefile.com
+++ b/usr/src/lib/libdtrace/Makefile.com
@@ -86,6 +86,7 @@ DLIBSRCS += \
io.d \
ip.d \
iscsit.d \
+ mac.d \
net.d \
nfs.d \
nfssrv.d \
@@ -98,7 +99,8 @@ DLIBSRCS += \
sysevent.d \
tcp.d \
udp.d \
- unistd.d
+ unistd.d \
+ vnd.d
include ../../Makefile.lib
@@ -111,6 +113,7 @@ CLEANFILES += dt_lex.c dt_grammar.c dt_grammar.h y.output
CLEANFILES += ../common/procfs.sed ../common/procfs.d
CLEANFILES += ../common/io.sed ../common/io.d
CLEANFILES += ../common/ip.sed ../common/ip.d
+CLEANFILES += ../common/mac.sed ../common/mac.d
CLEANFILES += ../common/net.sed ../common/net.d
CLEANFILES += ../common/errno.d ../common/signal.d
CLEANFILES += ../common/dt_errtags.c ../common/dt_names.c
@@ -203,6 +206,9 @@ pics/dt_lex.o pics/dt_grammar.o := CCVERBOSE =
../common/ip.d: ../common/ip.sed ../common/ip.d.in
sed -f ../common/ip.sed < ../common/ip.d.in > $@
+../common/mac.d: ../common/mac.sed ../common/mac.d.in
+ sed -f ../common/mac.sed < ../common/mac.d.in > $@
+
../common/net.d: ../common/net.sed ../common/net.d.in
sed -f ../common/net.sed < ../common/net.d.in > $@
diff --git a/usr/src/lib/libdtrace/common/mac.d.in b/usr/src/lib/libdtrace/common/mac.d.in
new file mode 100644
index 0000000000..6263d51bdd
--- /dev/null
+++ b/usr/src/lib/libdtrace/common/mac.d.in
@@ -0,0 +1,66 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#pragma D depends_on library ip.d
+
+inline int ETHERTYPE_PUP = @ETHERTYPE_PUP@;
+inline int ETHERTYPE_802_MIN = @ETHERTYPE_802_MIN@;
+inline int ETHERTYPE_IP = @ETHERTYPE_IP@;
+inline int ETHERTYPE_ARP = @ETHERTYPE_ARP@;
+inline int ETHERTYPE_REVARP = @ETHERTYPE_REVARP@;
+inline int ETHERTYPE_AT = @ETHERTYPE_AT@;
+inline int ETHERTYPE_AARP = @ETHERTYPE_AARP@;
+inline int ETHERTYPE_VLAN = @ETHERTYPE_VLAN@;
+inline int ETHERTYPE_IPV6 = @ETHERTYPE_IPV6@;
+inline int ETHERTYPE_SLOW = @ETHERTYPE_SLOW@;
+inline int ETHERTYPE_PPPOED = @ETHERTYPE_PPPOED@;
+inline int ETHERTYPE_PPPOES = @ETHERTYPE_PPPOES@;
+inline int ETHERTYPE_EAPOL = @ETHERTYPE_EAPOL@;
+inline int ETHERTYPE_RSN_PREAUTH = @ETHERTYPE_RSN_PREAUTH@;
+inline int ETHERTYPE_TRILL = @ETHERTYPE_TRILL@;
+inline int ETHERTYPE_FCOE = @ETHERTYPE_FCOE@;
+inline int ETHERTYPE_MAX = @ETHERTYPE_MAX@;
+
+
+typedef struct etherinfo {
+ uint8_t eth_dst[6]; /* Destination MAC addr */
+ uint8_t eth_src[6]; /* Source MAC addr */
+ uint16_t eth_type; /* Ethertype */
+ boolean_t eth_istagged; /* Is the VLAN tag present */
+ uint8_t eth_priority; /* Priority tag */
+ uint8_t eth_dei; /* drop eligible indicator */
+ uint16_t eth_vlanid; /* VLAN ID */
+ uintptr_t eth_header; /* Pointer to start of header */
+ uintptr_t eth_mblk; /* Pointer to the mblk containing header */
+} etherinfo_t;
+
+#pragma D binding "1.12.1" translator
+translator etherinfo_t < mblk_t *mp > {
+ eth_dst = mp->b_rptr;
+ eth_src = mp->b_rptr + 6;
+ eth_type = ntohs(*(uint16_t *)(mp->b_rptr + 12)) == ETHERTYPE_VLAN ?
+ ntohs(*(uint16_t *)(mp->b_rptr + 16)) :
+ ntohs(*(uint16_t *)(mp->b_rptr + 12));
+ eth_istagged = ntohs(*(uint16_t *)(mp->b_rptr + 12)) == ETHERTYPE_VLAN ?
+ 1 : 0;
+ eth_priority = ntohs(*(uint16_t *)(mp->b_rptr + 12)) == ETHERTYPE_VLAN ?
+ ntohs(*(uint16_t *)(mp->b_rptr + 14)) & 0xe000: 0;
+ eth_dei = ntohs(*(uint16_t *)(mp->b_rptr + 12)) == ETHERTYPE_VLAN ?
+ ntohs(*(uint16_t *)(mp->b_rptr + 14)) & 0x1000: 0;
+ eth_vlanid = ntohs(*(uint16_t *)(mp->b_rptr + 12)) == ETHERTYPE_VLAN ?
+ ntohs(*(uint16_t *)(mp->b_rptr + 14)) & 0x0fff: 0;
+ eth_header = (uintptr_t)mp->b_rptr;
+ eth_mblk = (uintptr_t)mp;
+};
diff --git a/usr/src/lib/libdtrace/common/mac.sed.in b/usr/src/lib/libdtrace/common/mac.sed.in
new file mode 100644
index 0000000000..00e149d000
--- /dev/null
+++ b/usr/src/lib/libdtrace/common/mac.sed.in
@@ -0,0 +1,45 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+
+/*
+ * This file is a sed script which is first preprocessed by cpp or cc -E to
+ * define a set of sed directives which replace #define tokens with their
+ * values. After preprocessing, the sed script is run over vnd.d.in to
+ * replace the #define tokens listed below to create the finished vnd.d.
+ * Refer to the rules in libdtrace/Makefile.com for more information.
+ */
+
+#include <sys/ethernet.h>
+
+#define SED_REPLACE(x) s/#x/x/g
+
+SED_REPLACE(ETHERTYPE_PUP)
+SED_REPLACE(ETHERTYPE_802_MIN)
+SED_REPLACE(ETHERTYPE_IP)
+SED_REPLACE(ETHERTYPE_ARP)
+SED_REPLACE(ETHERTYPE_REVARP)
+SED_REPLACE(ETHERTYPE_AT)
+SED_REPLACE(ETHERTYPE_AARP)
+SED_REPLACE(ETHERTYPE_VLAN)
+SED_REPLACE(ETHERTYPE_IPV6)
+SED_REPLACE(ETHERTYPE_SLOW)
+SED_REPLACE(ETHERTYPE_PPPOED)
+SED_REPLACE(ETHERTYPE_PPPOES)
+SED_REPLACE(ETHERTYPE_EAPOL)
+SED_REPLACE(ETHERTYPE_RSN_PREAUTH)
+SED_REPLACE(ETHERTYPE_TRILL)
+SED_REPLACE(ETHERTYPE_FCOE)
+SED_REPLACE(ETHERTYPE_MAX)
diff --git a/usr/src/lib/libdtrace/common/vnd.d b/usr/src/lib/libdtrace/common/vnd.d
new file mode 100644
index 0000000000..356c412150
--- /dev/null
+++ b/usr/src/lib/libdtrace/common/vnd.d
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+
+#pragma D depends_on module vnd
+#pragma D depends_on provider vnd
+#pragma D depends_on library ip.d
+#pragma D depends_on library mac.d
+
+#pragma D binding "1.6.3" translator
+translator ifinfo_t < vnd_str_t *vsp > {
+ if_name = vsp != NULL ? stringof(vsp->vns_dev->vdd_lname) : "<null>";
+ if_local = 0;
+ if_ipstack = vsp != NULL ? vsp->vns_nsd->vpnd_nsid : 0;
+ if_addr = (uintptr_t)vsp;
+};
diff --git a/usr/src/lib/libvnd/Makefile b/usr/src/lib/libvnd/Makefile
new file mode 100644
index 0000000000..0b3f923806
--- /dev/null
+++ b/usr/src/lib/libvnd/Makefile
@@ -0,0 +1,42 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include ../Makefile.lib
+
+HDRS = libvnd.h
+HDRDIR = common
+SUBDIRS = $(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+clean := TARGET = clean
+clobber := TARGET = clobber
+install := TARGET = install
+lint := TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber install lint: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/lib/libvnd/Makefile.com b/usr/src/lib/libvnd/Makefile.com
new file mode 100644
index 0000000000..3c6896de11
--- /dev/null
+++ b/usr/src/lib/libvnd/Makefile.com
@@ -0,0 +1,39 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include ../../Makefile.lib
+
+LIBRARY = libvnd.a
+VERS = .1
+OBJECTS = libvnd.o
+
+include ../../Makefile.lib
+
+LIBS = $(DYNLIB) $(LINTLIB)
+LDLIBS += -lc
+CPPFLAGS += -I../common
+
+SRCDIR = ../common
+
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/libvnd/amd64/Makefile b/usr/src/lib/libvnd/amd64/Makefile
new file mode 100644
index 0000000000..15d904c616
--- /dev/null
+++ b/usr/src/lib/libvnd/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT64)
diff --git a/usr/src/lib/libvnd/common/libvnd.c b/usr/src/lib/libvnd/common/libvnd.c
new file mode 100644
index 0000000000..8972f6cf5a
--- /dev/null
+++ b/usr/src/lib/libvnd/common/libvnd.c
@@ -0,0 +1,550 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stropts.h>
+#include <stdio.h>
+#include <zone.h>
+#include <assert.h>
+#include <sys/sysmacros.h>
+
+#include <sys/vnd.h>
+#include <libvnd.h>
+
+struct vnd_handle {
+ int vh_fd;
+ uint32_t vh_errno;
+ int vh_syserr;
+};
+
+static const char *vnd_strerror_tbl[] = {
+ "no error", /* VND_E_SUCCESS */
+ "not enough memory available", /* VND_E_NOMEM */
+ "no such datalink", /* VND_E_NODATALINK */
+ "datalink not of type DL_ETHER", /* VND_E_NOTETHER */
+ "unknown dlpi failure", /* VND_E_DLPIINVAL */
+ "DL_ATTACH_REQ failed", /* VND_E_ATTACHFAIL */
+ "DL_BIND_REQ failed", /* VND_E_PROMISCFAIL */
+ "DL_PROMISCON_REQ failed", /* VND_E_PROMISCFAIL */
+ "DLD_CAPAB_DIRECT enable failed", /* VND_E_DIRECTFAIL */
+ "bad datalink capability", /* VND_E_CAPACKINVAL */
+ "bad datalink subcapability", /* VND_E_SUBCAPINVAL */
+ "bad dld version", /* VND_E_DLDBADVERS */
+ "failed to create kstats", /* VND_E_KSTATCREATE */
+ "no such vnd link", /* VND_E_NODEV */
+ "netstack doesn't exist", /* VND_E_NONETSTACK */
+ "device already associated", /* VND_E_ASSOCIATED */
+ "device already attached", /* VND_E_ATTACHED */
+ "device already linked", /* VND_E_LINKED */
+ "invalid name", /* VND_E_BADNAME */
+ "permission denied", /* VND_E_PERM */
+ "no such zone", /* VND_E_NOZONE */
+ "failed to initialize vnd stream module", /* VND_E_STRINIT */
+ "device not attached", /* VND_E_NOTATTACHED */
+ "device not linked", /* VND_E_NOTLINKED */
+ "another device has the same link name", /* VND_E_LINKEXISTS */
+ "failed to create minor node", /* VND_E_MINORNODE */
+ "requested buffer size is too large", /* VND_E_BUFTOOBIG */
+ "requested buffer size is too small", /* VND_E_TOOSMALL */
+ "unable to obtain exclusive access to dlpi link, link busy",
+ /* VND_E_DLEXCL */
+ "DLD direct capability not supported over data link",
+ /* VND_E_DIRECTNOTSUP */
+ "invalid property size", /* VND_E_BADPROPSIZE */
+ "invalid property", /* VND_E_BADPROP */
+ "property is read only", /* VND_E_PROPRDONLY */
+ "unexpected system error", /* VND_E_SYS */
+ "capabilities invalid, pass-through module detected",
+ /* VND_E_CAPABPASS */
+ "unknown error" /* VND_E_UNKNOWN */
+};
+
+vnd_errno_t
+vnd_errno(vnd_handle_t *vhp)
+{
+ return (vhp->vh_errno);
+}
+
+const char *
+vnd_strerror(vnd_errno_t err)
+{
+ if (err >= VND_E_UNKNOWN)
+ err = VND_E_UNKNOWN;
+ return (vnd_strerror_tbl[err]);
+}
+
+int
+vnd_syserrno(vnd_handle_t *vhp)
+{
+ return (vhp->vh_syserr);
+}
+
+const char *
+vnd_strsyserror(int err)
+{
+ return (strerror(err));
+}
+
+static int
+vnd_ioc_return(vnd_handle_t *vhp, uint32_t err)
+{
+ if (err != VND_E_SUCCESS) {
+ vhp->vh_errno = err;
+ vhp->vh_syserr = 0;
+ } else {
+ if (errno == EFAULT)
+ abort();
+ vhp->vh_errno = VND_E_SYS;
+ vhp->vh_syserr = errno;
+ }
+ return (-1);
+}
+
+void
+vnd_close(vnd_handle_t *vhp)
+{
+ int ret;
+
+ if (vhp->vh_fd >= 0) {
+ ret = close(vhp->vh_fd);
+ assert(ret == 0);
+ }
+ free(vhp);
+}
+
+static int
+vnd_link(vnd_handle_t *vhp, const char *name)
+{
+ vnd_ioc_link_t vil;
+
+ if (strlen(name) >= VND_NAMELEN) {
+ errno = ENAMETOOLONG;
+ return (-1);
+ }
+
+ (void) strlcpy(vil.vil_name, name, sizeof (vil.vil_name));
+ vil.vil_errno = VND_E_SUCCESS;
+ if (ioctl(vhp->vh_fd, VND_IOC_LINK, &vil) != 0)
+ return (vnd_ioc_return(vhp, vil.vil_errno));
+
+ return (0);
+}
+
+static vnd_handle_t *
+vnd_open_ctl(vnd_errno_t *vnderr, int *syserr)
+{
+ int fd;
+ vnd_handle_t *vhp;
+
+ vhp = malloc(sizeof (vnd_handle_t));
+ if (vhp == NULL) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_NOMEM;
+ if (syserr != NULL)
+ *syserr = 0;
+ return (NULL);
+ }
+ bzero(vhp, sizeof (vnd_handle_t));
+
+ fd = open("/dev/vnd/ctl", O_RDWR);
+ if (fd < 0) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_SYS;
+ if (syserr != NULL)
+ *syserr = errno;
+ free(vhp);
+ return (NULL);
+ }
+
+ vhp->vh_fd = fd;
+ return (vhp);
+}
+
+vnd_handle_t *
+vnd_create(const char *zonename, const char *datalink, const char *linkname,
+ vnd_errno_t *vnderr, int *syserr)
+{
+ int ret;
+ vnd_handle_t *vhp;
+ vnd_ioc_attach_t via;
+ zoneid_t zid;
+
+ if (strlen(datalink) >= VND_NAMELEN) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_BADNAME;
+ if (syserr != NULL)
+ *syserr = 0;
+ return (NULL);
+ }
+
+ vhp = vnd_open_ctl(vnderr, syserr);
+ if (vhp == NULL)
+ return (NULL); /* errno set for us */
+
+ if (zonename != NULL) {
+ zid = getzoneidbyname(zonename);
+ if (zid == -1) {
+ vnd_close(vhp);
+ if (vnderr != NULL)
+ *vnderr = VND_E_NOZONE;
+ if (syserr != NULL)
+ *syserr = 0;
+ return (NULL);
+ }
+ via.via_zoneid = zid;
+ } else {
+ via.via_zoneid = -1;
+ }
+
+ (void) strlcpy(via.via_name, datalink, sizeof (via.via_name));
+ via.via_errno = VND_E_SUCCESS;
+ if (ioctl(vhp->vh_fd, VND_IOC_ATTACH, &via) != 0) {
+ if (via.via_errno != VND_E_SUCCESS) {
+ if (vnderr != NULL)
+ *vnderr = via.via_errno;
+ if (syserr != NULL)
+ *syserr = 0;
+ } else {
+ if (vnderr != NULL)
+ *vnderr = VND_E_SYS;
+ if (syserr != NULL)
+ *syserr = errno;
+ }
+ vnd_close(vhp);
+ return (NULL);
+ }
+
+ ret = vnd_link(vhp, linkname);
+ if (ret != 0) {
+ if (vnderr != NULL)
+ *vnderr = vhp->vh_errno;
+ if (syserr != NULL)
+ *syserr = vhp->vh_syserr;
+ vnd_close(vhp);
+ return (NULL);
+ }
+
+ if (vnderr != NULL)
+ *vnderr = VND_E_SUCCESS;
+ if (syserr != NULL)
+ *syserr = 0;
+
+ return (vhp);
+}
+
+vnd_handle_t *
+vnd_open(const char *zone, const char *link, vnd_errno_t *vnderr, int *syserr)
+{
+ int fd, ret;
+ char path[MAXPATHLEN];
+ vnd_handle_t *vhp;
+
+ if (zone != NULL)
+ ret = snprintf(path, sizeof (path), "/dev/vnd/zone/%s/%s",
+ zone, link);
+ else
+ ret = snprintf(path, sizeof (path), "/dev/vnd/%s", link);
+
+ if (ret >= sizeof (path)) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_BADNAME;
+ if (syserr != NULL)
+ *syserr = 0;
+ return (NULL);
+ }
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_SYS;
+ if (syserr != NULL)
+ *syserr = errno;
+ return (NULL);
+ }
+
+ vhp = malloc(sizeof (vnd_handle_t));
+ if (vhp == NULL) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_NOMEM;
+ if (syserr != NULL)
+ *syserr = 0;
+ ret = close(fd);
+ assert(ret == 0);
+ return (NULL);
+ }
+
+ bzero(vhp, sizeof (vnd_handle_t));
+ vhp->vh_fd = fd;
+
+ return (vhp);
+}
+
+int
+vnd_unlink(vnd_handle_t *vhp)
+{
+ vnd_ioc_unlink_t viu;
+ viu.viu_errno = VND_E_SUCCESS;
+
+ if (ioctl(vhp->vh_fd, VND_IOC_UNLINK, &viu) != 0)
+ return (vnd_ioc_return(vhp, viu.viu_errno));
+
+ return (0);
+}
+
+int
+vnd_pollfd(vnd_handle_t *vhp)
+{
+ return (vhp->vh_fd);
+}
+
+int
+vnd_walk(vnd_walk_cb_t func, void *arg, vnd_errno_t *vnderr, int *syserr)
+{
+ vnd_handle_t *vhp;
+ vnd_ioc_list_t vl;
+ vnd_ioc_info_t *viip;
+ int i, ret;
+
+ vl.vl_nents = 0;
+ vl.vl_ents = NULL;
+
+ vhp = vnd_open_ctl(vnderr, syserr);
+ if (vhp == NULL)
+ return (-1); /* errno is set for us */
+
+ /* VND_IOC_LIST only returns generic errnos */
+ if (ioctl(vhp->vh_fd, VND_IOC_LIST, &vl) != 0) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_SYS;
+ if (syserr != NULL)
+ *syserr = errno;
+ (void) vnd_ioc_return(vhp, VND_E_SUCCESS);
+ vnd_close(vhp);
+
+ return (-1);
+ }
+
+ if (vl.vl_actents == 0) {
+ vnd_close(vhp);
+ return (0);
+ }
+
+ viip = malloc(sizeof (vnd_ioc_info_t) * vl.vl_actents);
+ if (viip == NULL) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_NOMEM;
+ if (syserr != NULL)
+ *syserr = 0;
+ vnd_close(vhp);
+ return (-1);
+ }
+
+ vl.vl_nents = vl.vl_actents;
+ vl.vl_ents = viip;
+
+ if (ioctl(vhp->vh_fd, VND_IOC_LIST, &vl) != 0) {
+ if (vnderr != NULL)
+ *vnderr = VND_E_SYS;
+ if (syserr != NULL)
+ *syserr = errno;
+ (void) vnd_ioc_return(vhp, VND_E_SUCCESS);
+ free(viip);
+ vnd_close(vhp);
+ return (-1);
+ }
+
+ ret = 0;
+ for (i = 0; i < MIN(vl.vl_nents, vl.vl_actents); i++) {
+ if (func((vnd_info_t *)(viip + i), arg) != 0) {
+ ret = 1;
+ break;
+ }
+ }
+
+ free(viip);
+ vnd_close(vhp);
+
+ return (ret);
+}
+
+static int
+vnd_prop_readonly(vnd_handle_t *vhp)
+{
+ vhp->vh_syserr = 0;
+ vhp->vh_errno = VND_E_PROPRDONLY;
+ return (-1);
+}
+
+/*ARGSUSED*/
+static int
+vnd_prop_getbuf(vnd_handle_t *vhp, int cmd, void *buf, size_t len)
+{
+ vnd_ioc_buf_t vib;
+ vnd_prop_buf_t *vpbp = (vnd_prop_buf_t *)buf;
+ vib.vib_errno = 0;
+
+ if (ioctl(vhp->vh_fd, cmd, &vib) != 0)
+ return (vnd_ioc_return(vhp, vib.vib_errno));
+
+ vpbp->vpb_size = vib.vib_size;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+vnd_prop_setbuf(vnd_handle_t *vhp, int cmd, void *buf, size_t len)
+{
+ vnd_ioc_buf_t vib;
+ vnd_prop_buf_t *vpbp = (vnd_prop_buf_t *)buf;
+
+ vib.vib_errno = 0;
+ vib.vib_size = vpbp->vpb_size;
+ if (ioctl(vhp->vh_fd, cmd, &vib) != 0)
+ return (vnd_ioc_return(vhp, vib.vib_errno));
+
+ return (0);
+}
+
+typedef int (*vpt_prop_f)(vnd_handle_t *, int, void *, size_t);
+typedef struct vnd_prop_tab {
+ vnd_prop_t vpt_prop;
+ size_t vpt_size;
+ int vpt_ioctl_get;
+ int vpt_ioctl_set;
+ vpt_prop_f vpt_get;
+ vpt_prop_f vpt_set;
+} vnd_prop_tab_t;
+
+static vnd_prop_tab_t vnd_props[] = {
+ { VND_PROP_RXBUF, sizeof (vnd_prop_buf_t), VND_IOC_GETRXBUF,
+ VND_IOC_SETRXBUF, vnd_prop_getbuf, vnd_prop_setbuf},
+ { VND_PROP_TXBUF, sizeof (vnd_prop_buf_t), VND_IOC_GETTXBUF,
+ VND_IOC_SETTXBUF, vnd_prop_getbuf, vnd_prop_setbuf },
+ { VND_PROP_MAXBUF, sizeof (vnd_prop_buf_t), VND_IOC_GETMAXBUF,
+ -1, vnd_prop_getbuf, NULL },
+ { VND_PROP_MINTU, sizeof (vnd_prop_buf_t), VND_IOC_GETMINTU,
+ -1, vnd_prop_getbuf, NULL },
+ { VND_PROP_MAXTU, sizeof (vnd_prop_buf_t), VND_IOC_GETMAXTU,
+ -1, vnd_prop_getbuf, NULL },
+ { VND_PROP_MAX }
+};
+
+static int
+vnd_prop(vnd_handle_t *vhp, vnd_prop_t prop, void *buf, size_t len,
+ boolean_t get)
+{
+ vnd_prop_tab_t *vpt;
+
+ for (vpt = vnd_props; vpt->vpt_prop != VND_PROP_MAX; vpt++) {
+ if (vpt->vpt_prop != prop)
+ continue;
+
+ if (len != vpt->vpt_size) {
+ vhp->vh_errno = VND_E_BADPROPSIZE;
+ vhp->vh_syserr = 0;
+ return (-1);
+ }
+
+ if (get == B_TRUE) {
+ return (vpt->vpt_get(vhp, vpt->vpt_ioctl_get, buf,
+ len));
+ } else {
+ if (vpt->vpt_set == NULL)
+ return (vnd_prop_readonly(vhp));
+ return (vpt->vpt_set(vhp, vpt->vpt_ioctl_set, buf,
+ len));
+ }
+ }
+
+ vhp->vh_errno = VND_E_BADPROP;
+ vhp->vh_syserr = 0;
+ return (-1);
+}
+
+int
+vnd_prop_get(vnd_handle_t *vhp, vnd_prop_t prop, void *buf, size_t len)
+{
+ return (vnd_prop(vhp, prop, buf, len, B_TRUE));
+}
+
+int
+vnd_prop_set(vnd_handle_t *vhp, vnd_prop_t prop, void *buf, size_t len)
+{
+ return (vnd_prop(vhp, prop, buf, len, B_FALSE));
+}
+
+int
+vnd_prop_writeable(vnd_prop_t prop, boolean_t *write)
+{
+ vnd_prop_tab_t *vpt;
+
+ for (vpt = vnd_props; vpt->vpt_prop != VND_PROP_MAX; vpt++) {
+ if (vpt->vpt_prop != prop)
+ continue;
+
+ *write = (vpt->vpt_set != NULL);
+ return (0);
+ }
+
+ return (-1);
+}
+
+int
+vnd_prop_iter(vnd_handle_t *vhp, vnd_prop_iter_f func, void *arg)
+{
+ int i;
+
+ for (i = 0; i < VND_PROP_MAX; i++) {
+ if (func(vhp, i, arg) != 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+int
+vnd_frameio_read(vnd_handle_t *vhp, frameio_t *fiop)
+{
+ int ret;
+
+ ret = ioctl(vhp->vh_fd, VND_IOC_FRAMEIO_READ, fiop);
+ if (ret == -1) {
+ vhp->vh_errno = VND_E_SYS;
+ vhp->vh_syserr = errno;
+ }
+
+ return (ret);
+}
+
+int
+vnd_frameio_write(vnd_handle_t *vhp, frameio_t *fiop)
+{
+ int ret;
+
+ ret = ioctl(vhp->vh_fd, VND_IOC_FRAMEIO_WRITE, fiop);
+ if (ret == -1) {
+ vhp->vh_errno = VND_E_SYS;
+ vhp->vh_syserr = errno;
+ }
+
+ return (ret);
+}
diff --git a/usr/src/lib/libvnd/common/libvnd.h b/usr/src/lib/libvnd/common/libvnd.h
new file mode 100644
index 0000000000..ea92f113b6
--- /dev/null
+++ b/usr/src/lib/libvnd/common/libvnd.h
@@ -0,0 +1,84 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _LIBVND_H
+#define _LIBVND_H
+
+/*
+ * libvnd interfaces
+ */
+
+#include <stdint.h>
+#include <sys/vnd_errno.h>
+#include <sys/frameio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBVND_NAMELEN 32
+
+typedef struct vnd_handle vnd_handle_t;
+
+extern vnd_handle_t *vnd_create(const char *, const char *, const char *,
+ vnd_errno_t *, int *);
+extern vnd_handle_t *vnd_open(const char *, const char *, vnd_errno_t *, int *);
+extern int vnd_unlink(vnd_handle_t *);
+extern void vnd_close(vnd_handle_t *);
+extern vnd_errno_t vnd_errno(vnd_handle_t *);
+extern int vnd_syserrno(vnd_handle_t *);
+extern const char *vnd_strerror(vnd_errno_t);
+extern const char *vnd_strsyserror(int);
+
+extern int vnd_pollfd(vnd_handle_t *);
+
+typedef struct vnd_info {
+ uint32_t vi_version;
+ zoneid_t vi_zone;
+ char vi_name[LIBVND_NAMELEN];
+ char vi_datalink[LIBVND_NAMELEN];
+} vnd_info_t;
+
+typedef int (*vnd_walk_cb_t)(vnd_info_t *, void *);
+extern int vnd_walk(vnd_walk_cb_t, void *, vnd_errno_t *, int *);
+
+typedef enum vnd_prop {
+ VND_PROP_RXBUF = 0,
+ VND_PROP_TXBUF,
+ VND_PROP_MAXBUF,
+ VND_PROP_MINTU,
+ VND_PROP_MAXTU,
+ VND_PROP_MAX
+} vnd_prop_t;
+
+typedef struct vnd_prop_buf {
+ uint64_t vpb_size;
+} vnd_prop_buf_t;
+
+extern int vnd_prop_get(vnd_handle_t *, vnd_prop_t, void *, size_t);
+extern int vnd_prop_set(vnd_handle_t *, vnd_prop_t, void *, size_t);
+extern int vnd_prop_writeable(vnd_prop_t, boolean_t *);
+
+typedef int (*vnd_prop_iter_f)(vnd_handle_t *, vnd_prop_t, void *);
+extern int vnd_prop_iter(vnd_handle_t *, vnd_prop_iter_f, void *);
+
+extern int vnd_frameio_read(vnd_handle_t *, frameio_t *);
+extern int vnd_frameio_write(vnd_handle_t *, frameio_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBVND_H */
diff --git a/usr/src/lib/libvnd/common/llib-lvnd b/usr/src/lib/libvnd/common/llib-lvnd
new file mode 100644
index 0000000000..80a4229e32
--- /dev/null
+++ b/usr/src/lib/libvnd/common/llib-lvnd
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/* LINTLIBRARY */
+/* PROTOLIB1 */
+
+#include <libvnd.h>
diff --git a/usr/src/lib/libvnd/common/mapfile-vers b/usr/src/lib/libvnd/common/mapfile-vers
new file mode 100644
index 0000000000..0eb862ab60
--- /dev/null
+++ b/usr/src/lib/libvnd/common/mapfile-vers
@@ -0,0 +1,55 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+#
+# TODO When this makes it into illumos we should make it a public interface
+#
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ vnd_create;
+ vnd_close;
+ vnd_errno;
+ vnd_frameio_read;
+ vnd_frameio_write;
+ vnd_open;
+ vnd_pollfd;
+ vnd_prop_get;
+ vnd_prop_iter;
+ vnd_prop_set;
+ vnd_prop_writeable;
+ vnd_strerror;
+ vnd_strsyserror;
+ vnd_syserrno;
+ vnd_unlink;
+ vnd_walk;
+ local:
+ *;
+};
diff --git a/usr/src/lib/libvnd/i386/Makefile b/usr/src/lib/libvnd/i386/Makefile
new file mode 100644
index 0000000000..41e699e8f8
--- /dev/null
+++ b/usr/src/lib/libvnd/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/man/Makefile b/usr/src/man/Makefile
index 4435991c35..56d72ee3c0 100644
--- a/usr/src/man/Makefile
+++ b/usr/src/man/Makefile
@@ -79,6 +79,7 @@ SUBDIRS= man1 \
man3tsol \
man3uuid \
man3volmgt \
+ man3vnd \
man3xcurses \
man3xnet \
man4 \
diff --git a/usr/src/man/Makefile.man b/usr/src/man/Makefile.man
index 64c3d10284..0f348011a3 100644
--- a/usr/src/man/Makefile.man
+++ b/usr/src/man/Makefile.man
@@ -27,7 +27,10 @@ FILEMODE= 0444
ROOTMANFILES= $(MANFILES:%=$(ROOTMAN)/man$(MANSECT)/%)
ROOTMANLINKS= $(MANLINKS:%=$(ROOTMAN)/man$(MANSECT)/%)
-$(ROOTMAN)/man$(MANSECT)/% $(ROOTHASMAN)/man$(MANSECT)/%: %
+$(ROOTMAN)/man$(MANSECT) $(ROOTHASMAN)/man$(MANSECT):
+ $(INS.dir)
+
+$(ROOTMAN)/man$(MANSECT)/% $(ROOTHASMAN)/man$(MANSECT)/%: % $(ROOTMAN)/man$(MANSECT) $(ROOTHASMAN)/man$(MANSECT)
$(INS.file)
$(MANLINKS):
diff --git a/usr/src/man/man1m/Makefile b/usr/src/man/man1m/Makefile
index 7de1f80e73..d9d901e7c7 100644
--- a/usr/src/man/man1m/Makefile
+++ b/usr/src/man/man1m/Makefile
@@ -13,7 +13,7 @@
# Copyright 2011, Richard Lowe
# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright 2013 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
#
include $(SRC)//Makefile.master
@@ -566,6 +566,8 @@ _MANFILES= 6to4relay.1m \
uusched.1m \
uuxqt.1m \
vmstat.1m \
+ vndadm.1m \
+ vndstat.1m \
volcopy.1m \
volcopy_ufs.1m \
vscanadm.1m \
diff --git a/usr/src/man/man1m/snoop.1m b/usr/src/man/man1m/snoop.1m
index ca969e22b4..29d9485a9c 100644
--- a/usr/src/man/man1m/snoop.1m
+++ b/usr/src/man/man1m/snoop.1m
@@ -1,9 +1,10 @@
'\" te
.\" Copyright (C) 2009, Sun Microsystems, Inc. All Rights Reserved
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License.
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License.
.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH SNOOP 1M "Feb 18, 2009"
+.TH SNOOP 1M "Feb 24, 2014"
.SH NAME
snoop \- capture and inspect network packets
.SH SYNOPSIS
@@ -12,7 +13,7 @@ snoop \- capture and inspect network packets
\fBsnoop\fR [\fB-aqrCDINPSvV\fR] [\fB-t\fR [r | a | d]] [\fB-c\fR \fImaxcount\fR]
[\fB-d\fR \fIdevice\fR] [\fB-i\fR \fIfilename\fR] [\fB-n\fR \fIfilename\fR] [\fB-o\fR \fIfilename\fR]
[\fB-p\fR \fIfirst\fR [, \fIlast\fR]] [\fB-s\fR \fIsnaplen\fR] [\fB-x\fR \fIoffset\fR [, \fIlength\fR]]
- [\fIexpression\fR]
+ [\fB-z\fR \fIzonename\fR] [\fIexpression\fR]
.fi
.SH DESCRIPTION
@@ -298,6 +299,23 @@ the whole packet, use an \fIoffset\fR of 0. If a \fIlength\fR value is not
provided, the rest of the packet is displayed.
.RE
+.sp
+.ne 2
+.na
+.BI -z zonename
+.ad
+.sp .6
+.RS 4n
+Open an earlier datalink specified via
+.B -d
+or
+.B -I
+in the specified zone
+.I zonename
+. This option is only meaningful in the global zone and
+allows the global zone to inspect datalinks of non-global zones.
+.RE
+
.SH OPERANDS
.sp
.ne 2
diff --git a/usr/src/man/man1m/vndadm.1m b/usr/src/man/man1m/vndadm.1m
new file mode 100644
index 0000000000..63fb63f9df
--- /dev/null
+++ b/usr/src/man/man1m/vndadm.1m
@@ -0,0 +1,652 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VNDADM 1M "Mar 06, 2014"
+.SH NAME
+vndadm \- administer vnd devices
+
+.SH SYNOPSIS
+
+.nf
+vndadm create [-z zonename] [-l datalink] device
+vndadm destroy [-z zonename] device...
+vndadm list [-p] [-d delim] [-o field,...] [-z zonename] [device]...
+vndadm get [-p] [-d delim] [-z zonename] device [prop]...
+vndadm set [-z zonename] device prop=val...
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The vndadm command is used to administer vnd devices. A vnd device is
+similar to an IP network interface, except that the vnd device operates
+at layer two. A vnd device is created over a data link (see dladm(1M))
+and its address is that of the underlying data link. For ethernet based
+devices, that address would be the MAC address of the data link. vnd
+devices are character devices which may be used to send and receive
+layer two packets. When reading or writing to a vnd device, the full
+frame must be present. This is useful for working with virtual machines,
+or other environments where you need to manipulate the entire layer two
+frame.
+
+.sp
+.LP
+Every command takes a device as an argument. To specify a vnd device,
+you just use the name of the device. Devices are scoped to zones. If no
+zone is specified, the current zone is assumed. A device name can be any
+series of alphanumeric ascii characters which typically match the name
+of the underlying data link. A given vnd device name must be unique in a
+given zone, but the same name can be used across zones.
+.sp
+.LP
+.SH OPTIONS
+.sp
+.LP
+All vndadm subcommands have the following common option:
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+Operate in the context of the specified zone. When creating a vnd
+device, the named device is created in the specified zone. All other
+operations scope the device lookup to the specified zone. If the user is
+not in the global zone, the use of -z will not work.
+
+.sp
+.LP
+When -z is used and multiple devices are specified, then
+the use of -z applies to all of the devices.
+.RE
+
+.SH SUBCOMMANDS
+.sp
+.ne 2
+.na
+vndadm create [-z zonename] [-l datalink] device
+.ad
+.sp
+.RS 4n
+Creates a vnd device with the specified name device. If -l datalink is
+not specified, it is assumed that the data link and the device share the
+same name. The created device will exist for as long as the zone exists
+or until a call to vndadm destroy. vnd devices do not persist across
+system reboots. Note, if an IP interface or another libdlpi(3LIB)
+consumer is already using the data link, then vnd will fail.
+
+.sp
+The maximum length of the name of device is 31 characters. The allowed
+set of characters is alphanumberic characters, ':', \'-', and \'_'. The
+names 'zone' and 'ctl' are reserved and may not be used.
+
+.sp
+.ne 2
+.na
+-l datalink
+.ad
+.sp .6
+.RS 4n
+Specifies the name of the data link to create the device over. This
+allows the vnd device name to be different from the data link's name.
+.RE
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+See OPTIONS above.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.na
+vndadm destroy [-z zonename] device...
+.ad
+.sp
+.RS 4n
+Destroys the specified device. The destruction is analogous to
+unlink(2). If the device is still open and used by applications, the
+device will continue to exist, but it will no longer be accessible by
+the name device.
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+See OPTIONS above.
+.RE
+.RE
+
+.sp
+.ne 2
+.na
+vndadm list [-p] [-d delim] [-o field,...] [-z zonename] [device]...
+.ad
+.sp
+.RS 4n
+Lists active vnd devices. By default, vnadm list lists all devices in
+every zone that the caller is allowed to see; the current zone if in the
+non-global zone, and all zones in the global zone. If device is
+specified one or more times, then output will be limited to the
+specified devices.
+.sp
+.ne 2
+.na
+-o field[,...]
+.ad
+.sp .6
+.RS 4n
+A case-insensitive, comma-separated list of output fields. When -o is
+not used, all of the fields listed below are shown. The field name must
+be one of the following fields:
+
+.sp
+.ne 2
+.na
+NAME
+.ad
+.sp .6
+.RS 4n
+The name of the vnd device.
+.RE
+
+.sp
+.ne 2
+.na
+DATALINK
+.ad
+.sp .6
+.RS 4n
+The name of the data link the vnd device was created over.
+.RE
+
+.sp
+.ne 2
+.na
+ZONENAME
+.ad
+.sp .6
+.RS 4n
+The name of the zone that the vnd device exists in.
+.RE
+.RE
+
+.sp
+.ne 2
+.na
+-p
+.ad
+.sp .6
+.RS 4n
+Display the output in a stable machine parseable format. The -o option
+is required with the -p option. See "Parseable Output Format" below.
+.RE
+
+.sp
+.ne 2
+.na
+-d delim
+.ad
+.sp .6
+.RS 4n
+Change the delimiter used in conjunction with generating parseable
+output. This option may only be specified when -p is also specified.
+.RE
+
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+See OPTIONS above.
+.RE
+
+.RE
+
+
+.sp
+.ne 2
+.na
+vndadm get [-p] [-d delim] [-z zonename] device [prop]...
+.ad
+.sp
+.RS 4n
+Displays the properties for the specified device. By default, all
+properties of a given device are displayed. If prop is specified one or
+more times, then only the specified properties will be displayed for
+device. For a list of properties, see the section "Properties" below.
+The property output consists of the following four columns:
+.sp
+.ne 2
+.na
+LINK
+.ad
+.sp .6
+.RS 4n
+The name of the device
+.RE
+
+.sp
+.ne 2
+.na
+PROPERTY
+.ad
+.sp .6
+.RS 4n
+The name of the property. Note that some properties that are private to
+the implementation may be displayed. Those properties begin with a
+leading underscore.
+.RE
+
+.sp
+.ne 2
+.na
+PERM
+.ad
+.sp .6
+.RS 4n
+Describes whether the property is read-only or
+if it is read-write. This field does not
+indicate if the current user has permission, but
+lists permissions for a privileged user.
+.RE
+
+.sp
+.ne 2
+.na
+VALUE
+.ad
+.sp .6
+.RS 4n
+The value of the property.
+.RE
+
+.sp
+.ne 2
+.na
+-p
+.ad
+.sp .6
+.RS 4n
+Display the output in a stable machine parseable format. See "Parseable
+Output Format" below.
+.RE
+
+.sp
+.ne 2
+.na
+-d delim
+.ad
+.sp .6
+.RS 4n
+Change the delimiter used in conjunction with generating parseable
+output. This option may only be specified when -p is also specified.
+.RE
+
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+See OPTIONS above.
+.RE
+.RE
+
+.sp
+.ne 2
+.na
+vndadm set [-z zonename] device prop=val...
+.ad
+.sp
+.RS 4n
+Sets properties on the named device. Setting a property takes effect for
+all operations on the device, after the program returns. Multiple
+properties can be set at once; however, properties are applied one at a
+time to the device. Property names and values must be separated with an
+equals sign. Additional property and value pairs should be separated by
+white space. For a list of properties, see the section "Properties"
+below.
+
+.sp
+.ne 2
+.na
+-z zonename
+.ad
+.sp .6
+.RS 4n
+See OPTIONS above.
+.RE
+.RE
+
+.SS Parseable Output Format
+.sp
+.LP
+The default output for parseable data is to be separated with a single
+ascii space character. The delimiter may be changed with the -d
+option. When parseable output is requested, no numbers that represent
+sizes will be displayed in human readable form, they will be fully
+expanded. eg. the number 42K will instead be 43008.
+
+.SS Properties
+.sp
+.LP
+The following are supported and stable properties. Note that any
+properties that starts with a leading underscore are not a stable
+property and may be removed at any time.
+
+.sp
+.ne 2
+.na
+rxbuf
+.ad
+.sp .6
+.RS 4n
+A read/write property that controls the size of the receive buffer for
+the device. All received data enters the receive buffer until a consumer
+consumes it. If adding a received frame would exceed the size of the
+receive buffer, then that frame will be dropped. The maximum size of the
+buffer is limited by the 'maxsize' property. The minimum size of the
+buffer is the value of the 'maxtu' property. The property's value may be
+anything between that maximum and minimum. When setting this property,
+standard size suffixes such as 'K' and 'M' may be used.
+.RE
+
+.sp
+.ne 2
+.na
+txbuf
+.ad
+.sp .6
+.RS 4n
+A read/write property that controls the size of the transmit buffer. All
+in-flight transmitted data must be able to fit into the transmit buffer
+to account for potential flow control events. If there is not enough
+space in the transmit buffer, transmit related I/O operations will
+either block or fail based on whether the file has been put into
+non-blocking mode by setting O_NONBLOCK or O_NDELAY with fcntl(2). The
+maximum size of the buffer is limited by the 'maxsize' property. The
+minimum size of the buffer is the value of the 'maxtu' property. The
+property's value may be anything between that maximum and minimum. When
+setting this property, standard size suffixes such as 'K' and 'M' may be
+used.
+
+.RE
+
+.sp
+.ne 2
+.na
+maxsize
+.ad
+.sp .6
+.RS 4n
+A read-only property that describes the maximum size of buffers in the
+system. Properties such as rxbuf and txbuf cannot be set beyond this.
+.RE
+
+.sp
+.ne 2
+.na
+mintu
+.ad
+.sp .6
+.RS 4n
+A read-only property that describes the minimum size of a frame
+transmitted to the underlying data link. Note that the minimum listed
+here may be less than the size of a valid layer two frame and therefore
+may be dropped. A frame smaller than this value will be rejected by vnd.
+.RE
+
+.sp
+.ne 2
+.na
+maxtu
+.ad
+.sp .6
+.RS 4n
+A read-only property that describes the maximum size of a frame
+transmitted to the underlying data link. A frame larger than this value
+will be rejected by vnd.
+.RE
+
+.SH EXAMPLES
+.LP
+Example 1 Creating a vnd device
+.sp
+.LP
+To create a vnd device over the VNIC named net0, enter the following
+command:
+
+.sp
+.in +2
+.nf
+# vndadm create net0
+.fi
+.in -2
+.sp
+
+.LP
+Example 2 Creating a vnd device in another zone
+.sp
+.LP
+
+To create a vnd device over the VNIC named net1 in the zone
+1b7155a4-aef9-e7f0-d33c-9705e4b8b525, enter the following command:
+
+.sp
+.in +2
+.nf
+# vndadm create -z 1b7155a4-aef9-e7f0-d33c-9705e4b8b525 net1
+.fi
+.in -2
+.sp
+
+.LP
+Example 3 Destroying a vnd device
+.sp
+.LP
+
+To destroy the vnd device named net0, enter the following command:
+
+.sp
+.in +2
+.nf
+# vndadm destroy net0
+.fi
+.in -2
+.sp
+
+.LP
+Example 4 Destroying a vnd device in another zone
+.sp
+.LP
+
+To destroy the vnd device named net1 in the zone
+1b7155a4-aef9-e7f0-d33c-9705e4b8b525, enter the following command:
+
+.sp
+.in +2
+.nf
+# vndadm destroy -z 1b7155a4-aef9-e7f0-d33c-9705e4b8b525 net1
+.fi
+.in -2
+.sp
+
+.LP
+Example 5 List all vnd devices
+.sp
+.LP
+
+To list all devices, run the following command:
+
+.sp
+.in +2
+.nf
+# vndadm list
+NAME DATALINK ZONENAME
+net0 net0 global
+net0 net0 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+.fi
+.in -2
+.sp
+
+.LP
+Example 6 Listing devices in a specific zone
+.sp
+.LP
+
+To list devices in a specific zone, run the following command:
+
+.sp
+.in +2
+.nf
+# vndadm list -z 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+
+NAME DATALINK ZONENAME
+net0 net0 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+.fi
+.in -2
+.sp
+
+.LP
+Example 7 List all devices in a parseable format
+.sp
+.LP
+
+To list all devices in a parseable format with the delimiter of ':', run
+the following command:
+
+.sp
+.in +2
+.nf
+# vndadm list -p -d: -o name,datalink,zone
+net0:net0:global
+net0:net0:1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+.fi
+.in -2
+.sp
+
+.LP
+Example 8 Retrieving all properties for a device
+.sp
+.LP
+
+To retrieve all of the properties for the vnd device foo0, run the
+following command:
+
+.sp
+.in +2
+.nf
+# vndadm get foo0
+LINK PROPERTY PERM VALUE
+foo0 rxbuf rw 65536
+foo0 txbuf rw 65536
+foo0 maxsize r- 4194304
+foo0 mintu r- 0
+foo0 maxtu r- 1518
+foo0 _nflush rw 10
+foo0 _burstsz rw 10
+.fi
+.in -2
+.sp
+
+.LP
+Example 9 Retrieving specific properties for a device
+.sp
+.LP
+
+To retrieve just the rxbuf and txbuf properties for the vnd device foo0,
+run the following command:
+
+.sp
+.in +2
+.nf
+# vndadm get foo0 rxbuf txbuf
+LINK PROPERTY PERM VALUE
+foo0 rxbuf rw 65536
+foo0 txbuf rw 65536
+.fi
+.in -2
+.sp
+
+.LP
+Example 10 Retrieving properties for a device in a parseable format
+.sp
+.LP
+
+To retrieve all properties for the vnd device foo0 in a parseable
+format, run the following command:
+
+.sp
+.in +2
+.nf
+# vndadm get -p foo0
+foo0 rxbuf rw 65536
+foo0 txbuf rw 65536
+foo0 maxsize r- 4194304
+foo0 mintu r- 0
+foo0 maxtu r- 1518
+foo0 _nflush rw 10
+foo0 _burstsz rw 10
+.fi
+.in -2
+.sp
+
+.LP
+Example 11 Setting a property on a device
+.sp
+.LP
+
+To set the receive buffer size to one megabyte on the device foo0, run
+the following command:
+
+.sp
+.in +2
+.nf
+# vndadm set foo0 rxbuf=1M
+.fi
+.in -2
+.sp
+
+.LP
+Example 12 Setting multiple properties on a device
+.sp
+.LP
+
+To set the transmit buffer to 300 Kb and the receive buffer to 1 Mb, run
+the following command:
+
+.sp
+.in +2
+.nf
+# vndadm set foo0 rxbuf=300K txbuf=1M
+.fi
+.in -2
+.sp
+
+.SH SEE ALSO
+
+dladm(1M), ipadm(1M), fcntl(2), fcntl.h(3HEAD), libvnd(3LIB),
+vndstat(1M), vnd(7D)
diff --git a/usr/src/man/man1m/vndstat.1m b/usr/src/man/man1m/vndstat.1m
new file mode 100644
index 0000000000..a7f843e228
--- /dev/null
+++ b/usr/src/man/man1m/vndstat.1m
@@ -0,0 +1,163 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VNDSTAT 1M "Mar 06, 2014"
+.SH NAME
+vndstat \- report vnd activity
+
+.SH SYNOPSIS
+
+vndstat [interval [count]]
+
+.SH DESCRIPTION
+.sp
+.LP
+The vndstat command reports a summary of per-device vnd
+activity. Once per interval it prints a table of statistics per
+device. In the global zone, vndstat reports on all devices in the
+system. From the non-global zone, it only reports on devices that are
+present in that zone. vndstat reports on all vnd devices
+that exist, including anonymous devices which are not linked into the
+file system.
+.sp
+.LP
+The vndstat command's output includes the following information:
+.sp
+.ne 2
+.na
+.B name
+.ad
+.RS 14n
+The name of the device, if bound. If a given vnd device is not
+bound into the file system, hence considered anonymous, then there will
+be no name for the device.
+.RE
+
+.sp
+.ne 2
+.na
+.B rx B/s
+.ad
+.RS 14n
+The number of bytes received by the device during interval.
+.RE
+
+.sp
+.ne 2
+.na
+.B tx B/s
+.ad
+.RS 14n
+The number of bytes transmitted by the device during interval.
+.RE
+
+.sp
+.ne 2
+.na
+.B drops
+.ad
+.RS 14n
+The number of packets and messages which have been dropped. This
+includes all drops due to insufficient buffer space, IP hooks, and
+unknown or malformed DLPI messages.
+.RE
+
+.sp
+.ne 2
+.na
+.B txfc
+.ad
+.RS 14n
+The number of flow control events that have occurred. A flow control
+event occurs when the layers below vnd request that all transmits
+be paused until a future call resumes the flow. This statistic is
+incremented when the flow is resumed. It is not incremented when it is
+first paused.
+.RE
+
+.sp
+.ne 2
+.na
+.B zone
+.ad
+.RS 14n
+The name of the zone the device is located in.
+.RE
+
+.SH OPTIONS
+
+.sp
+.ne 2
+.na
+interval
+.ad
+.RS 13n
+Report once each interval seconds. interval may not be
+fractional.
+.RE
+
+.sp
+.ne 2
+.na
+count
+.ad
+.RS 13n
+Only print count reports, then exit.
+.RE
+.sp
+.LP
+When no arguments are given to vndstat, it will always print at an
+interval of one second. Reports will continue until vndstat
+is terminated.
+
+.SH EXAMPLES
+.LP
+Example 1 Print five seconds of data
+
+.sp
+.in +2
+.nf
+example% vndstat 1 5
+ name | rx B/s | tx B/s | drops txfc | zone
+ net0 | 1.45MB/s | 14.1KB/s | 0 0 | 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+ net0 | 3.50MB/s | 19.5KB/s | 0 0 | 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+ net0 | 2.83MB/s | 30.8KB/s | 0 0 | 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+ net0 | 3.08MB/s | 30.6KB/s | 0 0 | 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+ net0 | 3.21MB/s | 30.6KB/s | 0 0 | 1b7155a4-aef9-e7f0-d33c-9705e4b8b525
+.fi
+.in -2
+.sp
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Interface Stability See below.
+.TE
+
+.sp
+.LP
+Invocation is evolving. Human readable output is unstable.
+.SH SEE ALSO
+
+dlstat(1M), nicstat(1M), vndadm(1M), vnd(7M)
diff --git a/usr/src/man/man3dlpi/Makefile b/usr/src/man/man3dlpi/Makefile
index cdd24216bd..4c5448f0be 100644
--- a/usr/src/man/man3dlpi/Makefile
+++ b/usr/src/man/man3dlpi/Makefile
@@ -41,10 +41,12 @@ MANFILES= dlpi_arptype.3dlpi \
dlpi_walk.3dlpi
MANLINKS= dlpi_disabmulti.3dlpi \
+ dlpi_open_zone.3dlpi \
dlpi_promiscoff.3dlpi
dlpi_disabmulti.3dlpi := LINKSRC = dlpi_enabmulti.3dlpi
+dlpi_open_zone.3dlpi := LINKSRC = man3dlpi/dlpi_open.3dlpi
dlpi_promiscoff.3dlpi := LINKSRC = dlpi_promiscon.3dlpi
.KEEP_STATE:
diff --git a/usr/src/man/man3dlpi/dlpi_open.3dlpi b/usr/src/man/man3dlpi/dlpi_open.3dlpi
index 8129a75404..489f66066a 100644
--- a/usr/src/man/man3dlpi/dlpi_open.3dlpi
+++ b/usr/src/man/man3dlpi/dlpi_open.3dlpi
@@ -1,9 +1,10 @@
'\" te
.\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License.
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License.
.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH DLPI_OPEN 3DLPI "Nov 17, 2008"
+.TH DLPI_OPEN 3DLPI "Feb 24, 2014"
.SH NAME
dlpi_open \- open DLPI link
.SH SYNOPSIS
@@ -14,6 +15,9 @@ dlpi_open \- open DLPI link
\fBint\fR \fBdlpi_open\fR(\fBconst char *\fR\fIlinkname\fR, \fBdlpi_handle_t *\fR\fIdhp\fR,
\fBuint_t\fR \fIflags\fR);
+
+\fBint\fR \fBdlpi_open_zone\fR(\fBconst char *\fR\fIlinkname\fR, \fBconst char *\fR
+ \fIzonename\fR, \fBdlpi_handle_t *\fR\fIdhp\fR, \fBuint_t\fR \fIflags\fR);
.fi
.SH DESCRIPTION
@@ -114,6 +118,18 @@ value ensures that \fBDLPI_ETIMEDOUT\fR is returned from a \fBlibdlpi\fR
operation only in the event that the \fBDLPI\fR link becomes unresponsive. The
timeout value can be changed with \fBdlpi_set_timeout\fR(3DLPI), although this
should seldom be necessary.
+
+.sp
+.LP
+The \fBdlpi_open_zone()\fR function behaves as \fBdlpi_open()\fR, except that it
+looks for the link specified by \fBlinkname\fR in the specified zone
+\fBzonename\fR as opposed to the current zone. This function is only meaningful
+from the global zone. Instead of scanning \fB/dev/net\fR, \fBdlpi_open_zone()\fR
+scans \fB/dev/net/zone/<\fIzonename\fR> for the data link and
+\fB/dev/ipnet/zone/<\fIzonename\fR> when DLPI_DEVIPNET is present in
+\fBflags\fR. If a NULL or empty string is passed into \fBdlpi_open_zone()\fR, it
+will behave as though \fBdlpi_open\fR has been called.
+
.SH RETURN VALUES
.sp
.LP
@@ -124,7 +140,7 @@ section is returned.
.SH ERRORS
.sp
.LP
-The \fBdlpi_open()\fR function will fail if:
+The \fBdlpi_open()\fR and \fBdlpi_open_zone()\fR function will fail if:
.sp
.ne 2
.na
@@ -195,6 +211,17 @@ DLPI operation failed
See \fBattributes\fR(5) for description of the following attributes:
.sp
+.LP
+The \fBdlpi_open_zone()\fR function will fail if:
+.sp
+.ne 2
+.na
+\fB\fBDLPI_EZONENAMEINVAL\fR\fR
+.ad
+.RS 25n
+Invalid \fIzonename\fR argument
+.RE
+
.sp
.TS
box;
diff --git a/usr/src/man/man3lib/Makefile b/usr/src/man/man3lib/Makefile
index fe9acebc3a..e36ee916b9 100644
--- a/usr/src/man/man3lib/Makefile
+++ b/usr/src/man/man3lib/Makefile
@@ -103,6 +103,7 @@ MANFILES= libMPAPI.3lib \
libumem.3lib \
libuuid.3lib \
libvolmgt.3lib \
+ libvnd.3lib \
libw.3lib \
libxnet.3lib \
liby.3lib
diff --git a/usr/src/man/man3lib/libvnd.3lib b/usr/src/man/man3lib/libvnd.3lib
new file mode 100644
index 0000000000..a4d47670f0
--- /dev/null
+++ b/usr/src/man/man3lib/libvnd.3lib
@@ -0,0 +1,690 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH LIBVND 3LIB "Mar 06, 2014"
+.SH NAME
+libvnd \- vnd library
+
+.SH SYNOPSIS
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+.fi
+
+.SH DESCRIPTION
+.LP
+The libvnd library provides a stable and programmatic interface to
+vnd(7D) devices. vnd devices provide the means for creating a layer two
+interface over a data link, similar to the use of libdlpi(3LIB) and
+IP(7P). In dlpi parlance, a vnd device obtains data from all service
+attachment points (SAP). For ethernet devices, this means that a vnd
+device sends and receives traffic for all ethertypes. It is intended to
+be used for services such as virtual machines which emulate layer two
+devices.
+
+.LP
+Handles to vnd(7D) devices are obtained through the use of vnd_create
+and vnd_open. With a handle, I/O can be performed and properties on the
+device can be set and retrieved. I/O on devices should be performed
+through the vnd_frameio_read and vnd_frameio_write functions. A file
+descriptor suitable for use with event ports and polling may be obtained
+through vnd_pollfd. Handles are relinquished through calls to vnd_close;
+however, devices will persist until vnd_unlink has been called.
+
+.LP
+The rest of this manual documents the interfaces, properties, errors,
+and threading model for libvnd. The in-depth description of individual
+interfaces, their arguments, and examples, are in manual pages for each
+provided interface.
+
+
+.SH INTERFACES
+.sp
+.LP
+
+The shared object libvnd.so.1 provides the public interfaces defined
+below. See Intro(3) for additional information on shared object
+interfaces. Individual functions are documented in their own manual
+pages.
+
+.sp
+.TS
+l l
+l l .
+vnd_create vnd_errno
+vnd_open vnd_syserrno
+vnd_unlink vnd_strerror
+vnd_close vnd_strsyserror
+vnd_pollfd vnd_walk
+vnd_prop_get vnd_prop_set
+vnd_prop_iter vnd_prop_writeable
+vnd_frameio_read vnd_frameio_write
+.TE
+
+.SH PROPERTIES
+
+.LP
+The following table summarizes properties of a vnd device. The
+properties can be retrieved and set with the functions
+vnd_prop_get(3VND) and vnd_prop_set(3VND). Following the table, the
+structures and properties are described in greater detail.
+
+.nf
+ +-------------------+---------------------+-------+
+ | PROPERTY | STRUCTURE | PERM |
+ +-------------------+---------------------+-------+
+ | VND_PROP_RXBUF | vnd_prop_buf_t | R/W |
+ +-------------------+---------------------+-------+
+ | VND_PROP_TXBUF | vnd_prop_buf_t | R/W |
+ +-------------------+---------------------+-------+
+ | VND_PROP_MAXBUF | vnd_prop_buf_t | R/- |
+ +-------------------+---------------------+-------+
+ | VND_PROP_MINTU | vnd_prop_buf_t | R/- |
+ +-------------------+---------------------+-------+
+ | VND_PROP_MAXTU | vnd_prop_buf_t | R/- |
+ +-------------------+---------------------+-------+
+.fi
+
+.SS Structures
+
+.LP
+The vnd_prop_buf_t structure has the following members:
+
+.in +2
+.nf
+uint64_t vpb_size;
+.fi
+.in -2
+
+.LP
+The vpb_size member refers to a size in bytes. When getting a property,
+it represents the size of that property, when setting a property, it is
+the size to set the property to.
+
+
+.SS Property Descriptions
+.sp
+.ne 2
+.na
+rxbuf
+.ad
+.sp .6
+.RS 4n
+A read/write property that controls the size of the receive buffer for
+the device. All received data enters the receive buffer until a consumer
+consumes it. If adding a received frame would exceed the size of the
+receive buffer, then that frame will be dropped. The maximum size of the
+buffer is limited by the 'maxsize' property.
+.RE
+
+.sp
+.ne 2
+.na
+txbuf
+.ad
+.sp .6
+.RS 4n
+A read/write property that controls the size of the transmit buffer. All
+in-flight transmitted data must be able to fix into the transmit buffer
+to deal with potential flow control events. If there is not enough space
+in the transmit buffer, transmit related I/O operations will either
+block or fail based on whether or not O_NONBLOCK or O_NDELAY were set
+with fcntl(2).
+.RE
+
+.sp
+.ne 2
+.na
+maxsize
+.ad
+.sp .6
+.RS 4n
+A read only property that describes the maximum size of buffers in the
+system. Properties such as rxbuf and txbuf cannot be set beyond this.
+.RE
+
+.sp
+.ne 2
+.na
+mintu
+.ad
+.sp .6
+.RS 4n
+A read only property that describes the minimum size of a frame
+transmitted to the underlying data link. Note that the minimum listed
+here may be less than the size of a valid layer two frame and therefore
+may be dropped. A frame smaller than this value will be rejected by vnd.
+.RE
+
+.sp
+.ne 2
+.na
+maxtu
+.ad
+.sp .6
+.RS 4n
+A read only property that describes the maximum size of
+a frame transmitted to the underlying data link. A frame
+larger than this value will be rejected by vnd.
+.RE
+
+
+.SH ERRORS
+.sp
+.LP
+Most interfaces provided by libvnd provide a means to retrieve a
+vnd_errno_t that describes an error that has occurred. The manuals for
+individual interfaces describe whether or not this additional error
+information is available and how to retrieve it. The following is a
+complete list of the error numbers and their names as defined in
+<sys/vnd_errno.h>. Any entries not listed here are private to the
+implementation and may change at any time.
+
+.sp
+.ne 2
+.na
+0 VND_E_SUCCESS
+.ad
+.RS 23n
+no error
+.sp
+This indicates that the operation completed successfully.
+.RE
+
+.sp
+.ne 2
+.na
+1 VND_E_NOMEM
+.ad
+.RS 23n
+not enough memory available
+.sp
+Insufficient memory was available. This is the equivalent of the
+standard system errno ENOMEM.
+.RE
+
+.sp
+.ne 2
+.na
+2 VND_E_NODATALINK
+.ad
+.RS 23n
+no such datalink
+.sp
+The data link requested to be used as part of vnd_create does not exist
+in the requested zone.
+.RE
+
+.sp
+.ne 2
+.na
+3 VND_E_NOTETHER
+.ad
+.RS 23n
+datalink not of type DL_ETHER
+.sp
+The data link used as part of a call to vnd_create is not an Ethernet
+device. vnd_create only works with Ethernet devices at this time.
+.RE
+
+.sp
+.ne 2
+.na
+4 VND_E_DLPIINVAL
+.ad
+.RS 23n
+unknown dlpi failure
+.sp
+An unexpected DLPI message was received during vnd device
+initialization.
+.RE
+
+.sp
+.ne 2
+.na
+5 VND_E_ATTACHFAIL
+.ad
+.RS 23n
+DL_ATTACH_REQ failed
+.sp
+During vnd device initialization, the dlpi call to attach to the
+requested data link failed.
+.RE
+
+.sp
+.ne 2
+.na
+6 VND_E_BINDFAIL
+.ad
+.RS 23n
+DL_BIND_REQ failed
+.sp
+
+During vnd device initialization, the dlpi call to bind to a service
+attachment point on the data link failed.
+.RE
+
+.sp
+.ne 2
+.na
+7 VND_E_PROMISCFAIL
+.ad
+.RS 23n
+DL_PROMISCON_REQ failed
+.sp
+
+During vnd device initialization, the dlpi call to enable promiscuous
+mode on the underlying device failed.
+.RE
+
+.sp
+.ne 2
+.na
+8 VND_E_DIRECTFAIL
+.ad
+.RS 23n
+DLD_CAPAB_DIRECT enable failed
+.sp
+During vnd device initialization, the dlpi call to enable the DLD fast
+path failed.
+.RE
+
+.sp
+.ne 2
+.na
+9 VND_E_CAPACKINVAL
+.ad
+.RS 23n
+bad datalink capability
+.sp
+During vnd device initialization, the kernel responded with an invalid
+capability acknowledgement.
+.RE
+
+.sp
+.ne 2
+.na
+10 VND_E_SUBCAPINVAL
+.ad
+.RS 23n
+bad datalink subcapability
+.sp
+During vnd device initialization, the kernel responded with an invalid
+sub-capability.
+.RE
+
+.sp
+.ne 2
+.na
+11 VND_E_DLDBADVERS
+.ad
+.RS 23n
+bad dld version
+.sp
+The vnd(7D) module does not support the version of the dld capability
+that the kernel sent. As such, the data path could not be brought up and
+the device could not be fully initialized.
+.RE
+
+.sp
+.ne 2
+.na
+12 VND_E_KSTATCREATE
+.ad
+.RS 23n
+failed to create kstats
+.sp
+During vnd device initialization, the necessary kstats could not be
+created.
+.RE
+
+.sp
+.ne 2
+.na
+13 VND_E_NODEV
+.ad
+.RS 23n
+no such vnd link
+.sp
+During device initialization, the requested character device did not
+exist.
+.RE
+
+.sp
+.ne 2
+.na
+14 VND_E_NONETSTACK
+.ad
+.RS 23n
+netstack doesn't exist
+.sp
+During device initialization, the networking stack for the device did
+not exist.
+.RE
+
+.sp
+.ne 2
+.na
+15 VND_E_ASSOCIATED
+.ad
+.RS 23n
+device already associated
+.sp
+During vnd device initialization, the vnd STREAMS device was already
+associated with another vnd device.
+.RE
+
+.sp
+.ne 2
+.na
+16 VND_E_ATTACHED
+.ad
+.RS 23n
+device already attached
+.sp
+The given vnd device has already been created over a data link and
+cannot be created over another one.
+.RE
+
+.sp
+.ne 2
+.na
+17 VND_E_LINKED
+.ad
+.RS 23n
+device already linked
+.sp
+The given vnd device has already been given a name and bound into the
+file system name space.
+.RE
+
+.sp
+.ne 2
+.na
+18 VND_E_BADNAME
+.ad
+.RS 23n
+invalid name
+.sp
+The requested name is not a valid name. Valid names are alphanumeric
+ascii names, along with the following ascii characters: ':', '\-', and
+\'_'. Names must be less than LIBVND_NAMELEN bytes including the null
+terminator.
+.RE
+
+.sp
+.ne 2
+.na
+19 VND_E_PERM
+.ad
+.RS 23n
+permission denied
+.sp
+A request was made from a non-global zone to manipulate a vnd device
+that belongs to a different zone.
+.RE
+
+.sp
+.ne 2
+.na
+20 VND_E_NOZONE
+.ad
+.RS 23n
+no such zone
+.sp
+A request was made which targeted a zone that did not exist.
+.RE
+
+.sp
+.ne 2
+.na
+21 VND_E_STRINIT
+.ad
+.RS 23n
+failed to initialize vnd stream module
+.sp
+During vnd device initialization, the vnd STREAMS module could not be
+pushed onto the data link's stream head.
+.RE
+
+.sp
+.ne 2
+.na
+22 VND_E_NOTATTACHED
+.ad
+.RS 23n
+device not attached
+.sp
+A request was made that requires a vnd device be attached to a data
+link, such as a call to change a property. The device was not attached
+to a data link.
+.RE
+
+.sp
+.ne 2
+.na
+23 VND_E_NOTLINKED
+.ad
+.RS 23n
+device not linked
+.sp
+A request was made to a vnd device that requires the vnd device to be
+named and present in /dev. The given device was not linked into /dev at
+the time of the call.
+.RE
+
+.sp
+.ne 2
+.na
+24 VND_E_LINKEXISTS
+.ad
+.RS 23n
+another device has the same link name
+.sp
+When trying to link a given vnd device into a zones /dev name space,
+another device already exists with the same name.
+.RE
+
+.sp
+.ne 2
+.na
+25 VND_E_MINORNODE
+.ad
+.RS 23n
+failed to create minor node
+.sp
+While trying to link a vnd device into the /devices and /dev name space,
+the call to ddi_create_minor_node() failed.
+.RE
+
+.sp
+.ne 2
+.na
+26 VND_E_BUFTOOBIG
+.ad
+.RS 23n
+requested buffer size is too large
+.sp
+The requested buffer size exceeds the maximum valid value for the given
+property.
+.RE
+
+.sp
+.ne 2
+.na
+27 VND_E_BUFTOOSMALL
+.ad
+.RS 23n
+requested buffer size is too small
+.sp
+The requested buffer size is less than the minimum buffer size. This
+generally occurs when making the buffer size less than the maximum
+transmission unit.
+.RE
+
+.sp
+.ne 2
+.na
+28 VND_E_DLEXCL
+.ad
+.RS 23n
+unable to obtain exclusive access to dlpi link, link busy
+.sp
+When a vnd device is created, it expects exclusive active access to the
+device. If any other active dlpi consumers, such as IP, are already
+using the device, then the vnd device will not be created. Passive
+consumers, such as snoop, can still use a device that has been
+exclusively opened.
+.RE
+
+.sp
+.ne 2
+.na
+28 VND_E_DIRECTNOTSUP
+.ad
+.RS 23n
+DLD direct capability not supported over data link
+.sp
+The data link that the vnd device was created over does not supported
+the DLD Direct capability. As such, the data path could not be
+initialized.
+.RE
+
+.sp
+.ne 2
+.na
+30 VND_E_BADPROPSIZE
+.ad
+.RS 23n
+invalid property size
+.sp
+The size of the data passed into vnd_prop_get or vnd_prop_set is
+incorrect and does not match the expected data size.
+.RE
+
+.sp
+.ne 2
+.na
+31 VND_E_BADPROP
+.ad
+.RS 23n
+invalid property
+.sp
+An unknown property identifier was specified. For a list of valid
+properties, see the section above entitled "PROPERTIES".
+.RE
+
+.sp
+.ne 2
+.na
+32 VND_E_PROPRDONLY
+.ad
+.RS 23n
+property is read only
+.sp
+An operation tried to update the value of a read only property. For a
+list of which properties are read only and which are readable and
+writeable, see the section above entitled "PROPERTIES".
+.RE
+
+.sp
+.ne 2
+.na
+33 VND_E_SYS
+.ad
+.RS 23n
+unexpected system error
+.sp
+This indicates that there is no vnd specific error available and that
+the system errno is valid. The system errno can be obtained and printed
+through vnd_syserrno and vnd_strsyserror. The possible values and their
+meanings are documented in Intro(2).
+.RE
+
+.sp
+.ne 2
+.na
+34 VND_E_CAPABPASS
+.ad
+.RS 23n
+capabilities invalid, pass-through module detected
+.sp
+While negotiating capabilities, a pass-through module was detected and
+the capability had to be discarded. Because of this, the data path could
+not be initialized.
+.RE
+
+
+.SH THREADING
+
+.LP
+The libvnd library is not truly MT-safe. MT-safety is provided on
+the granularity of a given vnd_handle_t. Operations on a single
+vnd_handle_t are unsafe; however, operations on different handles are
+MT-safe. If a single vnd_handle_t is used by multiple threads, it
+is the caller's responsibility to provide locking to ensure that
+multiple threads aren't simultaneously calling into libvnd on a
+single handle.
+
+
+.SH FILES
+.sp
+.ne 2
+.na
+/usr/lib/libvnd.so.1
+.ad
+.RS 27n
+shared object
+.RE
+
+.sp
+.ne 2
+.na
+/usr/lib/64/libvnd.so.1
+.ad
+.RS 27n
+64-bit shared object
+.RE
+
+.SH ATTRIBUTES
+
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING"
+.TE
+
+.SH SEE ALSO
+
+.sp
+.LP
+attributes(5), Intro(2), fcntl(2), Intro(3), fcntl.h(3HEAD), libdlpi(3LIB), port_create(3C), vnd(7D)
+.sp
+.LP
+vnd_close(3VND), vnd_create(3VND), vnd_errno(3VND),
+vnd_frameio_read(3VND), vnd_frameio_write(3VND), vnd_open(3VND)
+vnd_pollfd(3VND), vnd_prop_get(3VND), vnd_prop_iter(3VND),
+vnd_prop_set(3VND),
+vnd_prop_writeable(3VND), vnd_walk(3VND)
diff --git a/usr/src/man/man3vnd/Makefile b/usr/src/man/man3vnd/Makefile
new file mode 100644
index 0000000000..64abf9dcd6
--- /dev/null
+++ b/usr/src/man/man3vnd/Makefile
@@ -0,0 +1,70 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet
+# at http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
+#
+
+include $(SRC)/Makefile.master
+
+MANSECT= 3vnd
+
+MANFILES= vnd_create.3vnd \
+ vnd_errno.3vnd \
+ vnd_frameio_read.3vnd \
+ vnd_pollfd.3vnd \
+ vnd_prop_get.3vnd \
+ vnd_prop_iter.3vnd \
+ vnd_prop_writeable.3vnd \
+ vnd_walk.3vnd
+
+MANLINKS= frameio_t.3vnd \
+ framevec_t.3vnd \
+ vnd_close.3vnd \
+ vnd_frameio_write.3vnd \
+ vnd_open.3vnd \
+ vnd_prop_set.3vnd \
+ vnd_prop_iter_f.3vnd \
+ vnd_strerror.3vnd \
+ vnd_strsyserror.3vnd \
+ vnd_syserrno.3vnd \
+ vnd_unlink.3vnd \
+ vnd_walk_cb_f.3vnd
+
+# vnd_create.3vnd
+vnd_open.3vnd := LINKSRC = vnd_create.3vnd
+vnd_unlink.3vnd := LINKSRC = vnd_create.3vnd
+vnd_close.3vnd := LINKSRC = vnd_create.3vnd
+
+# vnd_errno.3vnd
+vnd_strerror.3vnd := LINKSRC = vnd_errno.3vnd
+vnd_syserrno.3vnd := LINKSRC = vnd_errno.3vnd
+vnd_strsyserror.3vnd := LINKSRC = vnd_errno.3vnd
+
+# vnd_frameio_read.3vnd
+vnd_frameio_write.3vnd := LINKSRC = vnd_frameio_read.3vnd
+framevec_t.3vnd := LINKSRC = vnd_frameio_read.3vnd
+frameio_t.3vnd := LINKSRC = vnd_frameio_read.3vnd
+
+# vnd_prop_get.3vnd
+vnd_prop_set.3vnd := LINKSRC = vnd_prop_get.3vnd
+
+# vnd_prop_iter.3vnd
+vnd_prop_iter_f.3vnd := LINKSRC = vnd_prop_iter.3vnd
+
+# vnd_walk.3vnd
+vnd_walk_cb_f.3vnd := LINKSRC = vnd_walk.3vnd
+
+.KEEP_STATE:
+
+include $(SRC)/man/Makefile.man
+
+install: $(ROOTMANFILES) $(ROOTMANLINKS)
diff --git a/usr/src/man/man3vnd/vnd_create.3vnd b/usr/src/man/man3vnd/vnd_create.3vnd
new file mode 100644
index 0000000000..d29237a60c
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_create.3vnd
@@ -0,0 +1,280 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_CREATE 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_create, vnd_open, vnd_unlink, vnd_close \- create, open, and destroy
+vnd devices
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+vnd_handle_t *vnd_create(const char *zonename, const char *datalink,
+ const char *linkname, vnd_errno_t *vnderr, int *syserr);
+
+vnd_handle_t *vnd_open(const char *zonename, const char *linkname,
+ vnd_errno_t *vnderr, int *syserr);
+
+int vnd_unlink(vnd_handle_t *vhp);
+
+void vnd_close(vnd_handle_t *vhp);
+.fi
+
+
+.SH DESCRIPTION
+.LP
+These functions create vnd devices, obtain handles to extant vnd
+devices, and close handles to vnd devices, for use with the rest of
+libvnd(3LIB).
+
+.LP
+The vnd_create function creates a new vnd device in the zone specified
+by zonename. The zone name argument may be null, in which case the
+caller's current zone is used instead. The vnd device and data link it
+is created over must both be in the same zone. The datalink argument
+indicates the name of the DLPI data link to create the vnd device over.
+The linkname argument indicates the name of the new vnd device. The
+linkname argument must be less than VND_NAMELEN characters long,
+excluding the null terminator. It should be an alphanumeric string. The
+only non-alphanumeric characters allowed are ':', '-', and \'_'.
+Neither the datalink argument nor linkname argument may be NULL. A
+handle to the created device is returned to the caller. Once the
+vnd_create function returns, the device can be subsequently opened with
+a call to vnd_open. The named device persists until a call to vnd_unlink
+or the containing zone is halted. Creating a vnd device requires
+PRIV_SYS_NET_CONFIG as well as PRIV_RAWACCESS. The arguments vnderr and
+syserr are used to obtain errors in the cases where the call to
+vnd_create fails. Both arguments may be NULL pointers, in which case the
+more detailed error information is discarded.
+
+.LP
+The vnd_open function opens an existing vnd device and returns a
+unique handle to that device. The vnd device to open is specified by
+both zonename and linkname. The zonename argument specifies what zone
+to look for the vnd device in. The linkname specifies the name of the
+link. The zonename argument may be NULL. If it is, the current zone is
+used. Similar to vnd_create, the integer values pointed to by the
+arguments vnderr and syserr will be filled in with additional error
+information in the cases where a call to vnd_open fails. Both
+arguments may be NULL to indicate that the error information is not
+requested, though this is not recommended.
+
+.LP
+The vnd_unlink function unlinks the vnd device specified by the vnd
+handle vhp. This unlink is similar to the use of unlink in a file
+system. After a call to unlink, the vnd device will no longer be
+accessible by callers to vnd_open and the name will be available for
+use in vnd_create. However, the device will continue to exist until
+all handles to the device have been closed.
+
+.LP
+The vnd_close function relinquishes the vnd device referenced by the
+handle vhp. After a call to vnd_close, the handle is invalidated and
+must not be used by the consumer again. The act of calling vnd_close
+on a handle does not remove the device. The device is persisted as
+long as vnd_unlink has not been called on the device or the containing
+zone has not been destroyed.
+
+.SH RETURN VALUES
+
+.LP
+Upon successful completion, the functions vnd_create and vnd_open
+return a pointer to a vnd_handle_t. This handle is used for all
+subsequent library operations. If either function fails, then a NULL
+pointer is returned and more detailed error information is filled into
+the integers pointed to by vnderr and syserr. The vnderr and syserr
+correspond to the values that would normally be returned by a call to
+vnd_errno(3VND) and vnd_syserrno(3VND). For the full list of possible
+errors see libvnd(3LIB).
+
+.LP
+The vnd_unlink function returns zero on success and -1 on failure. On
+failure, the vnd and system errnos are updated and available through
+the vnd_errno(3VND) and vnd_syserrno(3VND) functions.
+
+.LP
+The vnd_close function does not return any values nor does it set
+vnderr or syserr. The handle passed to vnd_close can no longer be
+used.
+
+.SH EXAMPLES
+.LP
+Example 1 Creating a device
+.sp
+.LP
+
+The following sample C program shows how to create a vnd device over
+an existing datalink named "net0" that other applications can open
+and use as "vnd0".
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+
+ /* Errors are considered fatal */
+ vhp = vnd_create(NULL, "net0", "vnd0", &vnderr, &syserr);
+
+ if (vhp == NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to create device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to create device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ (void) printf("successfully created vnd0\n");
+ vnd_close(vhp);
+ return (0);
+}
+.fi
+.in -2
+
+.LP
+Example 2 Opening an existing device in another zone
+.sp
+.LP
+
+The following sample C program opens the device named "vnd1" in the zone
+named "turin" for further use.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr, ret;
+
+ vhp = vnd_open("turin", "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ /*
+ * Use the device vnd1 with the handle vhp with any of
+ * the other interfaces documented in libvnd(3LIB) here.
+ *
+ * After an arbitrary amount of code, the program will
+ * set the variable ret with the exit code for the
+ * program and should execute the following code before
+ * returning.
+ */
+ vnd_close(vhp);
+ return (ret);
+}
+.fi
+.in -2
+
+
+.LP
+Example 3 Removing a device
+.sp
+.LP
+
+The following sample C program removes a vnd device named vnd0. This
+program makes it so no additional programs can access the device.
+However, if anyone is actively using it, it will still exist,
+similar to calling unlink(2).
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr, ret;
+
+ vhp = vnd_open(NULL, "vnd0", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ if (vnd_unlink(vhp) != 0) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to unlink device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to unlink device: %s",
+ vnd_strerror(vnderr));
+ ret = 1;
+ } else {
+ (void) printf("successfully unlinked vnd0!\n");
+ ret = 0;
+ }
+
+ vnd_close(vhp);
+ return (ret);
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING" in libvnd(3LIB)
+.TE
+
+.SH SEE ALSO
+
+libvnd(3LIB), vnd_errno(3VND), vnd_syserrno(3VND), attributes(5), privileges(5)
diff --git a/usr/src/man/man3vnd/vnd_errno.3vnd b/usr/src/man/man3vnd/vnd_errno.3vnd
new file mode 100644
index 0000000000..ddd6126dd1
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_errno.3vnd
@@ -0,0 +1,170 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_ERRNO 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_errno, vnd_syserrno, vnd_strerror, vnd_strsyserror \- obtain and
+translate vnd errors
+
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+uint32_t vnd_errno(vnd_handle_t *vhp);
+
+const char *vnd_strerror(vnd_errno_t err);
+
+int vnd_syserrno(vnd_handle_t *vhp);
+
+const char *vnd_strsyserror(int syserr);
+.fi
+
+.SH DESCRIPTION
+
+.LP
+The libvnd(3LIB) library supports a complementary array of errors that
+give more specific error information than the traditional set of
+system errors available via errno(3C). When an error occurs, consumers
+should call the vnd_errno function first and check its value. If the
+value of the vnd_errno_t is VND_E_SYS, then the system errno should be
+checked. If the vnd_errno_t is not VND_E_SYS, then the contents of the
+system errno returned from vnd_syserrno are undefined. Both the vnd
+and system errors are only valid for a given handle after a libvnd
+library function returned an error and before another libvnd library
+function is called on the same handle. The act of making an additional
+function call with the same vnd_handle_t invalidates any prior vnd or
+system error numbers. For the full list of valid vnd errors see
+libvnd(3LIB). For the full list of valid system errors, see Intro(2).
+
+.LP
+The vnd_errno and vnd_syserrno functions retrieve the most recent vnd
+and syserr error number respectively from a vnd handle vhp.
+
+.LP
+The vnd_strerror function translates a vnd_errno_t err to a
+corresponding string and returns a pointer to that constant string.
+
+.LP
+The vnd_syserrno function is analogous to the vnd_strerror function,
+except that it translates a system error back to a string.
+
+
+.SH RETURN VALUES
+
+.LP
+The vnd_errno function returns a vnd_errno_t which contains the vnd
+error information.
+
+.LP
+The vnd_syserror function returns an integer which contains the system
+error information. These values are the same as those returned by
+errno(3C).
+
+.LP
+The vnd_strerror function returns a pointer to a constant string. If
+the error passed in is unknown, the string "unknown error" is
+returned.
+
+.LP
+The vnd_strsyserror function returns a pointer to the translated
+constant string. If an unknown error number is passed, it returns the
+string "Unknown error". If an error occurs, it returns a NULL pointer.
+
+.SH EXAMPLES
+
+.LP
+Example 1 Obtaining errors from a vnd_handle_t
+
+.sp
+.LP
+The following sample C function, which can be incorporated into a larger
+program, shows how to obtain the vnd and system errors from a
+vnd_handle_t after a vnd interface on a handle failed.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+
+static void
+print_errnos(vnd_handle_t *vhp)
+{
+ vnd_errno_t vnderr;
+ int syserr;
+
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+
+ (void) printf("vnd err: %d, sys err: %d\n",
+ vnderr, syserr);
+}
+.fi
+.in -2
+
+.LP
+Example 2 A perror-like function
+
+.sp
+.LP
+The following sample C function which can be incorporated into a
+larger program shows how to write a perror-like function to print
+out error messages for a vnd device.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+
+static void
+sample_perror(const char *msg, vnd_error_t vnderr, int syserr)
+{
+ (void) fprintf(stderr, "%s: %s", msg,
+ vnderr != VND_E_SYS ? vnd_strerror(vnderr) :
+ vnd_strsyserror(syserr));
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See below
+.TE
+
+.LP
+The MT-Level of the functions vnd_strerror and vnd_strsyserror is
+MT-Safe. See "THREADING" in libvnd(3LIB) for a discussion of the
+MT-Level of vnd_errno and vnd_syserrno.
+
+
+.SH SEE ALSO
+
+Intro(2), errno(3C), libvnd(3LIB), attributes(5)
diff --git a/usr/src/man/man3vnd/vnd_frameio_read.3vnd b/usr/src/man/man3vnd/vnd_frameio_read.3vnd
new file mode 100644
index 0000000000..df317c7cf3
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_frameio_read.3vnd
@@ -0,0 +1,705 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_FRAMEIO_READ 3VND "Mar 06, 2014"
+
+.SH NAME
+
+vnd_frameio_read, vnd_frameio_write \- perform framed I/O to a vnd device
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+int vnd_frameio_read(vnd_handle_t *vhp, frameio_t *fiop);
+
+int vnd_frameio_write(vnd_handle_t *vhp, frameio_t *fiop);
+.fi
+
+.SH DESCRIPTION
+.LP
+Framed I/O is a general means to manipulate data that is inherently
+framed, meaning that there is a maximum frame size, but the data may
+often be less than that size. As an example, an Ethernet device's MTU
+describes the maximum frame size, but the size of an individual frame
+is often much less. You can read a single frame at a time, or you can
+read multiple frames in a single call.
+
+In addition, framed I/O allows the consumer to break individual frames
+into a series of vectors. This is analogous to the use of an iovec(9S)
+with readv(2) and writev(2).
+
+vnd_frameio_read performs a framed I/O read of the device represented by
+the handle vhp, with the framed I/O data described by fiop.
+vnd_frameio_write works in the same manner, except performing a write
+instead of a read.
+
+.LP
+The basic vector component of the frameio_t is the framevec_t. Each
+framevec_t represents a single vector entry. An array of these is
+present in the frameio_t. The framevec_t structure has the following
+members:
+
+.in +2
+.nf
+void *fv_buf /* data buffer */
+size_t fv_buflen; /* total size of buffer */
+size_t fv_actlen; /* amount of buffer consumed */
+.fi
+.in -2
+
+.LP
+The fv_buf member points to a buffer which contains the data for this
+individual vector. When reading, data is consumed from fv_buf. When
+writing, data is written into fv_buf.
+
+The fv_buflen should indicate the total amount of data that is in the
+buffer. When reading, it indicates the size of the buffer. It must be
+set prior to calling vnd_frameio_read(). When writing, it indicates the
+amount of data that is valid in the buffer.
+
+The fv_actlen is a read-only member. It is set on successful return of
+the functions vnd_frameio_read and vnd_frameio_write. When reading, it
+is updated with the amount of data that was read into fv_buf. When
+writing, it is instead updated with the amount of data from fv_buf that
+was actually consumed. Generally when writing data, a framevec_t will
+either be entirely consumed or it will not be consumed at all.
+
+
+.LP
+A series of framevec_t's is encapsulated in a frameio_t. The frameio_t
+structure has the following members:
+
+.in +2
+.nf
+uint_t fio_version; /* current version */
+uint_t fio_nvpf; /* number of vectors in one frame */
+uint_t fio_nvecs; /* The total number of vectors */
+framevec_t fio_vecs[]; /* vectors */
+.fi
+.in -2
+
+.LP
+The fio_version member represents the current version of the frameio_t.
+The fio_version should be set to the macro FRAMEIO_CURRENT_VERSION,
+which is currently 1.
+
+The members fio_nvpf and fio_nvecs describe the number of frames that
+exist. fio_nvecs describes the total number of vectors that are present
+in fio_vecs. The upper bound on this is described by FRAMEIO_NVECS_MAX
+which is currently 32. fio_nvpf describe the number of vectors that
+should be used to make up each frame. By setting fio_vecs to be an even
+multiple of fio_nvpf, multiple frames can be read or written in a single
+call.
+
+After a call to vnd_frameio_read or vnd_frameio_write fio_nvecs is
+updated with total number of vectors read or written to. This value can
+be divided by fio_nvpf to determine the total number of frames that were
+written or read.
+
+.LP
+Each frame can be broken down into a series of multiple vectors. As an
+example, someone might want to break Ethernet frames into mac headers
+and payloads. The value of fio_nvpf would be set to two, to indicate
+that a single frame consists of two different vector components. The
+member fio_nvecs describes the total number of frames. As such, the
+value of fio_vecs divided by fio_nvpf describes the total number of
+frames that can be consumed in one call. As a result of this, fio_nvpf
+must evenly divide fio_vecs. If fio_nvpf is set to two and
+fio_nvecs is set to ten, then a total of five frames can be processed
+at once, each frame being broken down into two different vector
+components.
+
+A given frame will never overflow the number of vectors described by
+fio_nvpf. Consider the case where each vector component has a buffer
+sized to 1518 bytes, fio_nvpf is set to one, and fio_nvecs is set to
+three. If a call to vnd_frameio_read is made and four 500 byte Ethernet
+frames come in, then each frame will be mapped to a single vector. The
+500 bytes will be copied into fio_nvecs[i]->fio_buf and
+fio_nvecs[i]->fio_actlen will be set to 500. To contrast this, if
+readv(2) had been called, the first three frames would all be in the
+first iov and the fourth frame's first eight bytes would be in the first
+iov and the remaining in the second.
+
+.LP
+The user must properly initialize fio_nvecs framevec_t's worth of the
+fio_vecs array. When multiple vectors comprise a frame, fv_buflen data
+is consumed before moving onto the next vector. Consider the case
+where the user wants to break a vector into three different
+components, an 18 byte vector for an Ethernet VLAN header, a 20 byte
+vector for an IPv4 header, and a third 1500 byte vector for the
+remaining payload. If a frame was received that only had 30 bytes,
+then the first 18 bytes would fill up the first vector, the remaining
+12 bytes would fill up the IPv4 header. If instead a 524 byte frame
+came in, then the first 18 bytes would be placed in the first vector,
+the next 24 bytes would be placed in the next vector, and the remaining
+500 bytes in the third.
+
+.LP
+The functions vnd_frameio_read and vnd_frameio_write operate in both
+blocking and non-blocking mode. If either O_NONBLOCK or O_NDELAY have
+been set on the file descriptor, then the I/O will behave in
+non-blocking mode. When in non-blocking mode, if no data is available
+when vnd_frameio_read is called, EAGAIN is returned. When
+vnd_frameio_write is called in non-blocking mode, if sufficient buffer
+space to hold all of the output frames is not available, then
+vnd_frameio_write will return EAGAIN. To know when the given vnd device
+has sufficient space, the device fires POLLIN/POLLRDNORM when data is
+available for read and POLLOUT/POLLRDOUT when space in the buffer has
+opened up for write. These events can be watched for through
+port_associate(3C) and similar routines with a file descriptor returned
+from vnd_polfd(3VND).
+
+.LP
+When non-blocking mode is disabled, calls to vnd_frameio_read will
+block until some amount of data is available. Calls to
+vnd_frameio_write will block until sufficient buffer space is
+available.
+
+.LP
+Similar to read(2) and write(2), vnd_frameio_read and
+vnd_frameio_write make no guarantees about the ordering of data when
+multiple threads simultaneously call the interface. While the data
+itself will be atomic, the ordering of multiple simultaneous calls is
+not defined.
+
+.SH RETURN VALUES
+
+.LP
+The vnd_frameio_read function returns zero on success. The member
+fio_nvecs of fiop is updated with the total number of vectors that had
+data read into them. Each updated framevec_t will have the buffer
+pointed to by fv_buf filled in with data, and fv_actlen will be
+updated with the amount of valid data in fv_buf.
+
+.LP
+The vnd_frameio_write function returns zero on success. The member
+fio_nvecs of fiop is updated with the total number of vectors that
+were written out to the underlying datalink. The fv_actlen of each
+vector is updated to indicate the amount of data that was written from
+that buffer.
+
+.LP
+On failure, both vnd_frameio_read and vnd_frameio_write return -1. The
+vnd and system error numbers are updated and available via
+vnd_errno(3VND) and vnd_syserrno(3VND). See ERRORS below for a list of
+errors and their meaning.
+
+
+.SH ERRORS
+.LP
+The functions vnd_frameio_read and vnd_frameio_write always set the
+vnd error to VND_E_SYS. The following system errors will be
+encountered:
+
+.sp
+.ne 2
+.na
+EAGAIN
+.ad
+.RS 10n
+Insufficient system memory was available for the operation.
+.sp
+Non-blocking mode was enabled and during the call to vnd_frameio_read,
+no data was available. Non-blocking mode was enabled and during the call
+to vnd_frameio_write, insufficient buffer space was available.
+.RE
+
+.sp
+.ne 2
+.na
+ENXIO
+.ad
+.RS 10n
+The vnd device referred to by vhp is not currently attached to an
+underlying data link and cannot send data.
+.RE
+
+.sp
+.ne 2
+.na
+EFAULT
+.ad
+.RS 10n
+The fiop argument points to an illegal address or the fv_buf members of
+the framevec_t's associated with the fiop member fio_vecs point to
+illegal addresses.
+.RE
+
+.sp
+.ne 2
+.na
+EINVAL
+.ad
+.RS 10n
+The fio_version member of fiop was unknown, the number of vectors
+specified by fio_nvecs is zero or greater than FRAMEIO_NVECS_MAX,
+fio_nvpf equals zero, fio_nvecs is not evenly divisible by fio_nvpf, or
+a buffer in fio_vecs[] has set fv_buf or fv_buflen to zero.
+.RE
+
+
+.sp
+.ne 2
+.na
+EINTR
+.ad
+.RS 10n
+A signal was caught during vnd_frameio_read or vnd_frameio_write, and no
+data was transferred.
+.RE
+
+
+.sp
+.ne 2
+.na
+EOVERFLOW
+.ad
+.RS 10n
+During vnd_frameio_read, the size of a frame specified by fiop->fio_nvpf
+and fiop->fio_vecs[].fv_buflen cannot contain a frame.
+.sp
+In a ILP32 environment, more data than UINT_MAX would be set in
+fv_actlen.
+.RE
+
+
+.sp
+.ne 2
+.na
+ERANGE
+.ad
+.RS 10n
+During vnd_frameio_write, the size of a frame is less than the device's
+minimum transmission unit or it is larger than the size of the maximum
+transmission unit.
+.RE
+
+
+.SH EXAMPLES
+
+.LP
+Example 1 Read a single frame with a single vector
+
+.sp
+.LP
+The following sample C program opens an existing vnd device named
+"vnd0" in the current zone and performs a blocking read of a single
+frame from it.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr, i;
+ frameio_t *fiop;
+
+ fiop = malloc(sizeof (frameio_t) + sizeof (framevec_t));
+ if (fiop == NULL) {
+ perror("malloc frameio_t");
+ return (1);
+ }
+ fiop->fio_version = FRAMEIO_CURRENT_VERSION;
+ fiop->fio_nvpf = 1;
+ fiop->fio_nvecs = 1;
+ fiop->fio_vecs[0].fv_buf = malloc(1518);
+ fiop->fio_vecs[0].fv_buflen = 1518;
+ if (fiop->fio_vecs[0].fv_buf == NULL) {
+ perror("malloc framevec_t.fv_buf");
+ free(fiop);
+ return (1);
+ }
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ free(fiop->fio_vecs[0].fv_buf);
+ free(fiop);
+ return (1);
+ }
+
+ if (frameio_read(vhp, fiop) != 0) {
+ vnd_errno_t vnderr = vnd_errno(vhp);
+ int syserr = vnd_syserrno(vhp);
+
+ /* Most consumers should retry on EINTR */
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to read: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to read: %s",
+ vnd_strerror(vnderr));
+ vnd_close(vhp);
+ free(fiop->fio_vecs[0].fv_buf);
+ free(fiop);
+ return (1);
+ }
+
+
+ /* Consume the data however it's desired */
+ (void) printf("received %d bytes\n", fiop->fio_vecs[0].fv_actlen);
+ for (i = 0; i < fiop->fio_vecs[0].fv_actlen)
+ (void) printf("%x ", fiop->fio_vecs[0].fv_buf[i]);
+
+ vnd_close(vhp);
+ free(fiop->fio_vecs[0].fv_buf);
+ free(viop);
+ return (0);
+}
+.fi
+.in -2
+
+.LP
+Example 2 Write a single frame with a single vector
+.sp
+.LP
+The following sample C program opens an existing vnd device named
+"vnd0" in the current zone and performs a blocking write of a single
+frame to it.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+#include <string.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+ frameio_t *fiop;
+
+ fiop = malloc(sizeof (frameio_t) + sizeof (framevec_t));
+ if (fiop == NULL) {
+ perror("malloc frameio_t");
+ return (1);
+ }
+ fiop->fio_version = FRAMEIO_CURRENT_VERSION;
+ fiop->fio_nvpf = 1;
+ fiop->fio_nvecs = 1;
+ fiop->fio_vecs[0].fv_buf = malloc(1518);
+ if (fiop->fio_vecs[0].fv_buf == NULL) {
+ perror("malloc framevec_t.fv_buf");
+ free(fiop);
+ return (1);
+ }
+
+ /*
+ * Fill in your data however you desire. This is an entirely
+ * invalid frame and while the frameio write may succeed, the
+ * networking stack will almost certainly drop it.
+ */
+ (void) memset(fiop->fio_vecs[0].fv_buf, 'r', 1518);
+ fiop->fio_vecs[0].fv_buflen = 1518;
+
+ vhp = vnd_open(NULL, "vnd0", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ free(fiop->fio_vecs[0].fv_buf);
+ free(fiop);
+ return (1);
+ }
+
+ if (frameio_write(vhp, fiop) != 0) {
+ /* Most consumers should retry on EINTR */
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to write: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to write: %s",
+ vnd_strerror(vnderr));
+ vnd_close(vhp);
+ free(fiop->fio_vecs[0].fv_buf);
+ free(fiop);
+ return (1);
+ }
+
+
+ (void) printf("wrote %d bytes\n", fiop->fio_vecs[0].fv_actlen);
+
+ vnd_close(vhp);
+ free(fiop->fio_vecs[0].fv_buf);
+ free(viop);
+ return (0);
+}
+.fi
+.in -2
+
+.LP
+Example 3 Read frames comprised of multiple vectors
+.sp
+.LP
+The following sample C program is similar to example 1, except instead
+of reading a single frame consisting of a single vector it reads
+multiple frames consisting of two vectors. The first vector has room for
+an 18 byte VLAN enabled Ethernet header and the second vector has room
+for a 1500 byte payload.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr, i, nframes;
+ frameio_t *fiop;
+
+ /* Allocate enough framevec_t's for 5 frames */
+ fiop = malloc(sizeof (frameio_t) + sizeof (framevec_t) * 10);
+ if (fiop == NULL) {
+ perror("malloc frameio_t");
+ return (1);
+ }
+ fiop->fio_version = FRAMEIO_CURRENT_VERSION;
+ fiop->fio_nvpf = 2;
+ fiop->fio_nvecs = 10;
+ for (i = 0; i < 10; i += 2) {
+ fiop->fio_vecs[i].fv_buf = malloc(18);
+ fiop->fio_vecs[i].fv_buflen = 18;
+ if (fiop->fio_vecs[i].fv_buf == NULL) {
+ perror("malloc framevec_t.fv_buf");
+ /* Perform appropriate memory cleanup */
+ return (1);
+ }
+ fiop->fio_vecs[i+1].fv_buf = malloc(1500);
+ fiop->fio_vecs[i+1].fv_buflen = 1500;
+ if (fiop->fio_vecs[i+1].fv_buf == NULL) {
+ perror("malloc framevec_t.fv_buf");
+ /* Perform appropriate memory cleanup */
+ return (1);
+ }
+ }
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ /* Perform appropriate memory cleanup */
+ return (1);
+ }
+
+ if (frameio_read(vhp, fiop) != 0) {
+ /* Most consumers should retry on EINTR */
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to read: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to read: %s",
+ vnd_strerror(vnderr));
+ vnd_close(vhp);
+ /* Perform appropriate memory cleanup */
+ return (1);
+ }
+
+ /* Consume the data however it's desired */
+ nframes = fiop->fio_nvecs / fiop->fio_nvpf;
+ (void) printf("consumed %d frames!\n", nframes);
+ for (i = 0; i < nframes; i++) {
+ (void) printf("received %d bytes of Ethernet Header\n",
+ fiop->fio_vecs[i].fv_actlen);
+ (void) printf("received %d bytes of payload\n",
+ fiop->fio_vecs[i+1].fv_actlen);
+ }
+
+ vnd_close(vhp);
+ /* Do proper memory cleanup */
+ return (0);
+}
+.nf
+.in -2
+
+.LP
+Example 4 Perform non-blocking reads of multiple frames with a
+single vector
+.sp
+.LP
+In this sample C program, opens an existing vnd device named "vnd0" in
+the current zone, ensures that it is in non-blocking mode, and uses
+event ports to do device reads.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+#include <port.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/tpyes.h>
+#include <fcntl.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr, i, nframes, port, vfd;
+ frameio_t *fiop;
+
+ port = port_create();
+ if (port < 0) {
+ perror("port_create");
+ return (1);
+ }
+ /* Allocate enough framevec_t's for 10 frames */
+ fiop = malloc(sizeof (frameio_t) + sizeof (framevec_t) * 10);
+ if (fiop == NULL) {
+ perror("malloc frameio_t");
+ (void) close(port);
+ return (1);
+ }
+ fiop->fio_version = FRAMEIO_CURRENT_VERSION;
+ fiop->fio_nvpf = 1;
+ fiop->fio_nvecs = 10;
+ for (i = 0; i < 10; i++) {
+ fiop->fio_vecs[i].fv_buf = malloc(1518);
+ fiop->fio_vecs[i].fv_buflen = 1518;
+ if (fiop->fio_vecs[i].fv_buf == NULL) {
+ perror("malloc framevec_t.fv_buf");
+ /* Perform appropriate memory cleanup */
+ (void) close(port);
+ return (1);
+ }
+ }
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ /* Perform appropriate memory cleanup */
+ (void) close(port);
+ return (1);
+ }
+ vfd = vnd_pollfd(vhp);
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) != 0) {
+ (void) fprintf(stderr, "failed to enable non-blocking mode: %s",
+ strerrror(errno));
+ }
+
+ for (;;) {
+ port_event_t pe;
+
+ if (port_associate(port, PORT_SOURCE_FD, vfd, POLLIN,
+ vhp) != 0) {
+ perror("port_associate");
+ vnd_close(vhp);
+ /* Perform appropriate memory cleanup */
+ (void) close(port);
+ return (1);
+ }
+
+ if (port_get(port, &pe, NULL) != 0) {
+ if (errno == EINTR)
+ continue;
+ perror("port_associate");
+ vnd_close(vhp);
+ /* Perform appropriate memory cleanup */
+ (void) close(port);
+ return (1);
+ }
+
+ /*
+ * Most real applications will need to compare the file
+ * descriptor and switch on it. In this case, assume
+ * that the fd in question that is readable is 'vfd'.
+ */
+ if (frameio_read(pe.portev_user, fiop) != 0) {
+ vnd_errno_t vnderr = vnd_errno(vhp);
+ int syserr = vnd_syserrno(vhp);
+
+ if (vnderr == VND_E_SYS && (syserr == EINTR ||
+ syserr == EAGAIN))
+ continue;
+ (void) fprintf(stderr, "failed to get read: %s",
+ vnd_strsyserror(vnderr));
+ vnd_close(vhp);
+ /* Perform appropriate memory cleanup */
+ (void) close(port);
+ return (1);
+ }
+
+ /* Consume the data however it's desired */
+ nframes = fiop->fio_nvecs / fiop->fio_nvpf;
+ for (i = 0; i < nframes; i++) {
+ (void) printf("frame %d is %d bytes large\n", i,
+ fiop->fio_vecs[i].fv_actlen);
+ }
+
+ }
+
+ vnd_close(vhp);
+ /* Do proper memory cleanup */
+ return (0);
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING" in libvnd(3LIB)
+.TE
+
+
+.SH SEE ALSO
+
+Intro(2), getmsg(2), read(2), readv(2), write(2), writev(2),
+libvnd(3VND), vnd_errno(3VND), vnd_pollfd(3VND), vnd_syserrno(3VND),
+iovec(9S)
diff --git a/usr/src/man/man3vnd/vnd_pollfd.3vnd b/usr/src/man/man3vnd/vnd_pollfd.3vnd
new file mode 100644
index 0000000000..500d3bac99
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_pollfd.3vnd
@@ -0,0 +1,155 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_POLLFD 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_pollfd \- get file descriptor for polling
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+int vnd_pollfd(vnd_handle_t *vhp);
+.fi
+
+.SH DESCRIPTION
+.LP
+The vnd_pollfd() function returns an integer id which corresponds to
+the file descriptor that represents the underlying device that is
+associated with the vnd handle vhp. This file descriptor is suitable
+for use with port_associate(3C) and similar polling techniques such as
+poll(2). Use of the file descriptor outside of these uses may cause
+undocumented behavior from the rest of the library.
+
+.LP
+The file descriptor in question is still managed by libvnd. The caller
+must not call close(2) on it. Once vnd_close(3VND) has been called,
+any further use of the file descriptor is undefined behavior.
+
+
+.SH RETURN VALUES
+.LP
+The function returns the integer id of the file descriptor that
+corresponds to the underlying vnd device.
+
+.SH EXAMPLES
+
+.LP
+Example 1 Use event ports for vnd notifications
+.sp
+.LP
+The following sample C program shows how to use the vnd_pollfd
+function with event ports to be notified whenever there is data
+available to be read. This program assumes that a vnd device named
+"vnd0" exists in the current zone. For an example of creating the
+device, see Example 1 in vnd_create(3VND).
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <port.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int port, syserr, vfd, ret;
+
+ port = port_create();
+ if (port < 0) {
+ perror("port_create");
+ return (1);
+ }
+
+ vhp = vnd_open(NULL, "vnd0", &vnderr, &syserr);
+ if (vhp == NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ (void) close(port);
+ return (1);
+ }
+
+ vfd = vnd_pollfd(vhp);
+ if (fcntl(vfd, F_SETFL, O_NONBLOCK) != 0) {
+ perror("fcntl");
+ vnd_close(vhp);
+ (void) close(port);
+ return (1);
+ }
+
+ if (port_associate(port, PORT_SOURCE_FD, vfd, POLLIN, NULL) != 0) {
+ perror("port_associate");
+ vnd_close(vhp);
+ (void) close(port);
+ return (1);
+ }
+
+ for (;;) {
+ port_event_t pe;
+
+
+ if (port_get(port, &pe, NULL) != 0) {
+ if (errno == EINTR)
+ continue;
+ perror("port_get");
+ vnd_close(vhp);
+ (void) close(port);
+ return (1);
+ }
+
+ /*
+ * Read the data with vnd_frameio_read(3VND) and
+ * optionally break out of the loop or continue to the
+ * next iteration and reassociate vfd with the event
+ * port.
+ */
+ }
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING" in libvnd(3LIB)
+.TE
+
+.SH SEE ALSO
+
+close(2), poll(2), port_create(3C), libvnd(3LIB), vnd_close(3VND)
diff --git a/usr/src/man/man3vnd/vnd_prop_get.3vnd b/usr/src/man/man3vnd/vnd_prop_get.3vnd
new file mode 100644
index 0000000000..4170a4aca5
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_prop_get.3vnd
@@ -0,0 +1,243 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_PROP_GET 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_prop_get, vnd_prop_set \- get and set vnd properties
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+int vnd_prop_get(vnd_handle_t *vhp, vnd_prop_t prop, void *buf, size_t len);
+
+int vnd_prop_set(vnd_handle_t *vhp, vnd_prop_t prop, void *buf, size_t len);
+.fi
+
+.SH DESCRIPTION
+.LP
+The vnd_prop_get and vnd_prop_set functions are used to retrieve
+and set property values on the vnd_handle_t referred to by vhp. The
+property to get or set is specified by the argument prop. The
+argument buf and the size of buf, in len, should be a pointer to the
+appropriate structure for the property as defined in libvnd(3LIB).
+
+.LP
+All of the supported properties are listed and described in the
+libvnd(3LIB) manual page.
+
+
+.SH RETURN VALUES
+.LP
+On success, the vnd_prop_get and vnd_prop_set functions return zero.
+On failure, they return -1 and additional error information is
+available through vnd_errno(3VND) and vnd_syserrno(3VND).
+
+.LP
+When vnd_prop_get returns successfully, the contents of buf are
+filled in with the value of the corresponding property. The contents
+of buf should not change across a call to vnd_prop_set.
+
+.SH EXAMPLES
+
+.LP
+Example 1 Getting the value of the rxbuf property
+.LP
+The following sample C program retrieves the value of the
+rxbuf property and prints it to standard out.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+ vnd_prop_buf_t vpb;
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ if (vnd_prop_get(vhp, VND_PROP_RXBUF, &vpn, sizeof (vpn)) != 0) {
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to get VND_PROP_RXBUF: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to get VND_PROP_RXBUF: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ (void) printf("recieve buffer size is %d bytes\n", vpb.vpb_size);
+
+ vnd_close(vnd);
+ return (0);
+}
+.fi
+.in -2
+
+.LP
+EXAMPLE 2 Setting a property
+.LP
+This sample C program sets the property VND_PROP_RXBUF to the value of
+4200 bytes.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+ vnd_prop_buf_t vpb;
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ vpb.vpb_size = 4200;
+ if (vnd_prop_set(vhp, VND_PROP_RXBUF, &vpb, sizeof (vpb)) != 0) {
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to set VND_PROP_RXBUF: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to set VND_PROP_RXBUF: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ (void) printf("successfully set VND_PROP_RXBUF to 4200\n");
+
+ vnd_close(vnd);
+ return (0);
+}
+.fi
+.in -2
+
+.LP
+Example 3 Setting a property to the value of another.
+.LP
+In this sample C program, we set the VND_PROP_TXBUF to the maximum
+allowable size as determined by the read-only property VND_PROP_MAXBUF.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+ vnd_prop_buf_t vpb;
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ if (vnd_prop_get(vhp, VND_PROP_MAXBUF, &vpb, sizeof (vpb)) != 0) {
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to get VND_PROP_MAXBUF: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to get VND_PROP_MAXBUF: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ if (vnd_prop_set(vhp, VND_PROP_TXBUF, &vpb, sizeof (vpb)) != 0) {
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to set VND_PROP_TXBUF: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to set VND_PROP_TXBUF: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ (void) printf("successfully set VND_PROP_TXBUF to %d\n", vpb.vpb_size);
+
+ vnd_close(vnd);
+ return (0);
+}
+.nf
+.fi
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING" in libvnd(3LIB)
+.TE
+
+.SH SEE ALSO
+libvnd(3VND), vnd_errno(3VND, vnd_syserrno(3VND)
diff --git a/usr/src/man/man3vnd/vnd_prop_iter.3vnd b/usr/src/man/man3vnd/vnd_prop_iter.3vnd
new file mode 100644
index 0000000000..18485950cf
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_prop_iter.3vnd
@@ -0,0 +1,148 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_PROP_ITER 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_prop_iter \- iterate vnd properties
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+typedef int (vnd_prop_iter_f)(vnd_handle_t *vhp, vnd_prop_t prop,
+ void *cbarg);
+
+int vnd_prop_iter(vnd_handle_t *vhp, vnd_prop_iter_f cb,
+ void *arg);
+.fi
+
+.SH DESCRIPTION
+.LP
+The vnd_prop_iter function iterates over all the available properties
+for the vnd handle vhp and calls the user supplied callback function
+cb. The argument arg is passed directly to the callback function.
+
+.LP
+The function specified by cb receives three arguments. The first, vhp,
+is the same vnd library handle that was passed to vnd_prop_iter. During
+the callback, the consumer should not call vnd_close(3VND). Doing so
+will lead to undefined and undocumented behavior. The second argument,
+prop, is the current property. While vnd_prop_iter guarantees that all
+properties will be recieved, it does not guarantee the order of them.
+The final argument, cbarg, is the same argument that the caller passed
+in during arg.
+
+.LP
+The return value of the callback function cb indicates whether or not
+property iteration should continue. To continue iteration, the
+function cb should return zero. Otherwise, to stop property iteration
+it should return non-zero.
+
+.SH RETURN VALUES
+
+.LP
+On success, the function vnd_prop_iter returns zero. If the callback
+function returned non-zero to terminate iteration, vnd_prop_iter will
+instead return one. In the case of library failure, vnd_prop_iter will
+return -1. In such cases, the vnd and system errors will be updated
+and available via vnd_errno(3VND) and vnd_syserrno(3VND).
+
+.SHEXAMPLES
+
+.LP
+Example 1 Print writeable properties
+
+.LP
+The following sample C program walks over every vnd property and
+prints out whether the property is read-only or read-write for the
+vnd device "vnd1" in the current zone.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+static int
+print_prop(vnd_handle_t *vhp, vnd_prop_t prop, void *unused)
+{
+ boolean_t canwrite;
+
+ if (vnd_prop_writeable(vhp, &canwrite) != 0)
+ abort();
+
+ (void) printf("prop %d is %s", prop, canwrite == B_TRUE ? "rw" : "r-");
+ return (0);
+}
+
+int
+main(void)
+{
+ vnd_handle_t *vhp;
+ vnd_errno_t vnderr;
+ int syserr;
+
+ vhp = vnd_open(NULL, "vnd1", &vnderr, &syserr);
+ if (vhp != NULL) {
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ if (vnd_prop_iter(vhp, print_prop, NULL) != 0) {
+ vnderr = vnd_errno(vhp);
+ syserr = vnd_syserrno(vhp);
+ if (vnderr == VND_E_SYS)
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strsyserror(syserr));
+ else
+ (void) fprintf(stderr, "failed to open device: %s",
+ vnd_strerror(vnderr));
+ return (1);
+ }
+
+ vnd_close(vnd);
+ return (0);
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level See "THREADING" in libvnd(3LIB)
+.TE
+
+libvnd(3LIB), vnd_close(3VND), vnd_errno(3VND), vnd_syserrno(3VND)
diff --git a/usr/src/man/man3vnd/vnd_prop_writeable.3vnd b/usr/src/man/man3vnd/vnd_prop_writeable.3vnd
new file mode 100644
index 0000000000..c23414718b
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_prop_writeable.3vnd
@@ -0,0 +1,101 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_PROP_WRITEABLE 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_prop_writeable \- determine if a vnd property can be updated
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+int vnd_prop_writeable(vnd_prop_t prop, boolean_t *wp);
+.fi
+
+
+.SH DESCRIPTION
+.LP
+The vnd_prop_writeable function is used as a programmatic means to
+determine whether a given vnd property is writeable or not. The
+property to check is specified in prop and should be from the list
+described in libvnd(3VND). The argument wp is a pointer to a boolean_t
+which will be updated upon the successful completion of the function.
+The argument wp must be a valid pointer. If a property is writeable
+than the value pointed to by wp is set to B_TRUE. If the property is
+read-only, then the value is set to B_FALSE.
+
+
+.SH RETURN VALUES
+.LP
+On success, vnd_prop_writeable returns zero and the value pointed to
+by wp is updated with whether the property is writeable. If the
+property prop does not exist, then vnd_prop_writeable will return -1.
+
+.SH EXAMPLES
+.LP
+Example 1 Check whether the property VND_PROP_TXBUF is writable
+.LP
+The following sample C program checks whether the vnd property
+VND_PROP_TXBUF is writeable or not.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+ boolean_t canwrite;
+
+ if (vnd_prop_writeable(VND_PROP_TXBUF, &prop) != 0)
+ abort();
+
+ if (canwrite == B_TRUE)
+ (void) printf("VND_PROP_TXBUF is writeable\n");
+ else
+ (void) printf("VND_PROP_TXBUF is read only\n");
+
+ return (0);
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level MT-Safe
+.TE
+
+.SH SEE ALSO
+
+vndadm(1M), libvnd(3VND)
diff --git a/usr/src/man/man3vnd/vnd_walk.3vnd b/usr/src/man/man3vnd/vnd_walk.3vnd
new file mode 100644
index 0000000000..0c7197f656
--- /dev/null
+++ b/usr/src/man/man3vnd/vnd_walk.3vnd
@@ -0,0 +1,155 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND_CREATE 3VND "Feb 21, 2014"
+
+.SH NAME
+
+vnd_walk \- walk all vnd devices
+
+
+.SH SYNOPSIS
+
+.LP
+.nf
+cc [ flag... ] file... -lvnd [ library... ]
+#include <libvnd.h>
+
+typedef int (*vnd_walk_cb_f)(vnd_info_t *viip, void *cbarg);
+
+int vnd_walk(vnd_walk_cb_t cb, void *arg, vnd_errno_t *vnderr, int *syserr);
+.fi
+
+
+.SH DESCRIPTION
+.LP
+The vnd_walk() function fires the callback function cb once for every
+vnd device that is visible in the current zone. If the caller is in
+the global zone, then all vnd devices in all zones will be walked. If
+the caller is in a non-global zone, then only the devices in that zone
+will be visible.
+
+.LP
+The function cb will be called with two arguments. The first argument,
+viip, is a pointer to a structure that contains information about the
+link. The second argument to the function cb, cbarg, is the same
+argument that is passed to the function vnd_walk as arg. To continue
+the function cb should return zero. If the function cb returns
+non-zero the walk will terminate.
+
+.LP
+As the vnd_walk function does not have a handle, errors are returned
+in vnderr and syserr. Both vnderr and syserr are allowed to be NULL
+pointers. If either one is a NULL pointer, then error information for
+that class of error will not be returned. It is not recommended that
+consumers supply NULL pointers.
+
+.LP
+The vnd_info_t structure contains the following members:
+
+.in +2
+.nf
+uint32_t vi_version
+zoneid_t vi_zone
+char vi_name[LIBVND_NAMELEN];
+char vi_datalink[LIBVND_NAMELEN];
+.fi
+.in -2
+
+.LP
+The member vi_version is guaranteed to be the first member of the
+structure. This number indicates the current revision of the structure
+and is set to the integer value 1. More properties may be added in
+future releases. Those properties will be tied to a greater version
+number so software knows whether or not it is legal to access them.
+
+.LP
+The vi_zone field indicates the zone id that the vnd device exists in.
+The vi_name field is the name of the vnd device. If the vnd_device is
+not linked, the name field is set to "<unknown>". The vi_datalink
+field is filled in with the name of the data link the vnd device is on
+top of.
+
+
+.SH RETURN VALUES
+
+.LP
+The vnd_walk function will return zero on success. If the consumer
+supplied callback function returned non-zero, then the vnd_walk
+function will return 1. If an error occurred, -1 is returned, and if
+vnderr and syserr are non-null, they are filled in with their
+respective error values. See vnd_errno(3VND) for more information on
+these errors.
+
+.SH EXAMPLES
+
+.LP
+Example 1 Walk all devices and print information about them
+
+.LP
+The following sample C program walks every vnd device and prints out
+information about them.
+
+.sp
+.in +2
+.nf
+#include <libvnd.h>
+#include <stdio.h>
+
+static int
+print_entry(vnd_info_t *viip, void *unused)
+{
+ (void) printf("device %s over data link %s in zone %d\n",
+ viip->vi_name, viip->vi_datalink, viip->vi_zone);
+ return (0);
+}
+
+int
+main(void)
+{
+ vnd_errno_t vnderr;
+ int syserr;
+
+ if (vnd_walk(print_entry, NULL, &vnderr, &syserr) != 0) {
+ (void) fprintf(stderr, "failed to walk vnd devices: %s\n",
+ vnderr != VND_E_SYS ? vnd_strerror(vnderr) :
+ vnd_strsyserror(syserr));
+ return (1);
+ }
+
+ return (0);
+}
+.fi
+.in -2
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Stability Committed
+_
+MT-Level MT-Safe
+.TE
+
+.SH SEE ALSO
+
+libvnd(3VND), vnd_errno(3VND), attributes(5), zones(5)
diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile
index 806d6521b6..b6258f7661 100644
--- a/usr/src/man/man7d/Makefile
+++ b/usr/src/man/man7d/Makefile
@@ -142,6 +142,7 @@ _MANFILES= aac.7d \
virtualkm.7d \
vni.7d \
vr.7d \
+ vnd.7d \
wscons.7d \
wusb_ca.7d \
wusb_df.7d \
diff --git a/usr/src/man/man7d/vnd.7d b/usr/src/man/man7d/vnd.7d
new file mode 100644
index 0000000000..4cc0d8f9bf
--- /dev/null
+++ b/usr/src/man/man7d/vnd.7d
@@ -0,0 +1,119 @@
+'\" te
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\"
+.TH VND 7D "Feb 11, 2014"
+.SH NAME
+vnd \- virtual layer two network driver
+
+.SH SYNOPSIS
+.nf
+.LP
+/dev/vnd/ctl
+.nf
+.LP
+/dev/vnd/*
+
+.SH DESCRIPTION
+.sp
+.LP
+The vnd driver provides support for a layer two datapath in an
+analogous way that IP(7P) provides a support for an IP-based layer
+three datapath. Both devices operate exclusively on datalinks. A
+datalink that has been plumbed up with IP via ifconfig(1M) or
+ipadm(1M) cannot be used with vnd or vice-versa.
+.sp
+.LP
+The vnd driver supports and takes advantage of the the following
+illumos features:
+.RS
+.sp
+.LP
+Supports dld/dls feature negotation of GLDv3 features, such
+as direct calls, flow control, checksum offloading, and more.
+.sp
+.LP
+All IP and IPv6 based traffic is sent through ipfilter(5),
+allowing packet filtering.
+.sp
+.LP
+Better control over vectored reads and writes in a frame-centric manner
+through framed I/O. See libvnd(3LIB) for more information on these
+interfaces.
+.RE
+.sp
+.LP
+The vnd driver exposes two different kinds of device nodes. The first is
+a self-cloning control node which can be used to create vnd devices on
+top of datalinks. Those devices can optionally be bound into the file
+system namespace under /dev/vnd. Control operations on the control node
+or named devices are private to the implementation. Instead,
+libvnd(3LIB) provides a stable interfaces for using, creating, and
+manipulating vnd devices.
+.sp
+.LP
+.SH FILES
+.sp
+.n3 2
+.na
+/dev/vnd/ctl
+.ad
+.RS 16n
+vnd self-cloning control node
+.RE
+
+.sp
+.n3 2
+.na
+/dev/vnd/%link
+.ad
+.RS 16n
+Character device that corresponds to the vnd device of the given
+name (%link). A given device will appear for each actively linked device
+in the current zone.
+.RE
+
+.sp
+.n3 2
+.na
+/dev/vnd/zone/%zone/%link
+.ad
+.RS 16n
+These are character devices that correspond to the vnd device of
+the given name (%link). They are organized based on the zone that they
+appear in. Thus if a zone named foo has a vnd device named
+bar, then the global zone will have the file
+/dev/vnd/zone/foo/bar. Note, these only occur in the global zone.
+.RE
+
+.SH ATTRIBUTES
+.sp
+.LP
+See attributes(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+box;
+c | c
+l | l .
+ATTRIBUTE TYPE ATTRIBUTE VALUE
+_
+Interface Stability Evolving
+.TE
+
+.SH SEE ALSO
+.sp
+.LP
+dladm(1M), ipflter(5), libvnd(3LIB), vndadm(1M),
+vndstat(1)
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 1a00392914..09550a587c 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -21,7 +21,7 @@
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2013 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@@ -674,6 +674,10 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
VNIC_OBJS += vnic_ctl.o vnic_dev.o
+VND_OBJS += vnd.o frameio.o
+
+GSQUEUE_OBJS += gsqueue.o
+
SIMNET_OBJS += simnet.o
IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o
@@ -1129,8 +1133,7 @@ DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o
DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \
sdev_ptsops.o sdev_zvolops.o sdev_comm.o \
sdev_profile.o sdev_ncache.o sdev_netops.o \
- sdev_ipnetops.o \
- sdev_vtops.o
+ sdev_ipnetops.o sdev_vtops.o sdev_plugin.o
CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 08c4faefda..e4d94382b2 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -24,7 +24,7 @@
#
#
-# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
# Copyright 2013 Garrett D'Amore <garrett@damore.org>
#
@@ -1125,6 +1125,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1137,6 +1141,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c
index 157acc25fc..3d350ff278 100644
--- a/usr/src/uts/common/dtrace/sdt_subr.c
+++ b/usr/src/uts/common/dtrace/sdt_subr.c
@@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
};
+/*
+ * When adding a new provider you must add it before sdt as sdt is a catch all
+ * for remaining probes.
+ */
sdt_provider_t sdt_providers[] = {
{ "vtrace", "__vtrace_", &vtrace_attr },
{ "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
@@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = {
{ "fc", "__fc_", &fc_attr },
{ "srp", "__srp_", &fc_attr },
{ "sysevent", "__sysevent_", &stab_attr },
+ { "vnd", "__vnd_", &stab_attr },
{ "sdt", NULL, &sdt_attr },
{ NULL }
};
@@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = {
{ "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *",
"fc_port_info_t *" },
+ { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" },
+ { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" },
+ { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" },
+ { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" },
+ { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-in", 3, 3, "const char *", "const char *" },
+ { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-out", 3, 3, "const char *", "const char *" },
+ { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" },
+ { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "send", 1, 1, "void *", "csinfo_t *" },
+ { "vnd", "send", 2, 2, "void *", "ipinfo_t *" },
+ { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" },
+ { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" },
+ { "vnd", "recv", 1, 1, "void *", "csinfo_t *" },
+ { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" },
+ { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" },
+ { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" },
{ NULL }
};
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index 4eaf38f484..3f637f4cf5 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -41,8 +42,102 @@
#include <sys/zone.h>
#include <sys/dls.h>
+static const char *devnet_zpath = "/dev/net/zone/";
struct vnodeops *devnet_vnodeops;
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+ char *zname = NULL, *dup;
+ zone_t *zone;
+ int duplen;
+ zoneid_t zid;
+
+ /*
+ * If in a non-global zone, always return it's zid no matter what the
+ * node is.
+ */
+ zid = getzoneid();
+ if (zid != GLOBAL_ZONEID)
+ return (zid);
+
+ /*
+ * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+ * we're targetting.
+ */
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+ return (GLOBAL_ZONEID);
+
+ if (dv->sdev_vnode->v_type == VDIR) {
+ zone = zone_find_by_name(dv->sdev_name);
+ } else {
+ /* Non directories have the form /dev/net/zone/%z/%s */
+ dup = strdup(dv->sdev_path);
+ duplen = strlen(dup);
+ zname = strrchr(dup, '/');
+ *zname = '\0';
+ zname--;
+ zname = strrchr(dup, '/');
+ zname++;
+ zone = zone_find_by_name(zname);
+ kmem_free(dup, duplen + 1);
+ }
+ if (zone == NULL)
+ return (GLOBAL_ZONEID);
+ zid = zone->zone_id;
+ zone_rele(zone);
+ return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+ sdev_node_t *dv;
+ struct vattr va;
+ int ret;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ dv = sdev_cache_lookup(ddv, name);
+ if (dv != NULL) {
+ SDEV_SIMPLE_RELE(dv);
+ return (EEXIST);
+ }
+
+ va = *sdev_getdefault_attr(VDIR);
+ gethrestime(&va.va_atime);
+ va.va_mtime = va.va_atime;
+ va.va_ctime = va.va_atime;
+
+ ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(dv);
+ return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, dv->sdev_path) == 0)
+ return (SDEV_VTOR_VALID);
+
+ zonep = zone_find_by_name(dv->sdev_name);
+ if (zonep == NULL)
+ return (SDEV_VTOR_INVALID);
+ zone_rele(zonep);
+ return (SDEV_VTOR_VALID);
+}
+
/*
* Check if a net sdev_node is still valid - i.e. it represents a current
* network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
ASSERT(dv->sdev_state == SDEV_READY);
- if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+ if (dv->sdev_vnode->v_type == VDIR)
+ return (devnet_dirvalidate(dv));
+
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+ ASSERT(SDEV_IS_GLOBAL(dv));
+ zoneid = devnet_nodetozone(dv);
+ } else {
+ zoneid = getzoneid();
+ }
+
+ if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
return (SDEV_VTOR_INVALID);
- if (SDEV_IS_GLOBAL(dv))
+ if (zoneid == GLOBAL_ZONEID)
return (SDEV_VTOR_VALID);
- zoneid = getzoneid();
return (zone_check_datalink(&zoneid, linkid) == 0 ?
SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
}
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
* a net entry when the node is not found in the cache.
*/
static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+ zoneid_t zid)
{
timestruc_t now;
dev_t dev;
int error;
- if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+ if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
"network node: %s\n", nm));
return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
struct sdev_node *ddv = VTOSDEV(dvp);
struct sdev_node *dv = NULL;
dls_dl_handle_t ddh = NULL;
+ zone_t *zone;
struct vattr vattr;
int nmlen;
int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
if (SDEVTOV(ddv)->v_type != VDIR)
return (ENOTDIR);
+ if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+ return (EPERM);
+
/*
* Empty name or ., return node itself.
*/
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
rw_enter(&ddv->sdev_contents, RW_WRITER);
/*
+ * ZOMBIED parent does not allow new node creation, bail out early.
+ */
+ if (ddv->sdev_state == SDEV_ZOMBIE)
+ goto failed;
+
+ /*
* directory cache lookup:
*/
if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
goto found;
}
+ if (SDEV_IS_GLOBAL(ddv)) {
+ /*
+ * Check for /dev/net/zone
+ */
+ if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+ ddv->sdev_path) == 0) {
+ (void) devnet_mkdir(ddv, nm);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+
+ /*
+ * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+ * its trailing slash.
+ */
+ if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+ zone = zone_find_by_name(nm);
+ if (zone == NULL)
+ goto failed;
+ (void) devnet_mkdir(ddv, nm);
+ zone_rele(zone);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+ } else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+ goto failed;
+ }
+
/*
- * ZOMBIED parent does not allow new node creation, bail out early.
+ * We didn't find what we were looking for. What that is depends a lot
+ * on what directory we're in.
*/
- if (ddv->sdev_state == SDEV_ZOMBIE)
- goto failed;
- error = devnet_create_rvp(nm, &vattr, &ddh);
+ error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
if (error != 0)
goto failed;
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
goto found;
- if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+ if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
return (0);
ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
return (0);
}
+/*
+ * Fill in all the entries for the current zone.
+ */
static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
{
- sdev_node_t *dv, *next;
datalink_id_t linkid;
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ if (zid == GLOBAL_ZONEID) {
+ ASSERT(SDEV_IS_GLOBAL(ddv));
+ linkid = DATALINK_INVALID_LINKID;
+ do {
+ linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+ DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+ if (linkid != DATALINK_INVALID_LINKID)
+ (void) devnet_filldir_datalink(linkid, ddv);
+ } while (linkid != DATALINK_INVALID_LINKID);
+ } else {
+ (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv);
+ }
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+ sdev_node_t *ddv = arg;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ (void) devnet_mkdir(ddv, zonep->zone_name);
+ return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, ddv->sdev_path) == 0) {
+ (void) zone_walk(devnet_fillzdir_cb, ddv);
+ return;
+ }
+
+ zonep = zone_find_by_name(ddv->sdev_name);
+ if (zonep == NULL)
+ return;
+ devnet_fillzone(ddv, zonep->zone_id);
+ zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+ int ret;
+ sdev_node_t *dv, *next;
+
ASSERT(RW_READ_HELD(&ddv->sdev_contents));
if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
rw_exit(&ddv->sdev_contents);
rw_enter(&ddv->sdev_contents, RW_WRITER);
+ if (ddv->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&ddv->sdev_contents);
+ return;
+ }
}
for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,36 @@ devnet_filldir(struct sdev_node *ddv)
if (SDEVTOV(dv)->v_count > 0)
continue;
+
SDEV_HOLD(dv);
+
+ /*
+ * Clean out everything underneath before we remove ourselves.
+ */
+ ret = sdev_cleandir(dv, NULL, 0);
+ ASSERT(ret == 0);
/* remove the cache node */
(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
SDEV_CACHE_DELETE);
SDEV_RELE(dv);
}
+ if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+ devnet_fillzdir(ddv);
+ goto done;
+ }
+
if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
goto done;
if (SDEV_IS_GLOBAL(ddv)) {
- linkid = DATALINK_INVALID_LINKID;
- do {
- linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
- DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
- if (linkid != DATALINK_INVALID_LINKID)
- (void) devnet_filldir_datalink(linkid, ddv);
- } while (linkid != DATALINK_INVALID_LINKID);
+ devnet_fillzone(ddv, GLOBAL_ZONEID);
+ (void) devnet_mkdir(ddv, "zone");
} else {
- (void) zone_datalink_walk(getzoneid(),
- devnet_filldir_datalink, ddv);
+ devnet_fillzone(ddv, getzoneid());
}
ddv->sdev_flags &= ~SDEV_BUILD;
-
done:
rw_downgrade(&ddv->sdev_contents);
}
@@ -319,6 +529,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
ASSERT(sdvp);
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
if (uiop->uio_offset == 0)
devnet_filldir(sdvp);
diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c
new file mode 100644
index 0000000000..885191175f
--- /dev/null
+++ b/usr/src/uts/common/fs/dev/sdev_plugin.c
@@ -0,0 +1,913 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Dynamic directory plugin interface for sdev.
+ *
+ * The sdev plugin interfaces provides a means for a dynamic directory based on
+ * in-kernel state to be simply created. Traditionally, dynamic directories were
+ * built into sdev itself. While these legacy plugins are useful, it makes more
+ * sense for these pieces of functionality to live with the individual drivers.
+ *
+ * The plugin interface requires folks to implement three interfaces and
+ * provides a series of callbacks that can be made in the context of those
+ * interfaces to interrogate the sdev_node_t without having to leak
+ * implementation details of the sdev_node_t. These interfaces are:
+ *
+ * o spo_validate
+ *
+ * Given a particular node, answer the question as to whether or not this
+ * entry is still valid. Here, plugins should use the name and the dev_t
+ * associated with the node to verify that it matches something that still
+ * exists.
+ *
+ * o spo_filldir
+ *
+ * Fill all the entries inside of a directory. Note that some of these entries
+ * may already exist.
+ *
+ * o spo_inactive
+ *
+ * The given node is no longer being used. This allows the consumer to
+ * potentially tear down anything that was being held open related to this.
+ * Note that this only fires when the given sdev_node_t becomes a zombie.
+ *
+ * During these callbacks a consumer is not allowed to register or unregister a
+ * plugin, especially their own. They may call the sdev_ctx style functions. All
+ * callbacks fire in a context where blocking is allowed (eg. the spl is below
+ * LOCK_LEVEL).
+ *
+ * When a plugin is added, we create its directory in the global zone. By doing
+ * that, we ensure that something isn't already there and that nothing else can
+ * come along and try and create something without our knowledge. We only have
+ * to create it in the GZ and not for all other instances of sdev because an
+ * instance of sdev that isn't at /dev does not have dynamic directories, and
+ * second, any instance of sdev present in a non-global zone cannot create
+ * anything, therefore we know that by it not being in the global zone's
+ * instance of sdev that we're good to go.
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The global sdev_plugin_lock must be held before any of the individual
+ * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
+ * it is not legal to take any holds on any sdev_node_t or to grab the
+ * sdev_node_t`contents_lock in any way.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/fs/sdev_plugin.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+
+kmutex_t sdev_plugin_lock;
+list_t sdev_plugin_list;
+kmem_cache_t *sdev_plugin_cache;
+struct vnodeops *sdev_plugin_vnops;
+
+#define SDEV_PLUGIN_NAMELEN 64
+
+typedef struct sdev_plugin {
+ list_node_t sp_link;
+ char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */
+ int sp_nflags; /* E */
+ struct vnodeops *sp_vnops; /* E */
+ sdev_plugin_ops_t *sp_pops; /* E */
+ boolean_t sp_islegacy; /* E */
+ int (*sp_lvtor)(sdev_node_t *); /* E */
+ kmutex_t sp_lock; /* Protects everything below */
+ kcondvar_t sp_nodecv;
+ size_t sp_nnodes;
+} sdev_plugin_t;
+
+/* ARGSUSED */
+static int
+sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
+{
+ sdev_plugin_t *spp = buf;
+ mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
+ cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+sdev_plugin_cache_destructor(void *buf, void *arg)
+{
+ sdev_plugin_t *spp = buf;
+ cv_destroy(&spp->sp_nodecv);
+ mutex_destroy(&spp->sp_lock);
+}
+
+enum vtype
+sdev_ctx_vtype(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_vnode->v_type);
+}
+
+const char *
+sdev_ctx_path(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_path);
+}
+
+const char *
+sdev_ctx_name(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_name);
+}
+
+/*
+ * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
+ */
+sdev_ctx_flags_t
+sdev_ctx_flags(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_flags & SDEV_GLOBAL);
+}
+
+/*
+ * Return some amount of private data specific to the vtype. In the case of a
+ * character or block device this is the device number.
+ */
+const void *
+sdev_ctx_vtype_data(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+ void *ret;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ switch (sdp->sdev_vnode->v_type) {
+ case VCHR:
+ case VBLK:
+ ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev);
+ break;
+ default:
+ ret = NULL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
+ */
+static int
+sdev_plugin_name_isvalid(const char *c, int buflen)
+{
+ int i;
+
+ for (i = 0; i < buflen; i++, c++) {
+ if (*c == '\0')
+ return (1);
+
+ if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
+ return (0);
+ }
+ /* Never found a null terminator */
+ return (0);
+}
+
+static int
+sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
+ vattr_t *vap)
+{
+ int ret;
+ sdev_node_t *svp;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+ ASSERT(spp != NULL);
+ svp = sdev_cache_lookup(sdvp, name);
+ if (svp != NULL) {
+ SDEV_SIMPLE_RELE(svp);
+ return (EEXIST);
+ }
+
+ ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
+ SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(svp);
+
+ return (0);
+}
+
+/*
+ * Plugin node creation callbacks
+ */
+int
+sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(sdvp->sdev_private != NULL);
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+int
+sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+ if (mode != S_IFCHR && mode != S_IFBLK)
+ return (EINVAL);
+
+ ASSERT(sdvp->sdev_private != NULL);
+
+ vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+ vap.va_rdev = dev;
+ vap.va_mode = mode | 0666;
+
+ /* Despite the similar name, this is in fact a different function */
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+
+}
+
+static int
+sdev_plugin_validate(sdev_node_t *sdp)
+{
+ int ret;
+ sdev_plugin_t *spp;
+
+ ASSERT(sdp->sdev_private != NULL);
+ spp = sdp->sdev_private;
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
+ rw_exit(&sdp->sdev_contents);
+ return (ret);
+}
+
+static void
+sdev_plugin_validate_dir(sdev_node_t *sdvp)
+{
+ int ret;
+ sdev_node_t *svp, *next;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
+
+ next = SDEV_NEXT_ENTRY(sdvp, svp);
+ ASSERT(svp->sdev_state != SDEV_ZOMBIE);
+ /* skip nodes that aren't ready */
+ if (svp->sdev_state == SDEV_INIT)
+ continue;
+
+ switch (sdev_plugin_validate(svp)) {
+ case SDEV_VTOR_VALID:
+ case SDEV_VTOR_SKIP:
+ continue;
+ case SDEV_VTOR_INVALID:
+ case SDEV_VTOR_STALE:
+ break;
+ }
+
+ SDEV_HOLD(svp);
+
+ /*
+ * Clean out everything underneath this node before we
+ * remove it.
+ */
+ if (svp->sdev_vnode->v_type == VDIR) {
+ ret = sdev_cleandir(svp, NULL, 0);
+ ASSERT(ret == 0);
+ }
+ /* remove the cache node */
+ (void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
+ SDEV_CACHE_DELETE);
+ SDEV_RELE(svp);
+ }
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+ int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+ int ret;
+ sdev_node_t *sdvp = VTOSDEV(dvp);
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
+
+ /* Sanity check we're not a zombie before we do anyting else */
+ if (sdvp->sdev_state == SDEV_ZOMBIE)
+ return (ENOENT);
+
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ if (uiop->uio_offset == 0) {
+ /*
+ * We upgrade to a write lock and grab the plugin's lock along
+ * the way. We're almost certainly going to get creation
+ * callbacks, so this is the only safe way to go.
+ */
+ if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
+ rw_exit(&sdvp->sdev_contents);
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_downgrade(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ }
+
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_downgrade(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+/*
+ * If we don't have a callback function that returns a failure, then sdev will
+ * try to create a node for us which violates all of our basic assertions. To
+ * work around that we create our own callback for devname_lookup_func which
+ * always returns ENOENT as at this point either it was created with the filldir
+ * callback or it was not.
+ */
+/*ARGSUSED*/
+static int
+sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
+ void *unused, char *unused2)
+{
+ return (ENOENT);
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+ struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+ caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+ int ret;
+ sdev_node_t *sdvp;
+ sdev_plugin_t *spp;
+
+ /* execute access is required to search the directory */
+ if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+ return (ret);
+
+ sdvp = VTOSDEV(dvp);
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ /*
+ * Go straight for the write lock.
+ */
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_exit(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+
+ return (devname_lookup_func(sdvp, nm, vpp, cred,
+ sdev_plugin_vop_lookup_cb, SDEV_VATTR));
+}
+
+/*
+ * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
+ * to zero, but isn't necessairily a zombie yet. As such, to make things easier
+ * for users, we only fire the inactive callback when the node becomes a zombie
+ * and thus will be torn down here.
+ */
+static void
+sdev_plugin_vop_inactive_cb(struct vnode *dvp)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ if (sdp->sdev_state != SDEV_ZOMBIE) {
+ rw_exit(&sdp->sdev_contents);
+ return;
+ }
+ spp->sp_pops->spo_inactive((uintptr_t)sdp);
+ mutex_enter(&spp->sp_lock);
+ VERIFY(spp->sp_nnodes > 0);
+ spp->sp_nnodes--;
+ cv_signal(&spp->sp_nodecv);
+ mutex_exit(&spp->sp_lock);
+ rw_exit(&sdp->sdev_contents);
+}
+
+/*ARGSUSED*/
+static void
+sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
+ caller_context_t *ct)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+ ASSERT(sdp->sdev_private != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
+}
+
+const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
+ VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir },
+ VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup },
+ VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive },
+ VOPNAME_CREATE, { .error = fs_nosys },
+ VOPNAME_REMOVE, { .error = fs_nosys },
+ VOPNAME_MKDIR, { .error = fs_nosys },
+ VOPNAME_RMDIR, { .error = fs_nosys },
+ VOPNAME_SYMLINK, { .error = fs_nosys },
+ VOPNAME_SETSECATTR, { .error = fs_nosys },
+ NULL, NULL
+};
+
+/*
+ * construct a new template with overrides from vtab
+ */
+static fs_operation_def_t *
+sdev_merge_vtab(const fs_operation_def_t tab[])
+{
+ fs_operation_def_t *new;
+ const fs_operation_def_t *tab_entry;
+
+ /* make a copy of standard vnode ops table */
+ new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
+ bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
+
+ /* replace the overrides from tab */
+ for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
+ fs_operation_def_t *std_entry = new;
+ while (std_entry->name) {
+ if (strcmp(tab_entry->name, std_entry->name) == 0) {
+ std_entry->func = tab_entry->func;
+ break;
+ }
+ std_entry++;
+ }
+ }
+
+ return (new);
+}
+
+/* free memory allocated by sdev_merge_vtab */
+static void
+sdev_free_vtab(fs_operation_def_t *new)
+{
+ kmem_free(new, sdev_vnodeops_tbl_size);
+}
+
+/*
+ * Register a new plugin.
+ */
+sdev_plugin_hdl_t
+sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
+{
+ int ret, err;
+ sdev_plugin_t *spp, *iter;
+ vnode_t *vp, *nvp;
+ sdev_node_t *sdp, *slp;
+ timestruc_t now;
+ struct vattr vap;
+
+ /*
+ * Some consumers don't care about why they failed. To keep the code
+ * simple, we'll just pretend they gave us something.
+ */
+ if (errp == NULL)
+ errp = &err;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_version != 1) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
+ ops->spo_inactive == NULL) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
+
+ spp->sp_pops = ops;
+ spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
+ if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
+ spp->sp_nflags |= SDEV_NO_NCACHE;
+ if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
+ spp->sp_nflags |= SDEV_SUBDIR;
+ spp->sp_vnops = sdev_plugin_vnops;
+ spp->sp_islegacy = B_FALSE;
+ spp->sp_lvtor = NULL;
+ spp->sp_nnodes = 0;
+
+ /*
+ * Make sure it's unique, nothing exists with this name already, and add
+ * it to the list. We also need to go through and grab the sdev
+ * root node as we cannot grab any sdev node locks once we've grabbed
+ * the sdev_plugin_lock. We effectively assert that if a directory is
+ * not present in the GZ's /dev, then it doesn't exist in any of the
+ * local zones.
+ */
+ ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1);
+ if (ret != 0) {
+ *errp = ret;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ /* Make sure we have the real vnode */
+ if (VOP_REALVP(vp, &nvp, NULL) == 0) {
+ VN_HOLD(nvp);
+ VN_RELE(vp);
+ vp = nvp;
+ nvp = NULL;
+ }
+ VERIFY(vp->v_op == sdev_vnodeops);
+ sdp = VTOSDEV(vp);
+ rw_enter(&sdp->sdev_contents, RW_WRITER);
+ slp = sdev_cache_lookup(sdp, spp->sp_name);
+ if (slp != NULL) {
+ SDEV_RELE(slp);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+
+ mutex_enter(&sdev_plugin_lock);
+ for (iter = list_head(&sdev_plugin_list); iter != NULL;
+ iter = list_next(&sdev_plugin_list, iter)) {
+ if (strcmp(spp->sp_name, iter->sp_name) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ }
+
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ /*
+ * Now go ahead and create the top level directory for the global zone.
+ */
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
+
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+
+ return ((sdev_plugin_hdl_t)spp);
+}
+
+static void
+sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
+{
+ sdev_plugin_t *spp = arg;
+ sdev_node_t *sdp;
+
+ rw_enter(&rdp->sdev_contents, RW_WRITER);
+ sdp = sdev_cache_lookup(rdp, spp->sp_name);
+ /* If it doesn't exist, we're done here */
+ if (sdp == NULL) {
+ rw_exit(&rdp->sdev_contents);
+ return;
+ }
+
+ /*
+ * We first delete the directory before recursively marking everything
+ * else stale. This ordering should ensure that we don't accidentally
+ * miss anything.
+ */
+ sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
+ sdev_stale(sdp);
+ SDEV_RELE(sdp);
+ rw_exit(&rdp->sdev_contents);
+}
+
+/*
+ * Remove a plugin. This will block until everything has become a zombie, thus
+ * guaranteeing the caller that nothing will call into them again once this call
+ * returns. While the call is ongoing, it could be called into. Note that while
+ * this is ongoing, it will block other mounts.
+ */
+int
+sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
+{
+ sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
+ if (spp->sp_islegacy)
+ return (EINVAL);
+
+ mutex_enter(&sdev_plugin_lock);
+ list_remove(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
+ mutex_enter(&spp->sp_lock);
+ while (spp->sp_nnodes > 0)
+ cv_wait(&spp->sp_nodecv, &spp->sp_lock);
+ mutex_exit(&spp->sp_lock);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (0);
+}
+
+/*
+ * Register an old sdev style plugin to deal with what used to be in the vtab.
+ */
+static int
+sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
+{
+ sdev_plugin_t *spp;
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
+ spp->sp_islegacy = B_TRUE;
+ spp->sp_pops = NULL;
+ spp->sp_nflags = vtp->vt_flags;
+ spp->sp_lvtor = vtp->vt_vtor;
+ spp->sp_nnodes = 0;
+
+ if (vtp->vt_service != NULL) {
+ fs_operation_def_t *templ;
+ templ = sdev_merge_vtab(vtp->vt_service);
+ if (vn_make_ops(vtp->vt_name,
+ (const fs_operation_def_t *)templ,
+ &spp->sp_vnops) != 0) {
+ cmn_err(CE_WARN, "%s: malformed vnode ops\n",
+ vtp->vt_name);
+ sdev_free_vtab(templ);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (1);
+ }
+
+ if (vtp->vt_global_vops) {
+ *(vtp->vt_global_vops) = spp->sp_vnops;
+ }
+
+ sdev_free_vtab(templ);
+ } else {
+ spp->sp_vnops = sdev_vnodeops;
+ }
+
+ /*
+ * No need to check for EEXIST here. These are loaded as a part of the
+ * sdev's initialization function. Further, we don't have to create them
+ * as that's taken care of in sdev's mount for the GZ.
+ */
+ mutex_enter(&sdev_plugin_lock);
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ return (0);
+}
+
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
+static sdev_plugin_t *
+sdev_match(sdev_node_t *dv)
+{
+ int vlen;
+ const char *path;
+ sdev_plugin_t *spp;
+
+ if (strlen(dv->sdev_path) <= 5)
+ return (NULL);
+
+ if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+ return (NULL);
+ path = dv->sdev_path + 5;
+
+ mutex_enter(&sdev_plugin_lock);
+
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ if (spp->sp_nflags & SDEV_SUBDIR) {
+ vlen = strlen(spp->sp_name);
+ if ((strncmp(spp->sp_name, path,
+ vlen - 1) == 0) && path[vlen] == '/') {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ }
+ }
+
+ mutex_exit(&sdev_plugin_lock);
+ return (NULL);
+}
+
+void
+sdev_set_no_negcache(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ ASSERT(dv->sdev_path);
+ path = dv->sdev_path + strlen("/dev/");
+
+ mutex_enter(&sdev_plugin_lock);
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ if (spp->sp_nflags & SDEV_NO_NCACHE)
+ dv->sdev_flags |= SDEV_NO_NCACHE;
+ break;
+ }
+ }
+ mutex_exit(&sdev_plugin_lock);
+}
+
+struct vnodeops *
+sdev_get_vop(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ path = dv->sdev_path;
+ ASSERT(path);
+
+ /* gets the relative path to /dev/ */
+ path += 5;
+
+ if ((spp = sdev_match(dv)) != NULL) {
+ dv->sdev_flags |= spp->sp_nflags;
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+ (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (spp->sp_vnops);
+ }
+
+ /* child inherits the persistence of the parent */
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (sdev_vnodeops);
+}
+
+void *
+sdev_get_vtor(sdev_node_t *dv)
+{
+ sdev_plugin_t *spp;
+
+ if (dv->sdev_private == NULL) {
+ spp = sdev_match(dv);
+ if (spp == NULL)
+ return (NULL);
+ } else {
+ spp = dv->sdev_private;
+ }
+
+ if (spp->sp_islegacy)
+ return ((void *)spp->sp_lvtor);
+ else
+ return ((void *)sdev_plugin_validate);
+}
+
+void
+sdev_plugin_nodeready(sdev_node_t *sdp)
+{
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
+ ASSERT(sdp->sdev_private == NULL);
+
+ spp = sdev_match(sdp);
+ if (spp == NULL)
+ return;
+ if (spp->sp_islegacy)
+ return;
+ sdp->sdev_private = spp;
+ mutex_enter(&spp->sp_lock);
+ spp->sp_nnodes++;
+ mutex_exit(&spp->sp_lock);
+}
+
+int
+sdev_plugin_init(void)
+{
+ sdev_vop_table_t *vtp;
+ fs_operation_def_t *templ;
+
+ sdev_plugin_cache = kmem_cache_create("sdev_plugin",
+ sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
+ sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ if (sdev_plugin_cache == NULL)
+ return (1);
+ mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
+ offsetof(sdev_plugin_t, sp_link));
+
+ /*
+ * Register all of the legacy vnops
+ */
+ for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
+ if (sdev_plugin_register_legacy(vtp) != 0)
+ return (1);
+
+ templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
+ if (vn_make_ops("sdev_plugin",
+ (const fs_operation_def_t *)templ,
+ &sdev_plugin_vnops) != 0) {
+ sdev_free_vtab(templ);
+ return (1);
+ }
+
+ sdev_free_vtab(templ);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index b4b27e6285..a9f10b55a8 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = {
kmem_cache_t *sdev_node_cache; /* sdev_node cache */
int devtype; /* fstype */
-/* static */
-static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_negcache(struct sdev_node *);
-static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
-static void sdev_free_vtab(fs_operation_def_t *);
-
static void
sdev_prof_free(struct sdev_node *dv)
{
@@ -318,6 +312,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
/* overwritten for VLNK nodes */
dv->sdev_symlink = NULL;
+ list_link_init(&dv->sdev_plist);
vp = SDEVTOV(dv);
vn_reinit(vp);
@@ -406,6 +401,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
} else {
dv->sdev_nlink = 1;
}
+ sdev_plugin_nodeready(dv);
if (!(SDEV_IS_GLOBAL(dv))) {
dv->sdev_origin = (struct sdev_node *)args;
@@ -502,37 +498,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
return (dv);
}
-/* directory dependent vop table */
-struct sdev_vop_table {
- char *vt_name; /* subdirectory name */
- const fs_operation_def_t *vt_service; /* vnodeops table */
- struct vnodeops *vt_vops; /* constructed vop */
- struct vnodeops **vt_global_vops; /* global container for vop */
- int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */
- int vt_flags;
-};
-
-/*
- * A nice improvement would be to provide a plug-in mechanism
- * for this table instead of a const table.
- */
-static struct sdev_vop_table vtab[] =
-{
- { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
+struct sdev_vop_table vtab[] = {
+ { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
+ { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+ { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops,
devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
+ { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE },
- { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
- SDEV_DYNAMIC | SDEV_VTOR },
+ { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate,
+ SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
+ { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops,
devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
/*
@@ -547,132 +528,14 @@ static struct sdev_vop_table vtab[] =
* preventing a mkdir.
*/
- { "lofi", NULL, NULL, NULL, NULL,
+ { "lofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { "rlofi", NULL, NULL, NULL, NULL,
+ { "rlofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { NULL, NULL, NULL, NULL, NULL, 0}
+ { NULL, NULL, NULL, NULL, 0}
};
-/*
- * We need to match off of the sdev_path, not the sdev_name. We are only allowed
- * to exist directly under /dev.
- */
-struct sdev_vop_table *
-sdev_match(struct sdev_node *dv)
-{
- int vlen;
- int i;
- const char *path;
-
- if (strlen(dv->sdev_path) <= 5)
- return (NULL);
-
- if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
- return (NULL);
- path = dv->sdev_path + 5;
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0)
- return (&vtab[i]);
- if (vtab[i].vt_flags & SDEV_SUBDIR) {
- vlen = strlen(vtab[i].vt_name);
- if ((strncmp(vtab[i].vt_name, path,
- vlen - 1) == 0) && path[vlen] == '/')
- return (&vtab[i]);
- }
-
- }
- return (NULL);
-}
-
-/*
- * sets a directory's vnodeops if the directory is in the vtab;
- */
-static struct vnodeops *
-sdev_get_vop(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
- char *path;
-
- path = dv->sdev_path;
- ASSERT(path);
-
- /* gets the relative path to /dev/ */
- path += 5;
-
- /* gets the vtab entry it matches */
- if ((vtp = sdev_match(dv)) != NULL) {
- dv->sdev_flags |= vtp->vt_flags;
- if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
- (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
- dv->sdev_flags |= SDEV_PERSIST;
-
- if (vtp->vt_vops) {
- if (vtp->vt_global_vops)
- *(vtp->vt_global_vops) = vtp->vt_vops;
-
- return (vtp->vt_vops);
- }
-
- if (vtp->vt_service) {
- fs_operation_def_t *templ;
- templ = sdev_merge_vtab(vtp->vt_service);
- if (vn_make_ops(vtp->vt_name,
- (const fs_operation_def_t *)templ,
- &vtp->vt_vops) != 0) {
- cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
- vtp->vt_name);
- /*NOTREACHED*/
- }
- if (vtp->vt_global_vops) {
- *(vtp->vt_global_vops) = vtp->vt_vops;
- }
- sdev_free_vtab(templ);
-
- return (vtp->vt_vops);
- }
-
- return (sdev_vnodeops);
- }
-
- /* child inherits the persistence of the parent */
- if (SDEV_IS_PERSIST(dv->sdev_dotdot))
- dv->sdev_flags |= SDEV_PERSIST;
-
- return (sdev_vnodeops);
-}
-
-static void
-sdev_set_no_negcache(struct sdev_node *dv)
-{
- int i;
- char *path;
-
- ASSERT(dv->sdev_path);
- path = dv->sdev_path + strlen("/dev/");
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0) {
- if (vtab[i].vt_flags & SDEV_NO_NCACHE)
- dv->sdev_flags |= SDEV_NO_NCACHE;
- break;
- }
- }
-}
-
-void *
-sdev_get_vtor(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
-
- vtp = sdev_match(dv);
- if (vtp)
- return ((void *)vtp->vt_vtor);
- else
- return (NULL);
-}
/*
* Build the base root inode
@@ -952,8 +815,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
dv->sdev_path = NULL;
}
- if (!SDEV_IS_GLOBAL(dv))
+ if (!SDEV_IS_GLOBAL(dv)) {
sdev_prof_free(dv);
+ if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL)
+ SDEV_RELE(dv->sdev_origin);
+ }
if (SDEVTOV(dv)->v_type == VDIR) {
ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
@@ -2948,46 +2814,6 @@ sdev_modctl_devexists(const char *path)
return (error);
}
-extern int sdev_vnodeops_tbl_size;
-
-/*
- * construct a new template with overrides from vtab
- */
-static fs_operation_def_t *
-sdev_merge_vtab(const fs_operation_def_t tab[])
-{
- fs_operation_def_t *new;
- const fs_operation_def_t *tab_entry;
-
- /* make a copy of standard vnode ops table */
- new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
- bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
-
- /* replace the overrides from tab */
- for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
- fs_operation_def_t *std_entry = new;
- while (std_entry->name) {
- if (strcmp(tab_entry->name, std_entry->name) == 0) {
- std_entry->func = tab_entry->func;
- break;
- }
- std_entry++;
- }
- if (std_entry->name == NULL)
- cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
- tab_entry->name);
- }
-
- return (new);
-}
-
-/* free memory allocated by sdev_merge_vtab */
-static void
-sdev_free_vtab(fs_operation_def_t *new)
-{
- kmem_free(new, sdev_vnodeops_tbl_size);
-}
-
/*
* a generic setattr() function
*
diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c
index ea9cb6374a..6f32f47635 100644
--- a/usr/src/uts/common/fs/dev/sdev_vfsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c
@@ -169,7 +169,13 @@ devinit(int fstype, char *name)
if ((devmajor = getudev()) == (major_t)-1) {
cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name);
- return (1);
+ return (ENXIO);
+ }
+
+ if (sdev_plugin_init() != 0) {
+ cmn_err(CE_WARN, "%s: failed to set init plugin subsystem",
+ sdev_vfssw.name);
+ return (EIO);
}
/* initialize negative cache */
@@ -332,6 +338,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
ASSERT(sdev_origins);
dv->sdev_flags &= ~SDEV_GLOBAL;
dv->sdev_origin = sdev_origins->sdev_root;
+ SDEV_HOLD(dv->sdev_origin);
} else {
sdev_ncache_setup();
rw_enter(&dv->sdev_contents, RW_WRITER);
@@ -504,3 +511,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo)
SDEVTOV(mntinfo->sdev_root)->v_count--;
mutex_exit(&sdev_lock);
}
+
+void
+sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg)
+{
+ struct sdev_data *mntinfo;
+
+ mutex_enter(&sdev_lock);
+ mntinfo = sdev_mntinfo;
+ while (mntinfo != NULL) {
+ func(mntinfo->sdev_root, arg);
+ mntinfo = mntinfo->sdev_next;
+ }
+ mutex_exit(&sdev_lock);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 33a2fa5935..dedb4dadcc 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(ip_squeue_worker_wait, pri);
+ sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index e93e0570db..0de9c6fa18 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#if !defined(lint)
@@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
@@ -152,6 +160,12 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
char *hook6_loop_out = "ipfilter_hook6_loop_out";
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -249,6 +263,31 @@ ipf_stack_t *ifs;
ifs->ifs_ipf_ipv4 = NULL;
}
+ /*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
#undef UNDO_HOOK
#ifdef IPFDEBUG
@@ -442,6 +481,48 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, ifs);
+ HOOK_INIT(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, ifs);
+ HOOK_INIT(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+ /*
* Reacquire ipf_global, now it is safe.
*/
WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -2136,6 +2217,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hook4_loop_in */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index 260f50dff4..f13d363bbc 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -125,6 +125,10 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_in;
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
/* flags to indicate whether hooks are registered. */
boolean_t ifs_hook4_physical_in;
@@ -137,10 +141,16 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
/* ip_auth.c */
int ifs_fr_authsize;
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..1009f0700f 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
*/
/*
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
*/
/*
@@ -61,6 +61,10 @@
* connection are processed on that squeue. The connection ("conn") to
* squeue mapping is stored in "conn_t" member "conn_sqp".
*
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
* Since the processing of the connection cuts across multiple layers
* but still allows packets for different connnection to be processed on
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -244,7 +248,7 @@ squeue_init(void)
/* ARGSUSED */
squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri, boolean_t isip)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
@@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri)
sqp->sq_enter = squeue_enter;
sqp->sq_drain = squeue_drain;
+ sqp->sq_isip = isip;
return (sqp);
}
/*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+ kt_did_t worker, poll;
+ mutex_enter(&sqp->sq_lock);
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+ worker = sqp->sq_worker->t_did;
+ poll = sqp->sq_poll_thr->t_did;
+ sqp->sq_state |= SQS_EXIT;
+ cv_signal(&sqp->sq_poll_cv);
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
+ thread_join(poll);
+ thread_join(worker);
+ kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
* Bind squeue worker thread to the specified CPU, given by CPU id.
* If the CPU id value is -1, bind the worker thread to the value
* specified in sq_bind field. If a thread is already bound to a
@@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
SQUEUE_DBG_CLEAR(sqp);
- CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
return;
}
} else {
- if (ira != NULL) {
+ if (sqp->sq_isip == B_TRUE && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
if (!(sqp->sq_state & SQS_REENTER) &&
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
(sqp->sq_run == curthread) && (cnt == 1) &&
- (connp->conn_on_sqp == B_FALSE)) {
+ (sqp->sq_isip == B_FALSE ||
+ connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
@@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
- if (ira != NULL) {
+ if (sqp->sq_isip && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -779,7 +818,7 @@ again:
mp->b_prev = NULL;
/* Is there an ip_recv_attr_t to handle? */
- if (ip_recv_attr_is_mblk(mp)) {
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
mblk_t *attrmp = mp;
ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +843,25 @@ again:
/*
- * Handle squeue switching. More details in the
- * block comment at the top of the file
+ * Handle squeue switching. More details in the block comment at
+ * the top of the file. non-IP squeues cannot switch, as there
+ * is no conn_t.
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
mp->b_tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp)
cv_wait(async, lock);
CALLB_CPR_SAFE_END(&cprinfo, lock);
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
SQS_POLL_THR_QUIESCED);
if (ctl_state != 0) {
@@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp)
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+ /* Only IP related squeues should reach this point */
+ VERIFY(sqp->sq_isip == B_TRUE);
+
poll_again:
sq_rx_ring = sqp->sq_rx_ring;
sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp)
ill_rx_ring_t *rx_ring;
ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ VERIFY(sqp->sq_isip == B_TRUE);
if (sqp->sq_state & SQS_POLL_RESTART) {
/* Restart implies a previous quiesce. */
@@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp)
for (;;) {
for (;;) {
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
/*
* If the poll thread has handed control to us
* we need to break out of the wait.
@@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
again:
sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1487,6 +1546,7 @@ void
squeue_synch_exit(conn_t *connp)
{
squeue_t *sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_run == curthread) {
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index a438e43d91..79c3d8260a 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -41,7 +41,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req,
proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
- proto_notify_req, proto_passive_req;
+ proto_notify_req, proto_passive_req, proto_exclusive_req;
static void proto_capability_advertise(dld_str_t *, mblk_t *);
static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
@@ -121,6 +121,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp)
case DL_PASSIVE_REQ:
proto_passive_req(dsp, mp);
break;
+ case DL_EXCLUSIVE_REQ:
+ proto_exclusive_req(dsp, mp);
+ break;
default:
proto_req(dsp, mp);
break;
@@ -605,6 +608,10 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
new_flags |= DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ new_flags |= DLS_PROMISC_RX_ONLY;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed2;
@@ -692,12 +699,24 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
new_flags &= ~DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) {
+ dl_err = DL_NOTENAB;
+ goto failed;
+ }
+ new_flags &= ~DLS_PROMISC_RX_ONLY;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
mac_perim_exit(mph);
goto failed;
}
+ /* DLS_PROMISC_RX_ONLY can't be a solo flag */
+ if (new_flags == DLS_PROMISC_RX_ONLY)
+ new_flags = 0;
+
/*
* Adjust channel promiscuity.
*/
@@ -1295,7 +1314,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp)
* If we've already become active by issuing an active primitive,
* then it's too late to try to become passive.
*/
- if (dsp->ds_passivestate == DLD_ACTIVE) {
+ if (dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE) {
dl_err = DL_OUTSTATE;
goto failed;
}
@@ -1354,7 +1374,12 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
direct->di_rx_ch);
- direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ if (direct->di_flags & DI_DIRECT_RAW) {
+ direct->di_tx_df =
+ (uintptr_t)str_mdata_raw_fastpath_put;
+ } else {
+ direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ }
direct->di_tx_dh = dsp;
direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
direct->di_tx_cb_dh = dsp->ds_mch;
@@ -1516,8 +1541,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
* completes. So we limit the check to DLD_ENABLE case.
*/
if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
- (dsp->ds_sap != ETHERTYPE_IP ||
- !check_mod_above(dsp->ds_rq, "ip"))) {
+ ((dsp->ds_sap != ETHERTYPE_IP ||
+ !check_mod_above(dsp->ds_rq, "ip")) &&
+ !check_mod_above(dsp->ds_rq, "vnd"))) {
return (ENOTSUP);
}
@@ -1599,9 +1625,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
- * Direct capability negotiation interface between IP and DLD
+ * Direct capability negotiation interface between IP/VND and DLD. Note
+ * that for vnd we only allow the case where the media type is the
+ * native media type so we know that there are no transformations that
+ * would have to happen to the mac header that it receives.
*/
- if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) {
+ if ((dsp->ds_sap == ETHERTYPE_IP &&
+ check_mod_above(dsp->ds_rq, "ip")) ||
+ (check_mod_above(dsp->ds_rq, "vnd") &&
+ dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) {
dld_capable = B_TRUE;
subsize += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
@@ -1720,3 +1752,36 @@ dld_capabilities_disable(dld_str_t *dsp)
if (dsp->ds_polling)
(void) dld_capab_poll_disable(dsp, NULL);
}
+
+static void
+proto_exclusive_req(dld_str_t *dsp, mblk_t *mp)
+{
+ int ret = 0;
+ t_uscalar_t dl_err;
+ mac_perim_handle_t mph;
+
+ if (dsp->ds_passivestate != DLD_UNINITIALIZED) {
+ dl_err = DL_OUTSTATE;
+ goto failed;
+ }
+
+ if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) {
+ dl_err = DL_BADPRIM;
+ goto failed;
+ }
+
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ ret = dls_exclusive_set(dsp, B_TRUE);
+ mac_perim_exit(mph);
+
+ if (ret != 0) {
+ dl_err = DL_SYSERR;
+ goto failed;
+ }
+
+ dsp->ds_passivestate = DLD_EXCLUSIVE;
+ dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ);
+ return;
+failed:
+ dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret);
+}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 4e693c3a2a..12100b8a74 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -854,6 +854,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
return (mp);
}
+static boolean_t
+i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp)
+{
+ mblk_t *mp = *mpp;
+ mblk_t *newmp;
+ uint_t pri, vid, dvid;
+
+ dvid = mac_client_vid(dsp->ds_mch);
+
+ /*
+ * Discard the packet if this is a VLAN stream but the VID in
+ * the packet is not correct.
+ */
+ vid = VLAN_ID(mhip->mhi_tci);
+ if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+ return (B_FALSE);
+
+ /*
+ * Discard the packet if this packet is a tagged packet
+ * but both pri and VID are 0.
+ */
+ pri = VLAN_PRI(mhip->mhi_tci);
+ if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 &&
+ vid == VLAN_ID_NONE)
+ return (B_FALSE);
+
+ /*
+ * Update the priority bits to the per-stream priority if
+ * priority is not set in the packet. Update the VID for
+ * packets on a VLAN stream.
+ */
+ pri = (pri == 0) ? dsp->ds_pri : 0;
+ if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
+ if ((newmp = i_dld_ether_header_update_tag(mp, pri,
+ dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
+ return (B_FALSE);
+ }
+ *mpp = newmp;
+ }
+
+ return (B_TRUE);
+}
+
+mac_tx_cookie_t
+str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+ uint16_t flag)
+{
+ boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
+ mac_header_info_t mhi;
+ mac_tx_cookie_t cookie;
+
+ if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
+ goto discard;
+
+ if (is_ethernet) {
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
+ goto discard;
+ }
+
+ if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
+ DLD_SETQFULL(dsp);
+ }
+ return (cookie);
+discard:
+ /* TODO: bump kstat? */
+ freemsg(mp);
+ return (NULL);
+}
+
+
+
/*
* M_DATA put (IP fast-path mode)
*/
@@ -902,7 +973,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
mblk_t *bp, *newmp;
size_t size;
mac_header_info_t mhi;
- uint_t pri, vid, dvid;
uint_t max_sdu;
/*
@@ -948,38 +1018,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
goto discard;
if (is_ethernet) {
- dvid = mac_client_vid(dsp->ds_mch);
-
- /*
- * Discard the packet if this is a VLAN stream but the VID in
- * the packet is not correct.
- */
- vid = VLAN_ID(mhi.mhi_tci);
- if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
- goto discard;
-
- /*
- * Discard the packet if this packet is a tagged packet
- * but both pri and VID are 0.
- */
- pri = VLAN_PRI(mhi.mhi_tci);
- if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
- vid == VLAN_ID_NONE)
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
goto discard;
-
- /*
- * Update the priority bits to the per-stream priority if
- * priority is not set in the packet. Update the VID for
- * packets on a VLAN stream.
- */
- pri = (pri == 0) ? dsp->ds_pri : 0;
- if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
- if ((newmp = i_dld_ether_header_update_tag(mp, pri,
- dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
- goto discard;
- }
- mp = newmp;
- }
}
if (DLD_TX(dsp, mp, 0, 0) != NULL) {
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 61e4afb5b6..9fa649943c 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
*/
/*
@@ -248,19 +248,44 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
{
int err = 0;
uint32_t old_flags = dsp->ds_promisc;
+ uint32_t new_type = new_flags & ~DLS_PROMISC_RX_ONLY;
mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL;
+ uint16_t mac_flags = 0;
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
- DLS_PROMISC_PHYS)));
+ DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY)));
+
+ /*
+ * Asking us just to turn on DLS_PROMISC_RX_ONLY is not valid.
+ */
+ if (new_flags == DLS_PROMISC_RX_ONLY)
+ return (EINVAL);
/*
* If the user has only requested DLS_PROMISC_MULTI then we need to make
* sure that they don't see all packets.
*/
- if (new_flags == DLS_PROMISC_MULTI)
+ if (new_type == DLS_PROMISC_MULTI)
mptype = MAC_CLIENT_PROMISC_MULTI;
+ /*
+ * Look at new flags and figure out the correct mac promisc flags.
+ * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS,
+ * don't turn on physical promisc mode.
+ */
+ if (new_flags & DLS_PROMISC_RX_ONLY)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP;
+ if (new_type == DLS_PROMISC_SAP)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS;
+
+ /*
+ * There are three cases we care about here with respect to MAC. Going
+ * from nothing to something, something to nothing, something to
+ * something where we need to change how we're getting stuff from mac.
+ * In the last case, as long as they're not equal, we need to assume
+ * something has changed and do something about it.
+ */
if (dsp->ds_promisc == 0 && new_flags != 0) {
/*
* If only DLS_PROMISC_SAP, we don't turn on the
@@ -268,9 +293,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
*/
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph,
- (new_flags != DLS_PROMISC_SAP) ? 0 :
- MAC_PROMISC_FLAGS_NO_PHYS);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0) {
dsp->ds_promisc = old_flags;
return (err);
@@ -296,19 +319,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
&dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
}
- } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 &&
- new_flags != dsp->ds_promisc) {
- /*
- * If the old flag is PROMISC_SAP, but the current flag has
- * changed to some new non-zero value, we need to turn the
- * physical promiscuous mode.
- */
+ } else if (new_flags != 0 && new_flags != old_flags) {
ASSERT(dsp->ds_mph != NULL);
mac_promisc_remove(dsp->ds_mph);
/* Honors both after-remove and before-add semantics! */
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph, 0);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0)
dsp->ds_promisc = old_flags;
} else {
@@ -675,7 +692,10 @@ dls_mac_active_set(dls_link_t *dlp)
* Set the function to start receiving packets.
*/
mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+ } else if (dlp->dl_exclusive == B_TRUE) {
+ return (EBUSY);
}
+
dlp->dl_nactive++;
return (0);
}
@@ -701,7 +721,11 @@ dls_active_set(dld_str_t *dsp)
if (dsp->ds_passivestate == DLD_PASSIVE)
return (0);
- /* If we're already active, then there's nothing more to do. */
+ if (dsp->ds_dlp->dl_exclusive == B_TRUE &&
+ dsp->ds_passivestate != DLD_EXCLUSIVE)
+ return (EBUSY);
+
+ /* If we're already active, we need to check the link's exclusivity */
if ((dsp->ds_nactive == 0) &&
((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) {
/* except for ENXIO all other errors are mapped to EBUSY */
@@ -710,7 +734,8 @@ dls_active_set(dld_str_t *dsp)
return (err);
}
- dsp->ds_passivestate = DLD_ACTIVE;
+ dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ?
+ DLD_EXCLUSIVE : DLD_ACTIVE;
dsp->ds_nactive++;
return (0);
}
@@ -741,7 +766,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all)
if (dsp->ds_nactive != 0)
return;
- ASSERT(dsp->ds_passivestate == DLD_ACTIVE);
+ ASSERT(dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE);
dls_mac_active_clear(dsp->ds_dlp);
+ /*
+ * We verify below to ensure that no other part of DLS has mucked with
+ * our exclusive state.
+ */
+ if (dsp->ds_passivestate == DLD_EXCLUSIVE)
+ VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0);
dsp->ds_passivestate = DLD_UNINITIALIZED;
}
+
+int
+dls_exclusive_set(dld_str_t *dsp, boolean_t enable)
+{
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ if (enable == B_FALSE) {
+ dsp->ds_dlp->dl_exclusive = B_FALSE;
+ return (0);
+ }
+
+ if (dsp->ds_dlp->dl_nactive != 0)
+ return (EBUSY);
+
+ dsp->ds_dlp->dl_exclusive = B_TRUE;
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 9e4d6fdad5..82dc7cbd2e 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -602,6 +602,7 @@ i_dls_link_destroy(dls_link_t *dlp)
dlp->dl_mip = NULL;
dlp->dl_unknowns = 0;
dlp->dl_nonip_cnt = 0;
+ dlp->dl_exclusive = B_FALSE;
kmem_cache_free(i_dls_link_cachep, dlp);
}
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 27a808f211..6111d62475 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -542,6 +542,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
return (err);
}
+int
+dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid,
+ zoneid_t zid)
+{
+ dlmgmt_door_getlinkid_t getlinkid;
+ dlmgmt_getlinkid_retval_t retval;
+ int err;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid());
+ getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
+ (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = zid;
+
+ if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
+ sizeof (retval))) == 0) {
+ *linkid = retval.lr_linkid;
+ }
+ return (err);
+}
+
+
datalink_id_t
dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class,
datalink_media_t dmedia, uint32_t flags)
@@ -1209,7 +1230,7 @@ dls_devnet_rele(dls_devnet_t *ddp)
}
static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
{
char drv[MAXLINKNAMELEN];
uint_t ppa;
@@ -1219,7 +1240,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
dls_dev_handle_t ddh;
int err;
- if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0)
+ if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0)
return (dls_devnet_hold(linkid, ddpp));
/*
@@ -1662,15 +1683,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid)
* Access a vanity naming node.
*/
int
-dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp,
+ zoneid_t zid)
{
dls_devnet_t *ddp;
dls_link_t *dlp;
- zoneid_t zid = getzoneid();
+ zoneid_t czid = getzoneid();
int err;
mac_perim_handle_t mph;
- if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+ if (czid != GLOBAL_ZONEID && czid != zid)
+ return (ENOENT);
+
+ if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
return (err);
dls_devnet_prop_task_wait(ddp);
@@ -1703,6 +1728,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
return (0);
}
+int
+dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+{
+ return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid()));
+}
+
/*
* Close access to a vanity naming node.
*/
diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c
new file mode 100644
index 0000000000..381273a2e5
--- /dev/null
+++ b/usr/src/uts/common/io/gsqueue/gsqueue.c
@@ -0,0 +1,607 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Serialization queues are a technique used in illumos to provide what's
+ * commonly known as a 'vertical' perimeter. The idea (described a bit in
+ * uts/common/inet/squeue.c) is to provide a means to make sure that message
+ * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd
+ * consume these on different policies, ip on a conn_t basis, vnd on a per
+ * device basis, and use this to ensure that only one packet is being processed
+ * at a given time.
+ *
+ * Serialization queues were originally used by ip. As part of that
+ * implementation, many of the details of ip were baked into it. That includes
+ * things like conn_t, ip receive attributes, and the notion of sets. While an
+ * individual serialization queue, or gsqueue_t, is a useful level of
+ * abstraction, it isn't the basis on which monst consumers want to manage them.
+ * Instead, we have the notion of a set of serialization queues. These sets are
+ * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a
+ * gsqueue_t per CPU to fanout on without managing them all itself. In the
+ * original implementation, this existed, but they were heavily tied into the
+ * infrastructure of IP, and its notion of polling on the underlying MAC
+ * devices.
+ *
+ * The result of that past is a new interface to serialization queues and a
+ * similar, but slightly different, abstraction to sets of these
+ * (gsqueue_set_t). When designing this there are two different approaches that
+ * one could consider. The first is that the system has one gsqueue_set_t that
+ * the entire world shares, whether IP or some other consumer. The other is that
+ * every consumer has their own set.
+ *
+ * The trade offs between these two failure modes are the pathological failure
+ * modes. There is no guarantee that any two consumers here are equivalent. In
+ * fact, they very likely have very different latency profiles. If they are
+ * being processed in the same queue, that can lead to very odd behaviors. More
+ * generally, if we have a series of processing functions from one consumer
+ * which are generally short, and another which are generally long, that'll
+ * cause undue latency that's harder to observe. If we instead take the approach
+ * that each consumer should have its own set that it fans out over then we
+ * won't end up with the problem that a given serialization queue will have
+ * multiple latency profiles, but instead we'll see cpu contention for the bound
+ * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker
+ * thread is bound and it is in fact possible for it to be processed by other
+ * threads on other CPUs.
+ *
+ * We've opted to go down the second path, so each consumer has its own
+ * independent set of serialization queues that it is bound over.
+ *
+ * Structure Hierarchies
+ * ---------------------
+ *
+ * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t
+ * encapsulates all the per-CPU gsqueue_t that exist in the form of
+ * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could
+ * accommodate more than one gsqueue_t, but today there is a one to one mapping.
+ *
+ * We maintain two different lists of gsqueue_cpu_t, the active and defunct
+ * sets. The active set is maintained in the array `gs_cpus`. There are NCPU
+ * entries available in `gs_cpus` with the total number of currently active cpus
+ * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When
+ * there is no longer a need for a given binding (see the following section for
+ * more explanation on when this is the case) then we move the entry to the
+ * `gs_defunct` list which is just a singly linked list of gsqueue_cpu_t.
+ *
+ * In addition, each gsqueue_set_t can have a series of callbacks registered
+ * with it. These are described in the following section. Graphically, a given
+ * gsqueue_set_t looks roughly like the following:
+ *
+ * +---------------+
+ * | gsqueue_set_t |
+ * +---------------+
+ * | |
+ * | * . . . gs_cpus
+ * | |
+ * | | +-------------------------------------------------+
+ * | +----->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |...
+ * | +-------------------------------------------------+
+ * |
+ * * . . . gs_defunct
+ * |
+ * | +--------------+ +--------------+ +--------------+
+ * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |...
+ * +--------------+ +--------------+ +--------------+
+ *
+ * CPU DR, gsqueue_t, and gsqueue_t
+ * --------------------------------
+ *
+ * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker
+ * thread that may end up doing work. As part of supporting fanout, we have one
+ * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of
+ * this binding, we need to deal with CPU DR changes.
+ *
+ * The gsqueue driver maintains a single CPU DR callback that is used for the
+ * entire sub-system. We break down CPU DR events into three groups. Offline
+ * events, online events, and events we can ignore. When the first group occurs,
+ * we need to go through every gsqueue_t, find the gsqueue_cpu_t that
+ * corresponds to that processor id, and unbind all of its gsqueue_t's. It's
+ * rather important that we only unbind the gsqueue_t's and not actually destroy
+ * them. When this happens, they could very easily have data queued inside of
+ * them and it's unreasonable to just throw out everything in them at this
+ * point. The data remains intact and service continues uinterrupted.
+ *
+ * When we receive an online event, we do the opposite. We try to find a
+ * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid
+ * field intact) in the defunct list. If we find one, we remove it from the
+ * defunct list and add it to the active list as well as binding the gsqueue_t
+ * to the CPU in question. If we don't find one, then we create a new one.
+ *
+ * To deal with these kinds of situations, we allow a consumer to register
+ * callbacks for the gsqueue_t that they are interested in. These callbacks will
+ * fire whenever we are handling a topology change. The design of the callbacks
+ * is not that the user can take any administrative action during them, but
+ * rather set something for them to do asynchronously. It is illegal to make any
+ * calls into the gsqueue system while you are in a callback.
+ *
+ * Locking
+ * -------
+ *
+ * The lock ordering here is fairly straightforward. Due to our use of CPU
+ * binding and the CPU DR callbacks, we have an additional lock to consider
+ * cpu_lock. Because of that, the following are the rules for locking:
+ *
+ *
+ * o If performing binding operations, you must grab cpu_lock. cpu_lock is
+ * also at the top of the order.
+ *
+ * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock
+ * If you need to take multiple locks, you must take the greatest
+ * (left-most) one first.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/cpuvar.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+
+#include <sys/gsqueue.h>
+#include <sys/squeue_impl.h>
+
+typedef struct gsqueue_cb {
+ struct gsqueue_cb *gcb_next;
+ gsqueue_cb_f gcb_func;
+ void *gcb_arg;
+} gsqueue_cb_t;
+
+typedef struct gsqueue_cpu {
+ struct gsqueue_cpu *gqc_next;
+ squeue_t *gqc_head;
+ processorid_t gqc_cpuid;
+} gsqueue_cpu_t;
+
+struct gsqueue_set {
+ list_node_t gs_next;
+ uint_t gs_wwait;
+ pri_t gs_wpri;
+ kmutex_t gs_lock;
+ int gs_ncpus;
+ gsqueue_cpu_t **gs_cpus;
+ gsqueue_cpu_t *gs_defunct;
+ gsqueue_cb_t *gs_cbs;
+};
+
+static kmutex_t gsqueue_lock;
+static list_t gsqueue_list;
+static kmem_cache_t *gsqueue_cb_cache;
+static kmem_cache_t *gsqueue_cpu_cache;
+static kmem_cache_t *gsqueue_set_cache;
+
+static gsqueue_cpu_t *
+gsqueue_cpu_create(uint_t wwait, pri_t wpri, processorid_t cpuid)
+{
+ gsqueue_cpu_t *scp;
+
+ scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP);
+
+ scp->gqc_next = NULL;
+ scp->gqc_cpuid = cpuid;
+ scp->gqc_head = squeue_create(wwait, wpri, B_FALSE);
+ scp->gqc_head->sq_state = SQS_DEFAULT;
+ squeue_bind(scp->gqc_head, cpuid);
+
+ return (scp);
+}
+
+static void
+gsqueue_cpu_destroy(gsqueue_cpu_t *scp)
+{
+ squeue_destroy(scp->gqc_head);
+ kmem_cache_free(gsqueue_cpu_cache, scp);
+}
+
+gsqueue_set_t *
+gsqueue_set_create(uint_t wwait, pri_t wpri)
+{
+ int i;
+ gsqueue_set_t *gssp;
+
+ gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP);
+ gssp->gs_wwait = wwait;
+ gssp->gs_wpri = wpri;
+ gssp->gs_ncpus = 0;
+
+ /*
+ * We're grabbing CPU lock. Once we let go of it we have to ensure all
+ * set up of the gsqueue_set_t is complete, as it'll be in there for the
+ * various CPU DR bits.
+ */
+ mutex_enter(&cpu_lock);
+
+ for (i = 0; i < NCPU; i++) {
+ gsqueue_cpu_t *scp;
+ cpu_t *cp = cpu_get(i);
+ if (cp != NULL && CPU_ACTIVE(cp) &&
+ cp->cpu_flags & CPU_EXISTS) {
+ scp = gsqueue_cpu_create(wwait, wpri, cp->cpu_id);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ }
+ }
+
+ /* Finally we can add it to our global list and be done */
+ mutex_enter(&gsqueue_lock);
+ list_insert_tail(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+ mutex_exit(&cpu_lock);
+
+ return (gssp);
+}
+
+void
+gsqueue_set_destroy(gsqueue_set_t *gssp)
+{
+ int i;
+ gsqueue_cpu_t *scp;
+
+ /*
+ * Go through and unbind all of the squeues while cpu_lock is held and
+ * move them to the defunct list. Once that's done, we don't need to do
+ * anything else with cpu_lock.
+ */
+ mutex_enter(&cpu_lock);
+ mutex_enter(&gsqueue_lock);
+ list_remove(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&gssp->gs_lock);
+
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ scp = gssp->gs_cpus[i];
+ squeue_unbind(scp->gqc_head);
+ scp->gqc_next = gssp->gs_defunct;
+ gssp->gs_defunct = scp;
+ gssp->gs_cpus[i] = NULL;
+ }
+ gssp->gs_ncpus = 0;
+
+ mutex_exit(&gssp->gs_lock);
+ mutex_exit(&cpu_lock);
+
+ while (gssp->gs_defunct != NULL) {
+ gsqueue_cpu_t *scp;
+
+ scp = gssp->gs_defunct;
+ gssp->gs_defunct = scp->gqc_next;
+ gsqueue_cpu_destroy(scp);
+ }
+
+ while (gssp->gs_cbs != NULL) {
+ gsqueue_cb_t *cbp;
+
+ cbp = gssp->gs_cbs;
+ gssp->gs_cbs = cbp->gcb_next;
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ }
+
+ ASSERT(gssp->gs_ncpus == 0);
+ ASSERT(gssp->gs_defunct == NULL);
+ ASSERT(gssp->gs_cbs == NULL);
+ kmem_cache_free(gsqueue_set_cache, gssp);
+}
+
+gsqueue_t *
+gsqueue_set_get(gsqueue_set_t *gssp, uint_t index)
+{
+ squeue_t *sqp;
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ scp = gssp->gs_cpus[index % gssp->gs_ncpus];
+ sqp = scp->gqc_head;
+ mutex_exit(&gssp->gs_lock);
+ return ((gsqueue_t *)sqp);
+}
+
+uintptr_t
+gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg)
+{
+ gsqueue_cb_t *cbp;
+
+ cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP);
+ cbp->gcb_func = cb;
+ cbp->gcb_arg = arg;
+
+ mutex_enter(&gssp->gs_lock);
+ cbp->gcb_next = gssp->gs_cbs;
+ gssp->gs_cbs = cbp;
+ mutex_exit(&gssp->gs_lock);
+ return ((uintptr_t)cbp);
+}
+
+int
+gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id)
+{
+ gsqueue_cb_t *cbp, *prev;
+ mutex_enter(&gssp->gs_lock);
+ cbp = gssp->gs_cbs;
+ prev = NULL;
+ while (cbp != NULL) {
+ if ((uintptr_t)cbp != id) {
+ prev = cbp;
+ cbp = cbp->gcb_next;
+ continue;
+ }
+
+ if (prev == NULL) {
+ gssp->gs_cbs = cbp->gcb_next;
+ } else {
+ prev->gcb_next = cbp->gcb_next;
+ }
+
+ mutex_exit(&gssp->gs_lock);
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ return (0);
+ }
+ mutex_exit(&gssp->gs_lock);
+ return (-1);
+}
+
+void
+gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg,
+ int flags, uint8_t tag)
+{
+ squeue_t *sqp = (squeue_t *)gsp;
+
+ ASSERT(mp->b_next == NULL);
+ ASSERT(mp->b_prev == NULL);
+ mp->b_queue = (queue_t *)func;
+ mp->b_prev = arg;
+ sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag);
+}
+
+static void
+gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online)
+{
+ gsqueue_cb_t *cbp;
+
+ ASSERT(MUTEX_HELD(&gssp->gs_lock));
+ cbp = gssp->gs_cbs;
+ while (cbp != NULL) {
+ cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online);
+ cbp = cbp->gcb_next;
+ }
+
+}
+
+/*
+ * When we online a processor we need to go through and either bind a defunct
+ * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the
+ * defunct list that used to be on that processor. If no such gsqueue_cpu_t
+ * exists, then we'll create a new one. We'd rather avoid taking over an
+ * existing defunct one that used to be on another CPU, as its not unreasonable
+ * to believe that its CPU will come back. More CPUs are offlined and onlined by
+ * the administrator or by creating cpu sets than actually get offlined by FMA.
+ */
+static void
+gsqueue_handle_online(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ scp = gssp->gs_defunct;
+ while (scp != NULL) {
+ if (scp->gqc_cpuid == id)
+ break;
+ scp = scp->gqc_next;
+ }
+
+ if (scp == NULL) {
+ scp = gsqueue_cpu_create(gssp->gs_wwait,
+ gssp->gs_wpri, id);
+ } else {
+ squeue_bind(scp->gqc_head, id);
+ }
+ ASSERT(gssp->gs_ncpus < NCPU);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ gsqueue_notify(gssp, scp->gqc_head, B_TRUE);
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+static void
+gsqueue_handle_offline(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ int i;
+ gsqueue_cpu_t *scp = NULL;
+
+ mutex_enter(&gssp->gs_lock);
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ if (gssp->gs_cpus[i]->gqc_cpuid == id) {
+ scp = gssp->gs_cpus[i];
+ break;
+ }
+ }
+
+ if (scp != NULL) {
+ squeue_unbind(scp->gqc_head);
+ scp->gqc_next = gssp->gs_defunct;
+ gssp->gs_defunct = scp;
+ gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1];
+ gssp->gs_ncpus--;
+ gsqueue_notify(gssp, scp->gqc_head, B_FALSE);
+ }
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+/* ARGSUSED */
+static int
+gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused)
+{
+ cpu_t *cp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ cp = cpu_get(id);
+ switch (what) {
+ case CPU_CONFIG:
+ case CPU_ON:
+ case CPU_INIT:
+ case CPU_CPUPART_IN:
+ if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS)
+ gsqueue_handle_online(cp->cpu_id);
+ break;
+ case CPU_UNCONFIG:
+ case CPU_OFF:
+ case CPU_CPUPART_OUT:
+ gsqueue_handle_offline(cp->cpu_id);
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+gsqueue_set_cache_construct(void *buf, void *arg, int kmflags)
+{
+ gsqueue_set_t *gssp = buf;
+
+ gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags);
+ if (gssp->gs_cpus == NULL)
+ return (-1);
+
+ mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL);
+ gssp->gs_ncpus = 0;
+ gssp->gs_defunct = NULL;
+ gssp->gs_cbs = NULL;
+
+ return (0);
+}
+
+static void
+gsqueue_set_cache_destruct(void *buf, void *arg)
+{
+ gsqueue_set_t *gssp = buf;
+
+ kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU);
+ gssp->gs_cpus = NULL;
+ mutex_destroy(&gssp->gs_lock);
+}
+
+static void
+gsqueue_ddiinit(void)
+{
+ list_create(&gsqueue_list, sizeof (gsqueue_set_t),
+ offsetof(gsqueue_set_t, gs_next));
+ mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL);
+
+ gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache",
+ sizeof (gsqueue_cb_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache",
+ sizeof (gsqueue_cpu_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_set_cache = kmem_cache_create("squeue_set_cache",
+ sizeof (gsqueue_set_t),
+ 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct,
+ NULL, NULL, NULL, 0);
+
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+}
+
+static int
+gsqueue_ddifini(void)
+{
+ mutex_enter(&gsqueue_lock);
+ if (list_is_empty(&gsqueue_list) == 0) {
+ mutex_exit(&gsqueue_lock);
+ return (EBUSY);
+ }
+ list_destroy(&gsqueue_list);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+
+ kmem_cache_destroy(gsqueue_set_cache);
+ kmem_cache_destroy(gsqueue_cpu_cache);
+ kmem_cache_destroy(gsqueue_cb_cache);
+
+ mutex_destroy(&gsqueue_lock);
+
+ return (0);
+}
+
+static struct modlmisc gsqueue_modmisc = {
+ &mod_miscops,
+ "gsqueue"
+};
+
+static struct modlinkage gsqueue_modlinkage = {
+ MODREV_1,
+ &gsqueue_modmisc,
+ NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ gsqueue_ddiinit();
+ if ((ret = mod_install(&gsqueue_modlinkage)) != 0) {
+ VERIFY(gsqueue_ddifini() == 0);
+ return (ret);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&gsqueue_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ if ((ret = gsqueue_ddifini()) != 0)
+ return (ret);
+
+ if ((ret = mod_remove(&gsqueue_modlinkage)) != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c
new file mode 100644
index 0000000000..e4e700fa12
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/frameio.c
@@ -0,0 +1,464 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Frame I/O utility functions
+ */
+
+#include <sys/frameio.h>
+
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/inttypes.h>
+
+static kmem_cache_t *frameio_cache;
+
+int
+frameio_init(void)
+{
+ frameio_cache = kmem_cache_create("frameio_cache",
+ sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX,
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (frameio_cache == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+frameio_fini(void)
+{
+ if (frameio_cache != NULL)
+ kmem_cache_destroy(frameio_cache);
+}
+
+frameio_t *
+frameio_alloc(int kmflags)
+{
+ return (kmem_cache_alloc(frameio_cache, kmflags));
+}
+
+void
+frameio_free(frameio_t *fio)
+{
+ return (kmem_cache_free(frameio_cache, fio));
+}
+
+/*
+ * Ensure that we don't see any garbage in the framevecs that we're nominally
+ * supposed to work with. Specifically we want to make sure that the buflen and
+ * the address are not zero.
+ */
+static int
+frameio_hdr_check_vecs(frameio_t *fio)
+{
+ int i;
+ for (i = 0; i < fio->fio_nvecs; i++)
+ if (fio->fio_vecs[i].fv_buf == NULL ||
+ fio->fio_vecs[i].fv_buflen == 0)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * We have to copy in framevec32_t's. To work around the data model issues and
+ * trying not to copy memory we first copy in the framevec32_t data into the
+ * standard fio_vec space. Next we work backwards copying a given framevec32_t
+ * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that
+ * it is important that we do this in reverse so as to ensure that we don't
+ * clobber data as the framevec_t is larger than the framevec32_t.
+ */
+static int
+frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr)
+{
+ framevec32_t *vec32p;
+ framevec_t fv;
+ int i;
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+
+ if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs,
+ 0) != 0)
+ return (EFAULT);
+
+ for (i = fio->fio_nvecs - 1; i >= 0; i--) {
+ fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf;
+ fv.fv_buflen = vec32p[i].fv_buflen;
+ fv.fv_actlen = vec32p[i].fv_actlen;
+ fio->fio_vecs[i].fv_buf = fv.fv_buf;
+ fio->fio_vecs[i].fv_buflen = fv.fv_buflen;
+ fio->fio_vecs[i].fv_actlen = fv.fv_actlen;
+ }
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+/*
+ * Copy in a frame io header into fio with space for up to nvecs. If the frameio
+ * contains more vectors than specified it will be ignored. mode should contain
+ * information about the datamodel.
+ */
+int
+frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode)
+{
+ int model = ddi_model_convert_from(mode & FMODELS);
+ int cpf = mode & FKIOCTL ? FKIOCTL : 0;
+ size_t fsize = model == DDI_MODEL_ILP32 ?
+ sizeof (frameio32_t) : sizeof (frameio_t);
+
+ /*
+ * The start of the header is the same in all data models for the
+ * current verison.
+ */
+ if (ddi_copyin(addr, fio, fsize, cpf) != 0)
+ return (EFAULT);
+
+ if (fio->fio_version != FRAMEIO_VERSION_ONE)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvpf == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs % fio->fio_nvpf != 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > max_vecs)
+ return (EOVERFLOW);
+
+ addr = (void *)((uintptr_t)addr + fsize);
+ if (model == DDI_MODEL_ILP32) {
+ if (cpf != 0)
+ return (EINVAL);
+ return (frameio_hdr_copyin_ilp32(fio, addr));
+ }
+
+ if (ddi_copyin(addr, &fio->fio_vecs[0],
+ sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0)
+ return (EFAULT);
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+static mblk_t *
+frameio_allocb(size_t sz)
+{
+ mblk_t *mp;
+
+ mp = allocb(sz, 0);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_DATA;
+ return (mp);
+}
+
+static int
+framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ mblk_t *mp;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ mp = frameio_allocb(fv->fv_buflen);
+
+ if (mp == NULL) {
+ freemsg(mp);
+ return (EAGAIN);
+ }
+
+ if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen,
+ cpf) != 0) {
+ freemsg(mp);
+ return (EFAULT);
+ }
+
+ mp->b_wptr += fv->fv_buflen;
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Read a set of frame vectors that make up a single message boundary and return
+ * that as a single message in *mpp that consists of multiple data parts.
+ */
+static int
+frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ int nparts = fio->fio_nvpf;
+ int part, error;
+ mblk_t *mp;
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ /*
+ * Construct the initial frame
+ */
+ for (part = 0; part < nparts; part++) {
+ error = framevec_mblk_read(fv, &mp, cpf);
+ if (error != 0) {
+ freemsg(*mpp);
+ return (error);
+ }
+
+ if (*mpp == NULL)
+ *mpp = mp;
+ else
+ linkb(*mpp, mp);
+ fv++;
+ }
+
+ return (0);
+}
+
+/*
+ * Read data from a series of frameio vectors into a message block chain. A
+ * given frameio request has a number of discrete messages divided into
+ * individual vectors based on fio->fio_nvcspframe. Each discrete message will
+ * be constructed into a message block chain pointed to by b_next.
+ *
+ * If we get an EAGAIN while trying to construct a given message block what we
+ * return depends on what else we've done so far. If we have succesfully
+ * completed at least one message then we free everything else we've done so
+ * far and return that. If no messages have been completed we return EAGAIN. If
+ * instead we encounter a different error, say EFAULT, then all of the fv_actlen
+ * entries values are undefined.
+ */
+int
+frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf)
+{
+ int error = ENOTSUP;
+ int nframes = fio->fio_nvecs / fio->fio_nvpf;
+ int frame;
+ framevec_t *fv;
+ mblk_t *mp, *bmp = NULL;
+
+ /*
+ * Protect against bogus kernel subsystems.
+ */
+ VERIFY(fio->fio_nvecs > 0);
+ VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0);
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ fv = &fio->fio_vecs[0];
+ for (frame = 0; frame < nframes; frame++) {
+ error = frameio_mblk_read(fio, fv, &mp, cpf);
+ if (error != 0)
+ goto failed;
+
+ if (bmp != NULL)
+ bmp->b_next = mp;
+ else
+ *mpp = mp;
+ bmp = mp;
+ }
+
+ *nvecs = nframes;
+ return (0);
+failed:
+ /*
+ * On EAGAIN we've already taken care of making sure that we have no
+ * leftover messages, eg. they were never linked in.
+ */
+ if (error == EAGAIN) {
+ if (frame != 0)
+ error = 0;
+ if (*nvecs != NULL)
+ *nvecs = frame;
+ ASSERT(*mpp != NULL);
+ } else {
+ for (mp = *mpp; mp != NULL; mp = bmp) {
+ bmp = mp->b_next;
+ freemsg(mp);
+ }
+ if (nvecs != NULL)
+ *nvecs = 0;
+ *mpp = NULL;
+ }
+ return (error);
+}
+
+size_t
+frameio_frame_length(frameio_t *fio, framevec_t *fv)
+{
+ int i;
+ size_t len = 0;
+
+ for (i = 0; i < fio->fio_nvpf; i++, fv++)
+ len += fv->fv_buflen;
+
+ return (len);
+}
+
+/*
+ * Write a portion of an mblk to the current.
+ */
+static int
+framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff,
+ size_t foff, int cpf)
+{
+ ASSERT(len <= MBLKL(mp) - moff);
+ ASSERT(len <= fv->fv_buflen - fv->fv_actlen);
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ if (ddi_copyout(mp->b_rptr + moff, fv->fv_buf + foff, len, cpf) != 0)
+ return (EFAULT);
+ fv->fv_actlen += len;
+
+ return (0);
+}
+
+/*
+ * Because copying this out to the user might fail we don't want to update the
+ * b_rptr in case we need to copy it out again.
+ */
+static int
+framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf)
+{
+ int err;
+ size_t msize, blksize, len, moff, foff;
+
+ msize = msgsize(mp);
+ if (msize > frameio_frame_length(fio, fv))
+ return (EOVERFLOW);
+
+ moff = 0;
+ foff = 0;
+ blksize = MBLKL(mp);
+ fv->fv_actlen = 0;
+ while (msize != 0) {
+ len = MIN(blksize, fv->fv_buflen - fv->fv_actlen);
+ err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf);
+ if (err != 0)
+ return (err);
+
+ msize -= len;
+ blksize -= len;
+ moff += len;
+ foff += len;
+
+ if (blksize == 0 && msize != 0) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ moff = 0;
+ blksize = MBLKL(mp);
+ }
+
+ if (fv->fv_buflen == fv->fv_actlen && msize != 0) {
+ fv++;
+ fv->fv_actlen = 0;
+ foff = 0;
+ }
+ }
+
+ return (0);
+}
+
+int
+frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map,
+ mblk_t *mp, int *nwrite, int cpf)
+{
+ int mcount = 0;
+ int ret = 0;
+
+ if (map != MAP_BLK_FRAME)
+ return (EINVAL);
+
+ while (mp != NULL && mcount < fio->fio_nvecs) {
+ ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf);
+ if (ret != 0)
+ break;
+ mcount += fio->fio_nvpf;
+ mp = mp->b_next;
+ }
+
+ if (ret != 0 && mcount == 0) {
+ if (nwrite != NULL)
+ *nwrite = 0;
+ return (ret);
+ }
+
+ if (nwrite != NULL)
+ *nwrite = mcount / fio->fio_nvpf;
+
+ return (0);
+}
+
+/*
+ * Copy out nframes worth of frameio header data back to userland.
+ */
+int
+frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode)
+{
+ int i;
+ int model = ddi_model_convert_from(mode & FMODELS);
+ framevec32_t *vec32p;
+ framevec32_t f;
+
+ if (fio->fio_nvecs / fio->fio_nvpf < nframes)
+ return (EINVAL);
+
+ fio->fio_nvecs = nframes * fio->fio_nvpf;
+
+ if (model == DDI_MODEL_NONE) {
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+ }
+
+ ASSERT(model == DDI_MODEL_ILP32);
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+ for (i = 0; i < fio->fio_nvecs; i++) {
+ f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf;
+ if (fio->fio_vecs[i].fv_buflen > UINT_MAX ||
+ fio->fio_vecs[i].fv_actlen > UINT_MAX)
+ return (EOVERFLOW);
+ f.fv_buflen = fio->fio_vecs[i].fv_buflen;
+ f.fv_actlen = fio->fio_vecs[i].fv_actlen;
+ vec32p[i].fv_buf = f.fv_buf;
+ vec32p[i].fv_buflen = f.fv_buflen;
+ vec32p[i].fv_actlen = f.fv_actlen;
+ }
+
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+void
+frameio_mark_consumed(frameio_t *fio, int nframes)
+{
+ int i;
+
+ ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes);
+ for (i = 0; i < nframes * fio->fio_nvpf; i++)
+ fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen;
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
new file mode 100644
index 0000000000..8589965a4b
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -0,0 +1,5469 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * vnd - virtual (machine) networking datapath
+ *
+ * vnd's purpose is to provide a highly performant data path for Layer 2 network
+ * traffic and exist side by side an active IP netstack, each servicing
+ * different datalinks. vnd provides many of the same capabilities as the
+ * current TCP/IP stack does and some specific to layer two. Specifically:
+ *
+ * o Use of the DLD fastpath
+ * o Packet capture hooks
+ * o Ability to use hardware capabilities
+ * o Useful interfaces for handling multiple frames
+ *
+ * The following image shows where vnd fits into today's networking stack:
+ *
+ * +---------+----------+----------+
+ * | libdlpi | libvnd | libsocket|
+ * +---------+----------+----------+
+ * | · · VFS |
+ * | VFS · VFS +----------+
+ * | · | sockfs |
+ * +---------+----------+----------+
+ * | | VND | IP |
+ * | +----------+----------+
+ * | DLD/DLS |
+ * +-------------------------------+
+ * | MAC |
+ * +-------------------------------+
+ * | GLDv3 |
+ * +-------------------------------+
+ *
+ * -----------------------------------------
+ * A Tale of Two Devices - DDI Device Basics
+ * -----------------------------------------
+ *
+ * vnd presents itself to userland as a character device; however, it also is a
+ * STREAMS device so that it can interface with dld and the rest of the
+ * networking stack. Users never interface with the STREAMs devices directly and
+ * they are purely an implementation detail of vnd. Opening the STREAMS device
+ * require kcred and as such userland cannot interact with it or push it onto
+ * the stream head.
+ *
+ * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
+ * clone gets its own minor number; however, minor nodes are not created in the
+ * devices tree for these instances. In this state a user may do two different
+ * things. They may issue ioctls that affect global state or they may issue
+ * ioctls that try to attach it to a given datalink. Once a minor device has
+ * been attached to a datalink, all operations on it are scoped to that context,
+ * therefore subsequent global operations are not permitted.
+ *
+ * A given device can be linked into the /devices and /dev name space via a link
+ * ioctl. That ioctl causes a minor node to be created in /devices and then it
+ * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
+ * to, but simpler than, IP's persistence mechanism.
+ *
+ * ---------------------
+ * Binding to a datalink
+ * ---------------------
+ *
+ * Datalinks are backed by the dld (datalink device) and dls (datalink services)
+ * drivers. These drivers provide a STREAMS device for datalinks on the system
+ * which are exposed through /dev/net. Userland generally manipulates datalinks
+ * through libdlpi. When an IP interface is being plumbed up what actually
+ * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
+ * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may
+ * then can negotiate with dld and dls to obtain access to various capabilities
+ * and fast paths via a series of STREAMS messages.
+ *
+ * In vnd, we do the same thing, but we leave our STREAMS module as an
+ * implementation detail of the system. We don't want users to be able to
+ * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
+ * kcred to manipulate it. Thus, when a user issues a request to attach a
+ * datalink to a minor instance of the character device, that vnd minor instance
+ * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
+ * vnd does that open using the passed in credentials from the ioctl, not kcred.
+ * This ensures that users who doesn't have permissions to open the device
+ * cannot. Once that's been opened, we push on the vnd streams module.
+ *
+ * Once the vnd STREAMS instance has been created for this device, eg. the
+ * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
+ * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
+ * This association begins the STREAM device's initialization. We start up an
+ * asynchronous state machine that takes care of all the different aspects of
+ * plumbing up the device with dld and dls and enabling the MAC fast path. We
+ * need to guarantee to consumers of the character device that by the time their
+ * ioctl returns, the data path has been fully initialized.
+ *
+ * The state progression is fairly linear. There are two general steady states.
+ * The first is VND_S_ONLINE, which means that everything is jacked up and good
+ * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
+ * encountered an error or we have finished tearing it down and the character
+ * device can clean it up. The following is our state progression and the
+ * meaning of each state:
+ *
+ * |
+ * |
+ * V
+ * +---------------+
+ * | VNS_S_INITIAL | This is our initial state. Every
+ * +---------------+ vnd STREAMS device starts here.
+ * | While in this state, only dlpi
+ * | M_PROTO and M_IOCTL messages can be
+ * | sent or received. All STREAMS based
+ * | data messages are dropped.
+ * | We transition out of this state by
+ * | sending a DL_INFO_REQ to obtain
+ * | information about the underlying
+ * | link.
+ * v
+ * +-----------------+
+ * +--<-| VNS_S_INFO_SENT | In this state, we verify and
+ * | +-----------------+ record information about the
+ * | | underlying device. If the device is
+ * | | not suitable, eg. not of type
+ * v | DL_ETHER, then we immediately
+ * | | become a ZOMBIE. To leave this
+ * | | state we request exclusive active
+ * | | access to the device via
+ * v | DL_EXCLUSIVE_REQ.
+ * | v
+ * | +----------------------+
+ * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether
+ * | +----------------------+ or not we were able to obtain
+ * | | | exclusive access to the device. If
+ * | | | we were not able to, then we leave,
+ * v | | as that means that something like
+ * | | | IP is already plumbed up on top of
+ * | | | the datalink. We leave this state
+ * | | | by progressing through to the
+ * | | | appropriate DLPI primitive, either
+ * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ
+ * | | | depending on the style of the
+ * | | | datalink.
+ * | | v
+ * | | +-------------------+
+ * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were
+ * | | +-------------------+ able to perform a standard DLPI
+ * | | | attach and if so, go ahead and
+ * v | | send a DLPI_BIND_REQ.
+ * | v v
+ * | +-------------------+
+ * +--<-| VNS_S_BIND_SENT | In this state we see the result of
+ * | +-------------------+ our attempt to bind to PPA 0 of the
+ * v | underlying device. Because we're
+ * | | trying to be a layer two datapath,
+ * | | the specific attachment point isn't
+ * | | too important as we're going to
+ * v | have to enable promiscuous mode. We
+ * | | transition out of this by sending
+ * | | our first of three promiscuous mode
+ * | | requests.
+ * v v
+ * | +------------------------+
+ * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we
+ * | +------------------------+ were able to enable promiscuous
+ * | | mode at the physical level. We
+ * | | transition out of this by enabling
+ * | | multicast and broadcast promiscuous
+ * v | mode.
+ * | v
+ * | +--------------------------+
+ * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we
+ * | +--------------------------+ have enabled DL_PROMISC_MULTI and
+ * v | move onto the final promiscuous
+ * | | mode request.
+ * | v
+ * | +----------------------------+
+ * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we
+ * | +----------------------------+ enabled RX_ONLY promiscuous mode.
+ * | | We specifically do this as we don't
+ * v | want to receive our own traffic
+ * | | that we'll send out. We leave this
+ * | | state by requesting the set of
+ * | | dld/dls capabilities that we can
+ * v | process.
+ * | |
+ * | v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of
+ * | +--------------------+ capabilities that dld advertised
+ * | | and enable the ones that currently
+ * v | support for use. See the section
+ * | | later on regarding capabilities
+ * | | for more information. We leave this
+ * | | state by sending an enable request.
+ * v v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability
+ * | +--------------------+ initialization. Once finished, we
+ * | | transition to the next state. If
+ * v | the dld fast path is not available,
+ * | | we become a zombie.
+ * | v
+ * | +--------------+
+ * +--<-| VNS_S_ONLINE | This is a vnd STREAMS device's
+ * | +--------------+ steady state. It will normally
+ * | | reside in this state while it is in
+ * | | active use. It will only transition
+ * v | to the next state when the STREAMS
+ * | | device is closed by the character
+ * | | device. In this state, all data
+ * | | flows over the dld fast path.
+ * | v
+ * | +---------------------+
+ * +--<-| VNS_S_SHUTTING_DOWN | This vnd state takes care of
+ * | +---------------------+ disabling capabilities and then
+ * | | transitions to zombie state to
+ * v | indicate that it is finished.
+ * | v
+ * | +--------------+
+ * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS
+ * +--------------+ device is waiting to finished being
+ * reaped.
+ *
+ * If the stream association fails for any reason the state machine reaches
+ * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
+ * STREAMS ioctl to the character device. That will fail the user ioctl and
+ * propagate the vnd_errno_t back to userland. If, on the other hand, the
+ * association succeeds, then the vnd STREAMS device will be fully plumbed up
+ * and ready to transmit and receive message blocks. Consumers will be able to
+ * start using the other cbops(9E) entry points once the attach has fully
+ * finished, which will occur after the original user attach ioctl to the
+ * character device returns.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * There are several different devices and structures in the vnd driver. There
+ * is a per-netstack component, pieces related to the character device that
+ * consumers see, the internal STREAMS device state, and the data queues
+ * themselves. The following ASCII art picture describes their relationships and
+ * some of the major pieces of data that contain them. These are not exhaustive,
+ * eg. synchronization primitives are left out.
+ *
+ * +----------------+ +-----------------+
+ * | global | | global |
+ * | device list | | netstack list |
+ * | vnd_dev_list | | vnd_nsd_list |
+ * +----------------+ +-----------------+
+ * | |
+ * | v
+ * | +-------------------+ +-------------------+
+ * | | per-netstack data | ---> | per-netstack data | --> ...
+ * | | vnd_pnsd_t | | vnd_pnsd_t |
+ * | | | +-------------------+
+ * | | |
+ * | | nestackid_t ---+----> Netstack ID
+ * | | vnd_pnsd_flags_t -+----> Status flags
+ * | | zoneid_t ---+----> Zone ID for this netstack
+ * | | hook_family_t ---+----> VND IPv4 Hooks
+ * | | hook_family_t ---+----> VND IPv6 Hooks
+ * | | list_t ----+ |
+ * | +------------+------+
+ * | |
+ * | v
+ * | +------------------+ +------------------+
+ * | | character device | ---> | character device | -> ...
+ * +---------->| vnd_dev_t | | vnd_dev_t |
+ * | | +------------------+
+ * | |
+ * | minor_t ---+--> device minor number
+ * | ldi_handle_t ---+--> handle to /dev/net/%datalink
+ * | vnd_dev_flags_t -+--> device flags, non blocking, etc.
+ * | char[] ---+--> name if linked
+ * | vnd_str_t * -+ |
+ * +--------------+---+
+ * |
+ * v
+ * +-------------------------+
+ * | STREAMS device |
+ * | vnd_str_t |
+ * | |
+ * | vnd_str_state_t ---+---> State machine state
+ * | gsqueue_t * ---+---> mblk_t Serialization queue
+ * | vnd_str_stat_t ---+---> per-device kstats
+ * | vnd_str_capab_t ---+----------------------------+
+ * | vnd_data_queue_t ---+ | |
+ * | vnd_data_queue_t -+ | | v
+ * +-------------------+-+---+ +---------------------+
+ * | | | Stream capabilities |
+ * | | | vnd_str_capab_t |
+ * | | | |
+ * | | supported caps <--+-- vnd_capab_flags_t |
+ * | | dld cap handle <--+-- void * |
+ * | | direct tx func <--+-- vnd_dld_tx_t |
+ * | | +---------------------+
+ * | |
+ * +----------------+ +-------------+
+ * | |
+ * v v
+ * +-------------------+ +-------------------+
+ * | Read data queue | | Write data queue |
+ * | vnd_data_queue_t | | vnd_data_queue_t |
+ * | | | |
+ * | size_t ----+--> Current size | size_t ----+--> Current size
+ * | size_t ----+--> Max size | size_t ----+--> Max size
+ * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head
+ * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail
+ * +-------------------+ +-------------------+
+ *
+ *
+ * Globally, we maintain two lists. One list contains all of the character
+ * device soft states. The other maintains a list of all our netstack soft
+ * states. Each netstack maintains a list of active devices that have been
+ * associated with a datalink in its netstack.
+ *
+ * Recall that a given minor instance of the character device exists in one of
+ * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
+ * or it can be associated with a given datalink. When minor instances are in
+ * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
+ * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
+ * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
+ * vnd_str_t, to be created and associated to a vnd_dev_t.
+ *
+ * The character device, and its vnd_dev_t, is the interface to the rest of the
+ * system. The vnd_dev_t keeps track of various aspects like whether various
+ * operations, such as read, write and the frameio ioctls, are considered
+ * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
+ * keeping track of things like the name of the device, if any, in /dev. The
+ * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
+ * data queues. However, ioctls that manipulate these properties all go through
+ * the vnd_dev_t to its associated vnd_str_t.
+ *
+ * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
+ * for frames to transmit (write queue) and one for frames received (read
+ * queue). These data queues have a maximum size and attempting to add data
+ * beyond that maximum size will result in data being dropped. The sizes are
+ * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
+ * in those buffers or has a reservation in those buffers while they are in vnd
+ * and waiting to be consumed by the user or by mac.
+ *
+ * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
+ * available, negotiated, and currently active features.
+ *
+ * ----------------------
+ * Data Path and gsqueues
+ * ----------------------
+ *
+ * There's a lot of plumbing in vnd to get to the point where we can send data,
+ * but vnd's bread and butter is the data path, so it's worth diving into it in
+ * more detail. Data enters and exits the system from two ends.
+ *
+ * The first end is the vnd consumer. This comes in the form of read and write
+ * system calls as well as the frame I/O ioctls. The read and write system calls
+ * operate on a single frame at a time. Think of a frame as a single message
+ * that has come in off the wire, which may itself comprise multiple mblk_t's
+ * linked together in the kernel. readv(2) and writev(2) have the same
+ * limitations as read(2) and write(2). We enforce this as the system is
+ * required to fill up every uio(9S) buffer before moving onto the next one.
+ * This means that if you have a MTU sized buffer and two frames come in which
+ * are less than half of the MTU they must fill up the given iovec. Even if we
+ * didn't want to do this, we have no way of informing the supplier of the
+ * iovecs that they were only partially filled or where one frame ends and
+ * another begins. That's life, as such we have frame I/O which solves this
+ * problem. It allows for multiple frames to be consumed as well as for frames
+ * to be broken down into multiple vector components.
+ *
+ * The second end is the mac direct calls. As part of negotiating capabilities
+ * via dld, we give mac a function of ours to call when packets are received
+ * [vnd_mac_input()] and a callback to indicate that flow has been restored
+ * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
+ * transmit data with. As part of the contract with mac, mac is allowed to flow
+ * control us by returning a cookie to the transmit function. When that happens,
+ * all outbound traffic is halted until our callback function is called and we
+ * can schedule drains.
+ *
+ * It's worth looking at these in further detail. We'll start with the rx path.
+ *
+ *
+ * |
+ * * . . . packets from gld
+ * |
+ * v
+ * +-------------+
+ * | mac |
+ * +-------------+
+ * |
+ * v
+ * +-------------+
+ * | dld |
+ * +-------------+
+ * |
+ * * . . . dld direct callback
+ * |
+ * v
+ * +---------------+
+ * | vnd_mac_input |
+ * +---------------+
+ * |
+ * v
+ * +---------+ +-------------+
+ * | dropped |<--*---------| vnd_hooks |
+ * | by | . +-------------+
+ * | hooks | . drop probe |
+ * +---------+ kstat bump * . . . Do we have free
+ * | buffer space?
+ * |
+ * no . | . yes
+ * . + .
+ * +---*--+------*-------+
+ * | |
+ * * . . drop probe * . . recv probe
+ * | kstat bump | kstat bump
+ * v |
+ * +---------+ * . . fire pollin
+ * | freemsg | v
+ * +---------+ +-----------------------+
+ * | vnd_str_t`vns_dq_read |
+ * +-----------------------+
+ * ^ ^
+ * +----------+ | | +---------+
+ * | read(9E) |-->-+ +--<--| frameio |
+ * +----------+ +---------+
+ *
+ * The rx path is rather linear. Packets come into us from mac. We always run
+ * them through the various hooks, and if they come out of that, we inspect the
+ * read data queue. If there is not enough space for a packet, we drop it.
+ * Otherwise, we append it to the data queue, and fire read notifications
+ * targetting anyone polling or doing blocking I/O on this device. Those
+ * consumers then drain the head of the data queue.
+ *
+ * The tx path is more complicated due to mac flow control. After any call into
+ * mac, we may have to potentially suspend writes and buffer data for an
+ * arbitrary amount of time. As such, we need to carefully track the total
+ * amount of outstanding data so that we don't waste kernel memory. This is
+ * further complicated by the fact that mac will asynchronously tell us when our
+ * flow has been resumed.
+ *
+ * For data to be able to enter the system, it needs to be able to take a
+ * reservation from the write data queue. Once the reservation has been
+ * obtained, we enter the gsqueue so that we can actually append it. We use
+ * gsqueues (serialization queues) to ensure that packets are manipulated in
+ * order as we deal with the draining and appending packets. We also leverage
+ * its worker thread to help us do draining after mac has restorted our flow.
+ *
+ * The following image describes the flow:
+ *
+ * +-----------+ +--------------+ +-------------------------+ +------+
+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done |
+ * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+
+ * +-----------+ +--------------+ . +-------------------------+
+ * | ^ .
+ * | | . reserve space from gsqueue
+ * | | |
+ * queue . . . * | space v
+ * full | * . . . avail +------------------------+
+ * v | | vnd_squeue_tx_append() |
+ * +--------+ +------------+ +------------------------+
+ * | EAGAIN |<--*------| Non-block? |<-+ |
+ * +--------+ . +------------+ | v
+ * . yes v | wait +--------------+
+ * no . .* * . . for | append chain |
+ * +----+ space | to outgoing |
+ * | mblk chain |
+ * from gsqueue +--------------+
+ * | |
+ * | +-------------------------------------------------+
+ * | |
+ * | | yes . . .
+ * v v .
+ * +-----------------------+ +--------------+ . +------+
+ * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
+ * +-----------------------+ +--------------+ +------+
+ * | |
+ * +---------------------------------|---------------------+
+ * | | tx |
+ * | no . . * queue . . *
+ * | flow controlled . | empty * . fire pollout
+ * | . v | if mblk_t's
+ * +-------------+ . +---------------------+ | sent
+ * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
+ * | flags | +---------------------+ |
+ * +-------------+ More data | | | More data |
+ * and limit ^ v * . . and limit ^
+ * not reached . . * | | reached |
+ * +----+ | |
+ * v |
+ * +----------+ +-------------+ +---------------------------+
+ * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with |
+ * | control | | block flags | | vnd_squeue_tx_drain() and |
+ * | callback | +-------------+ | GSQUEUE_FILL flag, iff |
+ * +----------+ | not already scheduled |
+ * +---------------------------+
+ *
+ * The final path taken for a given write(9E)/frameio ioctl depends on whether
+ * or not the vnd_dev_t is non-blocking. That controls the initial path of
+ * trying to take a reservation in write data queue. If the device is in
+ * non-blocking mode, we'll return EAGAIN when there is not enough space
+ * available, otherwise, the calling thread blocks on the data queue.
+ *
+ * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
+ * entire queue, as that could be quite large and we don't want to necessarily
+ * keep the thread that's doing the drain until it's been finished. Not only
+ * could more data be coming in, but the draining thread could be a userland
+ * thread that has more work to do. We have two limits today. There is an upper
+ * bound on the total amount of data and the total number of mblk_t chains. If
+ * we hit either limit, then we will schedule another drain in the gsqueue and
+ * go from there.
+ *
+ * It's worth taking some time to describe how we interact with gsqueues. vnd
+ * has a gsqueue_set_t for itself. It's important that it has its own set, as
+ * the profile of work that vnd does is different from other sub-systems in the
+ * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
+ * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
+ * maintaining one for a given device. Because of that, we want to use a
+ * pseudo-random one to try and spread out the load, and picking one at random
+ * is likely to be just as good as any fancy algorithm we might come up with,
+ * especially as any two devices could have radically different transmit
+ * profiles.
+ *
+ * While some of the write path may seem complicated, it does allow us to
+ * maintain an important property. Once we have acknowledged a write(9E) or
+ * frameio ioctl, we will not drop the packet, excepting something like ipf via
+ * the firewall hooks.
+ *
+ * There is one other source of flow control that can exist in the system which
+ * is in the form of a barrier. The barrier is an internal mechanism used for
+ * ensuring that an gsqueue is drained for a given device. We use this as part
+ * of tearing down. Specifically we disable the write path so nothing new can be
+ * inserted into the gsqueue and then insert a barrier block. Once the barrier
+ * block comes out of the gsqueue, then we know nothing else in the gsqueue that
+ * could refer to the vnd_str_t, being destroyed, exists.
+ *
+ * ---------------------
+ * vnd, zones, netstacks
+ * ---------------------
+ *
+ * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
+ * Because of that, vnd is also a netstack module. It registers with the
+ * netstack sub-system and receives callbacks every time a netstack is created,
+ * being shutdown, and destroyed. The netstack callbacks drive the creation and
+ * destruction of the vnd_pnsd_t structures.
+ *
+ * Recall from the earlier architecture diagrams that every vnd device is scoped
+ * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
+ * torn down, we also tear down any vnd devices that are hanging around. When
+ * the netstack is torn down, we know that any zones that are scoped to that
+ * netstack are being shut down and have no processes remaining. This is going
+ * to be the case whether they are shared or exclusive stack zones. We have to
+ * perform a careful dance.
+ *
+ * There are two different callbacks that happen on tear down, the first is a
+ * shutdown callback, the second is a destroy callback. When the shutdown
+ * callback is fired we need to prepare for the netstack to go away and ensure
+ * that nothing can continue to persist itself.
+ *
+ * More specifically, when we get notice of a stack being shutdown we first
+ * remove the netstack from the global netstack list to ensure that no one new
+ * can come in and find the netstack and get a reference to it. After that, we
+ * notify the neti hooks that they're going away. Once that's all done, we get
+ * to the heart of the matter.
+ *
+ * When shutting down there could be any number of outstanding contexts that
+ * have a reference on the vnd_pnsd_t and on the individual links. However, we
+ * know that no one new will be able to find the vnd_pnsd_t. To account for
+ * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
+ * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
+ * to the netstack's list. If this is set, then they must not append to it.
+ * Once this is set, we know that the netstack's list of devices can never grow,
+ * only shrink.
+ *
+ * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
+ * the container for the device is being destroyed and that we should not allow
+ * additional references to the device to be created, whether via open, or
+ * linking. The presence of this bit also allows things like the list ioctl and
+ * sdev to know not to consider its existence. At the conclusion of this being
+ * set, we know that no one else should be able to obtain a new reference to the
+ * device.
+ *
+ * Once that has been set for all devices, we go through and remove any existing
+ * links that have been established in sdev. Because doing that may cause the
+ * final reference for the device to be dropped, which still has a reference to
+ * the netstack, we have to restart our walk due to dropped locks. We know that
+ * this walk will eventually complete because the device cannot be relinked and
+ * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
+ * Once that's finished, the shutdown callback returns.
+ *
+ * When we reach the destroy callback, we simply wait for references on the
+ * netstack to disappear. Because the zone has been shut down, all processes in
+ * it that have open references have been terminated and reaped. Any threads
+ * that are newly trying to reference it will fail. However, there is one thing
+ * that can halt this that we have no control over, which is the global zone
+ * holding open a reference to the device. In this case the zone halt will hang
+ * in vnd_stack_destroy. Once the last references is dropped we finish destroy
+ * the netinfo hooks and free the vnd_pnsd_t.
+ *
+ * ----
+ * sdev
+ * ----
+ *
+ * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
+ * for both the global and non-global zones. In any given zone we always supply
+ * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
+ * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
+ * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
+ * also see every link for every zone, ala /dev/net, under
+ * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
+ * named net0, the global zone would have /dev/vnd/turin/net0.
+ *
+ * The sdev plugin has three interfaces that it supplies back to sdev. One is to
+ * validate that a given node is still valid. The next is a callback from sdev
+ * to say that it is no longer using the node. The third and final one is from
+ * sdev where it asks us to fill a directory. All of the heavy lifting is done
+ * in directory filling and in valiation. We opt not to maintain a reference on
+ * the device while there is an sdev node present. This makes the removal of
+ * nodes much simpler and most of the possible failure modes shouldn't cause any
+ * real problems. For example, the open path has to handle both dev_t's which no
+ * longer exist and which are no longer linked.
+ *
+ * -----
+ * hooks
+ * -----
+ *
+ * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
+ * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
+ * in a minimal fashion. While we will allow traffic to be filtered through the
+ * hooks, we do not provide means for packet injection or additional inspection
+ * at this time. There are a total of four different events created:
+ *
+ * o IPv4 physical in
+ * o IPv4 physical out
+ * o IPv6 physical in
+ * o IPv6 physical out
+ *
+ * ---------------
+ * Synchronization
+ * ---------------
+ *
+ * To make our synchronization simpler, we've put more effort into making the
+ * metadata/setup paths do more work. That work allows the data paths to make
+ * assumptions around synchronization that simplify the general case. Each major
+ * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
+ * annotated with the protection that its members receives. The following
+ * annotations are used:
+ *
+ * A Atomics; these values are only modified using atomics values.
+ * Currently this only applies to kstat values.
+ * E Existence; no lock is needed to access this member, it does not
+ * change while the structure is valid.
+ * GL Global Lock; these members are protected by the global
+ * vnd_dev_lock.
+ * L Locked; access to the member is controlled by a lock that is in
+ * the structure.
+ * NSL netstack lock; this member is protected by the containing
+ * netstack. This only applies to the vnd_dev_t`vdd_nslink.
+ * X This member is special, and is discussed in this section.
+ *
+ * In addition to locking, we also have reference counts on the vnd_dev_t and
+ * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
+ * With rare exception, once a reference count is decremented, the consumer
+ * should not assume that the data is valid any more. The only exception to this
+ * is the case where we're removing an extant reference count from a link into
+ * /devices or /dev. Reference counts are obtained on these structures as a part
+ * of looking them up.
+ *
+ * # Global Lock Ordering
+ * ######################
+ *
+ * The following is the order that you must take locks in vnd:
+ *
+ * 1) vnd`vnd_dev_lock
+ * 2) vnd_pnsd_t`vpnd_lock
+ * 3) vnd_dev_t`vnd_lock
+ * 4) vnd_str_t`vns_lock
+ * 5) vnd_data_queue_t`vdq_lock
+ *
+ * One must adhere to the following rules:
+ *
+ * o You must acquire a lower numbered lock before a high numbered lock.
+ * o It is NOT legal to hold two locks of the same level concurrently, eg. you
+ * can not hold two different vnd_dev_t's vnd_lock at the same time.
+ * o You may release locks in any order.
+ * o If you release a lock, you must honor the locking rules before acquiring
+ * it again.
+ * o You should not hold any locks when calling any of the rele functions.
+ *
+ * # Special Considerations
+ * ########################
+ *
+ * While most of the locking is what's expected, it's worth going into the
+ * special nature that a few members hold. Today, only two structures have
+ * special considerations: the vnd_dev_t and the vnd_str_t. All members with
+ * special considerations have an additional annotation that describes how you
+ * should interact with it.
+ *
+ * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
+ * attached or in the process of attaching. If the code path that goes through
+ * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
+ * is always legal to dereference that member without a lock held. When they are
+ * added to the system, they should be done under the vdd_lock and done as part
+ * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
+ * lifetime of the vnd_dev_t.
+ *
+ * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
+ * always exists as it is a part of the structure. The only time that it's valid
+ * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
+ * set or during tear down. Outside of those paths which are naturally
+ * serialized, there is no explicit locking around the member.
+ *
+ * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
+ * initially set as part of creating the structure, but are set as part of
+ * responding to the association ioctl. Anything in the data path or metadata
+ * path that requires association may assume that they exist, as we do not kick
+ * off the state machine until they're set.
+ *
+ * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
+ * members are designed to be used as part of various operations with the
+ * gsqueues. A lock isn't needed to use them, but to work with them, the
+ * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
+ * thread. Otherwise, it is always fair game to refer to their addresses. Their
+ * contents are ignored by vnd, but some members are manipulated by the gsqueue
+ * subsystem.
+ */
+
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/ddi.h>
+#include <sys/ethernet.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/ksynch.h>
+#include <sys/taskq_impl.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/dlpi.h>
+#include <sys/cred.h>
+#include <sys/id_space.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+#include <sys/policy.h>
+#include <sys/sunldi.h>
+#include <sys/cred.h>
+#include <sys/strsubr.h>
+#include <sys/poll.h>
+#include <sys/neti.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+#include <sys/vlan.h>
+#include <sys/dld.h>
+#include <sys/mac_client.h>
+#include <sys/netstack.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/gsqueue.h>
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/vnd.h>
+
+/*
+ * Globals
+ */
+static dev_info_t *vnd_dip;
+static taskq_t *vnd_taskq;
+static kmem_cache_t *vnd_str_cache;
+static kmem_cache_t *vnd_dev_cache;
+static kmem_cache_t *vnd_pnsd_cache;
+static id_space_t *vnd_minors;
+static int vnd_list_init = 0;
+static sdev_plugin_hdl_t vnd_sdev_hdl;
+static gsqueue_set_t *vnd_sqset;
+
+static kmutex_t vnd_dev_lock;
+static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */
+static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */
+
+/*
+ * STREAMs ioctls
+ *
+ * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
+ * they aren't a part of the header file.
+ */
+#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
+
+/*
+ * Private ioctl to associate a given streams instance with a minor instance of
+ * the character device.
+ */
+#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1)
+
+typedef struct vnd_strioc_associate {
+ minor_t vsa_minor; /* minor device node */
+ netstackid_t vsa_nsid; /* netstack id */
+ vnd_errno_t vsa_errno; /* errno */
+} vnd_strioc_associate_t;
+
+typedef enum vnd_strioc_state {
+ VSS_UNKNOWN = 0,
+ VSS_COPYIN = 1,
+ VSS_COPYOUT = 2,
+} vnd_strioc_state_t;
+
+typedef struct vnd_strioc {
+ vnd_strioc_state_t vs_state;
+ caddr_t vs_addr;
+} vnd_strioc_t;
+
+/*
+ * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
+ * really, overlap is at the end of the day, inevitable.
+ */
+#define VND_SQUEUE_TAG_TX_DRAIN 0x42
+#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43
+#define VND_SQUEUE_TAG_VND_WRITE 0x44
+#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45
+#define VND_SQUEUE_TAG_STRBARRIER 0x46
+
+/*
+ * vnd reserved names. These are names which are reserved by vnd and thus
+ * shouldn't be used by some external program.
+ */
+static char *vnd_reserved_names[] = {
+ "ctl",
+ "zone",
+ NULL
+};
+
+/*
+ * vnd's DTrace probe macros
+ *
+ * DTRACE_VND* are all for a stable provider. We also have an unstable internal
+ * set of probes for reference count manipulation.
+ */
+#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4);
+
+#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5) \
+ DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5);
+
+#define DTRACE_VND_REFINC(vdp) \
+ DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+#define DTRACE_VND_REFDEC(vdp) \
+ DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+
+
+/*
+ * Tunables
+ */
+size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */
+size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */
+
+/*
+ * These numbers are designed as per-device tunables that are applied when a new
+ * vnd device is attached. They're a rough stab at what may be a reasonable
+ * amount of work to do in one burst in an squeue.
+ */
+size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */
+size_t vnd_flush_nburst = 10; /* 10 frames */
+
+/*
+ * Constants related to our sdev plugins
+ */
+#define VND_SDEV_NAME "vnd"
+#define VND_SDEV_ROOT "/dev/vnd"
+#define VND_SDEV_ZROOT "/dev/vnd/zone"
+
+/*
+ * Statistic macros
+ */
+#define VND_STAT_INC(vsp, field, val) \
+ atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
+#define VND_LATENCY_1MS 1000000
+#define VND_LATENCY_10MS 10000000
+#define VND_LATENCY_100MS 100000000
+#define VND_LATENCY_1S 1000000000
+#define VND_LATENCY_10S 10000000000
+
+/*
+ * Constants for vnd hooks
+ */
+static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+#define IPV4_MCAST_LEN 3
+static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+#define IPV6_MCAST_LEN 2
+static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/*
+ * vnd internal data structures and types
+ */
+
+/*
+ * As part of opening the device stream we need to properly communicate with our
+ * underlying stream. This is a bit of an asynchronous dance and we need to
+ * properly work with dld to get everything set up. We have to initiate the
+ * conversation with dld and as such we keep track of our state here.
+ */
+typedef enum vnd_str_state {
+ VNS_S_INITIAL = 0,
+ VNS_S_INFO_SENT,
+ VNS_S_EXCLUSIVE_SENT,
+ VNS_S_ATTACH_SENT,
+ VNS_S_BIND_SENT,
+ VNS_S_SAP_PROMISC_SENT,
+ VNS_S_MULTI_PROMISC_SENT,
+ VNS_S_RX_ONLY_PROMISC_SENT,
+ VNS_S_CAPAB_Q_SENT,
+ VNS_S_CAPAB_E_SENT,
+ VNS_S_ONLINE,
+ VNS_S_SHUTTING_DOWN,
+ VNS_S_ZOMBIE
+} vnd_str_state_t;
+
+typedef enum vnd_str_flags {
+ VNS_F_NEED_ZONE = 0x1,
+ VNS_F_TASKQ_DISPATCHED = 0x2,
+ VNS_F_CONDEMNED = 0x4,
+ VNS_F_FLOW_CONTROLLED = 0x8,
+ VNS_F_DRAIN_SCHEDULED = 0x10,
+ VNS_F_BARRIER = 0x20,
+ VNS_F_BARRIER_DONE = 0x40
+} vnd_str_flags_t;
+
+typedef enum vnd_capab_flags {
+ VNS_C_HCKSUM = 0x1,
+ VNS_C_DLD = 0x2,
+ VNS_C_DIRECT = 0x4,
+ VNS_C_HCKSUM_BADVERS = 0x8
+} vnd_capab_flags_t;
+
+/*
+ * Definitions to interact with direct callbacks
+ */
+typedef uintptr_t vnd_mac_cookie_t;
+/* DLD Direct capability function */
+typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
+/* DLD Direct tx function */
+typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+/* DLD Direct function to set flow control callback */
+typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
+ void *);
+/* DLD Direct function to see if flow controlled still */
+typedef int (*vnd_dld_is_fc_t)(void *, uint_t, void *, uint_t);
+
+/*
+ * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
+ */
+typedef struct vnd_str_capab {
+ vnd_capab_flags_t vsc_flags;
+ t_uscalar_t vsc_hcksum_opts;
+ vnd_dld_cap_t vsc_capab_f;
+ void *vsc_capab_hdl;
+ vnd_dld_tx_t vsc_tx_f;
+ void *vsc_tx_hdl;
+ vnd_dld_set_fcb_t vsc_set_fcb_f;
+ void *vsc_set_fcb_hdl;
+ vnd_dld_is_fc_t vsc_is_fc_f;
+ void *vsc_is_fc_hdl;
+ vnd_mac_cookie_t vsc_fc_cookie;
+ void *vsc_tx_fc_hdl;
+} vnd_str_capab_t;
+
+struct vnd_str;
+struct vnd_dev;
+struct vnd_pnsd;
+
+/*
+ * The vnd_data_queue is a simple construct for storing a series of messages in
+ * a queue.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_data_queue {
+ struct vnd_str *vdq_vns; /* E */
+ kmutex_t vdq_lock;
+ kcondvar_t vdq_ready; /* Uses vdq_lock */
+ ssize_t vdq_max; /* L */
+ ssize_t vdq_cur; /* L */
+ mblk_t *vdq_head; /* L */
+ mblk_t *vdq_tail; /* L */
+} vnd_data_queue_t;
+
+typedef struct vnd_str_stat {
+ kstat_named_t vks_rbytes;
+ kstat_named_t vks_rpackets;
+ kstat_named_t vks_obytes;
+ kstat_named_t vks_opackets;
+ kstat_named_t vks_nhookindrops;
+ kstat_named_t vks_nhookoutdrops;
+ kstat_named_t vks_ndlpidrops;
+ kstat_named_t vks_ndataindrops;
+ kstat_named_t vks_ndataoutdrops;
+ kstat_named_t vks_tdrops;
+ kstat_named_t vks_linkname;
+ kstat_named_t vks_zonename;
+ kstat_named_t vks_nmacflow;
+ kstat_named_t vks_tmacflow;
+ kstat_named_t vks_mac_flow_1ms;
+ kstat_named_t vks_mac_flow_10ms;
+ kstat_named_t vks_mac_flow_100ms;
+ kstat_named_t vks_mac_flow_1s;
+ kstat_named_t vks_mac_flow_10s;
+} vnd_str_stat_t;
+
+/*
+ * vnd stream structure
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_str {
+ kmutex_t vns_lock;
+ kcondvar_t vns_cancelcv; /* Uses vns_lock */
+ kcondvar_t vns_barriercv; /* Uses vns_lock */
+ kcondvar_t vns_stcv; /* Uses vns_lock */
+ vnd_str_state_t vns_state; /* L */
+ vnd_str_state_t vns_laststate; /* L */
+ vnd_errno_t vns_errno; /* L */
+ vnd_str_flags_t vns_flags; /* L */
+ vnd_str_capab_t vns_caps; /* L */
+ taskq_ent_t vns_tqe; /* L */
+ vnd_data_queue_t vns_dq_read; /* E */
+ vnd_data_queue_t vns_dq_write; /* E */
+ mblk_t *vns_dlpi_inc; /* L */
+ queue_t *vns_rq; /* E */
+ queue_t *vns_wq; /* E */
+ queue_t *vns_lrq; /* E */
+ t_uscalar_t vns_dlpi_style; /* L */
+ t_uscalar_t vns_minwrite; /* L */
+ t_uscalar_t vns_maxwrite; /* L */
+ hrtime_t vns_fclatch; /* L */
+ kstat_t *vns_kstat; /* E */
+ gsqueue_t *vns_squeue; /* E */
+ mblk_t vns_drainblk; /* E + X */
+ mblk_t vns_barrierblk; /* E + X */
+ vnd_str_stat_t vns_ksdata; /* A */
+ size_t vns_nflush; /* L */
+ size_t vns_bsize; /* L */
+ struct vnd_dev *vns_dev; /* E + X */
+ struct vnd_pnsd *vns_nsd; /* E + X */
+} vnd_str_t;
+
+typedef enum vnd_dev_flags {
+ VND_D_ATTACH_INFLIGHT = 0x001,
+ VND_D_ATTACHED = 0x002,
+ VND_D_LINK_INFLIGHT = 0x004,
+ VND_D_LINKED = 0x008,
+ VND_D_CONDEMNED = 0x010,
+ VND_D_ZONE_DYING = 0x020,
+ VND_D_OPENED = 0x040
+} vnd_dev_flags_t;
+
+/*
+ * This represents the data associated with a minor device instance.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_dev {
+ kmutex_t vdd_lock;
+ list_node_t vdd_link; /* GL */
+ list_node_t vdd_nslink; /* NSL */
+ int vdd_ref; /* L */
+ vnd_dev_flags_t vdd_flags; /* L */
+ minor_t vdd_minor; /* E */
+ dev_t vdd_devid; /* E */
+ ldi_ident_t vdd_ldiid; /* E */
+ ldi_handle_t vdd_ldih; /* X */
+ cred_t *vdd_cr; /* X */
+ vnd_str_t *vdd_str; /* L */
+ struct pollhead vdd_ph; /* E */
+ struct vnd_pnsd *vdd_nsd; /* E + X */
+ char vdd_datalink[VND_NAMELEN]; /* L */
+ char vdd_lname[VND_NAMELEN]; /* L */
+} vnd_dev_t;
+
+typedef enum vnd_pnsd_flags {
+ VND_NS_CONDEMNED = 0x1
+} vnd_pnsd_flags_t;
+
+/*
+ * Per netstack data structure.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_pnsd {
+ list_node_t vpnd_link; /* protected by global dev lock */
+ zoneid_t vpnd_zid; /* E */
+ netstackid_t vpnd_nsid; /* E */
+ boolean_t vpnd_hooked; /* E */
+ net_handle_t vpnd_neti_v4; /* E */
+ hook_family_t vpnd_family_v4; /* E */
+ hook_event_t vpnd_event_in_v4; /* E */
+ hook_event_t vpnd_event_out_v4; /* E */
+ hook_event_token_t vpnd_token_in_v4; /* E */
+ hook_event_token_t vpnd_token_out_v4; /* E */
+ net_handle_t vpnd_neti_v6; /* E */
+ hook_family_t vpnd_family_v6; /* E */
+ hook_event_t vpnd_event_in_v6; /* E */
+ hook_event_t vpnd_event_out_v6; /* E */
+ hook_event_token_t vpnd_token_in_v6; /* E */
+ hook_event_token_t vpnd_token_out_v6; /* E */
+ kmutex_t vpnd_lock; /* Protects remaining members */
+ kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */
+ int vpnd_ref; /* L */
+ vnd_pnsd_flags_t vpnd_flags; /* L */
+ list_t vpnd_dev_list; /* L */
+} vnd_pnsd_t;
+
+static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
+
+/*
+ * Drop function signature.
+ */
+typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
+
+static void
+vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ panic("illegal vnd drop");
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup(netstackid_t nsid)
+{
+ vnd_pnsd_t *nsp;
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ if (nsp->vpnd_nsid == nsid) {
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref >= 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zid(zoneid_t zid)
+{
+ netstack_t *ns;
+ vnd_pnsd_t *nsp;
+ ns = netstack_find_by_zoneid(zid);
+ if (ns == NULL)
+ return (NULL);
+ nsp = vnd_nsd_lookup(ns->netstack_stackid);
+ netstack_rele(ns);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zonename(char *zname)
+{
+ zone_t *zonep;
+ vnd_pnsd_t *nsp;
+
+ zonep = zone_find_by_name(zname);
+ if (zonep == NULL)
+ return (NULL);
+
+ nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
+ zone_rele(zonep);
+ return (nsp);
+}
+
+static void
+vnd_nsd_ref(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ /*
+ * This can only be used on something that has been obtained through
+ * some other means. As such, the caller should already have a reference
+ * before adding another one. This function should not be used as a
+ * means of creating the initial reference.
+ */
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static void
+vnd_nsd_rele(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref--;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static vnd_dev_t *
+vnd_dev_lookup(minor_t m)
+{
+ vnd_dev_t *vdp;
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ if (vdp->vdd_minor == m) {
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (vdp);
+}
+
+static void
+vnd_dev_free(vnd_dev_t *vdp)
+{
+ /*
+ * When the STREAM exists we need to go through and make sure
+ * communication gets torn down. As part of closing the stream, we
+ * guarantee that nothing else should be able to enter the stream layer
+ * at this point. That means no one should be able to call
+ * read(),write() or one of the frameio ioctls.
+ */
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+
+ /*
+ * We have to remove ourselves from our parents list now. It is
+ * really quite important that we have already set the condemend
+ * flag here so that our containing netstack basically knows
+ * that we're on the way down and knows not to wait for us. It's
+ * also important that we do that before we put a rele on the
+ * the device as that is the point at which it will check again.
+ */
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+ list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(vdp->vdd_nsd);
+ vdp->vdd_nsd = NULL;
+ }
+ ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
+ id_free(vnd_minors, vdp->vdd_minor);
+ mutex_destroy(&vdp->vdd_lock);
+ kmem_cache_free(vnd_dev_cache, vdp);
+}
+
+static void
+vnd_dev_ref(vnd_dev_t *vdp)
+{
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+}
+
+/*
+ * As part of releasing the hold on this we may tear down a given vnd_dev_t As
+ * such we need to make sure that we grab the list lock first before grabbing
+ * the vnd_dev_t's lock to ensure proper lock ordering.
+ */
+static void
+vnd_dev_rele(vnd_dev_t *vdp)
+{
+ mutex_enter(&vnd_dev_lock);
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref--;
+ DTRACE_VND_REFDEC(vdp);
+ if (vdp->vdd_ref > 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return;
+ }
+
+ /*
+ * Now that we've removed this from the list, we can go ahead and
+ * drop the list lock. No one else can find this device and reference
+ * it. As its reference count is zero, it by definition does not have
+ * any remaining entries in /devices that could lead someone back to
+ * this.
+ */
+ vdp->vdd_flags |= VND_D_CONDEMNED;
+ list_remove(&vnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ vnd_dev_free(vdp);
+}
+
+/*
+ * Insert a mesage block chain if there's space, otherwise drop it. Return one
+ * so someone who was waiting for data would now end up having found it. eg.
+ * caller should consider a broadcast.
+ */
+static int
+vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
+ vnd_dropper_f dropf)
+{
+ size_t msize;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ if (reserved == B_FALSE) {
+ msize = msgsize(mp);
+ if (vqp->vdq_cur + msize > vqp->vdq_max) {
+ dropf(vqp->vdq_vns, mp, "buffer full");
+ return (0);
+ }
+ vqp->vdq_cur += msize;
+ }
+
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ vqp->vdq_head = mp;
+ vqp->vdq_tail = mp;
+ } else {
+ vqp->vdq_tail->b_next = mp;
+ vqp->vdq_tail = mp;
+ }
+
+ return (1);
+}
+
+/*
+ * Remove a message message block chain. If the amount of space in the buffer
+ * has changed we return 1. We have no way of knowing whether or not there is
+ * enough space overall for a given writer who is blocked, so we always end up
+ * having to return true and thus tell consumers that they should consider
+ * signalling.
+ */
+static int
+vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
+{
+ size_t msize;
+ mblk_t *mp;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(mpp != NULL);
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ *mpp = NULL;
+ return (0);
+ }
+
+ mp = vqp->vdq_head;
+ msize = msgsize(mp);
+
+ vqp->vdq_cur -= msize;
+ if (mp->b_next == NULL) {
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ /*
+ * We can't be certain that this is always going to be zero.
+ * Someone may have basically taken a reservation of space on
+ * the data queue, eg. claimed spae but not yet pushed it on
+ * yet.
+ */
+ ASSERT(vqp->vdq_cur >= 0);
+ } else {
+ vqp->vdq_head = mp->b_next;
+ ASSERT(vqp->vdq_cur > 0);
+ }
+ mp->b_next = NULL;
+ *mpp = mp;
+ return (1);
+}
+
+/*
+ * Reserve space in the queue. This will bump up the size of the queue and
+ * entitle the user to push something on later without bumping the space.
+ */
+static int
+vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size >= 0);
+
+ if (size == 0)
+ return (0);
+
+ if (size + vqp->vdq_cur > vqp->vdq_max)
+ return (0);
+
+ vqp->vdq_cur += size;
+ return (1);
+}
+
+static void
+vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size > 0);
+ ASSERT(size <= vqp->vdq_cur);
+
+ vqp->vdq_cur -= size;
+}
+
+static void
+vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
+{
+ mblk_t *mp, *next;
+
+ mutex_enter(&vqp->vdq_lock);
+ for (mp = vqp->vdq_head; mp != NULL; mp = next) {
+ next = mp->b_next;
+ mp->b_next = NULL;
+ dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
+ }
+ vqp->vdq_cur = 0;
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ mutex_exit(&vqp->vdq_lock);
+}
+
+static boolean_t
+vnd_dq_is_empty(vnd_data_queue_t *vqp)
+{
+ boolean_t ret;
+
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head == NULL)
+ ret = B_TRUE;
+ else
+ ret = B_FALSE;
+ mutex_exit(&vqp->vdq_lock);
+
+ return (ret);
+}
+
+/*
+ * Get a network uint16_t from the message and translate it into something the
+ * host understands.
+ */
+static int
+vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
+{
+ size_t mpsize;
+ uint8_t *bp;
+
+ mpsize = msgsize(mp);
+ /* Check for overflow */
+ if (off + sizeof (uint16_t) > mpsize)
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+
+ /*
+ * Data is in network order. Note the second byte of data might be in
+ * the next mp.
+ */
+ bp = mp->b_rptr + off;
+ *out = *bp << 8;
+ if (off + 1 == mpsize) {
+ mp = mp->b_cont;
+ bp = mp->b_rptr;
+ } else {
+ bp++;
+ }
+
+ *out |= *bp;
+ return (0);
+}
+
+/*
+ * Given an mblk chain find the mblk and address of a particular offset.
+ */
+static int
+vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
+{
+ size_t mpsize;
+
+ if (off >= msgsize(mp))
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+ *mpp = mp;
+ *offp = (uintptr_t)mp->b_rptr + off;
+
+ return (0);
+}
+
+/*
+ * Fetch the destination mac address. Set *dstp to that mac address. If the data
+ * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
+ */
+static int
+vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
+{
+ int i;
+
+ if (MBLKL(mp) >= ETHERADDRL) {
+ *dstpp = mp->b_rptr;
+ return (0);
+ }
+
+ *dstpp = datap;
+ for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
+ if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
+ hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
+ hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
+{
+ uint16_t etype;
+ int vlan = 0;
+ hook_pkt_event_t info;
+ size_t offset, mblen;
+ uint8_t *dstp;
+ uint8_t dstaddr[6];
+ hook_event_t he;
+ hook_event_token_t het;
+ net_handle_t neti;
+
+ /*
+ * Before we can ask if we're interested we have to do enough work to
+ * determine the ethertype.
+ */
+
+ /* Byte 12 is either the VLAN tag or the ethertype */
+ if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (etype == ETHERTYPE_VLAN) {
+ vlan = 1;
+ /* Actual ethertype is another four bytes in */
+ if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
+ ddrop(vsp, *mpp,
+ "packet has incomplete ethernet vlan header");
+ *mpp = NULL;
+ return (1);
+ }
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * At the moment we only hook on the kinds of things that the IP module
+ * would normally.
+ */
+ if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
+ return (0);
+
+ if (etype == ETHERTYPE_IP) {
+ neti = netiv4;
+ he = hev4;
+ het = hetv4;
+ } else {
+ neti = netiv6;
+ he = hev6;
+ het = hetv6;
+ }
+
+ if (!he.he_interested)
+ return (0);
+
+
+ if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ /*
+ * Now that we know we're interested, we have to do some additional
+ * sanity checking for IPF's sake, ala ip_check_length(). Specifically
+ * we need to check to make sure that the remaining packet size,
+ * excluding MAC, is at least the size of an IP header.
+ */
+ mblen = msgsize(*mpp);
+ if ((etype == ETHERTYPE_IP &&
+ mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
+ (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
+ ddrop(vsp, *mpp, "packet has invalid IP header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ info.hpe_protocol = neti;
+ info.hpe_ifp = (phy_if_t)vsp;
+ info.hpe_ofp = (phy_if_t)vsp;
+ info.hpe_mp = mpp;
+ info.hpe_flags = 0;
+
+ if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
+ info.hpe_flags |= HPE_BROADCAST;
+ else if (etype == ETHERTYPE_IP &&
+ bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+ else if (etype == ETHERTYPE_IPV6 &&
+ bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+
+ if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
+ (uintptr_t *)&info.hpe_hdr) != 0) {
+ ddrop(vsp, *mpp, "packet too small -- "
+ "unable to find payload");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
+ hdrop(vsp, *mpp, "drooped by hooks");
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * This should not be used for DL_INFO_REQ.
+ */
+static mblk_t *
+vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
+{
+ mblk_t *mp;
+ mp = allocb(len, BPRI_MED);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_PROTO;
+ mp->b_wptr = mp->b_rptr + len;
+ bzero(mp->b_rptr, len);
+ ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
+
+ return (mp);
+}
+
+static void
+vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
+{
+ mblk_t **mpp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ ASSERT(mp->b_next == NULL);
+ mpp = &vsp->vns_dlpi_inc;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+ *mpp = mp;
+}
+
+static mblk_t *
+vnd_dlpi_inc_pop(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vsp->vns_dlpi_inc;
+ if (mp != NULL) {
+ VERIFY(mp->b_next == NULL || mp->b_next != mp);
+ vsp->vns_dlpi_inc = mp->b_next;
+ mp->b_next = NULL;
+ }
+ return (mp);
+}
+
+static int
+vnd_st_sinfo(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_info_req_t *dlir;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ vsp->vns_state = VNS_S_INFO_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+
+ mp->b_datap->db_type = M_PCPROTO;
+ dlir = (dl_info_req_t *)mp->b_rptr;
+ mp->b_wptr = (uchar_t *)&dlir[1];
+ dlir->dl_primitive = DL_INFO_REQ;
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_info(vnd_str_t *vsp)
+{
+ dl_info_ack_t *dlia;
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ dlia = (dl_info_ack_t *)mp->b_rptr;
+ vsp->vns_dlpi_style = dlia->dl_provider_style;
+ vsp->vns_minwrite = dlia->dl_min_sdu;
+ vsp->vns_maxwrite = dlia->dl_max_sdu;
+
+ /*
+ * At this time we only support DL_ETHER devices.
+ */
+ if (dlia->dl_mac_type != DL_ETHER) {
+ freemsg(mp);
+ vsp->vns_errno = VND_E_NOTETHER;
+ return (1);
+ }
+
+ /*
+ * Because vnd operates on entire packets, we need to manually account
+ * for the ethernet header information. We add the size of the
+ * ether_vlan_header to account for this, regardless if it is using
+ * vlans or not.
+ */
+ vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
+
+ freemsg(mp);
+ return (0);
+}
+
+static int
+vnd_st_sexclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+ return (0);
+}
+
+static int
+vnd_st_exclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_exclusive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_EXCLUSIVE_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_exclusive: got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_DLEXCL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+/*
+ * Send down a DLPI_ATTACH_REQ.
+ */
+static int
+vnd_st_sattach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
+ vsp->vns_state = VNS_S_ATTACH_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_attach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_ATTACH_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_attach: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_ATTACHFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_sbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_bind_req_t *dbrp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
+ DL_BIND_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ dbrp = (dl_bind_req_t *)(mp->b_rptr);
+ dbrp->dl_sap = 0;
+ dbrp->dl_service_mode = DL_CLDLS;
+
+ vsp->vns_state = VNS_S_BIND_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_bind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+
+ if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_BINDFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+ mblk_t *mp;
+ dl_promiscon_req_t *dprp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ dprp = (dl_promiscon_req_t *)mp->b_rptr;
+ dprp->dl_level = type;
+
+ vsp->vns_state = next;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_promisc(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_promisc");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_PROMISCON_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_promisc: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_PROMISCFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_scapabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_CAPAB_Q_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static void
+vnd_mac_input(vnd_str_t *vsp, mac_resource_t *wtf, mblk_t *mp_chain,
+ mac_header_info_t *mhip)
+{
+ int signal = 0;
+ mblk_t *mp;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+ ASSERT(vsp != NULL);
+ ASSERT(mp_chain != NULL);
+
+ for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+ uint16_t vid;
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * If we were operating in a traditional dlpi context then we
+ * would have enabled DLIOCRAW and rather than the fast path we
+ * would come through dld_str_rx_raw. That function does two
+ * things that we have to consider doing ourselves. The first is
+ * that it adjusts the b_rptr back to account for dld bumping us
+ * past the mac header. It also tries to account for cases where
+ * mac provides an illusion of the mac header. Fortunately, dld
+ * only allows the fastpath when the media type is the same as
+ * the native type. Therefore all we have to do here is adjust
+ * the b_rptr.
+ */
+ ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
+ mp->b_rptr -= mhip->mhi_hdrsize;
+ vid = VLAN_ID(mhip->mhi_tci);
+ if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
+ bcopy(mp->b_rptr, mp->b_rptr + 4, 12);
+ mp->b_rptr += 4;
+ }
+
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
+ nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
+ nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
+ continue;
+
+ VND_STAT_INC(vsp, vks_rpackets, 1);
+ VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
+ DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ mutex_enter(&vsp->vns_dq_read.vdq_lock);
+ signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
+ vnd_drop_in);
+ mutex_exit(&vsp->vns_dq_read.vdq_lock);
+
+ }
+
+ if (signal != 0) {
+ cv_broadcast(&vsp->vns_dq_read.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
+ }
+
+}
+
+/*
+ * This is a callback from MAC that indicates that we are allowed to send
+ * packets again.
+ */
+static void
+vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
+{
+ vnd_str_t *vsp = arg;
+ hrtime_t diff;
+
+ mutex_enter(&vsp->vns_lock);
+ ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
+ ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
+ vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = NULL;
+ diff = gethrtime() - vsp->vns_fclatch;
+ vsp->vns_fclatch = 0;
+ DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
+ vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
+ /*
+ * If someone has asked to flush the squeue and thus inserted a barrier,
+ * than we shouldn't schedule a drain.
+ */
+ if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
+ vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
+ VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
+ }
+ mutex_exit(&vsp->vns_lock);
+
+ VND_STAT_INC(vsp, vks_nmacflow, 1);
+ VND_STAT_INC(vsp, vks_tmacflow, diff);
+ if (diff >= VND_LATENCY_1MS)
+ VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
+ if (diff >= VND_LATENCY_10MS)
+ VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
+ if (diff >= VND_LATENCY_100MS)
+ VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
+ if (diff >= VND_LATENCY_1S)
+ VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
+ if (diff >= VND_LATENCY_10S)
+ VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
+}
+
+static void
+vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
+}
+
+static void
+vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
+}
+
+static int
+vnd_dld_cap_enable(vnd_str_t *vsp)
+{
+ int ret;
+ dld_capab_direct_t d;
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *c = &vsp->vns_caps;
+
+ bzero(&d, sizeof (d));
+ d.di_rx_cf = (uintptr_t)vnd_mac_input;
+ d.di_rx_ch = vsp;
+ d.di_flags = DI_DIRECT_RAW;
+
+ vnd_mac_enter(vsp, &mph);
+
+ if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
+ DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
+ c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
+ c->vsc_tx_hdl = d.di_tx_dh;
+ c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
+ c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
+ c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
+ c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
+ c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
+ vnd_mac_flow_control, vsp);
+ c->vsc_flags |= VNS_C_DIRECT;
+ ret = 0;
+ } else {
+ vsp->vns_errno = VND_E_DIRECTFAIL;
+ ret = 1;
+ }
+ vnd_mac_exit(vsp, mph);
+ return (ret);
+}
+
+static int
+vnd_st_capabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_capability_ack_t *cap;
+ dl_capability_sub_t *subp;
+ dl_capab_hcksum_t *hck;
+ dl_capab_dld_t *dld;
+ unsigned char *rp;
+ int ret = 0;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+
+ rp = mp->b_rptr;
+ cap = (dl_capability_ack_t *)rp;
+ if (cap->dl_sub_length == 0)
+ goto done;
+
+ /* Don't try to process something too big */
+ if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_CAPACKINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ rp += cap->dl_sub_offset;
+
+ while (cap->dl_sub_length > 0) {
+ subp = (dl_capability_sub_t *)rp;
+ /* Sanity check something crazy from down below */
+ if (subp->dl_length + sizeof (dl_capability_sub_t) >
+ cap->dl_sub_length) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_SUBCAPINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ switch (subp->dl_cap) {
+ case DL_CAPAB_HCKSUM:
+ hck = (dl_capab_hcksum_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
+ break;
+ }
+ if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
+ vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
+ break;
+ case DL_CAPAB_DLD:
+ dld = (dl_capab_dld_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (dld->dld_version != DLD_CURRENT_VERSION) {
+ vsp->vns_errno = VND_E_DLDBADVERS;
+ ret = 1;
+ goto done;
+ }
+ if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_DLD;
+ vsp->vns_caps.vsc_capab_f =
+ (vnd_dld_cap_t)dld->dld_capab;
+ vsp->vns_caps.vsc_capab_hdl =
+ (void *)dld->dld_capab_handle;
+ if (vnd_dld_cap_enable(vsp) != 0) {
+ /* vns_errno set by vnd_dld_cap_enable */
+ ret = 1;
+ goto done;
+ }
+ break;
+ default:
+ /* Ignore unsupported cap */
+ break;
+ }
+
+ rp += sizeof (dl_capability_sub_t) + subp->dl_length;
+ cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
+ subp->dl_length;
+ }
+
+done:
+ /* Make sure we enabled direct callbacks */
+ if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
+ vsp->vns_errno = VND_E_DIRECTNOTSUP;
+ ret = 1;
+ }
+
+ freemsg(mp);
+ return (ret);
+}
+
+static void
+vnd_st_sonline(vnd_str_t *vsp)
+{
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ vsp->vns_state = VNS_S_ONLINE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+static void
+vnd_st_shutdown(vnd_str_t *vsp)
+{
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *vsc = &vsp->vns_caps;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ /*
+ * At this point in time we know that there is no one transmitting as
+ * our final reference has been torn down and that vnd_s_close inserted
+ * a barrier to validate that everything is flushed.
+ */
+ if (vsc->vsc_flags & VNS_C_DIRECT) {
+ vnd_mac_enter(vsp, &mph);
+ vsc->vsc_flags &= ~VNS_C_DIRECT;
+ vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
+ vsc->vsc_tx_fc_hdl);
+ vsc->vsc_tx_fc_hdl = NULL;
+ (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
+ NULL, DLD_DISABLE);
+ vnd_mac_exit(vsp, mph);
+ }
+
+ /*
+ * We could send an unbind, but dld also does that for us. As we add
+ * more capabilities and the like, we should revisit this.
+ */
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+/*
+ * Perform state transitions. This is a one way shot down the flow chart
+ * described in the big theory statement.
+ */
+static void
+vnd_str_state_transition(void *arg)
+{
+ boolean_t died = B_FALSE;
+ vnd_str_t *vsp = arg;
+ mblk_t *mp;
+
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
+ vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+ DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
+ vnd_str_state_t, vsp->vns_state);
+ switch (vsp->vns_state) {
+ case VNS_S_INITIAL:
+ VERIFY(vsp->vns_dlpi_inc == NULL);
+ if (vnd_st_sinfo(vsp) != 0)
+ died = B_TRUE;
+ break;
+ case VNS_S_INFO_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_info(vsp) == 0) {
+ if (vnd_st_sexclusive(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_EXCLUSIVE_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_exclusive(vsp) == 0) {
+ if (vsp->vns_dlpi_style == DL_STYLE2) {
+ if (vnd_st_sattach(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ }
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_ATTACH_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_attach(vsp) == 0) {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_BIND_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_bind(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
+ VNS_S_SAP_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_SAP_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
+ VNS_S_MULTI_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_MULTI_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
+ VNS_S_RX_ONLY_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_RX_ONLY_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_scapabq(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_CAPAB_Q_SENT:
+ if (vnd_st_capabq(vsp) != 0)
+ died = B_TRUE;
+ else
+ vnd_st_sonline(vsp);
+ break;
+ case VNS_S_SHUTTING_DOWN:
+ vnd_st_shutdown(vsp);
+ break;
+ case VNS_S_ZOMBIE:
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vsp committed suicide");
+ break;
+ default:
+ panic("vnd_str_t entered an unknown state");
+ }
+
+ if (died == B_TRUE) {
+ ASSERT(vsp->vns_errno != VND_E_SUCCESS);
+ vsp->vns_laststate = vsp->vns_state;
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+ }
+
+ mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_dlpi_taskq_dispatch(void *arg)
+{
+ vnd_str_t *vsp = arg;
+ int run = 1;
+
+ while (run != 0) {
+ vnd_str_state_transition(vsp);
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_flags & VNS_F_CONDEMNED ||
+ vsp->vns_dlpi_inc == NULL) {
+ run = 0;
+ vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
+ }
+ if (vsp->vns_flags & VNS_F_CONDEMNED)
+ cv_signal(&vsp->vns_cancelcv);
+ mutex_exit(&vsp->vns_lock);
+ }
+}
+
+static int
+vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_getptmue(net_handle_t neti)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ size_t nelem, net_ifaddr_t type[], void *storage)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ zoneid_t *zid)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ uint64_t *flags)
+{
+ return (-1);
+}
+
+static phy_if_t
+vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
+{
+ return (-1);
+}
+
+static phy_if_t
+vnd_neti_phylookup(net_handle_t neti, const char *name)
+{
+ return (-1);
+}
+
+static lif_if_t
+vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+ return (-1);
+}
+
+static phy_if_t
+vnd_neti_route(net_handle_t neti, struct sockaddr *address,
+ struct sockaddr *next)
+{
+ return ((phy_if_t)-1);
+}
+
+static int
+vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+static int
+vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+static net_protocol_t vnd_neti_info_v4 = {
+ NETINFO_VERSION,
+ NHF_VND_INET,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+static net_protocol_t vnd_neti_info_v6 = {
+ NETINFO_VERSION,
+ NHF_VND_INET6,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+
+static int
+vnd_netinfo_init(vnd_pnsd_t *nsp)
+{
+ nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v4);
+ ASSERT(nsp->vpnd_neti_v4 != NULL);
+
+ nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v6);
+ ASSERT(nsp->vpnd_neti_v6 != NULL);
+
+ nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v4.hf_name = "vnd_inet";
+
+ if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v6.hf_name = "vnd_inet6";
+
+ if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
+ net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v4.he_flags = 0;
+ nsp->vpnd_event_in_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ if (nsp->vpnd_token_in_v4 == NULL) {
+ net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v6.he_flags = 0;
+ nsp->vpnd_event_in_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ if (nsp->vpnd_token_in_v6 == NULL) {
+ net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v4.he_flags = 0;
+ nsp->vpnd_event_out_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_out_v4);
+ if (nsp->vpnd_token_out_v4 == NULL) {
+ net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v6.he_flags = 0;
+ nsp->vpnd_event_out_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_out_v6);
+ if (nsp->vpnd_token_out_v6 == NULL) {
+ net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ net_protocol_unregister(nsp->vpnd_neti_v4);
+ net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ return (0);
+}
+
+static void
+vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+}
+
+static void
+vnd_netinfo_fini(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v4);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v6);
+ VERIFY(ret == 0);
+}
+
+static void
+vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+
+ VERIFY(bmp == &vsp->vns_barrierblk);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vsp->vns_flags & VNS_F_BARRIER);
+ VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
+ vsp->vns_flags |= VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * For better or worse, we have to broadcast here as we could have a
+ * thread that's blocked for completion as well as one that's blocked
+ * waiting to do a barrier itself.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * This is a data barrier for the stream while it is in fastpath mode. It blocks
+ * and ensures that there is nothing else in the squeue.
+ */
+static void
+vnd_strbarrier(vnd_str_t *vsp)
+{
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_flags & VNS_F_BARRIER)
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_BARRIER;
+ mutex_exit(&vsp->vns_lock);
+
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
+ vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
+
+ mutex_enter(&vsp->vns_lock);
+ while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags &= ~VNS_F_BARRIER;
+ vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * We have to broadcast in case anyone is waiting for the barrier
+ * themselves.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * Based on the type of message that we're dealing with we're going to want to
+ * do one of several things. Basically if it looks like it's something we know
+ * about, we should probably handle it in one of our transition threads.
+ * Otherwise, we should just simply putnext.
+ */
+static int
+vnd_s_rput(queue_t *q, mblk_t *mp)
+{
+ t_uscalar_t prim;
+ int dispatch = 0;
+ vnd_str_t *vsp = q->q_ptr;
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+ vnd_drop_ctl(vsp, mp, "PROTO message too short");
+ break;
+ }
+
+ prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
+ if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
+ vnd_drop_ctl(vsp, mp,
+ "recieved an unsupported dlpi DATA req");
+ break;
+ }
+
+ /*
+ * Enqueue the entry and fire off a taskq dispatch.
+ */
+ mutex_enter(&vsp->vns_lock);
+ vnd_dlpi_inc_push(vsp, mp);
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ dispatch = 1;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ }
+ mutex_exit(&vsp->vns_lock);
+ if (dispatch != 0)
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
+ vsp, 0, &vsp->vns_tqe);
+ break;
+ case M_DATA:
+ vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
+ break;
+ default:
+ putnext(vsp->vns_rq, mp);
+ }
+ return (0);
+}
+
+static void
+vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
+{
+ int error;
+ vnd_strioc_t *visp;
+
+ if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
+ iocp->ioc_count != TRANSPARENT) {
+ error = EINVAL;
+ goto nak;
+ }
+
+ /*
+ * All streams ioctls that we support must use kcred as a means to
+ * distinguish that this is a layered open by the kernel as opposed to
+ * one by a user who has done an I_PUSH of the module.
+ */
+ if (iocp->ioc_cr != kcred) {
+ error = EPERM;
+ goto nak;
+ }
+
+ if (mp->b_cont == NULL) {
+ error = EAGAIN;
+ goto nak;
+ }
+
+ visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
+ ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
+ visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
+ visp->vs_state = VSS_COPYIN;
+
+ mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
+ qreply(q, mp);
+
+ return;
+
+nak:
+ if (mp->b_cont != NULL) {
+ freemsg(mp->b_cont);
+ mp->b_cont = NULL;
+ }
+
+ iocp->ioc_error = error;
+ mp->b_datap->db_type = M_IOCNAK;
+ iocp->ioc_count = 0;
+ qreply(q, mp);
+}
+
+static void
+vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ int error;
+ vnd_str_state_t state;
+ struct copyreq *crp;
+ vnd_strioc_associate_t *vss;
+ vnd_dev_t *vdp = NULL;
+ vnd_pnsd_t *nsp = NULL;
+ char iname[2*VND_NAMELEN];
+ zone_t *zone;
+ vnd_strioc_t *visp;
+
+ visp = (vnd_strioc_t *)csp->cp_private;
+
+ /* If it's not ours, it's not our problem */
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* Data is sitting for us in b_cont */
+ if (mp->b_cont == NULL ||
+ MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ miocnak(q, mp, 0, EINVAL);
+ qreply(q, mp);
+ return;
+ }
+
+ vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
+ vdp = vnd_dev_lookup(vss->vsa_minor);
+ if (vdp == NULL) {
+ error = EIO;
+ vss->vsa_errno = VND_E_NODEV;
+ goto nak;
+ }
+
+ nsp = vnd_nsd_lookup(vss->vsa_nsid);
+ if (nsp == NULL) {
+ error = EIO;
+ vss->vsa_errno = VND_E_NONETSTACK;
+ goto nak;
+ }
+
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
+ mutex_exit(&vsp->vns_lock);
+ error = EEXIST;
+ vss->vsa_errno = VND_E_ASSOCIATED;
+ goto nak;
+ }
+
+ vsp->vns_nsd = nsp;
+ vsp->vns_flags &= ~VNS_F_NEED_ZONE;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ mutex_exit(&vsp->vns_lock);
+
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
+ &vsp->vns_tqe);
+
+
+ /* At this point we need to wait until we have transitioned to ONLINE */
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ state = vsp->vns_state;
+ mutex_exit(&vsp->vns_lock);
+
+ if (state == VNS_S_ZOMBIE) {
+ vss->vsa_errno = vsp->vns_errno;
+ error = EIO;
+ goto nak;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vdp->vdd_str == NULL);
+ /*
+ * Now initialize the remaining kstat properties and let's go ahead and
+ * create it.
+ */
+ (void) snprintf(iname, sizeof (iname), "z%d_%d",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
+ vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
+ KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+ if (vsp->vns_kstat == NULL) {
+ error = EIO;
+ vss->vsa_errno = VND_E_KSTATCREATE;
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ goto nak;
+ }
+ vdp->vdd_str = vsp;
+ vsp->vns_dev = vdp;
+
+ zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
+ ASSERT(zone != NULL);
+ vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
+ /* Account for zone name */
+ vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
+ /* Account for eventual link name */
+ vsp->vns_kstat->ks_data_size += VND_NAMELEN;
+ kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ zone_rele(zone);
+ kstat_install(vsp->vns_kstat);
+
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Note that the vnd_str_t does not keep a permanent hold on the
+ * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
+ * the nestack goes through to take care of everything.
+ */
+ vss->vsa_errno = VND_E_SUCCESS;
+nak:
+ if (vdp != NULL)
+ vnd_dev_rele(vdp);
+ if (nsp != NULL)
+ vnd_nsd_rele(nsp);
+ /*
+ * Change the copyin request to a copyout. Note that we can't use
+ * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
+ * okay, as the copyin vs. copyout is basically the same.
+ */
+ DB_TYPE(mp) = M_COPYOUT;
+ visp->vs_state = VSS_COPYOUT;
+ crp = (struct copyreq *)mp->b_rptr;
+ crp->cq_private = (void *)visp;
+ crp->cq_addr = visp->vs_addr;
+ crp->cq_size = sizeof (vnd_strioc_associate_t);
+ qreply(q, mp);
+}
+
+static void
+vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ ASSERT(csp->cp_private != NULL);
+ kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
+ return;
+ }
+
+ /* Ack and let's be done with it all */
+ miocack(q, mp, 0, 0);
+}
+
+static int
+vnd_s_wput(queue_t *q, mblk_t *mp)
+{
+ vnd_str_t *vsp = q->q_ptr;
+ struct copyresp *crp;
+ vnd_strioc_state_t vstate;
+ vnd_strioc_t *visp;
+
+ switch (DB_TYPE(mp)) {
+ case M_IOCTL:
+ vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
+ return (0);
+ case M_IOCDATA:
+ crp = (struct copyresp *)mp->b_rptr;
+ ASSERT(crp->cp_private != NULL);
+ visp = (vnd_strioc_t *)crp->cp_private;
+ vstate = visp->vs_state;
+ ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
+ if (vstate == VSS_COPYIN)
+ vnd_striocdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ else
+ vnd_stroutdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ return (0);
+ default:
+ break;
+ }
+ if (q->q_next != NULL)
+ putnext(q, mp);
+ else
+ vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");
+
+ return (0);
+}
+
+static int
+vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ uint_t rand;
+
+ if (q->q_ptr != NULL)
+ return (EINVAL);
+
+ if (!(sflag & MODOPEN))
+ return (ENXIO);
+
+ if (credp != kcred)
+ return (EPERM);
+
+ vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
+ bzero(vsp, sizeof (*vsp));
+ mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
+ vsp->vns_state = VNS_S_INITIAL;
+
+ mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_enter(&vnd_dev_lock);
+ vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_read.vdq_vns = vsp;
+ vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_write.vdq_vns = vsp;
+ mutex_exit(&vnd_dev_lock);
+ vsp->vns_rq = q;
+ vsp->vns_wq = WR(q);
+ q->q_ptr = WR(q)->q_ptr = vsp;
+ vsp->vns_flags = VNS_F_NEED_ZONE;
+ vsp->vns_nflush = vnd_flush_nburst;
+ vsp->vns_bsize = vnd_flush_burst_size;
+
+ (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+ vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);
+
+ /*
+ * We create our kstat and initialize all of its fields now, but we
+ * don't install it until we actually do the zone association so we can
+ * get everything.
+ */
+ kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
+ "flowcontrol_100ms", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
+ KSTAT_DATA_UINT64);
+ qprocson(q);
+ /*
+ * Now that we've called qprocson, grab the lower module for making sure
+ * that we don't have any pass through modules.
+ */
+ vsp->vns_lrq = RD(vsp->vns_wq->q_next);
+
+ return (0);
+}
+
+static int
+vnd_s_close(queue_t *q, int flag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ mblk_t *mp;
+
+ VERIFY(WR(q)->q_next != NULL);
+
+ vsp = q->q_ptr;
+ ASSERT(vsp != NULL);
+
+ /*
+ * We need to transition ourselves down. This means that we have a few
+ * important different things to do in the process of tearing down our
+ * input and output buffers, making sure we've drained the current
+ * squeue, and disabling the fast path. Before we disable the fast path,
+ * we should make sure the squeue is drained. Because we're in streams
+ * close, we know that no packets can come into us from userland, but we
+ * can receive more. As such, the following is the exact order of things
+ * that we do:
+ *
+ * 1) flush the vns_dq_read
+ * 2) Insert the drain mblk
+ * 3) When it's been received, tear down the fast path by kicking
+ * off the state machine.
+ * 4) One final flush of both the vns_dq_read,vns_dq_write
+ */
+
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_strbarrier(vsp);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_state = VNS_S_SHUTTING_DOWN;
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
+ 0, &vsp->vns_tqe);
+ }
+ while (vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ mutex_exit(&vsp->vns_lock);
+
+ qprocsoff(q);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_CONDEMNED;
+ while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
+ cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);
+
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vnd_s_close");
+ mutex_exit(&vsp->vns_lock);
+
+ q->q_ptr = NULL;
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
+ mutex_destroy(&vsp->vns_dq_read.vdq_lock);
+ mutex_destroy(&vsp->vns_dq_write.vdq_lock);
+
+ if (vsp->vns_kstat != NULL)
+ kstat_delete(vsp->vns_kstat);
+ mutex_destroy(&vsp->vns_lock);
+ cv_destroy(&vsp->vns_stcv);
+ cv_destroy(&vsp->vns_barriercv);
+ cv_destroy(&vsp->vns_cancelcv);
+ kmem_cache_free(vnd_str_cache, vsp);
+
+ return (0);
+}
+
+static vnd_mac_cookie_t
+vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
+{
+ vnd_mac_cookie_t vc;
+
+ VND_STAT_INC(vsp, vks_opackets, 1);
+ VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
+ DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ /* Actually tx now */
+ vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
+ mp, 0, MAC_DROP_ON_NO_DESC);
+ if (vc != NULL) {
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = vc;
+ vsp->vns_fclatch = gethrtime();
+ DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
+ uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ return (vc);
+}
+
+static void
+vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
+{
+ mblk_t *mp;
+ int nmps;
+ size_t mptot, nflush, bsize;
+ boolean_t blocked, empty;
+ vnd_data_queue_t *vqp;
+ vnd_str_t *vsp = arg;
+
+ mutex_enter(&vsp->vns_lock);
+ /*
+ * We either enter here via an squeue or via vnd_squeue_tx_append(). In
+ * the former case we need to mark that there is no longer an active
+ * user of the drain block.
+ */
+ if (drain_mp != NULL) {
+ VERIFY(drain_mp == &vsp->vns_drainblk);
+ VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
+ vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
+ }
+
+ /*
+ * If we're still flow controlled or under a flush barrier, nothing to
+ * do.
+ */
+ if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ nflush = vsp->vns_nflush;
+ bsize = vsp->vns_bsize;
+ mutex_exit(&vsp->vns_lock);
+
+ nmps = 0;
+ mptot = 0;
+ blocked = B_FALSE;
+ vqp = &vsp->vns_dq_write;
+ while (nmps < nflush && mptot <= bsize) {
+ mutex_enter(&vqp->vdq_lock);
+ if (vnd_dq_pop(vqp, &mp) == 0) {
+ mutex_exit(&vqp->vdq_lock);
+ break;
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ nmps++;
+ mptot += msgsize(mp);
+ if (vnd_squeue_tx_one(vsp, mp) != NULL) {
+ blocked = B_TRUE;
+ break;
+ }
+ }
+
+ empty = vnd_dq_is_empty(&vsp->vns_dq_write);
+
+ /*
+ * If the queue is not empty, we're not blocked, and there isn't a drain
+ * scheduled, put it into the squeue with the drain block and
+ * GSQUEUE_FILL.
+ */
+ if (blocked == B_FALSE && empty == B_FALSE) {
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
+ mblk_t *mp = &vsp->vns_drainblk;
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue,
+ mp, vnd_squeue_tx_drain, vsp,
+ GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
+ }
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ /*
+ * If we drained some amount of data, we need to signal the data queue.
+ */
+ if (nmps > 0) {
+ cv_broadcast(&vsp->vns_dq_write.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
+ }
+}
+
+static void
+vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+ vnd_data_queue_t *vqp;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+ /*
+ * Before we append this packet, we should run it through the firewall
+ * rules.
+ */
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
+ nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
+ vnd_drop_out) != 0)
+ return;
+
+ /*
+ * We earlier reserved space for this packet. So for now simply append
+ * it and call drain. We know that no other drain can be going on right
+ * now thanks to the squeue.
+ */
+ vqp = &vsp->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
+ mutex_exit(&vqp->vdq_lock);
+ vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
+}
+
+/*
+ * We need to see if this is a valid name of sorts for us. That means a few
+ * things. First off, we can't assume that what we've been given has actually
+ * been null terminated. More importantly, that it's a valid name as far as
+ * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
+ * further constrain ourselves to simply alphanumeric characters and a few
+ * additional ones, ':', '-', and '_'.
+ */
+static int
+vnd_validate_name(const char *buf, size_t buflen)
+{
+ int i, len;
+
+ /* First make sure a null terminator exists */
+ for (i = 0; i < buflen; i++)
+ if (buf[i] == '\0')
+ break;
+ len = i;
+ if (i == 0 || i == buflen)
+ return (0);
+
+ for (i = 0; i < len; i++)
+ if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
+ buf[i] != '_')
+ return (0);
+
+ return (1);
+}
+
+static int
+vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
+{
+ vnd_ioc_attach_t via;
+ vnd_strioc_associate_t vss;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+ zoneid_t zid;
+ char buf[2*VND_NAMELEN];
+ int ret, rp;
+
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
+ return (EFAULT);
+ via.via_errno = VND_E_SUCCESS;
+
+ if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Only the global zone can request to create a device in a different
+ * zone.
+ */
+ zid = crgetzoneid(credp);
+ if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
+ zid != via.via_zoneid) {
+ via.via_errno = VND_E_PERM;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ if (via.via_zoneid == -1)
+ via.via_zoneid = zid;
+
+ /*
+ * Establish the name we'll use now. We want to be extra paranoid about
+ * the device we're opening so check that now.
+ */
+ if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
+ zonep = zone_find_by_id(via.via_zoneid);
+ if (zonep == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+ if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
+ via.via_name) >= sizeof (buf)) {
+ zone_rele(zonep);
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
+ zonep->zone_name, via.via_name);
+ zone_rele(zonep);
+ zonep = NULL;
+ } else {
+ if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
+ sizeof (buf)) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
+ }
+
+ /*
+ * If our zone is dying then the netstack will have been removed from
+ * this list.
+ */
+ nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
+ if (nsp == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Note we set the attached handle even though we haven't actually
+ * finished the process of attaching the ldi handle.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_nsd_rele(nsp);
+ via.via_errno = VND_E_ATTACHED;
+ ret = EIO;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
+ ASSERT(vdp->vdd_cr == NULL);
+ crhold(credp);
+ vdp->vdd_cr = credp;
+ ASSERT(vdp->vdd_nsd == NULL);
+ vdp->vdd_nsd = nsp;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Place an additional hold on the vnd_pnsd_t as we go through and do
+ * all of the rest of our work. This will be the hold that we keep for
+ * as long as this thing is attached.
+ */
+ vnd_nsd_ref(nsp);
+
+ ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
+ &vdp->vdd_ldih, vdp->vdd_ldiid);
+ if (ret != 0) {
+ if (ret == ENODEV)
+ via.via_errno = VND_E_NODATALINK;
+ goto err;
+ }
+
+ /*
+ * Unfortunately the I_PUSH interface doesn't allow us a way to detect
+ * whether or not we're coming in from a layered device. We really want
+ * to make sure that a normal user can't push on our streams module.
+ * Currently the only idea I have for this is to make sure that the
+ * credp is kcred which is really terrible.
+ */
+ ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
+ kcred, &rp);
+ if (ret != 0) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ via.via_errno = VND_E_STRINIT;
+ ret = EIO;
+ goto err;
+ }
+
+ vss.vsa_minor = vdp->vdd_minor;
+ vss.vsa_nsid = nsp->vpnd_nsid;
+
+ ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
+ FKIOCTL, kcred, &rp);
+ if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ if (ret == 0) {
+ via.via_errno = vss.vsa_errno;
+ ret = EIO;
+ }
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+
+ /*
+ * There's a chance that our netstack was condemned while we've had a
+ * hold on it. As such we need to check and if so, error out.
+ */
+ if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ ret = EIO;
+ via.via_errno = VND_E_NOZONE;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_str != NULL);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ vdp->vdd_flags |= VND_D_ATTACHED;
+ (void) strlcpy(vdp->vdd_datalink, via.via_name,
+ sizeof (vdp->vdd_datalink));
+ list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(nsp);
+
+ return (0);
+
+err:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+ vdp->vdd_nsd = NULL;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * We have two holds to drop here. One for our original reference and
+ * one for the hold this operation would have represented.
+ */
+ vnd_nsd_rele(nsp);
+ vnd_nsd_rele(nsp);
+errcopyout:
+ if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
+ ret = EFAULT;
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret = 0;
+ vnd_ioc_link_t vil;
+ char mname[2*VND_NAMELEN];
+ char **c;
+ vnd_dev_t *v;
+ zoneid_t zid;
+
+ /* Not anyone can link something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
+ return (EFAULT);
+
+ if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ c = vnd_reserved_names;
+ while (*c != NULL) {
+ if (strcmp(vil.vil_name, *c) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+ c++;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOTATTACHED;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKED;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
+ zid = vdp->vdd_nsd->vpnd_zid;
+ mutex_exit(&vdp->vdd_lock);
+
+ if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
+ sizeof (mname)) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ mutex_enter(&vnd_dev_lock);
+ for (v = list_head(&vnd_dev_list); v != NULL;
+ v = list_next(&vnd_dev_list, v)) {
+ if (!(v->vdd_flags & VND_D_LINKED))
+ continue;
+
+ if (v->vdd_nsd->vpnd_zid == zid &&
+ strcmp(v->vdd_lname, vil.vil_name) == 0) {
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKEXISTS;
+ goto error;
+ }
+ }
+
+ /*
+ * We set the name and mark ourselves attached while holding the list
+ * lock to ensure that no other user can mistakingly find our name.
+ */
+ (void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
+ vil.vil_name);
+ mutex_enter(&vdp->vdd_lock);
+
+ /*
+ * Because we dropped our lock, we need to double check whether or not
+ * the zone was marked as dying while we were here. If it hasn't, then
+ * it's safe for us to link it in.
+ */
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto error;
+ }
+
+ (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
+ if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
+ DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ ret = EIO;
+ vil.vil_errno = VND_E_MINORNODE;
+ } else {
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_flags |= VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ ret = 0;
+ }
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ if (ret == 0) {
+ /*
+ * Add a reference to represent that this device is linked into
+ * the file system name space to ensure that it doesn't
+ * disappear.
+ */
+ vnd_dev_ref(vdp);
+ return (0);
+ }
+
+error:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_lname[0] = '\0';
+ mutex_exit(&vdp->vdd_lock);
+
+errcopyout:
+ if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
+ ret = EFAULT;
+ return (ret);
+}
+
+/*
+ * Common unlink function. This is used both from the ioctl path and from the
+ * netstack shutdown path. The caller is required to hold the mutex on the
+ * vnd_dev_t, but they basically will have it relinquished for them. The only
+ * thing the caller is allowed to do afterward is to potentially rele the
+ * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
+ * own hold.
+ */
+static void
+vnd_dev_unlink(vnd_dev_t *vdp)
+{
+ char mname[2*VND_NAMELEN];
+
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ (void) snprintf(mname, sizeof (mname), "z%d:%s",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
+ ddi_remove_minor_node(vnd_dip, mname);
+ vdp->vdd_lname[0] = '\0';
+ vdp->vdd_flags &= ~VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * This rele corresponds to the reference that we took in
+ * vnd_ioctl_link.
+ */
+ vnd_dev_rele(vdp);
+}
+
+static int
+vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret;
+ zoneid_t zid;
+ vnd_ioc_unlink_t viu;
+
+ /* Not anyone can unlink something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ zid = crgetzoneid(credp);
+
+ if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ viu.viu_errno = VND_E_SUCCESS;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_NOTLINKED;
+ goto err;
+ }
+ VERIFY(vdp->vdd_flags & VND_D_ATTACHED);
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_PERM;
+ goto err;
+ }
+
+ /* vnd_dev_unlink releases the vdp mutex for us */
+ vnd_dev_unlink(vdp);
+ ret = 0;
+err:
+ if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vdp->vdd_str->vns_dq_read.vdq_max = vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vnd_dev_lock);
+ vib.vib_size = vnd_vdq_hard_max;
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vdp->vdd_str->vns_dq_write.vdq_max = vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
+{
+ vnd_ioc_buf_t vib;
+
+ vib.vib_errno = 0;
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (min == B_TRUE)
+ vib.vib_size = vdp->vdd_str->vns_minwrite;
+ else
+ vib.vib_size = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ } else {
+ vib.vib_errno = VND_E_NOTATTACHED;
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ int ret, nonblock, nwrite;
+ frameio_t *fio;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
+ mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EWOULDBLOCK);
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ }
+
+ ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
+ &nwrite, mode & FKIOCTL);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ while (nwrite > 0) {
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+ nwrite--;
+ }
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+
+ return (0);
+}
+
+static int
+vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ frameio_t *fio;
+ int ret, nonblock, nframes, i, nread;
+ size_t maxwrite, minwrite, total, flen;
+ mblk_t *mp_chain, *mp, *nmp;
+ vnd_data_queue_t *vqp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ /*
+ * Make sure no single frame is larger than we can accept.
+ */
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ minwrite = vdp->vdd_str->vns_minwrite;
+ maxwrite = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ nframes = fio->fio_nvpf / fio->fio_nvecs;
+ total = 0;
+ for (i = 0; i < nframes; i++) {
+ flen = frameio_frame_length(fio,
+ &fio->fio_vecs[i*fio->fio_nvpf]);
+ if (flen < minwrite || flen > maxwrite) {
+ frameio_free(fio);
+ return (ERANGE);
+ }
+ total += flen;
+ }
+
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, total) == 0) {
+ if (nonblock != 0) {
+ frameio_free(fio);
+ mutex_exit(&vqp->vdq_lock);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * We've reserved our space, let's copyin and go from here.
+ */
+ ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
+ if (ret != 0) {
+ frameio_free(fio);
+ vnd_dq_unreserve(vqp, total);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ return (ret);
+ }
+
+ for (mp = mp_chain; mp != NULL; mp = nmp) {
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+ }
+
+ /*
+ * Update the frameio structure to indicate that we wrote those frames.
+ */
+ frameio_mark_consumed(fio, nread);
+ ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
+ frameio_free(fio);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
+{
+ const char *link;
+ uint32_t vers = 1;
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ /*
+ * Copy all of the members out to userland.
+ */
+ if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (vdp->vdd_flags & VND_D_LINKED)
+ link = vdp->vdd_lname;
+ else
+ link = "<anonymous>";
+ if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
+ sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
+ sizeof (zoneid_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+static int
+vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
+{
+ vnd_ioc_list_t vl;
+ vnd_ioc_list32_t vl32;
+ zoneid_t zid;
+ vnd_dev_t *vdp;
+ vnd_ioc_info_t *vip;
+ int found, cancopy, ret;
+
+ if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
+ if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ vl.vl_nents = vl32.vl_nents;
+ vl.vl_actents = vl32.vl_actents;
+ vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
+ } else {
+ if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+
+ cancopy = vl.vl_nents;
+ vip = vl.vl_ents;
+ found = 0;
+ zid = crgetzoneid(credp);
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
+ (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
+ found++;
+ if (cancopy > 0) {
+ ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
+ if (ret != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ cancopy--;
+ vip++;
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
+ sizeof (uint_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+
+static int
+vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ int ret;
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ ASSERT(m != 0);
+
+ /*
+ * Make sure no one has come in on an ioctl from the strioc case.
+ */
+ if ((cmd & VND_STRIOC) == VND_STRIOC)
+ return (ENOTTY);
+
+ /*
+ * Like close, seems like if this minor isn't found, it's a programmer
+ * error somehow.
+ */
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ switch (cmd) {
+ case VND_IOC_ATTACH:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_attach(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_LINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_link(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_UNLINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_GETRXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETRXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETTXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETTXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_settxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMAXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ if (crgetzoneid(credp) != GLOBAL_ZONEID) {
+ ret = EPERM;
+ break;
+ }
+ ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMINTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
+ break;
+ case VND_IOC_GETMAXTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
+ break;
+ case VND_IOC_FRAMEIO_READ:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_read(vdp, arg, mode);
+ break;
+ case VND_IOC_FRAMEIO_WRITE:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_write(vdp, arg, mode);
+ break;
+ case VND_IOC_LIST:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_list(arg, credp, mode);
+ break;
+ default:
+ ret = ENOTTY;
+ break;
+ }
+
+ vnd_dev_rele(vdp);
+ return (ret);
+}
+
+static int
+vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+ vnd_dev_t *vdp;
+ minor_t m;
+ zoneid_t zid;
+
+ if (flag & (FEXCL | FNDELAY))
+ return (ENOTSUP);
+
+ if (otyp & OTYP_BLK)
+ return (ENOTSUP);
+
+ zid = crgetzoneid(credp);
+ m = getminor(*devp);
+
+ /*
+ * If we have an open of a non-zero instance then we need to look that
+ * up in our list of entries.
+ */
+ if (m != 0) {
+
+ /*
+ * We don't check for rawaccess globally as a user could be
+ * doing a list ioctl on the control node which doesn't require
+ * this privilege.
+ */
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENOENT);
+
+ /*
+ * We need to check to make sure that the user is allowed to
+ * open this node. At this point it should be an attached handle
+ * as that's all we're allowed to access.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (EBUSY);
+ }
+
+ if (!(vdp->vdd_flags & VND_D_OPENED)) {
+ vdp->vdd_flags |= VND_D_OPENED;
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+
+ return (0);
+ }
+
+ if (flag & FEXCL)
+ return (ENOTSUP);
+
+ /*
+ * We need to clone ourselves and set up new a state.
+ */
+ vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
+ bzero(vdp, sizeof (vnd_dev_t));
+
+ if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
+ kmem_cache_free(vnd_dev_cache, vdp);
+ return (EINVAL);
+ }
+
+ vdp->vdd_minor = id_alloc(vnd_minors);
+ mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&vdp->vdd_link);
+ vdp->vdd_ref = 1;
+ *devp = makedevice(getmajor(*devp), vdp->vdd_minor);
+ vdp->vdd_devid = *devp;
+ DTRACE_VND_REFINC(vdp);
+ vdp->vdd_flags |= VND_D_OPENED;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_head(&vnd_dev_list, vdp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (0);
+}
+
+static int
+vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ if (m == 0)
+ return (ENXIO);
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_flags & VND_D_OPENED);
+ vdp->vdd_flags &= ~VND_D_OPENED;
+ mutex_exit(&vdp->vdd_lock);
+
+ /* Remove the hold from the previous open. */
+ vnd_dev_rele(vdp);
+
+ /* And now from lookup */
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+static int
+vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error = 0;
+ size_t mpsize;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp = NULL;
+ offset_t u_loffset;
+
+ /*
+ * If we have more than one uio we refuse to do anything. That's for
+ * frameio.
+ */
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ error = EWOULDBLOCK;
+ goto err;
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ error = EINTR;
+ goto err;
+ }
+ }
+ }
+
+ /* Ensure our buffer is big enough */
+ mp = vqp->vdq_head;
+ ASSERT(mp != NULL);
+ mpsize = msgsize(mp);
+ if (mpsize > uiop->uio_resid) {
+ error = EOVERFLOW;
+ goto err;
+ }
+
+ u_loffset = uiop->uio_loffset;
+ while (mp != NULL) {
+ if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
+ error = EFAULT;
+ uiop->uio_loffset = u_loffset;
+ mp = NULL;
+ goto err;
+ }
+ mpsize -= MBLKL(mp);
+ mp = mp->b_cont;
+ }
+ ASSERT(mpsize == 0);
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+err:
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+
+ return (error);
+}
+
+static int
+vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error;
+ vnd_dev_t *vdp;
+ mblk_t *mp;
+ ssize_t iosize, origsize;
+ vnd_data_queue_t *vqp;
+
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
+ uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ vnd_dev_rele(vdp);
+ return (ERANGE);
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ VERIFY(vdp->vdd_str != NULL);
+
+ /*
+ * Reserve space in the data queue if we can. If we can't, block or
+ * return EAGAIN. If we can, go and squeue_enter.
+ */
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * Now that we've reserved the space, try to allocate kernel space for
+ * and copy in the block. To take care of all this we use the
+ * strmakedata subroutine for now.
+ */
+ origsize = iosize = uiop->uio_resid;
+ error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
+ &mp);
+
+ /*
+ * strmakedata() will return an error or it may only consume a portion
+ * of the data.
+ */
+ if (error != 0 || uiop->uio_resid != 0) {
+ vnd_dq_unreserve(vqp, origsize);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ vnd_dev_rele(vdp);
+ return (ENOSR);
+ }
+
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+static int
+vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ int ready = 0;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if ((events & POLLIN) || (events & POLLRDNORM)) {
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head != NULL)
+ ready |= events & (POLLIN | POLLRDNORM);
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if (events & POLLOUT) {
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_cur != vqp->vdq_max)
+ ready |= POLLOUT;
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if (ready != 0) {
+ *reventsp = ready;
+ vnd_dev_rele(vdp);
+ return (0);
+ }
+
+ *reventsp = 0;
+ if (!anyyet)
+ *phpp = &vdp->vdd_ph;
+
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+static void *
+vnd_stack_init(netstackid_t stackid, netstack_t *ns)
+{
+ vnd_pnsd_t *nsp;
+
+ nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
+ bzero(nsp, sizeof (*nsp));
+ nsp->vpnd_nsid = stackid;
+ nsp->vpnd_zid = netstackid_to_zoneid(stackid);
+ nsp->vpnd_flags = 0;
+ mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_nslink));
+ if (vnd_netinfo_init(nsp) == 0)
+ nsp->vpnd_hooked = B_TRUE;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_tail(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (nsp);
+}
+
+static void
+vnd_stack_shutdown(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+ vnd_dev_t *vdp;
+
+ ASSERT(nsp != NULL);
+ /*
+ * After shut down no one should be able to find their way to this
+ * netstack again.
+ */
+ mutex_enter(&vnd_dev_lock);
+ list_remove(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ /*
+ * Make sure hooks know that they're going away.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_shutdown(nsp);
+
+ /*
+ * Now we need to go through and notify each zone that they are in
+ * teardown phase. See the big theory statement section on vnd, zones,
+ * netstacks, and sdev for more information about this.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ nsp->vpnd_flags |= VND_NS_CONDEMNED;
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_CONDEMNED))
+ vdp->vdd_flags |= VND_D_ZONE_DYING;
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * Next we remove all the links as we know nothing new can be added to
+ * the list and that none of the extent devices can obtain additional
+ * links.
+ */
+restart:
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
+ !(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ continue;
+ }
+
+ /*
+ * We drop our lock here and restart afterwards. Note that as
+ * part of unlinking we end up doing a rele of the vnd_dev_t. If
+ * this is the final hold on the vnd_dev_t then it might try and
+ * remove itself. Our locking rules requires not to be holding
+ * any locks when we call any of the rele functions.
+ *
+ * Note that the unlink function requires holders to call into
+ * it with the vnd_dev_t->vdd_lock held and will take care of it
+ * for us. Because we don't have a hold on it, we're done at
+ * this point.
+ */
+ mutex_exit(&nsp->vpnd_lock);
+ /* Forcibly unlink */
+ vnd_dev_unlink(vdp);
+ goto restart;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+}
+
+static void
+vnd_stack_destroy(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+
+ ASSERT(nsp != NULL);
+
+ /*
+ * Now that we've unlinked everything we just have to hang out for
+ * it to finish exiting. Now that it's no longer the kernel itself
+ * that's doing this we just need to wait for our reference count to
+ * equal zero and then we're free. If the global zone is holding open a
+ * reference to a vnd device for another zone, that's bad, but there's
+ * nothing much we can do. See the section on 'vnd, zones, netstacks' in
+ * the big theory statement for more information.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ while (nsp->vpnd_ref != 0)
+ cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * During shutdown we removed ourselves from the list and now we have no
+ * more references so we can safely say that there is nothing left and
+ * destroy everything that we had sitting around.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_fini(nsp);
+
+ mutex_destroy(&nsp->vpnd_lock);
+ list_destroy(&nsp->vpnd_dev_list);
+ kmem_cache_free(vnd_pnsd_cache, nsp);
+}
+
+/*
+ * Convert a node with a name of the form /dev/vnd/zone/%zonename and
+ * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
+ */
+static vnd_pnsd_t *
+vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ const char *path = sdev_ctx_path(ctx);
+ char *zstart, *dup;
+ size_t duplen;
+ vnd_pnsd_t *nsp;
+
+ vt = sdev_ctx_vtype(ctx);
+ ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);
+
+ if (vt == VDIR) {
+ zstart = strrchr(path, '/');
+ ASSERT(zstart != NULL);
+ zstart++;
+ return (vnd_nsd_lookup_by_zonename(zstart));
+ }
+
+ ASSERT(vt == VCHR);
+
+ dup = strdup(path);
+ duplen = strlen(dup) + 1;
+ zstart = strrchr(dup, '/');
+ *zstart = '\0';
+ zstart--;
+ zstart = strrchr(dup, '/');
+ zstart++;
+ nsp = vnd_nsd_lookup_by_zonename(zstart);
+ kmem_free(dup, duplen);
+
+ return (nsp);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate_dir(sdev_ctx_t ctx)
+{
+ vnd_pnsd_t *nsp;
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
+ return (SDEV_VTOR_VALID);
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+ return (SDEV_VTOR_VALID);
+ }
+
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (SDEV_VTOR_INVALID);
+ vnd_nsd_rele(nsp);
+
+ return (SDEV_VTOR_VALID);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ dev_t dev;
+ vnd_dev_t *vdp;
+
+ vt = sdev_ctx_vtype(ctx);
+ if (vt == VDIR)
+ return (vnd_sdev_validate_dir(ctx));
+ ASSERT(vt == VCHR);
+
+ if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
+ return (SDEV_VTOR_VALID);
+
+ dev = (uintptr_t)sdev_ctx_vtype_data(ctx);
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (SDEV_VTOR_STALE);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED) ||
+ (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_VALID);
+}
+
+/*
+ * This function is a no-op. sdev never has holds on our devices as they can go
+ * away at any time and specfs has to deal with that fact.
+ */
+static void
+vnd_sdev_inactive(sdev_ctx_t ctx)
+{
+}
+
+static int
+vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_dev_t *vdp;
+
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_LINKED) &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, S_IFCHR,
+ vdp->vdd_devid);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+ vnd_nsd_rele(nsp);
+ return (ret);
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_root(sdev_ctx_t ctx)
+{
+ zoneid_t zid;
+ vnd_pnsd_t *nsp;
+ int ret;
+
+ zid = getzoneid();
+ nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
+ ASSERT(nsp != NULL);
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * Checking the zone id is not sufficient as the global zone could be
+ * reaching down into a non-global zone's mounted /dev.
+ */
+ if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
+ ret = sdev_plugin_mkdir(ctx, "zone");
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+ }
+
+ /*
+ * Always add a reference to the control node. There's no need to
+ * reference it since it always exists and is always what we clone from.
+ */
+ ret = sdev_plugin_mknod(ctx, "ctl", S_IFCHR,
+ makedevice(ddi_driver_major(vnd_dip), 0));
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ mutex_enter(&nsp->vpnd_lock);
+ if (list_is_empty(&nsp->vpnd_dev_list)) {
+ mutex_exit(&nsp->vpnd_lock);
+ continue;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+ zonep = zone_find_by_id(nsp->vpnd_zid);
+ /*
+ * This zone must be being torn down, so skip it.
+ */
+ if (zonep == NULL)
+ continue;
+ ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
+ zone_rele(zonep);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (0);
+}
+
+static int
+vnd_sdev_filldir(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+
+ ASSERT(sdev_ctx_vtype(ctx) == VDIR);
+ if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_root(ctx));
+
+ if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_zroot(ctx));
+
+ ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
+ strlen(VND_SDEV_ZROOT)) == 0);
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (0);
+
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+
+ return (ret);
+}
+
+static sdev_plugin_ops_t vnd_sdev_ops = {
+ SDEV_PLUGIN_VERSION,
+ SDEV_PLUGIN_SUBDIR,
+ vnd_sdev_validate,
+ vnd_sdev_filldir,
+ vnd_sdev_inactive
+};
+
+static int
+vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int errp = 0;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ /*
+ * Only allow one instance.
+ */
+ if (vnd_dip != NULL)
+ return (DDI_FAILURE);
+
+ vnd_dip = dip;
+ if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
+ DDI_SUCCESS) {
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
+ DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
+ &errp);
+ if (vnd_sdev_hdl == NULL) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ ddi_prop_remove_all(vnd_dip);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_WAIT,
+ GSQUEUE_DEFAULT_PRIORITY);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&vnd_dev_lock);
+ if (!list_is_empty(&vnd_dev_list)) {
+ mutex_exit(&vnd_dev_lock);
+ return (DDI_FAILURE);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ return (DDI_FAILURE);
+}
+
+static int
+vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)vnd_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+ return (error);
+}
+
+
+
+static void
+vnd_ddi_fini(void)
+{
+ netstack_unregister(NS_VND);
+ if (vnd_taskq != NULL)
+ taskq_destroy(vnd_taskq);
+ if (vnd_str_cache != NULL)
+ kmem_cache_destroy(vnd_str_cache);
+ if (vnd_dev_cache != NULL)
+ kmem_cache_destroy(vnd_dev_cache);
+ if (vnd_pnsd_cache != NULL)
+ kmem_cache_destroy(vnd_pnsd_cache);
+ if (vnd_minors != NULL)
+ id_space_destroy(vnd_minors);
+ if (vnd_list_init != 0) {
+ list_destroy(&vnd_nsd_list);
+ list_destroy(&vnd_dev_list);
+ mutex_destroy(&vnd_dev_lock);
+ vnd_list_init = 0;
+ }
+ frameio_fini();
+}
+
+static int
+vnd_ddi_init(void)
+{
+ if (frameio_init() != 0)
+ return (DDI_FAILURE);
+
+ vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_str_cache == NULL) {
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_dev_cache == NULL) {
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
+ sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_pnsd_cache == NULL) {
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
+ if (vnd_taskq == NULL) {
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
+ if (vnd_minors == NULL) {
+ taskq_destroy(vnd_taskq);
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&vnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_link));
+ list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
+ offsetof(vnd_pnsd_t, vpnd_link));
+ vnd_list_init = 1;
+
+ netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
+ vnd_stack_destroy);
+
+ return (DDI_SUCCESS);
+}
+
+static struct module_info vnd_minfo = {
+ 0, /* module id */
+ "vnd", /* module name */
+ 1, /* smallest packet size */
+ INFPSZ, /* largest packet size (infinite) */
+ 1, /* high watermark */
+ 0 /* low watermark */
+};
+
+static struct qinit vnd_r_qinit = {
+ vnd_s_rput,
+ NULL,
+ vnd_s_open,
+ vnd_s_close,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct qinit vnd_w_qinit = {
+ vnd_s_wput,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct streamtab vnd_strtab = {
+ &vnd_r_qinit,
+ &vnd_w_qinit,
+ NULL,
+ NULL
+};
+
+
+static struct cb_ops vnd_cb_ops = {
+ vnd_open, /* open */
+ vnd_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ vnd_read, /* read */
+ vnd_write, /* write */
+ vnd_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ vnd_chpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* streamtab */
+ D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops vnd_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ vnd_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ vnd_attach, /* attach */
+ vnd_detach, /* detach */
+ nodev, /* reset */
+ &vnd_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+static struct modldrv vnd_modldrv = {
+ &mod_driverops,
+ "Virtual Networking Datapath Driver",
+ &vnd_dev_ops
+};
+
+static struct fmodsw vnd_fmodfsw = {
+ "vnd",
+ &vnd_strtab,
+ D_NEW | D_MP
+};
+
+static struct modlstrmod vnd_modlstrmod = {
+ &mod_strmodops,
+ "Virtual Networking Datapath Driver",
+ &vnd_fmodfsw
+};
+
+static struct modlinkage vnd_modlinkage = {
+ MODREV_1,
+ &vnd_modldrv,
+ &vnd_modlstrmod,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ /*
+ * We need to do all of our global initialization in init as opposed to
+ * attach and detach. The problem here is that because vnd can be used
+ * from a stream context while being detached, we can not rely on having
+ * run attach to create everything, alas. so it goes in _init, just like
+ * our friend ip.
+ */
+ if ((error = vnd_ddi_init()) != DDI_SUCCESS)
+ return (error);
+ error = mod_install((&vnd_modlinkage));
+ if (error != 0)
+ vnd_ddi_fini();
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vnd_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ error = mod_remove(&vnd_modlinkage);
+ if (error == 0)
+ vnd_ddi_fini();
+ return (error);
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf
new file mode 100644
index 0000000000..65872e1ddf
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
+#
+
+name="vnd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 08d82726d6..0a72d3d882 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -243,6 +243,7 @@ CHKHDRS= \
flock.h \
flock_impl.h \
fork.h \
+ frameio.h \
fss.h \
fsspriocntl.h \
fsid.h \
@@ -635,6 +636,8 @@ CHKHDRS= \
vmem.h \
vmem_impl.h \
vmsystm.h \
+ vnd.h \
+ vnd_errno.h \
vnic.h \
vnic_impl.h \
vnode.h \
@@ -852,7 +855,6 @@ FSHDRS= \
cachefs_log.h \
decomp.h \
dv_node.h \
- sdev_impl.h \
fifonode.h \
hsfs_isospec.h \
hsfs_node.h \
@@ -870,6 +872,8 @@ FSHDRS= \
pc_label.h \
pc_node.h \
pxfs_ki.h \
+ sdev_impl.h \
+ sdev_plugin.h \
snode.h \
swapnode.h \
tmp.h \
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index 303a9c7e45..4cd93be56e 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -353,6 +353,7 @@ typedef struct dld_hwgrpinfo {
*/
typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t);
+#define DI_DIRECT_RAW 0x1
/*
* Direct Tx/Rx capability.
*/
@@ -377,6 +378,9 @@ typedef struct dld_capab_direct_s {
/* flow control "can I put on a ring" callback */
uintptr_t di_tx_fctl_df; /* canput-like callback */
void *di_tx_fctl_dh;
+
+ /* flags that control our behavior */
+ uint_t di_flags;
} dld_capab_direct_t;
/*
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index a76a927e59..81708aad38 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -53,7 +53,8 @@ typedef enum {
typedef enum {
DLD_UNINITIALIZED,
DLD_PASSIVE,
- DLD_ACTIVE
+ DLD_ACTIVE,
+ DLD_EXCLUSIVE
} dld_passivestate_t;
/*
@@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t,
extern void dld_str_notify_ind(dld_str_t *);
extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *,
uintptr_t, uint16_t);
+extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *,
+ uintptr_t, uint16_t);
extern int dld_flow_ctl_callb(dld_str_t *, uint64_t,
int (*func)(), void *);
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 5bc2bd41c5..dddac5b878 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -107,6 +107,7 @@ typedef struct dl_ipnetinfo {
#define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */
#define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */
#define DL_NOTIFY_CONF 0x116 /* Notification from upstream */
+#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */
/*
* Primitives used for Connectionless Service
@@ -388,6 +389,7 @@ typedef struct dl_ipnetinfo {
#define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */
#define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */
#define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */
+#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */
/*
* DLPI notification codes for DL_NOTIFY_REQ primitives.
@@ -1107,6 +1109,13 @@ typedef struct {
} dl_intr_mode_req_t;
/*
+ * DL_EXCLUSIVE_REQ, M_PROTO type
+ */
+typedef struct {
+ t_uscalar_t dl_primitive;
+} dl_exclusive_req_t;
+
+/*
* CONNECTION-ORIENTED SERVICE PRIMITIVES
*/
@@ -1528,6 +1537,7 @@ union DL_primitives {
dl_control_ack_t control_ack;
dl_passive_req_t passive_req;
dl_intr_mode_req_t intr_mode_req;
+ dl_exclusive_req_t exclusive_req;
};
#define DL_INFO_REQ_SIZE sizeof (dl_info_req_t)
@@ -1596,6 +1606,7 @@ union DL_primitives {
#define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t)
#define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t)
#define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t)
+#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t)
#ifdef _KERNEL
/*
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index adcfe76c08..8e99d0e9d8 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -86,6 +86,7 @@ typedef struct dls_link_s dls_link_t;
#define DLS_PROMISC_SAP 0x00000001
#define DLS_PROMISC_MULTI 0x00000002
#define DLS_PROMISC_PHYS 0x00000004
+#define DLS_PROMISC_RX_ONLY 0x00000008
extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *);
extern void dls_close(dld_str_t *);
@@ -107,6 +108,8 @@ extern void str_notify(void *, mac_notify_type_t);
extern int dls_devnet_open(const char *,
dls_dl_handle_t *, dev_t *);
+extern int dls_devnet_open_in_zone(const char *,
+ dls_dl_handle_t *, dev_t *, zoneid_t);
extern void dls_devnet_close(dls_dl_handle_t);
extern boolean_t dls_devnet_rebuild();
@@ -142,6 +145,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t,
extern int dls_mgmt_get_linkinfo(datalink_id_t, char *,
datalink_class_t *, uint32_t *, uint32_t *);
extern int dls_mgmt_get_linkid(const char *, datalink_id_t *);
+extern int dls_mgmt_get_linkid_in_zone(const char *,
+ datalink_id_t *, zoneid_t);
extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t,
datalink_media_t, uint32_t);
extern int dls_devnet_macname2linkid(const char *,
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 8f7af6856c..d502b36a2d 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -62,6 +62,7 @@ struct dls_link_s { /* Protected by */
uint_t dl_zone_ref;
link_tagmode_t dl_tagmode; /* atomic */
uint_t dl_nonip_cnt; /* SL */
+ uint_t dl_exclusive; /* SL */
};
typedef struct dls_head_s {
@@ -128,6 +129,7 @@ extern void dls_mgmt_init(void);
extern void dls_mgmt_fini(void);
extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *);
+extern int dls_exclusive_set(dld_str_t *, boolean_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h
new file mode 100644
index 0000000000..54e6dbeedf
--- /dev/null
+++ b/usr/src/uts/common/sys/frameio.h
@@ -0,0 +1,107 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FRAMEIO_H
+#define _SYS_FRAMEIO_H
+
+/*
+ * Frame I/O definitions
+ */
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+/* Kernel only headers */
+#include <sys/stream.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * An individual frame vector component. Collections of these are used to make
+ * ioctls.
+ */
+typedef struct framevec {
+ void *fv_buf; /* Buffer with data */
+ size_t fv_buflen; /* Size of the buffer */
+ size_t fv_actlen; /* Amount of buffer consumed, ignore on error */
+} framevec_t;
+
+/*
+ * The base unit used with frameio.
+ */
+typedef struct frameio {
+ uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */
+ uint_t fio_nvpf; /* How many vectors make up one frame */
+ uint_t fio_nvecs; /* The total number of vectors */
+ framevec_t fio_vecs[]; /* C99 VLA */
+} frameio_t;
+
+
+#define FRAMEIO_VERSION_ONE 1
+#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE
+
+#define FRAMEIO_NVECS_MAX 32
+
+/*
+ * Definitions for kernel modules to include as helpers. These are consolidation
+ * private.
+ */
+#ifdef _KERNEL
+
+/*
+ * 32-bit versions for 64-bit kernels
+ */
+typedef struct framevec32 {
+ caddr32_t fv_buf;
+ size32_t fv_buflen;
+ size32_t fv_actlen;
+} framevec32_t;
+
+typedef struct frameio32 {
+ uint_t fio_version;
+ uint_t fio_vecspframe;
+ uint_t fio_nvecs;
+ framevec32_t fio_vecs[];
+} frameio32_t;
+
+/*
+ * Describe the different ways that vectors should map to frames.
+ */
+typedef enum frameio_write_mblk_map {
+ MAP_BLK_FRAME
+} frameio_write_mblk_map_t;
+
+int frameio_init(void);
+void frameio_fini(void);
+frameio_t *frameio_alloc(int);
+void frameio_free(frameio_t *);
+int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t);
+int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int);
+int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *,
+ int *, int);
+int frameio_hdr_copyout(frameio_t *, int, void *, uint_t);
+size_t frameio_frame_length(frameio_t *, framevec_t *);
+void frameio_mark_consumed(frameio_t *, int);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FRAMEIO_H */
diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h
index 19a147d88a..d5bd0162c7 100644
--- a/usr/src/uts/common/sys/fs/sdev_impl.h
+++ b/usr/src/uts/common/sys/fs/sdev_impl.h
@@ -35,6 +35,7 @@ extern "C" {
#include <sys/vfs_opreg.h>
#include <sys/list.h>
#include <sys/nvpair.h>
+#include <sys/fs/sdev_plugin.h>
/*
* sdev_nodes are the file-system specific part of the
@@ -126,6 +127,21 @@ typedef struct sdev_local_data {
struct sdev_dprof sdev_lprof; /* profile for multi-inst */
} sdev_local_data_t;
+/* sdev_flags */
+typedef enum sdev_flags {
+ SDEV_BUILD = 0x0001, /* directory cache out-of-date */
+ SDEV_GLOBAL = 0x0002, /* global /dev nodes */
+ SDEV_PERSIST = 0x0004, /* backing store persisted node */
+ SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */
+ SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */
+ /* (ex: pts) */
+ SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */
+ SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */
+ /* need update */
+ SDEV_SUBDIR = 0x0080, /* match all subdirs under here */
+ SDEV_ZONED = 0x0100 /* zoned subdir */
+} sdev_flags_t;
+
/*
* /dev filesystem sdev_node defines
*/
@@ -148,7 +164,7 @@ typedef struct sdev_node {
ino64_t sdev_ino; /* inode */
uint_t sdev_nlink; /* link count */
int sdev_state; /* state of this node */
- int sdev_flags; /* flags bit */
+ sdev_flags_t sdev_flags; /* flags bit */
kmutex_t sdev_lookup_lock; /* node creation synch lock */
kcondvar_t sdev_lookup_cv; /* node creation sync cv */
@@ -159,7 +175,7 @@ typedef struct sdev_node {
struct sdev_global_data sdev_globaldata;
struct sdev_local_data sdev_localdata;
} sdev_instance_data;
-
+ list_node_t sdev_plist; /* link on plugin list */
void *sdev_private;
} sdev_node_t;
@@ -190,29 +206,11 @@ typedef enum {
SDEV_READY
} sdev_node_state_t;
-/* sdev_flags */
-#define SDEV_BUILD 0x0001 /* directory cache out-of-date */
-#define SDEV_GLOBAL 0x0002 /* global /dev nodes */
-#define SDEV_PERSIST 0x0004 /* backing store persisted node */
-#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */
-#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */
- /* (ex: pts) */
-#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */
-#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */
- /* need update */
-#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */
-#define SDEV_ZONED 0x0100 /* zoned subdir */
-
/* sdev_lookup_flags */
#define SDEV_LOOKUP 0x0001 /* node creation in progress */
#define SDEV_READDIR 0x0002 /* VDIR readdir in progress */
#define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */
-#define SDEV_VTOR_INVALID -1
-#define SDEV_VTOR_SKIP 0
-#define SDEV_VTOR_VALID 1
-#define SDEV_VTOR_STALE 2
-
/* convenient macros */
#define SDEV_IS_GLOBAL(dv) \
(dv->sdev_flags & SDEV_GLOBAL)
@@ -364,8 +362,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *,
extern int devname_profile_update(char *, size_t);
extern struct sdev_data *sdev_find_mntinfo(char *);
void sdev_mntinfo_rele(struct sdev_data *);
+typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *);
+void sdev_mnt_walk(sdev_mnt_walk_f, void *);
extern struct vnodeops *devpts_getvnodeops(void);
extern struct vnodeops *devvt_getvnodeops(void);
+extern void sdev_plugin_nodeready(struct sdev_node *);
+extern int sdev_plugin_init(void);
+extern int sdev_plugin_fini(void);
/*
* boot states - warning, the ordering here is significant
@@ -510,6 +513,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *);
extern void sdev_modctl_dump_files(void);
/*
+ * plugin and legacy vtab stuff
+ */
+/* directory dependent vop table */
+typedef struct sdev_vop_table {
+ char *vt_name; /* subdirectory name */
+ const fs_operation_def_t *vt_service; /* vnodeops table */
+ struct vnodeops **vt_global_vops; /* global container for vop */
+ int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */
+ int vt_flags;
+} sdev_vop_table_t;
+
+extern struct sdev_vop_table vtab[];
+extern struct vnodeops *sdev_get_vop(struct sdev_node *);
+extern void sdev_set_no_negcache(struct sdev_node *);
+extern void *sdev_get_vtor(struct sdev_node *dv);
+
+/*
* globals
*/
extern kmutex_t sdev_lock;
@@ -522,6 +542,7 @@ extern struct vnodeops *devipnet_vnodeops;
extern struct vnodeops *devvt_vnodeops;
extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */
extern struct vnodeops *devzvol_vnodeops;
+extern int sdev_vnodeops_tbl_size;
extern const fs_operation_def_t sdev_vnodeops_tbl[];
extern const fs_operation_def_t devpts_vnodeops_tbl[];
diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h
new file mode 100644
index 0000000000..8783df58e6
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/sdev_plugin.h
@@ -0,0 +1,106 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_SDEV_PLUGIN_H
+#define _SYS_SDEV_PLUGIN_H
+
+/*
+ * Kernel sdev plugin interface
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef uintptr_t sdev_plugin_hdl_t;
+typedef uintptr_t sdev_ctx_t;
+
+/*
+ * Valid return values for sdev_plugin_validate_t.
+ */
+typedef enum sdev_plugin_validate {
+ SDEV_VTOR_INVALID = -1,
+ SDEV_VTOR_SKIP = 0,
+ SDEV_VTOR_VALID = 1,
+ SDEV_VTOR_STALE = 2
+} sdev_plugin_validate_t;
+
+/*
+ * Valid flags
+ */
+typedef enum sdev_plugin_flags {
+ SDEV_PLUGIN_NO_NCACHE = 0x1,
+ SDEV_PLUGIN_SUBDIR = 0x2
+} sdev_plugin_flags_t;
+
+#define SDEV_PLUGIN_FLAGS_MASK 0x3
+
+/*
+ * Functions a module must implement
+ */
+typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t);
+typedef int (*sp_filldir_f)(sdev_ctx_t);
+typedef void (*sp_inactive_f)(sdev_ctx_t);
+
+#define SDEV_PLUGIN_VERSION 1
+
+typedef struct sdev_plugin_ops {
+ int spo_version;
+ sdev_plugin_flags_t spo_flags;
+ sp_valid_f spo_validate;
+ sp_filldir_f spo_filldir;
+ sp_inactive_f spo_inactive;
+} sdev_plugin_ops_t;
+
+extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *,
+ int *);
+extern int sdev_plugin_unregister(sdev_plugin_hdl_t);
+
+typedef enum sdev_ctx_flags {
+ SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */
+} sdev_ctx_flags_t;
+
+/*
+ * Context helper functions
+ */
+extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t);
+extern const char *sdev_ctx_name(sdev_ctx_t);
+extern const char *sdev_ctx_path(sdev_ctx_t);
+extern enum vtype sdev_ctx_vtype(sdev_ctx_t);
+extern const void *sdev_ctx_vtype_data(sdev_ctx_t);
+
+/*
+ * Callbacks to manipulate nodes
+ */
+extern int sdev_plugin_mkdir(sdev_ctx_t, char *);
+extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SDEV_PLUGIN_H */
diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h
new file mode 100644
index 0000000000..40ef4ce982
--- /dev/null
+++ b/usr/src/uts/common/sys/gsqueue.h
@@ -0,0 +1,65 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_GSQUEUE_H
+#define _SYS_GSQUEUE_H
+
+/*
+ * Standard interfaces to serializaion queues for everyone (except IP).
+ */
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct gsqueue gsqueue_t;
+typedef struct gsqueue_set gsqueue_set_t;
+
+typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t);
+typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *);
+
+extern gsqueue_set_t *gsqueue_set_create(uint_t, pri_t);
+extern void gsqueue_set_destroy(gsqueue_set_t *);
+extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t);
+
+extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *);
+extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t);
+
+#define GSQUEUE_FILL 0x0001
+#define GSQUEUE_NODRAIN 0x0002
+#define GSQUEUE_PROCESS 0x0004
+
+extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *,
+ int, uint8_t);
+
+/*
+ * The default wait is inherited from IP. This determines the amount of time
+ * that must pass after queuing work, before we wake up the worker thread. This
+ * value is in milliseconds.
+ */
+#define GSQUEUE_DEFAULT_WAIT 10
+#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_GSQUEUE_H */
diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h
index 93b5fc3e01..ea85c78f6b 100644
--- a/usr/src/uts/common/sys/neti.h
+++ b/usr/src/uts/common/sys/neti.h
@@ -44,6 +44,8 @@ extern "C" {
#define NHF_INET "NHF_INET"
#define NHF_INET6 "NHF_INET6"
#define NHF_ARP "NHF_ARP"
+#define NHF_VND_INET "NHF_VND_INET"
+#define NHF_VND_INET6 "NHF_VND_INET6"
/*
* Event identification
diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h
index 2c77e1be96..73f29d1e63 100644
--- a/usr/src/uts/common/sys/netstack.h
+++ b/usr/src/uts/common/sys/netstack.h
@@ -81,7 +81,8 @@ typedef id_t netstackid_t;
#define NS_IPSECESP 16
#define NS_IPNET 17
#define NS_ILB 18
-#define NS_MAX (NS_ILB+1)
+#define NS_VND 19
+#define NS_MAX (NS_VND+1)
/*
* State maintained for each module which tracks the state of
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index f1bd429815..35e1cf64c7 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -29,6 +29,17 @@
extern "C" {
#endif
+/*
+ * Originally in illumos, we had an IP-centric view of the serialization queue
+ * abstraction. While that has useful properties, the implementation of squeues
+ * hardcodes various parts of the implementation of IP into it which makes it
+ * unsuitable for other consumers. To enable them, we created another interface,
+ * but opted not to port all of the functionality that IP uses in the form of
+ * ip_squeue.c As other consumers need the functionality that IP has in squeues,
+ * then we'll come up with more genericized methods and add that functionality
+ * to <sys/gsqueue.h>. Please do not continue to use this header.
+ */
+
#include <sys/types.h>
#include <sys/processor.h>
#include <sys/stream.h>
@@ -76,12 +87,13 @@ typedef enum {
struct ip_recv_attr_s;
extern void squeue_init(void);
-extern squeue_t *squeue_create(clock_t, pri_t);
+extern squeue_t *squeue_create(clock_t, pri_t, boolean_t);
extern void squeue_bind(squeue_t *, processorid_t);
extern void squeue_unbind(squeue_t *);
extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
uint32_t, struct ip_recv_attr_s *, int, uint8_t);
extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
+extern void squeue_destroy(squeue_t *);
struct conn_s;
extern int squeue_synch_enter(struct conn_s *, mblk_t *);
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 22550886eb..d2418bbc15 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -117,6 +117,7 @@ struct squeue_s {
squeue_set_t *sq_set; /* managed by squeue creator */
pri_t sq_priority; /* squeue thread priority */
+ boolean_t sq_isip; /* use IP-centric features */
/* Keep the debug-only fields at the end of the structure */
#ifdef DEBUG
@@ -165,6 +166,7 @@ struct squeue_s {
#define SQS_POLL_RESTART_DONE 0x01000000
#define SQS_POLL_THR_QUIESCE 0x02000000
#define SQS_PAUSE 0x04000000 /* The squeue has been paused */
+#define SQS_EXIT 0x08000000 /* squeue is being torn down */
#define SQS_WORKER_THR_CONTROL \
(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h
new file mode 100644
index 0000000000..bc7c9c3122
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd.h
@@ -0,0 +1,141 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VND_H
+#define _SYS_VND_H
+
+#include <sys/types.h>
+#include <sys/vnd_errno.h>
+#include <sys/frameio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * We distinguish between normal ioctls and private ioctls we issues to out
+ * streams version. Streams ioctls have the upper bit set in the lowest byte.
+ * Note that there are no STREAMs ioctls for userland and all definitions
+ * related to them are not present in this file.
+ */
+#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8))
+
+/*
+ * Attach the current minor instance to a given dlpi datalink identified by a
+ * vnd_ioc_name_t argument. This fails if it's already been attached. Note that
+ * unlike the other ioctls, this is passed directly as opposed to every other
+ * function which is passed as a pointer to the value.
+ */
+#define VND_IOC_ATTACH (VND_IOC | 0x1)
+
+#define VND_NAMELEN 32
+
+typedef struct vnd_ioc_attach {
+ char via_name[VND_NAMELEN];
+ zoneid_t via_zoneid;
+ uint32_t via_errno;
+} vnd_ioc_attach_t;
+
+/*
+ * Link the current minor instance into the /devices name space.
+ *
+ * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid,
+ * vil_name. The device will be namespaced to the zone. The global zone will be
+ * able to see all minor nodes. In the zone, only the /dev entries will exist.
+ * At this time, a given device can only have one link at a time. Note that a
+ * user cannot specify the zone to pass in, rather it is the zone that the
+ * device was attached in.
+ */
+#define VND_IOC_LINK (VND_IOC | 0x2)
+
+typedef struct vnd_ioc_link {
+ char vil_name[VND_NAMELEN];
+ uint32_t vil_errno;
+} vnd_ioc_link_t;
+
+/*
+ * Unlink the opened minor instance from the /devices name space. A zone may use
+ * this to unlink an extent entry in /dev; however, they will not be able to
+ * link it in again.
+ */
+#define VND_IOC_UNLINK (VND_IOC | 0x3)
+typedef struct vnd_ioc_unlink {
+ uint32_t viu_errno;
+} vnd_ioc_unlink_t;
+
+/*
+ * Controls to get and set the current buffer recieve buffer size.
+ */
+typedef struct vnd_ioc_buf {
+ uint64_t vib_size;
+ uint32_t vib_filler;
+ uint32_t vib_errno;
+} vnd_ioc_buf_t;
+
+#define VND_IOC_GETRXBUF (VND_IOC | 0x04)
+#define VND_IOC_SETRXBUF (VND_IOC | 0x05)
+#define VND_IOC_GETMAXBUF (VND_IOC | 0x06)
+#define VND_IOC_GETTXBUF (VND_IOC | 0x07)
+#define VND_IOC_SETTXBUF (VND_IOC | 0x08)
+#define VND_IOC_GETMINTU (VND_IOC | 0x09)
+#define VND_IOC_GETMAXTU (VND_IOC | 0x0a)
+
+/*
+ * Information and listing ioctls
+ *
+ * This gets information about all of the active vnd instances. vl_actents is
+ * always updated to the number around and vl_nents is the number of
+ * vnd_ioc_info_t elements are allocated in vl_ents.
+ */
+typedef struct vnd_ioc_info {
+ uint32_t vii_version;
+ zoneid_t vii_zone;
+ char vii_name[VND_NAMELEN];
+ char vii_datalink[VND_NAMELEN];
+} vnd_ioc_info_t;
+
+typedef struct vnd_ioc_list {
+ uint_t vl_nents;
+ uint_t vl_actents;
+ vnd_ioc_info_t *vl_ents;
+} vnd_ioc_list_t;
+
+#ifdef _KERNEL
+
+typedef struct vnd_ioc_list32 {
+ uint_t vl_nents;
+ uint_t vl_actents;
+ caddr32_t vl_ents;
+} vnd_ioc_list32_t;
+
+#endif /* _KERNEL */
+
+#define VND_IOC_LIST (VND_IOC | 0x20)
+
+/*
+ * Framed I/O ioctls
+ *
+ * Users should use the standard frameio_t as opposed to a vnd specific type.
+ * This is a consolidation private ioctl pending futher stability in the form of
+ * specific system work.
+ */
+#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30)
+#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_H */
diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h
new file mode 100644
index 0000000000..89e5fc2543
--- /dev/null
+++ b/usr/src/uts/common/sys/vnd_errno.h
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VND_ERRNO_H
+#define _SYS_VND_ERRNO_H
+
+/*
+ * This header contains all of the available vnd errors.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum vnd_errno {
+ VND_E_SUCCESS = 0, /* no error */
+ VND_E_NOMEM, /* no memory */
+ VND_E_NODATALINK, /* no such datalink */
+ VND_E_NOTETHER, /* not DL_ETHER */
+ VND_E_DLPIINVAL, /* Unknown DLPI failures */
+ VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */
+ VND_E_BINDFAIL, /* DL_BIND_REQ failed */
+ VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */
+ VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */
+ VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */
+ VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */
+ VND_E_DLDBADVERS, /* bad dld version */
+ VND_E_KSTATCREATE, /* failed to create kstats */
+ VND_E_NODEV, /* no such vnd link */
+ VND_E_NONETSTACK, /* netstack doesn't exist */
+ VND_E_ASSOCIATED, /* device already associated */
+ VND_E_ATTACHED, /* device already attached */
+ VND_E_LINKED, /* device already linked */
+ VND_E_BADNAME, /* invalid name */
+ VND_E_PERM, /* can't touch this */
+ VND_E_NOZONE, /* no such zone */
+ VND_E_STRINIT, /* failed to initialize vnd stream module */
+ VND_E_NOTATTACHED, /* device not attached */
+ VND_E_NOTLINKED, /* device not linked */
+ VND_E_LINKEXISTS, /* another device has the same link name */
+ VND_E_MINORNODE, /* failed to create minor node */
+ VND_E_BUFTOOBIG, /* requested buffer size is too large */
+ VND_E_BUFTOOSMALL, /* requested buffer size is too small */
+ VND_E_DLEXCL, /* unable to get dlpi excl access */
+ VND_E_DIRECTNOTSUP,
+ /* DLD direct capability not suported over data link */
+ VND_E_BADPROPSIZE, /* invalid property size */
+ VND_E_BADPROP, /* invalid property */
+ VND_E_PROPRDONLY, /* property is read only */
+ VND_E_SYS, /* unexpected system error */
+ VND_E_CAPABPASS,
+ /* capabilities invalid, pass-through module detected */
+ VND_E_UNKNOWN /* unknown error */
+} vnd_errno_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VND_ERRNO_H */
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index 1d4b8e790d..7e95f5fa23 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -347,7 +347,8 @@ DRV_KMODS += uath
DRV_KMODS += urtw
DRV_KMODS += vgatext
DRV_KMODS += heci
-DRV_KMODS += vmxnet
+DRV_KMODS += vmxnet
+DRV_KMODS += vnd
DRV_KMODS += vnic
DRV_KMODS += vscan
DRV_KMODS += wc
@@ -590,6 +591,7 @@ MISC_KMODS += drm
MISC_KMODS += fssnap_if
MISC_KMODS += gda
MISC_KMODS += gld
+MISC_KMODS += gsqueue
MISC_KMODS += hidparser
MISC_KMODS += hook
MISC_KMODS += hpcsvc
diff --git a/usr/src/uts/intel/dev/Makefile b/usr/src/uts/intel/dev/Makefile
index b5c7c1a9c8..e7ae468c05 100644
--- a/usr/src/uts/intel/dev/Makefile
+++ b/usr/src/uts/intel/dev/Makefile
@@ -66,6 +66,7 @@ INC_PATH += -I$(UTSBASE)/common/io/bpf
CERRWARN += -_gcc=-Wno-parentheses
CERRWARN += -_gcc=-Wno-unused-label
CERRWARN += -_gcc=-Wno-uninitialized
+CERRWARN += -_gcc=-Wno-unused-function
#
# Default build targets.
diff --git a/usr/src/uts/intel/gsqueue/Makefile b/usr/src/uts/intel/gsqueue/Makefile
new file mode 100644
index 0000000000..411e384309
--- /dev/null
+++ b/usr/src/uts/intel/gsqueue/Makefile
@@ -0,0 +1,49 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+UTSBASE = ../..
+
+MODULE = gsqueue
+OBJECTS = $(GSQUEUE_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(GSQUEUE_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+include $(UTSBASE)/intel/Makefile.intel
+
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+LDFLAGS += -dy -Ndrv/ip
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/ipf/Makefile b/usr/src/uts/intel/ipf/Makefile
index 046a6c223d..e18158a43d 100644
--- a/usr/src/uts/intel/ipf/Makefile
+++ b/usr/src/uts/intel/ipf/Makefile
@@ -58,7 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
CPPFLAGS += -DIPFILTER_LKM -DIPFILTER_LOG -DIPFILTER_LOOKUP -DUSE_INET6
CPPFLAGS += -DSUNDDI -DSOLARIS2=$(RELEASE_MINOR) -DIRE_ILL_CN
-LDFLAGS += -dy -Ndrv/ip -Nmisc/md5 -Nmisc/neti -Nmisc/hook -Nmisc/kcf
+LDFLAGS += -dy -Ndrv/ip -Nmisc/md5 -Nmisc/neti -Nmisc/hook -Nmisc/kcf -Ndrv/vnd
INC_PATH += -I$(UTSBASE)/common/inet/ipf
diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
index 663613cee3..966d546b1a 100644
--- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64
+++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64
@@ -39,6 +39,8 @@ hook4_nicevents
hook4_nicevents_gz
hook4_out
hook4_out_gz
+hook4_vnd_in
+hook4_vnd_out
hook6_in
hook6_in_gz
hook6_loop_in
@@ -49,6 +51,8 @@ hook6_nicevents
hook6_nicevents_gz
hook6_out
hook6_out_gz
+hook6_vnd_in
+hook6_vnd_out
icmpreplytype4
icmpreplytype6
icmptoicmp6types
diff --git a/usr/src/uts/intel/vnd/Makefile b/usr/src/uts/intel/vnd/Makefile
new file mode 100644
index 0000000000..f66f89062c
--- /dev/null
+++ b/usr/src/uts/intel/vnd/Makefile
@@ -0,0 +1,56 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+UTSBASE = ../..
+
+MODULE = vnd
+OBJECTS = $(VND_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(VND_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+
+include $(UTSBASE)/intel/Makefile.intel
+
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+CONF_SRCDIR = $(UTSBASE)/common/io/vnd
+
+LDFLAGS += -dy -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue
+
+#
+# We use <sys/ctype.h> which causes gcc to think that all of its inline
+# functions are defined and unused.
+#
+CERRWARN += -_gcc=-Wno-unused-function
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/sparc/ipf/ipf.global-objs.debug64 b/usr/src/uts/sparc/ipf/ipf.global-objs.debug64
index 61566e8b65..26a0777190 100644
--- a/usr/src/uts/sparc/ipf/ipf.global-objs.debug64
+++ b/usr/src/uts/sparc/ipf/ipf.global-objs.debug64
@@ -23,6 +23,10 @@
# Use is subject to license terms.
#
+hook4_vnd_in
+hook4_vnd_out
+hook6_vnd_in
+hook6_vnd_out
fr_availfuncs
fr_features
fr_objbytes