summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnders Persson <Anders.Persson@Sun.COM>2010-06-17 17:22:09 -0700
committerAnders Persson <Anders.Persson@Sun.COM>2010-06-17 17:22:09 -0700
commit3e95bd4ab92abca814bd28e854607d1975c7dc88 (patch)
tree9f3088b26f62207198f0d44feca65b701fafb8dc
parent8e51227711fb29b69b8f42a3953e759963432065 (diff)
downloadillumos-joyent-3e95bd4ab92abca814bd28e854607d1975c7dc88.tar.gz
PSARC/2009/590 Socket Filter Framework
6939085 Socket Filter Framework 6802067 connect_failed kernel socket callback is not triggered 6776450 time spent in tcp_close could be reduced/deferred to a worker thread 6828586 assertion failed: family == 26, file: ../../common/fs/sockfs/socksyscalls.c, line: 1608 6802078 kernel socket 'newconn' callback is passing rcv queue size as an argument
-rw-r--r--exception_lists/packaging4
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/Makefile9
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/soconfig.c212
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter55
-rw-r--r--usr/src/cmd/ptools/pfiles/pfiles.c68
-rw-r--r--usr/src/cmd/truss/expound.c133
-rw-r--r--usr/src/cmd/truss/print.c26
-rw-r--r--usr/src/cmd/truss/print.h6
-rw-r--r--usr/src/cmd/truss/systable.c23
-rw-r--r--usr/src/lib/libc/common/sys/_sockconfig.s6
-rw-r--r--usr/src/pkg/manifests/SUNWcs.mf1
-rw-r--r--usr/src/uts/common/Makefile.files2
-rw-r--r--usr/src/uts/common/c2/audit_event.c64
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.c66
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.h14
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c352
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_subr.c317
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter.c1770
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter_impl.h213
-rw-r--r--usr/src/uts/common/fs/sockfs/socknotify.c34
-rw-r--r--usr/src/uts/common/fs/sockfs/sockparams.c242
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c15
-rw-r--r--usr/src/uts/common/fs/sockfs/socksyscalls.c422
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c40
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.h9
-rw-r--r--usr/src/uts/common/fs/sockfs/sodirect.c5
-rw-r--r--usr/src/uts/common/inet/inetddi.c13
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c13
-rw-r--r--usr/src/uts/common/inet/ip/icmpddi.c9
-rw-r--r--usr/src/uts/common/inet/rawip_impl.h5
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctp.c11
-rw-r--r--usr/src/uts/common/inet/sockmods/socksdp.c13
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c455
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c6
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c186
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_output.c74
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_socket.c442
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tpi.c492
-rw-r--r--usr/src/uts/common/inet/tcp/tcpddi.c5
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h13
-rw-r--r--usr/src/uts/common/inet/udp/udp.c17
-rw-r--r--usr/src/uts/common/inet/udp/udpddi.c5
-rw-r--r--usr/src/uts/common/inet/udp_impl.h5
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket.c5
-rw-r--r--usr/src/uts/common/io/sock_conf.c7
-rw-r--r--usr/src/uts/common/os/sysent.c7
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/ksocket.h5
-rw-r--r--usr/src/uts/common/sys/socket.h32
-rw-r--r--usr/src/uts/common/sys/socket_proto.h16
-rw-r--r--usr/src/uts/common/sys/socketvar.h147
-rw-r--r--usr/src/uts/common/sys/sockfilter.h151
-rw-r--r--usr/src/uts/common/syscall/sendfile.c28
53 files changed, 4872 insertions, 1399 deletions
diff --git a/exception_lists/packaging b/exception_lists/packaging
index ba452253e7..58c70ec5c0 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -926,3 +926,7 @@ usr/lib/sparcv9/llib-lvrrpadm.ln sparc
#
opt/onbld/bin/i386/elfsign i386
opt/onbld/bin/sparc/elfsign sparc
+#
+# Private socket filter API
+#
+usr/include/sys/sockfilter.h
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile
index f7ae749660..014e4511a5 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile
@@ -20,8 +20,7 @@
#
#
-# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
#
SYNCPROG= syncinit syncloop syncstat
@@ -38,6 +37,7 @@ PROG= 6to4relay arp gettable if_mpadm \
MANIFEST= rarp.xml telnet.xml comsat.xml finger.xml \
login.xml shell.xml rexec.xml
+SVCMETHOD= svc-sockfilter
ROOTFS_PROG= hostconfig route soconfig
SBINLINKS= hostconfig route
@@ -106,7 +106,8 @@ SRCS+= $(COMMONSRCS)
#
# Message catalog
#
-POFILES= 6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po
+POFILES= 6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po \
+ soconfig.po
POFILE= usr.sbin.po
all:= TARGET= all
@@ -199,7 +200,7 @@ $(ROOTUSRSBINLINKS):
install: $(PROG) $(ROOTFS_PROG) $(SUBDIRS) .WAIT $(ROOTUSRSBINPROG) \
$(ROOTSBINPROG) $(ROOTUSRSBINLINKS) $(ROOTETCDEFAULTFILES) \
- $(ROOTMANIFEST) THIRDPARTYLICENSE.arp
+ $(ROOTMANIFEST) $(ROOTSVCMETHOD) THIRDPARTYLICENSE.arp
THIRDPARTYLICENSE.arp: arp.c
$(SED) -n '/University of California/,/SUCH DAMAGE/p' arp.c > $@
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
index b5c45f7b6f..a47d455ce3 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <stdio.h>
@@ -30,6 +29,9 @@
#include <string.h>
#include <ctype.h>
#include <locale.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <errno.h>
#define MAXLINELEN 4096
@@ -47,6 +49,15 @@
*
* soconfig <fam> <type> <protocol>
* deregisters
+ *
+ * Filter Operations (Consolidation Private):
+ *
+ * soconfig -F <name> <modname> {auto [top | bottom | before:filter |
+ * after:filter] | prog} <fam>:<type>:<proto>,...
+ * configure filter
+ *
+ * soconfig -F <name>
+ * unconfigures filter
*/
static int parse_file(char *filename);
@@ -60,6 +71,8 @@ static int parse_int(char *str);
static void usage(void);
+static int parse_filter_params(int argc, char **argv);
+
int
main(argc, argv)
int argc;
@@ -75,6 +88,11 @@ main(argc, argv)
#endif
(void) textdomain(TEXT_DOMAIN);
+ if (argc >= 2 && strcmp(argv[0], "-F") == 0) {
+ argc--; argv++;
+ ret = parse_filter_params(argc, argv);
+ exit(ret);
+ }
if (argc == 2 && strcmp(argv[0], "-f") == 0) {
ret = parse_file(argv[1]);
exit(ret);
@@ -213,7 +231,7 @@ split_line(char *line, char *argvec[], int maxargvec)
static int
parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
{
- int fam, type, protocol;
+ int cmd, fam, type, protocol;
fam = parse_int(famstr);
if (fam == -1) {
@@ -272,13 +290,17 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
}
return (1);
}
+
+ cmd = SOCKCONFIG_ADD_SOCK;
+ } else {
+ cmd = SOCKCONFIG_REMOVE_SOCK;
}
#ifdef DEBUG
- printf("not calling sockconfig(%d, %d, %d, %s)\n",
- fam, type, protocol, path == NULL ? "(null)" : path);
+ printf("not calling sockconfig(%d, %d, %d, %d, %s)\n",
+ cmd, fam, type, protocol, path == NULL ? "(null)" : path);
#else
- if (_sockconfig(fam, type, protocol, path) == -1) {
+ if (_sockconfig(cmd, fam, type, protocol, path) == -1) {
perror("sockconfig");
return (1);
}
@@ -297,3 +319,181 @@ parse_int(char *str)
return (-1);
return (res);
}
+
+/*
+ * Add and remove socket filters.
+ */
+static int
+parse_filter_params(int argc, char **argv)
+{
+ struct sockconfig_filter_props filprop;
+ sof_socktuple_t *socktuples;
+ size_t tupcnt, nalloc;
+ char *hintarg, *socktup, *tupstr;
+ int i;
+
+ if (argc == 1) {
+ if (_sockconfig(SOCKCONFIG_REMOVE_FILTER, argv[0], 0,
+ 0, 0) < 0) {
+ switch (errno) {
+ case ENXIO:
+ fprintf(stderr,
+ gettext("socket filter is not configured "
+ "'%s'\n"), argv[0]);
+ break;
+ default:
+ perror("sockconfig");
+ break;
+ }
+ return (1);
+ }
+ return (0);
+ }
+
+ if (argc < 4 || argc > 5)
+ return (1);
+
+
+ if (strlen(argv[1]) >= MODMAXNAMELEN) {
+ fprintf(stderr,
+ gettext("invalid module name '%s': name too long\n"),
+ argv[1]);
+ return (1);
+ }
+ filprop.sfp_modname = argv[1];
+
+ /* Check the attach semantics */
+ if (strcmp(argv[2], "auto") == 0) {
+ filprop.sfp_autoattach = B_TRUE;
+ if (argc == 5) {
+ /* placement hint */
+ if (strcmp(argv[3], "top") == 0) {
+ filprop.sfp_hint = SOF_HINT_TOP;
+ } else if (strcmp(argv[3], "bottom") == 0) {
+ filprop.sfp_hint = SOF_HINT_BOTTOM;
+ } else {
+ if (strncmp(argv[3], "before", 6) == 0) {
+ filprop.sfp_hint = SOF_HINT_BEFORE;
+ } else if (strncmp(argv[3], "after", 5) == 0) {
+ filprop.sfp_hint = SOF_HINT_AFTER;
+ } else {
+ fprintf(stderr,
+ gettext("invalid placement hint "
+ "'%s'\n"), argv[3]);
+ return (1);
+ }
+
+ hintarg = strchr(argv[3], ':');
+ if (hintarg == NULL ||
+ (strlen(++hintarg) == 0) ||
+ (strlen(hintarg) >= FILNAME_MAX)) {
+ fprintf(stderr,
+ gettext("invalid placement hint "
+ "argument '%s': name too long\n"),
+ argv[3]);
+ return (1);
+ }
+
+ filprop.sfp_hintarg = hintarg;
+ }
+ } else {
+ filprop.sfp_hint = SOF_HINT_NONE;
+ }
+ } else if (strcmp(argv[2], "prog") == 0) {
+ filprop.sfp_autoattach = B_FALSE;
+ filprop.sfp_hint = SOF_HINT_NONE;
+ /* cannot specify placement hint for programmatic filter */
+ if (argc == 5) {
+ fprintf(stderr,
+ gettext("placement hint specified for programmatic "
+ "filter\n"));
+ return (1);
+ }
+ } else {
+ fprintf(stderr, gettext("invalid attach semantic '%s'\n"),
+ argv[2]);
+ return (1);
+ }
+
+ /* parse the socket tuples */
+ nalloc = 4;
+ socktuples = calloc(nalloc, sizeof (sof_socktuple_t));
+ if (socktuples == NULL) {
+ perror("calloc");
+ return (1);
+ }
+
+ tupcnt = 0;
+ tupstr = argv[(argc == 4) ? 3 : 4];
+ while ((socktup = strsep(&tupstr, ",")) != NULL) {
+ int val;
+ char *valstr;
+
+ if (tupcnt == nalloc) {
+ sof_socktuple_t *new;
+
+ nalloc *= 2;
+ new = realloc(socktuples,
+ nalloc * sizeof (sof_socktuple_t));
+ if (new == NULL) {
+ perror("realloc");
+ free(socktuples);
+ return (1);
+ }
+ socktuples = new;
+ }
+ i = 0;
+ while ((valstr = strsep(&socktup, ":")) != NULL && i < 3) {
+ val = parse_int(valstr);
+ if (val == -1) {
+ fprintf(stderr, gettext("bad socket tuple\n"));
+ free(socktuples);
+ return (1);
+ }
+ switch (i) {
+ case 0: socktuples[tupcnt].sofst_family = val; break;
+ case 1: socktuples[tupcnt].sofst_type = val; break;
+ case 2: socktuples[tupcnt].sofst_protocol = val; break;
+ }
+ i++;
+ }
+ if (i != 3) {
+ fprintf(stderr, gettext("bad socket tuple\n"));
+ free(socktuples);
+ return (1);
+ }
+ tupcnt++;
+ }
+ if (tupcnt == 0) {
+ fprintf(stderr, gettext("no socket tuples specified\n"));
+ free(socktuples);
+ return (1);
+ }
+ filprop.sfp_socktuple_cnt = tupcnt;
+ filprop.sfp_socktuple = socktuples;
+
+ if (_sockconfig(SOCKCONFIG_ADD_FILTER, argv[0], &filprop, 0, 0) < 0) {
+ switch (errno) {
+ case EINVAL:
+ fprintf(stderr,
+ gettext("invalid socket filter configuration\n"));
+ break;
+ case EEXIST:
+ fprintf(stderr,
+ gettext("socket filter is already configured "
+ "'%s'\n"), argv[0]);
+ break;
+ case ENOSPC:
+ fprintf(stderr, gettext("unable to satisfy placement "
+ "constraint\n"));
+ break;
+ default:
+ perror("sockconfig");
+ break;
+ }
+ free(socktuples);
+ return (1);
+ }
+ free(socktuples);
+ return (0);
+}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter b/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter
new file mode 100644
index 0000000000..8df5ab52d8
--- /dev/null
+++ b/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter
@@ -0,0 +1,55 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+. /lib/svc/share/smf_include.sh
+
+filter_name=`svcprop -p socket-filter/name $SMF_FMRI 2>/dev/null`
+if [ -z "$filter_name" ]; then
+ echo "socket-filter/name is missing"
+ exit $SMF_EXIT_ERR_CONFIG
+fi
+
+case "$1" in
+start)
+ mod_name=`svcprop -p socket-filter/module_name $SMF_FMRI 2>/dev/null`
+ type=`svcprop -p socket-filter/attach_semantics $SMF_FMRI 2>/dev/null`
+ order=`svcprop -p socket-filter/order_hint $SMF_FMRI 2>/dev/null`
+ socktups=`svcprop -p socket-filter/socket_tuples $SMF_FMRI 2>/dev/null`
+
+ /sbin/soconfig -F $filter_name $mod_name $type $order $socktups
+ if [ $? -ne 0 ]; then
+ exit $SMF_EXIT_ERR_FATAL
+ fi
+ ;;
+stop)
+ /sbin/soconfig -F $filter_name
+ ;;
+*)
+ echo "Usage: $0 { start | stop }"
+ exit 1
+ ;;
+esac
+
+exit $SMF_EXIT_OK
diff --git a/usr/src/cmd/ptools/pfiles/pfiles.c b/usr/src/cmd/ptools/pfiles/pfiles.c
index 50e82ad34a..5bd1373a1d 100644
--- a/usr/src/cmd/ptools/pfiles/pfiles.c
+++ b/usr/src/cmd/ptools/pfiles/pfiles.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <stdio.h>
@@ -650,6 +649,70 @@ show_sockopts(struct ps_prochandle *Pr, int fd)
(void) printf("\t%s\n", buf+1);
}
+#define MAXNALLOC 32
+static void
+show_sockfilters(struct ps_prochandle *Pr, int fd)
+{
+ struct fil_info *fi;
+ int i = 0, nalloc = 2, len = nalloc * sizeof (*fi);
+ boolean_t printhdr = B_TRUE;
+
+ fi = calloc(nalloc, sizeof (*fi));
+ if (fi == NULL) {
+ perror("calloc");
+ return;
+ }
+ /* CONSTCOND */
+ while (1) {
+ if (pr_getsockopt(Pr, fd, SOL_FILTER, FIL_LIST, fi, &len) != 0)
+ break;
+ /* No filters */
+ if (len == 0)
+ break;
+ /* Make sure buffer was large enough */
+ if (fi->fi_pos >= nalloc) {
+ struct fil_info *new;
+
+ nalloc = fi->fi_pos + 1;
+ if (nalloc > MAXNALLOC)
+ break;
+ len = nalloc * sizeof (*fi);
+ new = realloc(fi, nalloc * sizeof (*fi));
+ if (new == NULL) {
+ perror("realloc");
+ break;
+ }
+ fi = new;
+ continue;
+ }
+
+ for (i = 0; (i + 1) * sizeof (*fi) <= len; i++) {
+ if (fi[i].fi_flags & FILF_BYPASS)
+ continue;
+ if (printhdr) {
+ (void) printf("\tfilters: ");
+ printhdr = B_FALSE;
+ }
+ (void) printf("%s", fi[i].fi_name);
+ if (fi[i].fi_flags != 0) {
+ (void) printf("(");
+ if (fi[i].fi_flags & FILF_AUTO)
+ (void) printf("auto,");
+ if (fi[i].fi_flags & FILF_PROG)
+ (void) printf("prog,");
+ (void) printf("\b)");
+ }
+ if (fi[i].fi_pos == 0) /* last one */
+ break;
+ (void) printf(",");
+ }
+ if (!printhdr)
+ (void) printf("\n");
+ break;
+ }
+ free(fi);
+}
+
/* the file is a socket */
static void
dosocket(struct ps_prochandle *Pr, int fd)
@@ -666,6 +729,7 @@ dosocket(struct ps_prochandle *Pr, int fd)
show_socktype((uint_t)type);
show_sockopts(Pr, fd);
+ show_sockfilters(Pr, fd);
len = sizeof (buf);
if (pr_getsockname(Pr, fd, sa, &len) == 0)
diff --git a/usr/src/cmd/truss/expound.c b/usr/src/cmd/truss/expound.c
index d78cbecad5..78c13fc4dc 100644
--- a/usr/src/cmd/truss/expound.c
+++ b/usr/src/cmd/truss/expound.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -90,6 +89,7 @@
#include <sys/nvpair.h>
#include <libnvpair.h>
#include <sys/rctl_impl.h>
+#include <sys/socketvar.h>
#include "ramdata.h"
#include "systable.h"
@@ -4721,6 +4721,132 @@ show_utimesys(private_t *pri)
}
}
+#ifdef _LP64
+static void
+show_sockconfig_filter_prop32(private_t *pri, long addr)
+{
+ struct sockconfig_filter_props32 props;
+ const char *s = NULL;
+ char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)];
+ sof_socktuple32_t *tup;
+ size_t sz;
+ int i;
+
+ if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) {
+ if (Pread_string(Proc, buf, sizeof (buf),
+ (uintptr_t)props.sfp_modname) == -1)
+ (void) strcpy(buf, "<?>");
+ (void) printf("%s\tmodule name: %s\n", pri->pname, buf);
+ (void) printf("%s\tattach semantics: %s", pri->pname,
+ props.sfp_autoattach ? "automatic" : "progammatic");
+ if (props.sfp_autoattach) {
+ buf[0] = '\0';
+ switch (props.sfp_hint) {
+ case SOF_HINT_TOP: s = "top"; break;
+ case SOF_HINT_BOTTOM: s = "bottom"; break;
+ case SOF_HINT_BEFORE:
+ case SOF_HINT_AFTER:
+ s = (props.sfp_hint == SOF_HINT_BEFORE) ?
+ "before" : "after";
+ if (Pread_string(Proc, buf, sizeof (buf),
+ (uintptr_t)props.sfp_hintarg) == -1)
+ (void) strcpy(buf, "<?>");
+ }
+ if (s != NULL) {
+ (void) printf(", placement: %s %s", s, buf);
+ }
+ }
+ (void) printf("\n");
+ (void) printf("%s\tsocket tuples:\n", pri->pname);
+ if (props.sfp_socktuple_cnt == 0) {
+ (void) printf("\t\t<empty>\n");
+ return;
+ }
+ sz = props.sfp_socktuple_cnt * sizeof (*tup);
+ tup = my_malloc(sz, "socket tuple buffer");
+ if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz)
+ for (i = 0; i < props.sfp_socktuple_cnt; i++) {
+ (void) printf(
+ "\t\tfamily: %d, type: %d, proto: %d\n",
+ tup[i].sofst_family, tup[i].sofst_type,
+ tup[i].sofst_protocol);
+ }
+ }
+}
+#endif /* _LP64 */
+static void
+show_sockconfig_filter_prop(private_t *pri, long addr)
+{
+ struct sockconfig_filter_props props;
+ const char *s = NULL;
+ char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)];
+ sof_socktuple_t *tup;
+ size_t sz;
+ int i;
+
+ if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) {
+ if (Pread_string(Proc, buf, sizeof (buf),
+ (uintptr_t)props.sfp_modname) == -1)
+ (void) strcpy(buf, "<?>");
+ (void) printf("%s\tmodule name: %s\n", pri->pname, buf);
+ (void) printf("%s\tattach semantics: %s", pri->pname,
+ props.sfp_autoattach ? "automatic" : "progammatic");
+ if (props.sfp_autoattach) {
+ buf[0] = '\0';
+ switch (props.sfp_hint) {
+ case SOF_HINT_TOP: s = "top"; break;
+ case SOF_HINT_BOTTOM: s = "bottom"; break;
+ case SOF_HINT_BEFORE:
+ case SOF_HINT_AFTER:
+ s = (props.sfp_hint == SOF_HINT_BEFORE) ?
+ "before" : "after";
+ if (Pread_string(Proc, buf, sizeof (buf),
+ (uintptr_t)props.sfp_hintarg) == -1)
+ (void) strcpy(buf, "<?>");
+ }
+ if (s != NULL) {
+ (void) printf(", placement: %s", s);
+ }
+ }
+ (void) printf("\n");
+ (void) printf("%s\tsocket tuples:\n", pri->pname);
+ if (props.sfp_socktuple_cnt == 0) {
+ (void) printf("\t\t<empty>\n");
+ return;
+ }
+ sz = props.sfp_socktuple_cnt * sizeof (*tup);
+ tup = my_malloc(sz, "socket tuple buffer");
+ if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz)
+ for (i = 0; i < props.sfp_socktuple_cnt; i++) {
+ (void) printf(
+ "\t\tfamily: %d, type: %d, proto: %d\n",
+ tup[i].sofst_family, tup[i].sofst_type,
+ tup[i].sofst_protocol);
+ }
+ }
+}
+
+void
+show_sockconfig(private_t *pri)
+{
+ switch (pri->sys_args[0]) {
+ case SOCKCONFIG_ADD_FILTER:
+#ifdef _LP64
+ if (data_model == PR_MODEL_LP64)
+ show_sockconfig_filter_prop(pri,
+ (long)pri->sys_args[2]);
+ else
+ show_sockconfig_filter_prop32(pri,
+ (long)pri->sys_args[2]);
+#else
+ show_sockconfig_filter_prop(pri, (long)pri->sys_args[2]);
+#endif
+ break;
+ default:
+ break;
+ }
+}
+
/* expound verbosely upon syscall arguments */
/*ARGSUSED*/
void
@@ -5199,5 +5325,8 @@ expound(private_t *pri, long r0, int raw)
case SYS_utimesys:
show_utimesys(pri);
break;
+ case SYS_sockconfig:
+ show_sockconfig(pri);
+ break;
}
}
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 5de1342c0e..1a92777c28 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -1649,7 +1649,32 @@ prt_pfm(private_t *pri, int raw, long val)
}
}
+/*
+ * Print sockconfig() subcode.
+ */
+/*ARGSUSED*/
+void
+prt_skc(private_t *pri, int raw, long val)
+{
+ const char *s = NULL;
+ if (!raw) {
+ switch (val) {
+ case SOCKCONFIG_ADD_SOCK:
+ s = "SOCKCONFIG_ADD_SOCK"; break;
+ case SOCKCONFIG_REMOVE_SOCK:
+ s = "SOCKCONFIG_REMOVE_SOCK"; break;
+ case SOCKCONFIG_ADD_FILTER:
+ s = "SOCKCONFIG_ADD_FILTER"; break;
+ case SOCKCONFIG_REMOVE_FILTER:
+ s = "SOCKCONFIG_REMOVE_FILTER"; break;
+ }
+ }
+ if (s == NULL)
+ prt_dec(pri, 0, val);
+ else
+ outstring(pri, s);
+}
/*
* Print so_socket() 2nd argument.
*/
@@ -2709,5 +2734,6 @@ void (* const Print[])() = {
prt_un1, /* UN1 -- as prt_uns except for -1 */
prt_mob, /* MOB -- print mmapobj() flags */
prt_utf, /* UTF -- print utimensat() flag */
+ prt_skc, /* SKC -- print sockconfig() subcode */
prt_dec, /* HID -- hidden argument, make this the last one */
};
diff --git a/usr/src/cmd/truss/print.h b/usr/src/cmd/truss/print.h
index 159b2fbe58..7a190f9cab 100644
--- a/usr/src/cmd/truss/print.h
+++ b/usr/src/cmd/truss/print.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -135,7 +134,8 @@ extern "C" {
#define UN1 95 /* unsigned except for -1 */
#define MOB 96 /* print mmapobj() flags */
#define UTF 97 /* print utimensat() flag */
-#define HID 98 /* hidden argument, don't print */
+#define SKC 98 /* print sockconfig subcode */
+#define HID 99 /* hidden argument, don't print */
/* make sure HID is always the last member */
/*
diff --git a/usr/src/cmd/truss/systable.c b/usr/src/cmd/truss/systable.c
index fe49984a29..b8bdbe6af5 100644
--- a/usr/src/cmd/truss/systable.c
+++ b/usr/src/cmd/truss/systable.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -464,7 +463,7 @@ const struct systable systable[] = {
{"getsockname", 4, DEC, NOV, DEC, HEX, HEX, SKV}, /* 244 */
{"getsockopt", 6, DEC, NOV, DEC, SOL, SON, HEX, HEX, SKV}, /* 245 */
{"setsockopt", 6, DEC, NOV, DEC, SOL, SON, HEX, DEC, SKV}, /* 246 */
-{"sockconfig", 4, DEC, NOV, DEC, DEC, DEC, STG}, /* 247 */
+{"sockconfig", 5, DEC, NOV, DEC, HEX, HEX, HEX, HEX}, /* 247 */
{"ntp_gettime", 1, DEC, NOV, HEX}, /* 248 */
{"ntp_adjtime", 1, DEC, NOV, HEX}, /* 249 */
{"lwp_mutex_unlock", 1, DEC, NOV, HEX}, /* 250 */
@@ -873,6 +872,14 @@ const struct systable utimesystable[] = {
};
#define NUTIMESYSCODE (sizeof (utimesystable) / sizeof (struct systable))
+const struct systable sockconfigtable[] = {
+{"sockconfig", 5, DEC, NOV, SKC, DEC, DEC, DEC, STG}, /* 0 */
+{"sockconfig", 4, DEC, NOV, SKC, DEC, DEC, DEC}, /* 1 */
+{"sockconfig", 3, DEC, NOV, SKC, STG, HEX }, /* 2 */
+{"sockconfig", 2, DEC, NOV, SKC, STG }, /* 3 */
+};
+#define NSOCKCONFIGCODE (sizeof (sockconfigtable) / sizeof (struct systable))
+
const struct sysalias sysalias[] = {
{ "exit", SYS_exit },
{ "fork", SYS_forksys },
@@ -1204,6 +1211,10 @@ subsys(int syscall, int subcode)
if ((unsigned)subcode < NUTIMESYSCODE)
stp = &utimesystable[subcode];
break;
+ case SYS_sockconfig: /* sockconfig family */
+ if ((unsigned)subcode < NSOCKCONFIGCODE)
+ stp = &sockconfigtable[subcode];
+ break;
}
}
@@ -1383,6 +1394,7 @@ getsubcode(private_t *pri)
case SYS_rctlsys: /* rctlsys */
case SYS_sidsys: /* sidsys */
case SYS_utimesys: /* utimesys */
+ case SYS_sockconfig: /* sockconfig */
subcode = arg0;
break;
case SYS_fcntl: /* fcntl() */
@@ -1453,7 +1465,8 @@ maxsyscalls()
+ NRCTLCODE - 1
+ NFORKCODE - 1
+ NSIDSYSCODE - 1
- + NUTIMESYSCODE - 1);
+ + NUTIMESYSCODE - 1
+ + NSOCKCONFIGCODE - 1);
}
/*
@@ -1545,6 +1558,8 @@ nsubcodes(int syscall)
return (NSIDSYSCODE);
case SYS_utimesys:
return (NUTIMESYSCODE);
+ case SYS_sockconfig:
+ return (NSOCKCONFIGCODE);
default:
return (1);
}
diff --git a/usr/src/lib/libc/common/sys/_sockconfig.s b/usr/src/lib/libc/common/sys/_sockconfig.s
index 4ee709ee1b..9c939f0d52 100644
--- a/usr/src/lib/libc/common/sys/_sockconfig.s
+++ b/usr/src/lib/libc/common/sys/_sockconfig.s
@@ -23,16 +23,14 @@
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
*/
.file "_sockconfig.s"
/* C library -- _sockconfig */
/*
- * int _sockconfig (int domain, int type, int protocol,
- * dev_t dev, int version);
+ * int _sockconfig (int cmd, void *arg1, void *arg2, void *arg3, void *arg4);
*/
#include "SYS.h"
diff --git a/usr/src/pkg/manifests/SUNWcs.mf b/usr/src/pkg/manifests/SUNWcs.mf
index 6a92f9b5d8..4035949864 100644
--- a/usr/src/pkg/manifests/SUNWcs.mf
+++ b/usr/src/pkg/manifests/SUNWcs.mf
@@ -629,6 +629,7 @@ file path=lib/svc/method/svc-hotplug mode=0555
file path=lib/svc/method/svc-legacy-routing mode=0555
file path=lib/svc/method/svc-nscd mode=0555
file path=lib/svc/method/svc-rbac mode=0555
+file path=lib/svc/method/svc-sockfilter mode=0555
file path=lib/svc/method/svc-utmpd mode=0555
file path=lib/svc/method/system-log mode=0555
file path=lib/svc/method/vtdaemon mode=0555
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 265ad592b7..7ba9696a61 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1263,7 +1263,7 @@ SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \
sockcommon_sops.o sockcommon.o \
sock_notsupp.o socknotify.o \
nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \
- nl7cnca.o sodirect.o
+ nl7cnca.o sodirect.o sockfilter.o
TMPFS_OBJS += tmp_dir.o tmp_subr.o tmp_tnode.o tmp_vfsops.o \
tmp_vnops.o
diff --git a/usr/src/uts/common/c2/audit_event.c b/usr/src/uts/common/c2/audit_event.c
index 440db9dd3f..68975f00aa 100644
--- a/usr/src/uts/common/c2/audit_event.c
+++ b/usr/src/uts/common/c2/audit_event.c
@@ -3822,38 +3822,60 @@ aus_sockconfig(tad)
struct t_audit_data *tad;
{
struct a {
- long domain;
- long type;
- long protocol;
- long devpath;
+ long cmd;
+ long arg1;
+ long arg2;
+ long arg3;
+ long arg4;
} *uap = (struct a *)ttolwp(curthread)->lwp_ap;
- char *kdevpath;
- int kdevpathlen = MAXPATHLEN + 1;
+ char *buf;
+ int buflen;
size_t size;
- au_uwrite(au_to_arg32(1, "domain", (uint32_t)uap->domain));
- au_uwrite(au_to_arg32(2, "type", (uint32_t)uap->type));
- au_uwrite(au_to_arg32(3, "protocol", (uint32_t)uap->protocol));
+ au_uwrite(au_to_arg32(1, "cmd", (uint_t)uap->cmd));
+ switch (uap->cmd) {
+ case SOCKCONFIG_ADD_SOCK:
+ case SOCKCONFIG_REMOVE_SOCK:
+ au_uwrite(au_to_arg32(2, "domain", (uint32_t)uap->arg1));
+ au_uwrite(au_to_arg32(3, "type", (uint32_t)uap->arg2));
+ au_uwrite(au_to_arg32(4, "protocol", (uint32_t)uap->arg3));
+
+ if (uap->arg4 == 0) {
+ au_uwrite(au_to_arg32(5, "devpath", (uint32_t)0));
+ } else {
+ buflen = MAXPATHLEN + 1;
+ buf = kmem_alloc(buflen, KM_SLEEP);
+ if (copyinstr((caddr_t)uap->arg4, buf, buflen,
+ &size)) {
+ kmem_free(buf, buflen);
+ return;
+ }
- if (uap->devpath == 0) {
- au_uwrite(au_to_arg32(3, "devpath", (uint32_t)0));
- } else {
- kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP);
+ if (size > MAXPATHLEN) {
+ kmem_free(buf, buflen);
+ return;
+ }
- if (copyinstr((caddr_t)uap->devpath, kdevpath, kdevpathlen,
- &size)) {
- kmem_free(kdevpath, kdevpathlen);
- return;
+ au_uwrite(au_to_text(buf));
+ kmem_free(buf, buflen);
}
+ break;
+ case SOCKCONFIG_ADD_FILTER:
+ case SOCKCONFIG_REMOVE_FILTER:
+ buflen = FILNAME_MAX;
+ buf = kmem_alloc(buflen, KM_SLEEP);
- if (size > MAXPATHLEN) {
- kmem_free(kdevpath, kdevpathlen);
+ if (copyinstr((caddr_t)uap->arg1, buf, buflen, &size)) {
+ kmem_free(buf, buflen);
return;
}
- au_uwrite(au_to_text(kdevpath));
- kmem_free(kdevpath, kdevpathlen);
+ au_uwrite(au_to_text(buf));
+ kmem_free(buf, buflen);
+ break;
+ default:
+ break;
}
}
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index e92e72f8dc..703e26ea61 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -45,6 +44,7 @@
#include <inet/ipclassifier.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/socktpi.h>
#include <fs/sockfs/sodirect.h>
@@ -216,7 +216,7 @@ socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
* Active open.
*/
int
-socket_connect(struct sonode *so, const struct sockaddr *name,
+socket_connect(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int fflag, int flags, cred_t *cr)
{
int error;
@@ -471,14 +471,23 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
so->so_rcv_timer_tid = 0;
so->so_rcv_thresh = 0;
- so->so_acceptq_head = NULL;
- so->so_acceptq_tail = &so->so_acceptq_head;
- so->so_acceptq_next = NULL;
+ list_create(&so->so_acceptq_list, sizeof (struct sonode),
+ offsetof(struct sonode, so_acceptq_node));
+ list_create(&so->so_acceptq_defer, sizeof (struct sonode),
+ offsetof(struct sonode, so_acceptq_node));
+ list_link_init(&so->so_acceptq_node);
so->so_acceptq_len = 0;
so->so_backlog = 0;
+ so->so_listener = NULL;
so->so_snd_qfull = B_FALSE;
+ so->so_filter_active = 0;
+ so->so_filter_tx = 0;
+ so->so_filter_defertime = 0;
+ so->so_filter_top = NULL;
+ so->so_filter_bottom = NULL;
+
mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
@@ -509,9 +518,15 @@ sonode_destructor(void *buf, void *cdrarg)
ASSERT(so->so_rcv_q_head == NULL);
- ASSERT(so->so_acceptq_head == NULL);
- ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
- ASSERT(so->so_acceptq_next == NULL);
+ list_destroy(&so->so_acceptq_list);
+ list_destroy(&so->so_acceptq_defer);
+ ASSERT(!list_link_active(&so->so_acceptq_node));
+ ASSERT(so->so_listener == NULL);
+
+ ASSERT(so->so_filter_active == 0);
+ ASSERT(so->so_filter_tx == 0);
+ ASSERT(so->so_filter_top == NULL);
+ ASSERT(so->so_filter_bottom == NULL);
ASSERT(vp->v_data == so);
ASSERT(vn_matchops(vp, socket_vnodeops));
@@ -581,21 +596,11 @@ sonode_init(struct sonode *so, struct sockparams *sp, int family,
so->so_copyflag = 0;
- ASSERT(so->so_acceptq_head == NULL);
- ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
- ASSERT(so->so_acceptq_next == NULL);
-
vn_reinit(vp);
vp->v_vfsp = rootvfs;
vp->v_type = VSOCK;
vp->v_rdev = sockdev;
- so->so_rcv_queued = 0;
- so->so_rcv_q_head = NULL;
- so->so_rcv_q_last_head = NULL;
- so->so_rcv_head = NULL;
- so->so_rcv_last_head = NULL;
-
so->so_snd_qfull = B_FALSE;
so->so_minpsz = 0;
@@ -620,7 +625,6 @@ sonode_init(struct sonode *so, struct sockparams *sp, int family,
void
sonode_fini(struct sonode *so)
{
- mblk_t *mp;
vnode_t *vp;
ASSERT(so->so_count == 0);
@@ -631,15 +635,6 @@ sonode_fini(struct sonode *so)
so->so_rcv_timer_tid = 0;
}
- so_acceptq_flush(so, B_FALSE);
-
- if ((mp = so->so_oobmsg) != NULL) {
- freemsg(mp);
- so->so_oobmsg = NULL;
- so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
- SS_RCVATMARK);
- }
-
if (so->so_poll_list.ph_list != NULL) {
pollwakeup(&so->so_poll_list, POLLERR);
pollhead_clean(&so->so_poll_list);
@@ -655,4 +650,17 @@ sonode_fini(struct sonode *so)
crfree(so->so_peercred);
so->so_peercred = NULL;
}
+ /* Detach and destroy filters */
+ if (so->so_filter_top != NULL)
+ sof_sonode_cleanup(so);
+
+ ASSERT(list_is_empty(&so->so_acceptq_list));
+ ASSERT(list_is_empty(&so->so_acceptq_defer));
+ ASSERT(!list_link_active(&so->so_acceptq_node));
+
+ ASSERT(so->so_rcv_queued == 0);
+ ASSERT(so->so_rcv_q_head == NULL);
+ ASSERT(so->so_rcv_q_last_head == NULL);
+ ASSERT(so->so_rcv_head == NULL);
+ ASSERT(so->so_rcv_last_head == NULL);
}
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h
index fac10a8935..d4e1883b1d 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.h
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SOCKCOMMON_H_
@@ -54,7 +53,7 @@ extern int socket_bind(struct sonode *, struct sockaddr *, socklen_t, int,
struct cred *);
extern int socket_accept(struct sonode *, int, struct cred *, struct sonode **);
extern int socket_listen(struct sonode *, int, struct cred *);
-extern int socket_connect(struct sonode *, const struct sockaddr *,
+extern int socket_connect(struct sonode *, struct sockaddr *,
socklen_t, int, int, struct cred *);
extern int socket_getpeername(struct sonode *, struct sockaddr *, socklen_t *,
boolean_t, struct cred *);
@@ -120,7 +119,7 @@ extern int so_accept(struct sonode *, int, struct cred *, struct sonode **);
extern int so_bind(struct sonode *, struct sockaddr *, socklen_t, int,
struct cred *);
extern int so_listen(struct sonode *, int, struct cred *);
-extern int so_connect(struct sonode *, const struct sockaddr *,
+extern int so_connect(struct sonode *, struct sockaddr *,
socklen_t, int, int, struct cred *);
extern int so_getsockopt(struct sonode *, int, int, void *,
socklen_t *, int, struct cred *);
@@ -136,6 +135,8 @@ extern int so_poll(struct sonode *, short, int, short *,
struct pollhead **);
extern int so_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
struct cred *);
+extern int so_sendmblk_impl(struct sonode *, struct nmsghdr *, int,
+ struct cred *, mblk_t **, struct sof_instance *, boolean_t);
extern int so_sendmblk(struct sonode *, struct nmsghdr *, int,
struct cred *, mblk_t **);
extern int so_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
@@ -153,6 +154,8 @@ extern void so_set_prop(sock_upper_handle_t,
struct sock_proto_props *);
extern ssize_t so_queue_msg(sock_upper_handle_t, mblk_t *, size_t, int,
int *, boolean_t *);
+extern ssize_t so_queue_msg_impl(struct sonode *, mblk_t *, size_t, int,
+ int *, boolean_t *, struct sof_instance *);
extern void so_signal_oob(sock_upper_handle_t, ssize_t);
extern void so_connected(sock_upper_handle_t, sock_connid_t, struct cred *,
@@ -183,6 +186,7 @@ extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *,
rval_t *, int);
extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t);
extern void so_process_new_message(struct sonode *, mblk_t *, mblk_t *);
+extern void so_check_flow_control(struct sonode *);
extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
@@ -213,7 +217,7 @@ extern int so_get_mod_version(struct sockparams *);
/* Notification functions */
extern void so_notify_connected(struct sonode *);
extern void so_notify_disconnecting(struct sonode *);
-extern void so_notify_disconnected(struct sonode *, int);
+extern void so_notify_disconnected(struct sonode *, boolean_t, int);
extern void so_notify_writable(struct sonode *);
extern void so_notify_data(struct sonode *, size_t);
extern void so_notify_oobsig(struct sonode *);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 64ea59c4b5..bf5fcdeb08 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -46,6 +46,7 @@
#include <inet/ip.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <sys/socket_proto.h>
@@ -59,7 +60,7 @@
extern int xnet_skip_checks;
extern int xnet_check_print;
-static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
+static void so_queue_oob(struct sonode *, mblk_t *, size_t);
/*ARGSUSED*/
@@ -291,8 +292,11 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
}
dobind:
- error = (*so->so_downcalls->sd_bind)
- (so->so_proto_handle, name, namelen, cr);
+ if (so->so_filter_active == 0 ||
+ (error = sof_filter_bind(so, name, &namelen, cr)) < 0) {
+ error = (*so->so_downcalls->sd_bind)
+ (so->so_proto_handle, name, namelen, cr);
+ }
done:
SO_UNBLOCK_FALLBACK(so);
@@ -307,8 +311,10 @@ so_listen(struct sonode *so, int backlog, struct cred *cr)
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
- error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
- cr);
+ if ((so)->so_filter_active == 0 ||
+ (error = sof_filter_listen(so, &backlog, cr)) < 0)
+ error = (*so->so_downcalls->sd_listen)(so->so_proto_handle,
+ backlog, cr);
SO_UNBLOCK_FALLBACK(so);
@@ -317,7 +323,7 @@ so_listen(struct sonode *so, int backlog, struct cred *cr)
int
-so_connect(struct sonode *so, const struct sockaddr *name,
+so_connect(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int fflag, int flags, struct cred *cr)
{
int error = 0;
@@ -339,12 +345,16 @@ so_connect(struct sonode *so, const struct sockaddr *name,
goto done;
}
- error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
- name, namelen, &id, cr);
-
- if (error == EINPROGRESS)
- error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
+ if (so->so_filter_active == 0 ||
+ (error = sof_filter_connect(so, (struct sockaddr *)name,
+ &namelen, cr)) < 0) {
+ error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
+ name, namelen, &id, cr);
+ if (error == EINPROGRESS)
+ error = so_wait_connected(so,
+ fflag & (FNONBLOCK|FNDELAY), id);
+ }
done:
SO_UNBLOCK_FALLBACK(so);
return (error);
@@ -371,9 +381,10 @@ so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
ASSERT(nso != NULL);
/* finish the accept */
- error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
- nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
- if (error != 0) {
+ if ((so->so_filter_active > 0 &&
+ (error = sof_filter_accept(nso, cr)) > 0) ||
+ (error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
+ nso->so_proto_handle, (sock_upper_handle_t)nso, cr)) != 0) {
(void) socket_close(nso, 0, cr);
socket_destroy(nso);
} else {
@@ -442,7 +453,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
error = EOPNOTSUPP;
break;
}
- } else if (so->so_snd_qfull) {
+ } else if (SO_SND_FLOWCTRLD(so)) {
/*
* Need to wait until the protocol is ready to receive
* more data for transmission.
@@ -474,6 +485,13 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
}
ASSERT(uiop->uio_resid >= 0);
+ if (so->so_filter_active > 0 &&
+ ((mp = SOF_FILTER_DATA_OUT(so, mp, msg, cr,
+ &error)) == NULL)) {
+ if (error != 0)
+ break;
+ continue;
+ }
error = (*so->so_downcalls->sd_send)
(so->so_proto_handle, mp, msg, cr);
if (error != 0) {
@@ -495,27 +513,23 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
}
int
-so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
- struct cred *cr, mblk_t **mpp)
+so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp, sof_instance_t *fil,
+ boolean_t fil_inject)
{
int error;
boolean_t dontblock;
size_t size;
mblk_t *mp = *mpp;
- SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
+ if (so->so_downcalls->sd_send == NULL)
+ return (EOPNOTSUPP);
error = 0;
dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
(fflag & (FNONBLOCK|FNDELAY));
size = msgdsize(mp);
- if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
- so->so_downcalls->sd_send == NULL) {
- SO_UNBLOCK_FALLBACK(so);
- return (EOPNOTSUPP);
- }
-
if ((so->so_mode & SM_ATOMIC) &&
size > so->so_proto_props.sopp_maxpsz &&
so->so_proto_props.sopp_maxpsz != -1) {
@@ -538,7 +552,8 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
if (error != 0)
break;
}
- if (so->so_snd_qfull) {
+ /* Socket filters are not flow controlled */
+ if (SO_SND_FLOWCTRLD(so) && !fil_inject) {
/*
* Need to wait until the protocol is ready to receive
* more data for transmission.
@@ -564,6 +579,14 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
nmp = nmp->b_cont;
}
+ if (so->so_filter_active > 0 &&
+ (mp = SOF_FILTER_DATA_OUT_FROM(so, fil, mp, msg,
+ cr, &error)) == NULL) {
+ *mpp = mp = nmp;
+ if (error != 0)
+ break;
+ continue;
+ }
error = (*so->so_downcalls->sd_send)
(so->so_proto_handle, mp, msg, cr);
if (error != 0) {
@@ -578,6 +601,30 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
*mpp = mp = nmp;
}
+ /* Let the filter know whether the protocol is flow controlled */
+ if (fil_inject && error == 0 && SO_SND_FLOWCTRLD(so))
+ error = ENOSPC;
+
+ return (error);
+}
+
+#pragma inline(so_sendmblk_impl)
+
+int
+so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp)
+{
+ int error;
+
+ SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
+
+ if ((so->so_mode & SM_SENDFILESUPP) == 0) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (EOPNOTSUPP);
+ }
+
+ error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
+ B_FALSE);
SO_UNBLOCK_FALLBACK(so);
@@ -607,8 +654,10 @@ so_shutdown(struct sonode *so, int how, struct cred *cr)
goto done;
}
- error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
- how, cr));
+ if (so->so_filter_active == 0 ||
+ (error = sof_filter_shutdown(so, &how, cr)) < 0)
+ error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
+ how, cr));
/*
* Protocol agreed to shutdown. We need to flush the
@@ -638,8 +687,10 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
- error = (*so->so_downcalls->sd_getsockname)
- (so->so_proto_handle, addr, addrlen, cr);
+ if (so->so_filter_active == 0 ||
+ (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
+ error = (*so->so_downcalls->sd_getsockname)
+ (so->so_proto_handle, addr, addrlen, cr);
SO_UNBLOCK_FALLBACK(so);
return (error);
@@ -664,7 +715,8 @@ so_getpeername(struct sonode *so, struct sockaddr *addr,
if (xnet_check_print) {
printf("sockfs: X/Open getpeername check => EINVAL\n");
}
- } else {
+ } else if (so->so_filter_active == 0 ||
+ (error = sof_filter_getpeername(so, addr, addrlen, cr)) < 0) {
error = (*so->so_downcalls->sd_getpeername)
(so->so_proto_handle, addr, addrlen, cr);
}
@@ -679,13 +731,17 @@ so_getsockopt(struct sonode *so, int level, int option_name,
{
int error = 0;
- ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ if (level == SOL_FILTER)
+ return (sof_getsockopt(so, option_name, optval, optlenp, cr));
+
SO_BLOCK_FALLBACK(so,
SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
- error = socket_getopt_common(so, level, option_name, optval, optlenp,
- flags);
- if (error < 0) {
+ if ((so->so_filter_active == 0 ||
+ (error = sof_filter_getsockopt(so, level, option_name, optval,
+ optlenp, cr)) < 0) &&
+ (error = socket_getopt_common(so, level, option_name, optval,
+ optlenp, flags)) < 0) {
error = (*so->so_downcalls->sd_getsockopt)
(so->so_proto_handle, level, option_name, optval, optlenp,
cr);
@@ -764,6 +820,9 @@ so_setsockopt(struct sonode *so, int level, int option_name,
struct timeval tl;
const void *opt = optval;
+ if (level == SOL_FILTER)
+ return (sof_setsockopt(so, option_name, optval, optlen, cr));
+
SO_BLOCK_FALLBACK(so,
SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
@@ -775,6 +834,11 @@ so_setsockopt(struct sonode *so, int level, int option_name,
return (EINVAL);
}
+ if (so->so_filter_active > 0 &&
+ (error = sof_filter_setsockopt(so, level, option_name,
+ (void *)optval, &optlen, cr)) >= 0)
+ goto done;
+
if (level == SOL_SOCKET) {
switch (option_name) {
case SO_RCVTIMEO:
@@ -856,7 +920,10 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
* calling strioc can result in the socket falling back to TPI,
* if that is supported.
*/
- if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
+ if ((so->so_filter_active == 0 ||
+ (error = sof_filter_ioctl(so, cmd, arg, mode,
+ rvalp, cr)) < 0) &&
+ (error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
(error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
cmd, arg, mode, rvalp, cr);
@@ -894,7 +961,7 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
* is flow controlled
*/
*reventsp |= POLLWRBAND & events;
- if (!so->so_snd_qfull) {
+ if (!SO_SND_FLOWCTRLD(so)) {
/*
* As long as there is buffer to send data
* turn on POLLOUT events
@@ -915,7 +982,7 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
*/
/* Pending connections */
- if (so->so_acceptq_len > 0)
+ if (!list_is_empty(&so->so_acceptq_list))
*reventsp |= (POLLIN|POLLRDNORM) & events;
/* Data */
@@ -941,7 +1008,8 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
/* Check for read events again, but this time under lock */
if (events & (POLLIN|POLLRDNORM)) {
mutex_enter(&so->so_lock);
- if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
+ if (SO_HAVE_DATA(so) ||
+ !list_is_empty(&so->so_acceptq_list)) {
mutex_exit(&so->so_lock);
*reventsp |= (POLLIN|POLLRDNORM) & events;
return (0);
@@ -987,12 +1055,13 @@ int
so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
{
struct sonode *so = (struct sonode *)sock_handle;
+ boolean_t connect_failed;
mutex_enter(&so->so_lock);
-
+ connect_failed = so->so_state & SS_ISCONNECTED;
so->so_proto_connid = id;
soisdisconnected(so, error);
- so_notify_disconnected(so, error);
+ so_notify_disconnected(so, connect_failed, error);
return (0);
}
@@ -1019,6 +1088,16 @@ so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
mutex_enter(&so->so_lock);
so->so_state |= SS_ACCEPTCONN;
so->so_backlog = (unsigned int)arg;
+ /*
+ * The protocol can stop generating newconn upcalls when
+ * the backlog is full, so to make sure the listener does
+ * not end up with a queue full of deferred connections
+ * we reduce the backlog by one. Thus the listener will
+ * start closing deferred connections before the backlog
+ * is full.
+ */
+ if (so->so_filter_active > 0)
+ so->so_backlog = MAX(1, so->so_backlog - 1);
mutex_exit(&so->so_lock);
break;
default:
@@ -1037,6 +1116,7 @@ so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
} else {
so_snd_qnotfull(so);
mutex_enter(&so->so_lock);
+ /* so_notify_writable drops so_lock */
so_notify_writable(so);
}
}
@@ -1053,8 +1133,10 @@ so_newconn(sock_upper_handle_t parenthandle,
ASSERT(proto_handle != NULL);
if ((so->so_state & SS_ACCEPTCONN) == 0 ||
- so->so_acceptq_len >= so->so_backlog)
- return (NULL);
+ (so->so_acceptq_len >= so->so_backlog &&
+ (so->so_filter_active == 0 || !sof_sonode_drop_deferred(so)))) {
+ return (NULL);
+ }
nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
&error);
@@ -1066,6 +1148,7 @@ so_newconn(sock_upper_handle_t parenthandle,
nso->so_peercred = peer_cred;
nso->so_cpid = peer_cpid;
}
+ nso->so_listener = so;
/*
* The new socket (nso), proto_handle and sock_upcallsp are all
@@ -1075,12 +1158,30 @@ so_newconn(sock_upper_handle_t parenthandle,
*/
*sock_upcallsp = &so_upcalls;
- (void) so_acceptq_enqueue(so, nso);
-
- mutex_enter(&so->so_lock);
- so_notify_newconn(so);
+ mutex_enter(&so->so_acceptq_lock);
+ if (so->so_state & (SS_CLOSING|SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) {
+ mutex_exit(&so->so_acceptq_lock);
+ ASSERT(nso->so_count == 1);
+ nso->so_count--;
+ /* drop proto ref */
+ VN_RELE(SOTOV(nso));
+ socket_destroy(nso);
+ return (NULL);
+ } else {
+ so->so_acceptq_len++;
+ if (nso->so_state & SS_FIL_DEFER) {
+ list_insert_tail(&so->so_acceptq_defer, nso);
+ mutex_exit(&so->so_acceptq_lock);
+ } else {
+ list_insert_tail(&so->so_acceptq_list, nso);
+ cv_signal(&so->so_acceptq_cv);
+ mutex_exit(&so->so_acceptq_lock);
+ mutex_enter(&so->so_lock);
+ so_notify_newconn(so);
+ }
- return ((sock_upper_handle_t)nso);
+ return ((sock_upper_handle_t)nso);
+ }
}
void
@@ -1132,6 +1233,27 @@ so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
mutex_exit(&so->so_lock);
+ if (so->so_filter_active > 0) {
+ sof_instance_t *inst;
+ ssize_t maxblk;
+ ushort_t wroff, tail;
+ maxblk = so->so_proto_props.sopp_maxblk;
+ wroff = so->so_proto_props.sopp_wroff;
+ tail = so->so_proto_props.sopp_tail;
+ for (inst = so->so_filter_bottom; inst != NULL;
+ inst = inst->sofi_prev) {
+ if (SOF_INTERESTED(inst, mblk_prop)) {
+ (*inst->sofi_ops->sofop_mblk_prop)(
+ (sof_handle_t)inst, inst->sofi_cookie,
+ &maxblk, &wroff, &tail);
+ }
+ }
+ mutex_enter(&so->so_lock);
+ so->so_proto_props.sopp_maxblk = maxblk;
+ so->so_proto_props.sopp_wroff = wroff;
+ so->so_proto_props.sopp_tail = tail;
+ mutex_exit(&so->so_lock);
+ }
#ifdef DEBUG
soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
@@ -1144,10 +1266,10 @@ so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
/* ARGSUSED */
ssize_t
-so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
- size_t msg_size, int flags, int *errorp, boolean_t *force_pushp)
+so_queue_msg_impl(struct sonode *so, mblk_t *mp,
+ size_t msg_size, int flags, int *errorp, boolean_t *force_pushp,
+ sof_instance_t *filter)
{
- struct sonode *so = (struct sonode *)sock_handle;
boolean_t force_push = B_TRUE;
int space_left;
sodirect_t *sodp = so->so_direct;
@@ -1165,31 +1287,14 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
return (0);
}
ASSERT(msg_size == 0);
- /*
- * recv space check
- */
mutex_enter(&so->so_lock);
- space_left = so->so_rcvbuf - so->so_rcv_queued;
- if (space_left <= 0) {
- so->so_flowctrld = B_TRUE;
- *errorp = ENOSPC;
- space_left = -1;
- }
- goto done_unlock;
+ goto space_check;
}
ASSERT(mp->b_next == NULL);
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
ASSERT(msg_size == msgdsize(mp));
- if (flags & MSG_OOB) {
- so_queue_oob(sock_handle, mp, msg_size);
- return (0);
- }
-
- if (force_pushp != NULL)
- force_push = *force_pushp;
-
if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
/* The read pointer is not aligned correctly for TPI */
zcmn_err(getzoneid(), CE_WARN,
@@ -1199,11 +1304,36 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
mutex_enter(&so->so_lock);
if (sodp != NULL)
SOD_UIOAFINI(sodp);
- mutex_exit(&so->so_lock);
+ goto space_check;
+ }
+
+ if (so->so_filter_active > 0) {
+ for (; filter != NULL; filter = filter->sofi_prev) {
+ if (!SOF_INTERESTED(filter, data_in))
+ continue;
+ mp = (*filter->sofi_ops->sofop_data_in)(
+ (sof_handle_t)filter, filter->sofi_cookie, mp,
+ flags, &msg_size);
+ ASSERT(msgdsize(mp) == msg_size);
+ DTRACE_PROBE2(filter__data, (sof_instance_t), filter,
+ (mblk_t *), mp);
+ /* Data was consumed/dropped, just do space check */
+ if (msg_size == 0) {
+ mutex_enter(&so->so_lock);
+ goto space_check;
+ }
+ }
+ }
- return (so->so_rcvbuf - so->so_rcv_queued);
+ if (flags & MSG_OOB) {
+ so_queue_oob(so, mp, msg_size);
+ mutex_enter(&so->so_lock);
+ goto space_check;
}
+ if (force_pushp != NULL)
+ force_push = *force_pushp;
+
mutex_enter(&so->so_lock);
if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
if (sodp != NULL)
@@ -1212,7 +1342,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
*errorp = EOPNOTSUPP;
return (-1);
}
- if (so->so_state & SS_CANTRCVMORE) {
+ if (so->so_state & (SS_CANTRCVMORE | SS_CLOSING)) {
freemsg(mp);
if (sodp != NULL)
SOD_DISABLE(sodp);
@@ -1270,6 +1400,27 @@ done_unlock:
mutex_exit(&so->so_lock);
done:
return (space_left);
+
+space_check:
+ space_left = so->so_rcvbuf - so->so_rcv_queued;
+ if (space_left <= 0) {
+ so->so_flowctrld = B_TRUE;
+ *errorp = ENOSPC;
+ space_left = -1;
+ }
+ goto done_unlock;
+}
+
+#pragma inline(so_queue_msg_impl)
+
+ssize_t
+so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
+ size_t msg_size, int flags, int *errorp, boolean_t *force_pushp)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ return (so_queue_msg_impl(so, mp, msg_size, flags, errorp, force_pushp,
+ so->so_filter_bottom));
}
/*
@@ -1320,11 +1471,8 @@ so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
* Queue the OOB byte
*/
static void
-so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
+so_queue_oob(struct sonode *so, mblk_t *mp, size_t len)
{
- struct sonode *so;
-
- so = (struct sonode *)sock_handle;
mutex_enter(&so->so_lock);
if (so->so_direct != NULL)
SOD_UIOAFINI(so->so_direct);
@@ -1345,21 +1493,62 @@ so_close(struct sonode *so, int flag, struct cred *cr)
{
int error;
- error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
-
/*
- * At this point there will be no more upcalls from the protocol
+ * No new data will be enqueued once the CLOSING flag is set.
*/
mutex_enter(&so->so_lock);
-
+ so->so_state |= SS_CLOSING;
ASSERT(so_verify_oobstate(so));
-
so_rcv_flush(so);
mutex_exit(&so->so_lock);
+ if (so->so_state & SS_ACCEPTCONN) {
+ /*
+ * We grab and release the accept lock to ensure that any
+ * thread about to insert a socket in so_newconn completes
+ * before we flush the queue. Any thread calling so_newconn
+ * after we drop the lock will observe the SS_CLOSING flag,
+ * which will stop it from inserting the socket in the queue.
+ */
+ mutex_enter(&so->so_acceptq_lock);
+ mutex_exit(&so->so_acceptq_lock);
+
+ so_acceptq_flush(so, B_TRUE);
+ }
+
+ if (so->so_filter_active > 0)
+ sof_sonode_closing(so);
+
+ error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
+ switch (error) {
+ default:
+ /* Protocol made a synchronous close; remove proto ref */
+ VN_RELE(SOTOV(so));
+ break;
+ case EINPROGRESS:
+ /*
+ * Protocol is in the process of closing, it will make a
+ * 'closed' upcall to remove the reference.
+ */
+ error = 0;
+ break;
+ }
+
return (error);
}
+/*
+ * Upcall made by the protocol when it's doing an asynchronous close. It
+ * will drop the protocol's reference on the socket.
+ */
+void
+so_closed(sock_upper_handle_t sock_handle)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ VN_RELE(SOTOV(so));
+}
+
void
so_zcopy_notify(sock_upper_handle_t sock_handle)
{
@@ -1759,5 +1948,6 @@ sock_upcalls_t so_upcalls = {
so_txq_full,
so_signal_oob,
so_zcopy_notify,
- so_set_error
+ so_set_error,
+ so_closed
};
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index 2e3442e879..a44d389855 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -39,6 +38,7 @@
#include <sys/tihdr.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/socktpi.h>
#include <fs/sockfs/sodirect.h>
#include <sys/ddi.h>
@@ -59,46 +59,6 @@ boolean_t so_debug_length = B_FALSE;
static boolean_t so_check_length(sonode_t *so);
#endif
-int
-so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
-{
- ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
- ASSERT(nso->so_acceptq_next == NULL);
-
- *so->so_acceptq_tail = nso;
- so->so_acceptq_tail = &nso->so_acceptq_next;
- so->so_acceptq_len++;
-
- /* Wakeup a single consumer */
- cv_signal(&so->so_acceptq_cv);
-
- return (so->so_acceptq_len);
-}
-
-/*
- * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
- *
- * Enqueue an incoming connection on a listening socket.
- *
- * Arguments:
- * so - listening socket
- * nso - new connection
- *
- * Returns:
- * Number of queued connections, including the new connection
- */
-int
-so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
-{
- int conns;
-
- mutex_enter(&so->so_acceptq_lock);
- conns = so_acceptq_enqueue_locked(so, nso);
- mutex_exit(&so->so_acceptq_lock);
-
- return (conns);
-}
-
static int
so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
struct sonode **nsop)
@@ -107,7 +67,7 @@ so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
*nsop = NULL;
ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
- while ((nso = so->so_acceptq_head) == NULL) {
+ while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
/*
* No need to check so_error here, because it is not
* possible for a listening socket to be reset or otherwise
@@ -126,15 +86,9 @@ so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
}
ASSERT(nso != NULL);
- so->so_acceptq_head = nso->so_acceptq_next;
- nso->so_acceptq_next = NULL;
-
- if (so->so_acceptq_head == NULL) {
- ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
- so->so_acceptq_tail = &so->so_acceptq_head;
- }
ASSERT(so->so_acceptq_len > 0);
- --so->so_acceptq_len;
+ so->so_acceptq_len--;
+ nso->so_listener = NULL;
*nsop = nso;
@@ -174,8 +128,36 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
return (error);
}
+static void
+so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
+{
+ struct sonode *nso;
+
+ while ((nso = list_remove_head(list)) != NULL) {
+ nso->so_listener = NULL;
+ if (doclose) {
+ (void) socket_close(nso, 0, CRED());
+ } else {
+ /*
+ * Only used for fallback - not possible when filters
+ * are present.
+ */
+ ASSERT(so->so_filter_active == 0);
+ /*
+ * Since the socket is on the accept queue, there can
+ * only be one reference. We drop the reference and
+ * just blow off the socket.
+ */
+ ASSERT(nso->so_count == 1);
+ nso->so_count--;
+ /* drop the proto ref */
+ VN_RELE(SOTOV(nso));
+ }
+ socket_destroy(nso);
+ }
+}
/*
- * void so_acceptq_flush(struct sonode *so, boolean_t doclose)
+ * void so_acceptq_flush(struct sonode *so)
*
* Removes all pending connections from a listening socket, and
* frees the associated resources.
@@ -183,7 +165,6 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
* Arguments
* so - listening socket
* doclose - make a close downcall for each socket on the accept queue
- * (Note, only SCTP and SDP sockets rely on this)
*
* Return values:
* None.
@@ -197,28 +178,9 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
void
so_acceptq_flush(struct sonode *so, boolean_t doclose)
{
- struct sonode *nso;
+ so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
+ so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
- while ((nso = so->so_acceptq_head) != NULL) {
- so->so_acceptq_head = nso->so_acceptq_next;
- nso->so_acceptq_next = NULL;
-
- if (doclose) {
- (void) socket_close(nso, 0, CRED());
- } else {
- /*
- * Since the socket is on the accept queue, there can
- * only be one reference. We drop the reference and
- * just blow off the socket.
- */
- ASSERT(nso->so_count == 1);
- nso->so_count--;
- }
- socket_destroy(nso);
- }
-
- so->so_acceptq_head = NULL;
- so->so_acceptq_tail = &so->so_acceptq_head;
so->so_acceptq_len = 0;
}
@@ -296,7 +258,7 @@ so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
int error;
ASSERT(MUTEX_HELD(&so->so_lock));
- while (so->so_snd_qfull) {
+ while (SO_SND_FLOWCTRLD(so)) {
if (so->so_state & SS_CANTSENDMORE)
return (EPIPE);
if (dontblock)
@@ -334,11 +296,9 @@ so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
int error = 0;
mutex_enter(&so->so_lock);
- if (so->so_snd_qfull) {
- so->so_snd_wakeup = B_TRUE;
- error = so_snd_wait_qnotfull_locked(so, dontblock);
- so->so_snd_wakeup = B_FALSE;
- }
+ so->so_snd_wakeup = B_TRUE;
+ error = so_snd_wait_qnotfull_locked(so, dontblock);
+ so->so_snd_wakeup = B_FALSE;
mutex_exit(&so->so_lock);
return (error);
@@ -601,8 +561,13 @@ so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
void
so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
{
+ if (so->so_filter_active > 0 &&
+ (mp_head = sof_filter_data_in_proc(so, mp_head,
+ &mp_last_head)) == NULL)
+ return;
+
ASSERT(mp_head->b_prev != NULL);
- if (so->so_rcv_q_head == NULL) {
+ if (so->so_rcv_q_head == NULL) {
so->so_rcv_q_head = mp_head;
so->so_rcv_q_last_head = mp_last_head;
ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
@@ -650,13 +615,13 @@ so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
* Check flow control on a given sonode. Must have so_lock held, and
* this function will release the hold.
*/
-
-static void
+void
so_check_flow_control(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
- if (so->so_flowctrld && so->so_rcv_queued < so->so_rcvlowat) {
+ if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
+ !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
so->so_flowctrld = B_FALSE;
mutex_exit(&so->so_lock);
/*
@@ -668,6 +633,8 @@ so_check_flow_control(struct sonode *so)
(*so->so_downcalls->sd_clr_flowctrl)
(so->so_proto_handle);
}
+ /* filters can start injecting data */
+ sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
} else {
mutex_exit(&so->so_lock);
}
@@ -1116,7 +1083,7 @@ so_rcv_flush(struct sonode *so)
}
/*
- * Free messages sitting in the send and recv queue
+ * Free messages sitting in the recv queues
*/
while (so->so_rcv_q_head != NULL) {
mp = so->so_rcv_q_head;
@@ -1313,11 +1280,29 @@ socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
mutex_exit(&pso->so_lock);
+
+ /*
+ * If the parent has any filters, try to inherit them.
+ */
+ if (pso->so_filter_active > 0 &&
+ (error = sof_sonode_inherit_filters(so, pso)) != 0)
+ return (error);
+
} else {
struct sockparams *sp = so->so_sockparams;
sock_upcalls_t *upcalls_to_use;
/*
+ * Attach automatic filters, if there are any.
+ */
+ if (!list_is_empty(&sp->sp_auto_filters) &&
+ (error = sof_sonode_autoattach_filters(so, cr)) != 0)
+ return (error);
+
+ /* OK to attach filters */
+ so->so_state |= SS_FILOP_OK;
+
+ /*
* Based on the version number select the right upcalls to
* pass down. Currently we only have one version so choose
* default
@@ -1384,6 +1369,9 @@ socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
if (uioasync.enabled)
sod_sock_init(so);
+ /* put an extra reference on the socket for the protocol */
+ VN_HOLD(SOTOV(so));
+
return (0);
}
@@ -1812,6 +1800,22 @@ socket_getopt_common(struct sonode *so, int level, int option_name,
*optlenp = sizeof (struct so_snd_bufinfo);
return (0);
}
+ case SO_SND_COPYAVOID: {
+ sof_instance_t *inst;
+
+ /*
+ * Avoid zero-copy if there is a filter with a data_out
+ * callback. We could let the operation succeed, but then
+ * the filter would have to copy the data anyway.
+ */
+ for (inst = so->so_filter_top; inst != NULL;
+ inst = inst->sofi_next) {
+ if (SOF_INTERESTED(inst, data_out))
+ return (EOPNOTSUPP);
+ }
+ break;
+ }
+
default:
break;
}
@@ -1982,15 +1986,19 @@ so_end_fallback(struct sonode *so)
* We do not need to hold so_lock, since there can be only one thread
* operating on the sonode.
*/
-static void
-so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
- struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
+static mblk_t *
+so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
+ struct T_capability_ack *tcap,
+ struct sockaddr *laddr, socklen_t laddrlen,
struct sockaddr *faddr, socklen_t faddrlen, short opts)
{
struct sonode *so = (struct sonode *)sock_handle;
boolean_t atmark;
+ mblk_t *retmp = NULL, **tailmpp = &retmp;
- sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
+ if (tcap != NULL)
+ sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
+ opts);
/*
* Some protocols do not quiece the data path during fallback. Once
@@ -2038,9 +2046,9 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
*/
if (atmark) {
struct T_exdata_ind *tei;
- mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
+ mblk_t *mp1 = arg->soqa_exdata_mp;
- SOTOTPI(so)->sti_exdata_mp = NULL;
+ arg->soqa_exdata_mp = NULL;
ASSERT(mp1 != NULL);
mp1->b_datap->db_type = M_PROTO;
tei = (struct T_exdata_ind *)mp1->b_rptr;
@@ -2101,7 +2109,8 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
* Queue data on the STREAM head.
*/
so->so_rcv_queued -= mlen;
- putnext(q, mp);
+ *tailmpp = mp;
+ tailmpp = &mp->b_next;
}
so->so_rcv_head = NULL;
so->so_rcv_last_head = NULL;
@@ -2121,8 +2130,8 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
if (atmark && so->so_oobmsg != NULL) {
struct T_exdata_ind *tei;
- mp = SOTOTPI(so)->sti_exdata_mp;
- SOTOTPI(so)->sti_exdata_mp = NULL;
+ mp = arg->soqa_exdata_mp;
+ arg->soqa_exdata_mp = NULL;
ASSERT(mp != NULL);
mp->b_datap->db_type = M_PROTO;
tei = (struct T_exdata_ind *)mp->b_rptr;
@@ -2133,38 +2142,32 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
mp->b_cont = so->so_oobmsg;
so->so_oobmsg = NULL;
- putnext(q, mp);
+ *tailmpp = mp;
+ tailmpp = &mp->b_next;
} else {
/* Send up the signal */
- mp = SOTOTPI(so)->sti_exdata_mp;
- SOTOTPI(so)->sti_exdata_mp = NULL;
+ mp = arg->soqa_exdata_mp;
+ arg->soqa_exdata_mp = NULL;
ASSERT(mp != NULL);
DB_TYPE(mp) = M_PCSIG;
*mp->b_wptr++ = (uchar_t)SIGURG;
- putnext(q, mp);
+ *tailmpp = mp;
+ tailmpp = &mp->b_next;
/* Send up the mark indicator */
- mp = SOTOTPI(so)->sti_urgmark_mp;
- SOTOTPI(so)->sti_urgmark_mp = NULL;
+ mp = arg->soqa_urgmark_mp;
+ arg->soqa_urgmark_mp = NULL;
mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
- putnext(q, mp);
+ *tailmpp = mp;
+ tailmpp = &mp->b_next;
so->so_oobmark = 0;
}
}
-
- if (SOTOTPI(so)->sti_exdata_mp != NULL) {
- freeb(SOTOTPI(so)->sti_exdata_mp);
- SOTOTPI(so)->sti_exdata_mp = NULL;
- }
-
- if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
- freeb(SOTOTPI(so)->sti_urgmark_mp);
- SOTOTPI(so)->sti_urgmark_mp = NULL;
- }
-
ASSERT(so->so_oobmark == 0);
ASSERT(so->so_rcv_queued == 0);
+
+ return (retmp);
}
#ifdef DEBUG
@@ -2203,7 +2206,8 @@ so_integrity_check(struct sonode *cur, struct sonode *orig)
VERIFY(cur->so_version == orig->so_version);
/* New conns might have arrived, but none should have been lost */
VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
- VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
+ VERIFY(list_head(&cur->so_acceptq_list) ==
+ list_head(&orig->so_acceptq_list));
VERIFY(cur->so_backlog == orig->so_backlog);
/* New OOB migth have arrived, but mark should not have been lost */
VERIFY(cur->so_oobmark >= orig->so_oobmark);
@@ -2243,8 +2247,10 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
struct sockparams *sp;
struct sockparams *newsp = NULL;
so_proto_fallback_func_t fbfunc;
+ const char *devpath;
boolean_t direct;
struct sonode *nso;
+ sock_quiesce_arg_t arg = { NULL, NULL };
#ifdef DEBUG
struct sonode origso;
#endif
@@ -2253,10 +2259,27 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
/*
- * Fallback can only happen if there is a device associated
- * with the sonode, and the socket module has a fallback function.
+ * Cannot fallback if the socket has active filters
+ */
+ if (so->so_filter_active > 0)
+ return (EINVAL);
+
+ switch (so->so_family) {
+ case AF_INET:
+ devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
+ break;
+ case AF_INET6:
+ devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Fallback can only happen if the socket module has a TPI device
+ * and fallback function.
*/
- if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
+ if (devpath == NULL || fbfunc == NULL)
return (EINVAL);
/*
@@ -2276,8 +2299,7 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
sp->sp_stats.sps_nfallback.value.ui64++;
newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
- so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
- KM_SLEEP, &error);
+ so->so_protocol, devpath, KM_SLEEP, &error);
if (error != 0)
goto out;
@@ -2295,14 +2317,30 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
if (error != 0)
goto out;
-
+ /*
+ * When it comes to urgent data we have two cases to deal with;
+ * (1) The oob byte has already arrived, or (2) the protocol has
+ * notified that oob data is pending, but it has not yet arrived.
+ *
+ * For (1) all we need to do is send a T_EXDATA_IND to indicate were
+ * in the byte stream the oob byte is. For (2) we have to send a
+ * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
+ * the oob byte will be the next byte from the protocol.
+ *
+ * So in the worst case we need two mblks, one for the signal, another
+ * for mark indication. In that case we use the exdata_mp for the sig.
+ */
+ arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
+ BPRI_MED, STR_NOSIG, NULL);
+ arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
/*
* Now tell the protocol to start using TPI. so_quiesced_cb be
* called once it's safe to synchronize state.
*/
DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
- error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
+ error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
+ &arg);
DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
if (error != 0) {
@@ -2315,19 +2353,40 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
* Walk the accept queue and notify the proto that they should
* fall back to TPI. The protocol will send up the T_CONN_IND.
*/
- nso = so->so_acceptq_head;
+ nso = list_head(&so->so_acceptq_list);
while (nso != NULL) {
int rval;
+ struct sonode *next;
+
+ if (arg.soqa_exdata_mp == NULL) {
+ arg.soqa_exdata_mp =
+ allocb_wait(sizeof (struct T_exdata_ind),
+ BPRI_MED, STR_NOSIG, NULL);
+ }
+ if (arg.soqa_urgmark_mp == NULL) {
+ arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
+ STR_NOSIG, NULL);
+ }
DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
- rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
+ rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
+ so_quiesced_cb, &arg);
DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
if (rval != 0) {
+ /* Abort the connection */
zcmn_err(getzoneid(), CE_WARN,
"Failed to convert socket in accept queue to TPI. "
"Pid = %d\n", curproc->p_pid);
+ next = list_next(&so->so_acceptq_list, nso);
+ list_remove(&so->so_acceptq_list, nso);
+ so->so_acceptq_len--;
+
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
+ nso = next;
+ } else {
+ nso = list_next(&so->so_acceptq_list, nso);
}
- nso = nso->so_acceptq_next;
}
/*
@@ -2352,6 +2411,14 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
* the STREAMS head).
*/
pollwakeup(&so->so_poll_list, POLLERR);
+
+ /*
+ * When this non-STREAM socket was created we placed an extra ref on
+ * the associated vnode to support asynchronous close. Drop that ref
+ * here.
+ */
+ ASSERT(SOTOV(so)->v_count >= 2);
+ VN_RELE(SOTOV(so));
out:
so_end_fallback(so);
@@ -2365,6 +2432,10 @@ out:
if (newsp != NULL)
SOCKPARAMS_DEC_REF(newsp);
}
+ if (arg.soqa_exdata_mp != NULL)
+ freemsg(arg.soqa_exdata_mp);
+ if (arg.soqa_urgmark_mp != NULL)
+ freemsg(arg.soqa_urgmark_mp);
return (error);
}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
new file mode 100644
index 0000000000..f4d4f9e922
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -0,0 +1,1770 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/disp.h>
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/note.h>
+#include <sys/rwlock.h>
+#include <sys/stropts.h>
+#include <sys/taskq.h>
+#include <sys/socketvar.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
+
+/*
+ * Socket Filter Framework
+ *
+ * Socket filter entry (sof_entry_t):
+ *
+ * There exists one entry for each configured filter (done via soconfig(1M)),
+ * and they are all in sof_entry_list. In addition to the global list, each
+ * sockparams entry maintains a list of filters that is interested in that
+ * particular socket type. So the filter entry may be referenced by multiple
+ * sockparams. The set of sockparams referencing a filter may change as
+ * socket types are added and/or removed from the system. Both sof_entry_list
+ * and the sockparams list is protected by sockconf_lock.
+ *
+ * Each filter entry has a ref count which is incremented whenever a filter
+ * is attached to a socket. An entry is marked SOFEF_CONDEMED when it is
+ * unconfigured, which will result in the entry being freed when its ref
+ * count reaches zero.
+ *
+ * Socket filter module (sof_module_t):
+ *
+ * Modules are created by sof_register() and placed in sof_module_list,
+ * which is protected by sof_module_lock. Each module has a reference count
+ * that is incremented when a filter entry is using the module. A module
+ * can be destroyed by sof_register() only when it's ref count is zero.
+ *
+ * Socket filter instance (sof_instance_t):
+ *
+ * Whenever a filter is attached to a socket (sonode), a new instance is
+ * created. The socket is guaranteed to be single threaded when filters are
+ * being attached/detached. The instance uses the sonode's so_lock for
+ * protection.
+ *
+ * The lifetime of an instance is the same as the socket it's attached to.
+ *
+ * How things link together:
+ *
+ * sockparams.sp_{auto,prog}_filters -> sp_filter_t -> sp_filter_t
+ * ^ | |
+ * | | |
+ * sonode.so_filter_top -> sof_instance_t | |
+ * | | |
+ * v v v
+ * sof_entry_list -> sof_entry_t -> sof_entry -> ... -> sof_entry_t
+ * |
+ * v
+ * sof_module_list -> sof_module_t -> ... -> sof_module_t
+ */
+
+static list_t sof_entry_list; /* list of configured filters */
+
+static list_t sof_module_list; /* list of loaded filter modules */
+static kmutex_t sof_module_lock; /* protect the module list */
+
+static sof_kstat_t sof_stat;
+static kstat_t *sof_stat_ksp;
+
+#ifdef DEBUG
+static int socket_filter_debug = 0;
+#endif
+
+/*
+ * A connection that has been deferred for more than `sof_defer_drop_time'
+ * ticks can be dropped to make room for new connections. A connection that
+ * is to be dropped is moved over to `sof_close_deferred_list' where it will
+ * be closed by sof_close_deferred() (which is running on a taskq). Connections
+ * will not be moved over to the close list if it grows larger than
+ * `sof_close_deferred_max_backlog'.
+ */
+clock_t sof_defer_drop_time = 3000;
+uint_t sof_close_deferred_max_backlog = 1000;
+
+taskq_t *sof_close_deferred_taskq;
+boolean_t sof_close_deferred_running;
+uint_t sof_close_deferred_backlog;
+list_t sof_close_deferred_list;
+kmutex_t sof_close_deferred_lock;
+
+static void sof_close_deferred(void *);
+
+static void sof_module_rele(sof_module_t *);
+static sof_module_t *sof_module_hold_by_name(const char *, const char *);
+
+static int sof_entry_load_module(sof_entry_t *);
+static void sof_entry_hold(sof_entry_t *);
+static void sof_entry_rele(sof_entry_t *);
+static int sof_entry_kstat_create(sof_entry_t *);
+static void sof_entry_kstat_destroy(sof_entry_t *);
+
+static sof_instance_t *sof_instance_create(sof_entry_t *, struct sonode *);
+static void sof_instance_destroy(sof_instance_t *);
+
+static int
+sof_kstat_update(kstat_t *ksp, int rw)
+{
+ _NOTE(ARGUNUSED(ksp));
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ sof_stat.sofks_defer_close_backlog.value.ui64 =
+ sof_close_deferred_backlog;
+
+ return (0);
+}
+
+void
+sof_init(void)
+{
+ list_create(&sof_entry_list, sizeof (sof_entry_t),
+ offsetof(sof_entry_t, sofe_node));
+ list_create(&sof_module_list, sizeof (sof_module_t),
+ offsetof(sof_module_t, sofm_node));
+ list_create(&sof_close_deferred_list, sizeof (struct sonode),
+ offsetof(struct sonode, so_acceptq_node));
+
+ sof_close_deferred_taskq = taskq_create("sof_close_deferred_taskq",
+ 1, minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE);
+ sof_close_deferred_running = B_FALSE;
+ sof_close_deferred_backlog = 0;
+
+ mutex_init(&sof_close_deferred_lock, NULL, MUTEX_DEFAULT, 0);
+ mutex_init(&sof_module_lock, NULL, MUTEX_DEFAULT, 0);
+
+ sof_stat_ksp = kstat_create("sockfs", 0, "sockfilter", "misc",
+ KSTAT_TYPE_NAMED, sizeof (sof_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (sof_stat_ksp == NULL)
+ return;
+
+ kstat_named_init(&sof_stat.sofks_defer_closed, "defer_closed",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&sof_stat.sofks_defer_close_backlog,
+ "defer_close_backlog", KSTAT_DATA_UINT64);
+ kstat_named_init(&sof_stat.sofks_defer_close_failed_backlog_too_big,
+ "defer_close_failed_backlog_too_big", KSTAT_DATA_UINT64);
+
+ sof_stat_ksp->ks_data = &sof_stat;
+ sof_stat_ksp->ks_update = sof_kstat_update;
+ kstat_install(sof_stat_ksp);
+}
+
+/*
+ * Process filter options.
+ */
+static int
+sof_setsockopt_impl(struct sonode *so, int option_name,
+ const void *optval, socklen_t optlen, struct cred *cr)
+{
+ struct sockparams *sp = so->so_sockparams;
+ sof_entry_t *ent = NULL;
+ sp_filter_t *fil;
+ sof_instance_t *inst;
+ sof_rval_t rval;
+ int error;
+
+ _NOTE(ARGUNUSED(optlen));
+
+ /*
+ * Is the filter in a state where filters can be attached?
+ */
+ if (!(so->so_state & SS_FILOP_OK))
+ return (EINVAL);
+
+ if (option_name == FIL_ATTACH) {
+ /*
+ * Make sure there isn't already another instance of the
+ * same filter attached to the socket.
+ */
+ for (inst = so->so_filter_top; inst != NULL;
+ inst = inst->sofi_next) {
+ if (strncmp(inst->sofi_filter->sofe_name,
+ (const char *)optval, SOF_MAXNAMELEN) == 0)
+ return (EEXIST);
+ }
+ /* Look up the filter. */
+ rw_enter(&sockconf_lock, RW_READER);
+ for (fil = list_head(&sp->sp_prog_filters); fil != NULL;
+ fil = list_next(&sp->sp_prog_filters, fil)) {
+ ent = fil->spf_filter;
+ ASSERT(ent->sofe_flags & SOFEF_PROG);
+
+ if (strncmp(ent->sofe_name, (const char *)optval,
+ SOF_MAXNAMELEN) == 0)
+ break;
+ }
+ /* No such filter */
+ if (fil == NULL) {
+ rw_exit(&sockconf_lock);
+ return (ENOENT);
+ }
+ inst = sof_instance_create(ent, so);
+ rw_exit(&sockconf_lock);
+
+ /* Failed to create an instance; must be out of memory */
+ if (inst == NULL)
+ return (ENOMEM);
+
+ /*
+ * This might be the first time the filter is being used,
+ * so try to load the module if it's not already registered.
+ */
+ if (ent->sofe_mod == NULL &&
+ (error = sof_entry_load_module(ent)) != 0) {
+ sof_instance_destroy(inst);
+ return (error);
+ }
+
+ /* Module loaded OK, so there must be an ops vector */
+ ASSERT(ent->sofe_mod != NULL);
+ inst->sofi_ops = &ent->sofe_mod->sofm_ops;
+
+ SOF_STAT_ADD(inst, tot_active_attach, 1);
+ if (inst->sofi_ops->sofop_attach_active != NULL) {
+ rval = inst->sofi_ops->sofop_attach_active(
+ (sof_handle_t)inst, so->so_family, so->so_type,
+ so->so_protocol, cr, &inst->sofi_cookie);
+ if (rval != SOF_RVAL_CONTINUE) {
+ sof_instance_destroy(inst);
+ switch (rval) {
+ case SOF_RVAL_DETACH:
+ /*
+ * Filter does not want to to attach.
+ * An error is returned so the user
+ * knows the request did not go
+ * through.
+ */
+ error = EINVAL;
+ break;
+ default:
+ SOF_STAT_ADD(inst, attach_failures, 1);
+ /* Not a valid rval for active attach */
+ ASSERT(rval != SOF_RVAL_DEFER);
+ error = sof_rval2errno(rval);
+ break;
+ }
+ return (error);
+ }
+ }
+ return (0);
+ } else if (option_name == FIL_DETACH) {
+ for (inst = so->so_filter_top; inst != NULL;
+ inst = inst->sofi_next) {
+
+ ent = inst->sofi_filter;
+ if (strncmp(ent->sofe_name, (const char *)optval,
+ SOF_MAXNAMELEN) == 0)
+ break;
+ }
+ if (inst == NULL)
+ return (ENXIO);
+
+ /* automatic filters cannot be detached */
+ if (inst->sofi_filter->sofe_flags & SOFEF_AUTO)
+ return (EINVAL);
+
+ if (inst->sofi_ops->sofop_detach != NULL)
+ inst->sofi_ops->sofop_detach((sof_handle_t)inst,
+ inst->sofi_cookie, cr);
+ sof_instance_destroy(inst);
+
+ return (0);
+ } else {
+ return (EINVAL);
+ }
+}
+
+int
+sof_setsockopt(struct sonode *so, int option_name,
+ const void *optval, socklen_t optlen, struct cred *cr)
+{
+ int error;
+
+ /*
+ * By grabbing the lock as a writer we ensure that no other socket
+ * operations can start while the filter stack is being manipulated.
+ *
+ * We do a tryenter so that in case there is an active thread we
+ * ask the caller to try again instead of blocking here until the
+ * other thread is done (which could be indefinitely in case of recv).
+ */
+ if (!rw_tryenter(&so->so_fallback_rwlock, RW_WRITER)) {
+ return (EAGAIN);
+ }
+
+ /* Bail out if a fallback has taken place */
+ if (so->so_state & SS_FALLBACK_COMP)
+ error = EINVAL;
+ else
+ error = sof_setsockopt_impl(so, option_name, optval,
+ optlen, cr);
+ rw_exit(&so->so_fallback_rwlock);
+
+ return (error);
+}
+
+/*
+ * Get filter socket options.
+ */
+static int
+sof_getsockopt_impl(struct sonode *so, int option_name,
+ void *optval, socklen_t *optlenp, struct cred *cr)
+{
+ sof_instance_t *inst;
+ struct fil_info *fi;
+ socklen_t maxsz = *optlenp;
+ int i;
+ uint_t cnt;
+
+ _NOTE(ARGUNUSED(cr));
+
+ if (option_name == FIL_LIST) {
+ fi = (struct fil_info *)optval;
+
+ if (maxsz < sizeof (*fi))
+ return (EINVAL);
+
+ for (inst = so->so_filter_top, cnt = 0; inst != NULL;
+ inst = inst->sofi_next)
+ cnt++;
+ for (inst = so->so_filter_top, i = 0;
+ inst != NULL && (i+1) * sizeof (*fi) <= maxsz;
+ inst = inst->sofi_next, i++) {
+ fi[i].fi_flags =
+ (inst->sofi_filter->sofe_flags & SOFEF_AUTO) ?
+ FILF_AUTO : FILF_PROG;
+ if (inst->sofi_flags & SOFIF_BYPASS)
+ fi[i].fi_flags |= FILF_BYPASS;
+ (void) strncpy(fi[i].fi_name,
+ inst->sofi_filter->sofe_name, FILNAME_MAX);
+ ASSERT(cnt > 0);
+ fi[i].fi_pos = --cnt;
+ }
+ *optlenp = i * sizeof (*fi);
+ return (0);
+ } else {
+ return (EINVAL);
+ }
+}
+
+int
+sof_getsockopt(struct sonode *so, int option_name,
+ void *optval, socklen_t *optlenp, struct cred *cr)
+{
+ int error;
+
+ /*
+ * The fallback lock is used here to serialize set and get
+ * filter operations.
+ */
+ rw_enter(&so->so_fallback_rwlock, RW_READER);
+ if (so->so_state & SS_FALLBACK_COMP)
+ error = EINVAL;
+ else
+ error = sof_getsockopt_impl(so, option_name, optval, optlenp,
+ cr);
+ rw_exit(&so->so_fallback_rwlock);
+
+ return (error);
+}
+
+/*
+ * The socket `so' wants to inherit the filter stack from `pso'.
+ * Returns 0 if all went well or an errno otherwise.
+ */
+int
+sof_sonode_inherit_filters(struct sonode *so, struct sonode *pso)
+{
+ sof_instance_t *inst, *pinst;
+ sof_rval_t rval;
+ int error;
+ struct sockaddr_in6 laddrbuf, faddrbuf;
+ struct sockaddr_in6 *laddr, *faddr;
+ socklen_t laddrlen, faddrlen;
+
+ /*
+ * Make sure there is enough room to retrieve the addresses
+ */
+ if (so->so_proto_props.sopp_maxaddrlen > sizeof (laddrbuf)) {
+ laddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen,
+ KM_NOSLEEP);
+ if (laddr == NULL)
+ return (ENOMEM);
+ faddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen,
+ KM_NOSLEEP);
+ if (faddr == NULL) {
+ kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen);
+ return (ENOMEM);
+ }
+ laddrlen = faddrlen = so->so_proto_props.sopp_maxaddrlen;
+ } else {
+ laddrlen = faddrlen = sizeof (laddrbuf);
+ laddr = &laddrbuf;
+ faddr = &faddrbuf;
+ }
+
+ error = (*so->so_downcalls->sd_getpeername)
+ (so->so_proto_handle, (struct sockaddr *)faddr, &faddrlen, kcred);
+ if (error != 0)
+ goto out;
+ error = (*so->so_downcalls->sd_getsockname)
+ (so->so_proto_handle, (struct sockaddr *)laddr, &laddrlen, kcred);
+ if (error != 0)
+ goto out;
+
+ /*
+ * The stack is built bottom up. Filters are allowed to modify the
+ * the foreign and local addresses during attach.
+ */
+ for (pinst = pso->so_filter_bottom;
+ pinst != NULL && !(pinst->sofi_flags & SOFIF_BYPASS);
+ pinst = pinst->sofi_prev) {
+ inst = sof_instance_create(pinst->sofi_filter, so);
+ if (inst == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ /*
+ * The filter module must be loaded since it's already
+ * attached to the listener.
+ */
+ ASSERT(pinst->sofi_ops != NULL);
+ inst->sofi_ops = pinst->sofi_ops;
+
+ SOF_STAT_ADD(inst, tot_passive_attach, 1);
+ if (inst->sofi_ops->sofop_attach_passive != NULL) {
+ rval = inst->sofi_ops->sofop_attach_passive(
+ (sof_handle_t)inst,
+ (sof_handle_t)pinst, pinst->sofi_cookie,
+ (struct sockaddr *)laddr, laddrlen,
+ (struct sockaddr *)faddr, faddrlen,
+ &inst->sofi_cookie);
+ if (rval != SOF_RVAL_CONTINUE) {
+ if (rval == SOF_RVAL_DEFER) {
+ mutex_enter(&so->so_lock);
+ inst->sofi_flags |= SOFIF_DEFER;
+ so->so_state |= SS_FIL_DEFER;
+ mutex_exit(&so->so_lock);
+ so->so_filter_defertime =
+ ddi_get_lbolt();
+ SOF_STAT_ADD(inst, ndeferred, 1);
+ } else if (rval == SOF_RVAL_DETACH) {
+ sof_instance_destroy(inst);
+ } else {
+ SOF_STAT_ADD(inst, attach_failures, 1);
+ error = sof_rval2errno(rval);
+ /*
+ * Filters that called attached will be
+ * destroyed when the socket goes away,
+ * after detach is called.
+ */
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ if (laddr != &laddrbuf) {
+ kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen);
+ kmem_free(faddr, so->so_proto_props.sopp_maxaddrlen);
+ }
+ return (error);
+}
+
+/*
+ * Attach any automatic filters to sonode `so'. Returns 0 if all went well
+ * and an errno otherwise.
+ */
+int
+sof_sonode_autoattach_filters(struct sonode *so, cred_t *cr)
+{
+ struct sockparams *sp = so->so_sockparams;
+ sp_filter_t *fil;
+ sof_instance_t *inst;
+ sof_rval_t rval;
+ int error;
+
+ /*
+ * A created instance is added to the top of the sonode's filter
+ * stack, so traverse the config list in reverse order.
+ */
+ rw_enter(&sockconf_lock, RW_READER);
+ for (fil = list_tail(&sp->sp_auto_filters);
+ fil != NULL; fil = list_prev(&sp->sp_auto_filters, fil)) {
+ ASSERT(fil->spf_filter->sofe_flags & SOFEF_AUTO);
+ if (!sof_instance_create(fil->spf_filter, so)) {
+ rw_exit(&sockconf_lock);
+ error = ENOMEM; /* must have run out of memory */
+ goto free_all;
+ }
+ }
+ rw_exit(&sockconf_lock);
+
+ /*
+ * Notify each filter that it's being attached.
+ */
+ inst = so->so_filter_top;
+ while (inst != NULL) {
+ sof_entry_t *ent = inst->sofi_filter;
+ sof_instance_t *ninst = inst->sofi_next;
+
+ /*
+ * This might be the first time the filter is being used,
+ * so try to load the module if it's not already registered.
+ */
+ if (ent->sofe_mod == NULL &&
+ (error = sof_entry_load_module(ent)) != 0)
+ goto free_detached;
+
+ /* Module loaded OK, so there must be an ops vector */
+ ASSERT(ent->sofe_mod != NULL);
+ inst->sofi_ops = &ent->sofe_mod->sofm_ops;
+
+ SOF_STAT_ADD(inst, tot_active_attach, 1);
+ if (inst->sofi_ops->sofop_attach_active != NULL) {
+ rval = inst->sofi_ops->sofop_attach_active(
+ (sof_handle_t)inst, so->so_family, so->so_type,
+ so->so_protocol, cr, &inst->sofi_cookie);
+ if (rval != SOF_RVAL_CONTINUE) {
+ switch (rval) {
+ case SOF_RVAL_DETACH:
+ /* filter does not want to attach */
+ sof_instance_destroy(inst);
+ break;
+ default:
+ SOF_STAT_ADD(inst, attach_failures, 1);
+ /* Not a valid rval for active attach */
+ ASSERT(rval != SOF_RVAL_DEFER);
+ error = sof_rval2errno(rval);
+ goto free_detached;
+ }
+ }
+ }
+ inst = ninst;
+ }
+ return (0);
+
+free_all:
+ inst = so->so_filter_top;
+free_detached:
+ ASSERT(inst != NULL);
+ /*
+ * Destroy all filters for which attach was not called. The other
+ * filters will be destroyed (and detach called) when the socket
+ * is freed.
+ */
+ do {
+ sof_instance_t *t = inst->sofi_next;
+ sof_instance_destroy(inst);
+ inst = t;
+ } while (inst != NULL);
+
+ return (error);
+}
+
+/*
+ * Detaches and frees all filters attached to sonode `so'.
+ */
+void
+sof_sonode_cleanup(struct sonode *so)
+{
+ sof_instance_t *inst;
+
+ while ((inst = so->so_filter_top) != NULL) {
+ (inst->sofi_ops->sofop_detach)((sof_handle_t)inst,
+ inst->sofi_cookie, kcred);
+ sof_instance_destroy(inst);
+ }
+}
+
+/*
+ * Notifies all active filters attached to `so' about the `event' and
+ * where `arg' is an event specific argument.
+ */
+void
+sof_sonode_notify_filters(struct sonode *so, sof_event_t event, uintptr_t arg)
+{
+ sof_instance_t *inst;
+
+ for (inst = so->so_filter_bottom; inst != NULL;
+ inst = inst->sofi_prev) {
+ if (SOF_INTERESTED(inst, notify))
+ (inst->sofi_ops->sofop_notify)((sof_handle_t)inst,
+ inst->sofi_cookie, event, arg);
+ }
+}
+
+/*
+ * The socket `so' is closing. Notify filters and make sure that there
+ * are no pending tx operations.
+ */
+void
+sof_sonode_closing(struct sonode *so)
+{
+ /*
+ * Notify filters that the socket is being closed. It's OK for
+ * filters to inject data.
+ */
+ sof_sonode_notify_filters(so, SOF_EV_CLOSING, (uintptr_t)B_TRUE);
+
+ /* wait for filters that are sending out data */
+ mutex_enter(&so->so_lock);
+ while (so->so_filter_tx > 0)
+ cv_wait(&so->so_closing_cv, &so->so_lock);
+ mutex_exit(&so->so_lock);
+}
+
+/*
+ * Called when socket `so' wants to get rid of a deferred connection.
+ * Returns TRUE if a connection was dropped.
+ */
+boolean_t
+sof_sonode_drop_deferred(struct sonode *so)
+{
+ struct sonode *def;
+ clock_t now = ddi_get_lbolt();
+
+ if (sof_close_deferred_backlog > sof_close_deferred_max_backlog) {
+ SOF_GLOBAL_STAT_BUMP(defer_close_failed_backlog_too_big);
+ return (B_FALSE);
+ }
+ mutex_enter(&so->so_acceptq_lock);
+ if ((def = list_head(&so->so_acceptq_defer)) != NULL &&
+ (now - def->so_filter_defertime) > sof_defer_drop_time) {
+ list_remove(&so->so_acceptq_defer, def);
+ so->so_acceptq_len--;
+ mutex_exit(&so->so_acceptq_lock);
+ def->so_listener = NULL;
+ } else {
+ mutex_exit(&so->so_acceptq_lock);
+ return (B_FALSE);
+ }
+
+ mutex_enter(&sof_close_deferred_lock);
+ list_insert_tail(&sof_close_deferred_list, def);
+ sof_close_deferred_backlog++;
+ if (!sof_close_deferred_running) {
+ mutex_exit(&sof_close_deferred_lock);
+ (void) taskq_dispatch(sof_close_deferred_taskq,
+ sof_close_deferred, NULL, TQ_NOSLEEP);
+ } else {
+ mutex_exit(&sof_close_deferred_lock);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Called from a taskq to close connections that have been deferred for
+ * too long.
+ */
+void
+sof_close_deferred(void *unused)
+{
+ struct sonode *drop;
+
+ _NOTE(ARGUNUSED(unused));
+
+ mutex_enter(&sof_close_deferred_lock);
+ if (!sof_close_deferred_running) {
+ sof_close_deferred_running = B_TRUE;
+ while ((drop =
+ list_remove_head(&sof_close_deferred_list)) != NULL) {
+ sof_close_deferred_backlog--;
+ mutex_exit(&sof_close_deferred_lock);
+
+ SOF_GLOBAL_STAT_BUMP(defer_closed);
+ (void) socket_close(drop, 0, kcred);
+ socket_destroy(drop);
+
+ mutex_enter(&sof_close_deferred_lock);
+ }
+ sof_close_deferred_running = B_FALSE;
+ ASSERT(sof_close_deferred_backlog == 0);
+ }
+ mutex_exit(&sof_close_deferred_lock);
+}
+
+/*
+ * Creates a new filter instance from the entry `ent' and attaches
+ * it to the sonode `so'. On success, return a pointer to the created
+ * instance.
+ *
+ * The new instance will be placed on the top of the filter stack.
+ *
+ * The caller is responsible for assigning the instance's ops vector and
+ * calling the filter's attach callback.
+ *
+ * No locks are held while manipulating the sonode fields because we are
+ * guaranteed that this operation is serialized.
+ *
+ * We can be sure that the entry `ent' will not disappear, because the
+ * caller is either holding sockconf_lock (in case of an active open), or is
+ * already holding a reference (in case of a passive open, the listener has
+ * one).
+ */
+static sof_instance_t *
+sof_instance_create(sof_entry_t *ent, struct sonode *so)
+{
+ sof_instance_t *inst;
+
+ inst = kmem_zalloc(sizeof (sof_instance_t), KM_NOSLEEP);
+ if (inst == NULL)
+ return (NULL);
+ sof_entry_hold(ent);
+ inst->sofi_filter = ent;
+ inst->sofi_sonode = so;
+
+ inst->sofi_next = so->so_filter_top;
+ if (so->so_filter_top != NULL)
+ so->so_filter_top->sofi_prev = inst;
+ else
+ so->so_filter_bottom = inst;
+ so->so_filter_top = inst;
+ so->so_filter_active++;
+
+ return (inst);
+}
+/*
+ * Destroys the filter instance `inst' and unlinks it from the sonode.
+ *
+ * Any filter private state must be destroyed (via the detach callback)
+ * before the instance is destroyed.
+ */
+static void
+sof_instance_destroy(sof_instance_t *inst)
+{
+ struct sonode *so = inst->sofi_sonode;
+
+ ASSERT(inst->sofi_sonode != NULL);
+ ASSERT(inst->sofi_filter != NULL);
+ ASSERT(inst->sofi_prev != NULL || so->so_filter_top == inst);
+ ASSERT(inst->sofi_next != NULL || so->so_filter_bottom == inst);
+
+ if (inst->sofi_prev != NULL)
+ inst->sofi_prev->sofi_next = inst->sofi_next;
+ else
+ so->so_filter_top = inst->sofi_next;
+
+ if (inst->sofi_next != NULL)
+ inst->sofi_next->sofi_prev = inst->sofi_prev;
+ else
+ so->so_filter_bottom = inst->sofi_prev;
+
+ if (!(inst->sofi_flags & SOFIF_BYPASS)) {
+ ASSERT(so->so_filter_active > 0);
+ so->so_filter_active--;
+ }
+ if (inst->sofi_flags & SOFIF_DEFER)
+ SOF_STAT_ADD(inst, ndeferred, -1);
+ sof_entry_rele(inst->sofi_filter);
+ kmem_free(inst, sizeof (sof_instance_t));
+}
+
+static sof_entry_t *
+sof_entry_find(const char *name)
+{
+ sof_entry_t *ent;
+
+ for (ent = list_head(&sof_entry_list); ent != NULL;
+ ent = list_next(&sof_entry_list, ent)) {
+ if (strncmp(ent->sofe_name, name, SOF_MAXNAMELEN) == 0)
+ return (ent);
+ }
+ return (NULL);
+}
+
+void
+sof_entry_free(sof_entry_t *ent)
+{
+ ASSERT(ent->sofe_refcnt == 0);
+ ASSERT(!list_link_active(&ent->sofe_node));
+
+ if (ent->sofe_hintarg != NULL) {
+ ASSERT(ent->sofe_hint == SOF_HINT_BEFORE ||
+ ent->sofe_hint == SOF_HINT_AFTER);
+ kmem_free(ent->sofe_hintarg, strlen(ent->sofe_hintarg) + 1);
+ ent->sofe_hintarg = NULL;
+ }
+ if (ent->sofe_socktuple_cnt > 0) {
+ ASSERT(ent->sofe_socktuple != NULL);
+ kmem_free(ent->sofe_socktuple,
+ sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt);
+ ent->sofe_socktuple = NULL;
+ ent->sofe_socktuple_cnt = 0;
+ }
+ sof_entry_kstat_destroy(ent);
+
+ mutex_destroy(&ent->sofe_lock);
+ kmem_free(ent, sizeof (sof_entry_t));
+}
+
+static int
+sof_entry_kstat_update(kstat_t *ksp, int rw)
+{
+ sof_entry_t *ent = ksp->ks_private;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ ent->sofe_kstat.sofek_nactive.value.ui64 = ent->sofe_refcnt;
+
+ return (0);
+}
+
+/*
+ * Create the kstat for filter entry `ent'.
+ */
+static int
+sof_entry_kstat_create(sof_entry_t *ent)
+{
+ char name[SOF_MAXNAMELEN + 7];
+
+ (void) snprintf(name, sizeof (name), "filter_%s", ent->sofe_name);
+ ent->sofe_ksp = kstat_create("sockfs", 0, name, "misc",
+ KSTAT_TYPE_NAMED,
+ sizeof (sof_entry_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ent->sofe_ksp == NULL)
+ return (ENOMEM);
+
+ kstat_named_init(&ent->sofe_kstat.sofek_nactive, "nactive",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ent->sofe_kstat.sofek_tot_active_attach,
+ "tot_active_attach", KSTAT_DATA_UINT64);
+ kstat_named_init(&ent->sofe_kstat.sofek_tot_passive_attach,
+ "tot_passive_attach", KSTAT_DATA_UINT64);
+ kstat_named_init(&ent->sofe_kstat.sofek_ndeferred, "ndeferred",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ent->sofe_kstat.sofek_attach_failures,
+ "attach_failures", KSTAT_DATA_UINT64);
+
+ ent->sofe_ksp->ks_data = &ent->sofe_kstat;
+ ent->sofe_ksp->ks_update = sof_entry_kstat_update;
+ ent->sofe_ksp->ks_private = ent;
+ kstat_install(ent->sofe_ksp);
+
+ return (0);
+}
+
+/*
+ * Destroys the kstat for filter entry `ent'.
+ */
+static void
+sof_entry_kstat_destroy(sof_entry_t *ent)
+{
+ if (ent->sofe_ksp != NULL) {
+ kstat_delete(ent->sofe_ksp);
+ ent->sofe_ksp = NULL;
+ }
+}
+
+static void
+sof_entry_hold(sof_entry_t *ent)
+{
+ mutex_enter(&ent->sofe_lock);
+ ent->sofe_refcnt++;
+ mutex_exit(&ent->sofe_lock);
+}
+
+/*
+ * Decrement the reference count for `ent'. The entry will
+ * drop its' reference on the filter module whenever its'
+ * ref count reaches zero.
+ */
+static void
+sof_entry_rele(sof_entry_t *ent)
+{
+ mutex_enter(&ent->sofe_lock);
+ if (--ent->sofe_refcnt == 0) {
+ sof_module_t *mod = ent->sofe_mod;
+ ent->sofe_mod = NULL;
+ if (ent->sofe_flags & SOFEF_CONDEMED) {
+ mutex_exit(&ent->sofe_lock);
+ sof_entry_free(ent);
+ } else {
+ mutex_exit(&ent->sofe_lock);
+ }
+ if (mod != NULL)
+ sof_module_rele(mod);
+ } else {
+ mutex_exit(&ent->sofe_lock);
+ }
+}
+
+/*
+ * Loads the module used by `ent'
+ */
+static int
+sof_entry_load_module(sof_entry_t *ent)
+{
+ sof_module_t *mod = sof_module_hold_by_name(ent->sofe_name,
+ ent->sofe_modname);
+
+ if (mod == NULL)
+ return (EINVAL);
+
+ mutex_enter(&ent->sofe_lock);
+ /* Another thread might have already loaded the module */
+ ASSERT(ent->sofe_mod == mod || ent->sofe_mod == NULL);
+ if (ent->sofe_mod != NULL) {
+ mutex_exit(&ent->sofe_lock);
+ sof_module_rele(mod);
+ } else {
+ ent->sofe_mod = mod;
+ mutex_exit(&ent->sofe_lock);
+ }
+
+ return (0);
+}
+
+/*
+ * Add filter entry `ent' to the global list and attach it to all sockparam
+ * entries which the filter is interested in. Upon successful return the filter
+ * will be available for applications to use.
+ */
+int
+sof_entry_add(sof_entry_t *ent)
+{
+ int error;
+
+ /*
+ * We hold sockconf_lock as a WRITER for the whole operation,
+ * so all operations must be non-blocking.
+ */
+ rw_enter(&sockconf_lock, RW_WRITER);
+ if (sof_entry_find(ent->sofe_name) != NULL) {
+ rw_exit(&sockconf_lock);
+ return (EEXIST);
+ }
+
+ /* The entry is unique; create the kstats */
+ if (sof_entry_kstat_create(ent) != 0) {
+ rw_exit(&sockconf_lock);
+ return (ENOMEM);
+ }
+
+ /*
+ * Attach the filter to sockparams of interest.
+ */
+ if ((error = sockparams_new_filter(ent)) != 0) {
+ sof_entry_kstat_destroy(ent);
+ rw_exit(&sockconf_lock);
+ return (error);
+ }
+ /*
+ * Everything is OK; insert in global list.
+ */
+ list_insert_tail(&sof_entry_list, ent);
+ rw_exit(&sockconf_lock);
+
+ return (0);
+}
+
+/*
+ * Removes the filter entry `ent' from global list and all sockparams.
+ */
+sof_entry_t *
+sof_entry_remove_by_name(const char *name)
+{
+ sof_entry_t *ent;
+
+ rw_enter(&sockconf_lock, RW_WRITER);
+ if ((ent = sof_entry_find(name)) == NULL) {
+ rw_exit(&sockconf_lock);
+ return (NULL);
+ }
+ list_remove(&sof_entry_list, ent);
+ sockparams_filter_cleanup(ent);
+ sof_entry_kstat_destroy(ent);
+ rw_exit(&sockconf_lock);
+
+ return (ent);
+}
+
+/*
+ * Filter entry `ent' will process sockparams entry `sp' to determine whether
+ * it should be attached to the sockparams. It should be called whenever a new
+ * filter or sockparams is being added. Returns zero either if the filter is
+ * not interested in the sockparams or if it successfully attached to the
+ * sockparams. On failure an errno is returned.
+ */
+int
+sof_entry_proc_sockparams(sof_entry_t *ent, struct sockparams *sp)
+{
+ uint_t i;
+ sof_socktuple_t *t = ent->sofe_socktuple;
+ sp_filter_t *new, *fil;
+
+ /* Only interested in non-TPI sockets */
+ if (strcmp(sp->sp_smod_name, SOTPI_SMOD_NAME) == 0)
+ return (0);
+
+ for (i = 0; i < ent->sofe_socktuple_cnt; i++) {
+ if (t[i].sofst_family == sp->sp_family &&
+ t[i].sofst_type == sp->sp_type &&
+ t[i].sofst_protocol == sp->sp_protocol)
+ break;
+ }
+ /* This filter is not interested in the sockparams entry */
+ if (i == ent->sofe_socktuple_cnt)
+ return (0);
+
+ new = kmem_zalloc(sizeof (sp_filter_t), KM_NOSLEEP);
+ if (new == NULL)
+ return (ENOMEM);
+
+ new->spf_filter = ent;
+ if (ent->sofe_flags & SOFEF_PROG) {
+ /* placement is irrelevant for programmatic filters */
+ list_insert_head(&sp->sp_prog_filters, new);
+ return (0);
+ } else {
+ ASSERT(ent->sofe_flags & SOFEF_AUTO);
+ /*
+ * If the filter specifies a placement hint, then make sure
+ * it can be satisfied.
+ */
+ switch (ent->sofe_hint) {
+ case SOF_HINT_TOP:
+ if ((fil = list_head(&sp->sp_auto_filters)) != NULL &&
+ fil->spf_filter->sofe_hint == SOF_HINT_TOP)
+ break;
+ list_insert_head(&sp->sp_auto_filters, new);
+ return (0);
+ case SOF_HINT_BOTTOM:
+ if ((fil = list_tail(&sp->sp_auto_filters)) != NULL &&
+ fil->spf_filter->sofe_hint == SOF_HINT_BOTTOM)
+ break;
+ list_insert_tail(&sp->sp_auto_filters, new);
+ return (0);
+ case SOF_HINT_BEFORE:
+ case SOF_HINT_AFTER:
+ for (fil = list_head(&sp->sp_auto_filters);
+ fil != NULL;
+ fil = list_next(&sp->sp_auto_filters, fil)) {
+ if (strncmp(ent->sofe_hintarg,
+ fil->spf_filter->sofe_name,
+ SOF_MAXNAMELEN) == 0)
+ break;
+ }
+
+ if (fil != NULL) {
+ if (ent->sofe_hint == SOF_HINT_BEFORE) {
+ if (fil->spf_filter->sofe_hint ==
+ SOF_HINT_TOP)
+ break;
+ list_insert_before(&sp->sp_auto_filters,
+ fil, new);
+ } else {
+ if (fil->spf_filter->sofe_hint ==
+ SOF_HINT_BOTTOM)
+ break;
+ list_insert_after(&sp->sp_auto_filters,
+ fil, new);
+ }
+ return (0);
+ }
+ /*FALLTHRU*/
+ case SOF_HINT_NONE:
+ /*
+ * Insert the new filter at the beginning as long as it
+ * does not violate a TOP hint, otherwise insert in the
+ * next suitable location.
+ */
+ if ((fil = list_head(&sp->sp_auto_filters)) != NULL &&
+ fil->spf_filter->sofe_hint == SOF_HINT_TOP) {
+ list_insert_after(&sp->sp_auto_filters, fil,
+ new);
+ } else {
+ list_insert_head(&sp->sp_auto_filters, new);
+ }
+ return (0);
+ }
+ /* Failed to insert the filter */
+ kmem_free(new, sizeof (sp_filter_t));
+ return (ENOSPC);
+ }
+}
+
+/*
+ * Remove all filter entries attached to the sockparams entry `sp'.
+ */
+void
+sof_sockparams_fini(struct sockparams *sp)
+{
+ sp_filter_t *fil;
+
+ ASSERT(!list_link_active(&sp->sp_node));
+
+ while ((fil = list_remove_head(&sp->sp_auto_filters)) != NULL)
+ kmem_free(fil, sizeof (sp_filter_t));
+ while ((fil = list_remove_head(&sp->sp_prog_filters)) != NULL)
+ kmem_free(fil, sizeof (sp_filter_t));
+}
+
+/*
+ * A new sockparams is being added. Walk all filters and attach those that
+ * are interested in the entry.
+ *
+ * It should be called when the sockparams entry is about to be made available
+ * for use and while holding the sockconf_lock.
+ */
+int
+sof_sockparams_init(struct sockparams *sp)
+{
+ sof_entry_t *ent;
+
+ ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+ for (ent = list_head(&sof_entry_list); ent != NULL;
+ ent = list_next(&sof_entry_list, ent)) {
+ if (sof_entry_proc_sockparams(ent, sp) != 0) {
+ sof_sockparams_fini(sp);
+ return (ENOMEM);
+ }
+ }
+ return (0);
+}
+
+static sof_module_t *
+sof_module_find(const char *name)
+{
+ sof_module_t *ent;
+
+ ASSERT(MUTEX_HELD(&sof_module_lock));
+
+ for (ent = list_head(&sof_module_list); ent != NULL;
+ ent = list_next(&sof_module_list, ent))
+ if (strcmp(ent->sofm_name, name) == 0)
+ return (ent);
+ return (NULL);
+}
+
+/*
+ * Returns a pointer to a module identified by `name' with its ref count
+ * bumped. An attempt to load the module is done if it's not found in the
+ * global list.
+ */
+sof_module_t *
+sof_module_hold_by_name(const char *name, const char *modname)
+{
+ ddi_modhandle_t handle = NULL;
+ sof_module_t *mod = NULL;
+ char *modpath;
+ int error;
+
+ /*
+ * We'll go through the loop at most two times, which will only
+ * happen if the module needs to be loaded.
+ */
+ for (;;) {
+ mutex_enter(&sof_module_lock);
+ mod = sof_module_find(name);
+ if (mod != NULL || handle != NULL)
+ break;
+ mutex_exit(&sof_module_lock);
+
+ modpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(modpath, MAXPATHLEN, "%s/%s", SOF_MODPATH,
+ modname);
+ handle = ddi_modopen(modpath, KRTLD_MODE_FIRST, &error);
+ kmem_free(modpath, MAXPATHLEN);
+ /* Failed to load, then bail */
+ if (handle == NULL) {
+ cmn_err(CE_WARN,
+ "Failed to load socket filter module: %s (err %d)",
+ modname, error);
+ return (NULL);
+ }
+ }
+ if (mod != NULL)
+ mod->sofm_refcnt++;
+ mutex_exit(&sof_module_lock);
+
+ if (handle != NULL) {
+ (void) ddi_modclose(handle);
+ /*
+ * The module was loaded, but the filter module could not be
+ * found. It's likely a misconfigured filter.
+ */
+ if (mod == NULL) {
+ cmn_err(CE_WARN,
+ "Socket filter module %s was loaded, but did not" \
+ "register. Filter %s is likely misconfigured.",
+ modname, name);
+ }
+ }
+
+ return (mod);
+}
+
+void
+sof_module_rele(sof_module_t *mod)
+{
+ mutex_enter(&sof_module_lock);
+ mod->sofm_refcnt--;
+ mutex_exit(&sof_module_lock);
+}
+
+int
+sof_rval2errno(sof_rval_t rval)
+{
+ if (rval > SOF_RVAL_CONTINUE) {
+ return ((int)rval);
+ } else {
+#ifdef DEBUG
+ if (socket_filter_debug)
+ printf("sof_rval2errno: invalid rval '%d'\n", rval);
+#endif
+ return (EINVAL);
+ }
+}
+
+/*
+ * Walk through all the filters attached to `so' and allow each filter
+ * to process the data using its data_out callback. `mp' is a b_cont chain.
+ *
+ * Returns the processed mblk, or NULL if mblk was consumed. The mblk might
+ * have been consumed as a result of an error, in which case `errp' is set to
+ * the appropriate errno.
+ */
+mblk_t *
+sof_filter_data_out_from(struct sonode *so, sof_instance_t *start,
+ mblk_t *mp, struct nmsghdr *msg, cred_t *cr, int *errp)
+{
+ sof_instance_t *inst;
+ sof_rval_t rval;
+
+ _NOTE(ARGUNUSED(so));
+
+ for (inst = start; inst != NULL; inst = inst->sofi_next) {
+ if (!SOF_INTERESTED(inst, data_out))
+ continue;
+ mp = (inst->sofi_ops->sofop_data_out)((sof_handle_t)inst,
+ inst->sofi_cookie, mp, msg, cr, &rval);
+ DTRACE_PROBE2(filter__data, (sof_instance_t), inst,
+ (mblk_t *), mp);
+ if (mp == NULL) {
+ *errp = sof_rval2errno(rval);
+ break;
+ }
+ }
+ return (mp);
+}
+
+/*
+ * Walk through all the filters attached to `so' and allow each filter
+ * to process the data using its data_in_proc callback. `mp' is the start of
+ * a possible b_next chain, and `lastmp' points to the last mblk in the chain.
+ *
+ * Returns the processed mblk, or NULL if all mblks in the chain were
+ * consumed. `lastmp' is updated to point to the last mblk in the processed
+ * chain.
+ */
+mblk_t *
+sof_filter_data_in_proc(struct sonode *so, mblk_t *mp, mblk_t **lastmp)
+{
+ sof_instance_t *inst;
+ size_t len = 0, orig = 0;
+ ssize_t diff = 0;
+ mblk_t *retmp = NULL, *tailmp, *nextmp;
+
+ *lastmp = NULL;
+ do {
+ nextmp = mp->b_next;
+ mp->b_next = mp->b_prev = NULL;
+ len = orig = msgdsize(mp);
+ for (inst = so->so_filter_bottom; inst != NULL;
+ inst = inst->sofi_prev) {
+ if (!SOF_INTERESTED(inst, data_in_proc))
+ continue;
+ mp = (inst->sofi_ops->sofop_data_in_proc)(
+ (sof_handle_t)inst, inst->sofi_cookie, mp,
+ kcred, &len);
+ if (mp == NULL)
+ break;
+ }
+ DTRACE_PROBE2(filter__data, (sof_instance_t), inst,
+ (mblk_t *), mp);
+ diff += len - orig;
+ if (mp == NULL)
+ continue;
+
+ for (tailmp = mp; tailmp->b_cont != NULL;
+ tailmp = tailmp->b_cont)
+ ;
+ mp->b_prev = tailmp;
+
+ if (*lastmp == NULL)
+ retmp = mp;
+ else
+ (*lastmp)->b_next = mp;
+ *lastmp = mp;
+ } while ((mp = nextmp) != NULL);
+
+ /*
+ * The size of the chain has changed; make sure the rcv queue
+ * stays consistent and check if the flow control state should
+ * change.
+ */
+ if (diff != 0) {
+ DTRACE_PROBE2(filter__data__adjust__qlen,
+ (struct sonode *), so, (size_t), diff);
+ mutex_enter(&so->so_lock);
+ so->so_rcv_queued += diff;
+ /* so_check_flow_control drops so_lock */
+ so_check_flow_control(so);
+ }
+
+ return (retmp);
+}
+
+int
+sof_filter_bind(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, bind, cr, addr, addrlen)
+}
+
+int
+sof_filter_listen(struct sonode *so, int *backlogp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, listen, cr, backlogp)
+}
+
+int
+sof_filter_connect(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, connect, cr, addr, addrlen)
+}
+
+int
+sof_filter_accept(struct sonode *so, cred_t *cr)
+{
+ sof_instance_t *inst;
+ sof_rval_t rval;
+
+ for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) {
+ if (!SOF_INTERESTED(inst, accept))
+ continue;
+ rval = (inst->sofi_ops->sofop_accept)((sof_handle_t)inst,
+ inst->sofi_cookie, cr);
+ DTRACE_PROBE2(filter__action, (sof_instance_t), inst,
+ (sof_rval_t), rval);
+ if (rval != SOF_RVAL_CONTINUE) {
+ ASSERT(rval != SOF_RVAL_RETURN);
+ return (sof_rval2errno(rval));
+ }
+ }
+ return (-1);
+}
+
+int
+sof_filter_shutdown(struct sonode *so, int *howp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, shutdown, cr, howp)
+}
+
+int
+sof_filter_getsockname(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlenp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, getsockname, cr, addr, addrlenp)
+}
+
+int
+sof_filter_getpeername(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlenp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, getpeername, cr, addr, addrlenp)
+}
+
+int
+sof_filter_setsockopt(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, setsockopt, cr, level, option_name,
+ optval, optlenp)
+}
+
+int
+sof_filter_getsockopt(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, getsockopt, cr, level, option_name,
+ optval, optlenp)
+}
+
+int
+sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ int32_t *rvalp, cred_t *cr)
+{
+ __SOF_FILTER_OP(so, ioctl, cr, cmd, arg, mode, rvalp)
+}
+
+/*
+ * sof_register(version, name, ops, flags)
+ *
+ * Register a socket filter identified by name `name' and which should use
+ * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * On success 0 is returned, otherwise an errno is returned.
+ */
+int
+sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
+{
+ sof_module_t *mod;
+
+ _NOTE(ARGUNUSED(flags));
+
+ if (version != SOF_VERSION)
+ return (EINVAL);
+
+ mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
+ mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+ (void) strcpy(mod->sofm_name, name);
+ mod->sofm_ops = *ops;
+
+ mutex_enter(&sof_module_lock);
+ if (sof_module_find(name) != NULL) {
+ mutex_exit(&sof_module_lock);
+ kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1);
+ kmem_free(mod, sizeof (sof_module_t));
+ return (EEXIST);
+ }
+ list_insert_tail(&sof_module_list, mod);
+ mutex_exit(&sof_module_lock);
+
+ return (0);
+}
+
+/*
+ * sof_unregister(name)
+ *
+ * Try to unregister the socket filter identified by `name'. If the filter
+ * is successfully unregistered, then 0 is returned, otherwise an errno is
+ * returned.
+ */
+int
+sof_unregister(const char *name)
+{
+ sof_module_t *mod;
+
+ mutex_enter(&sof_module_lock);
+ mod = sof_module_find(name);
+ if (mod != NULL) {
+ if (mod->sofm_refcnt == 0) {
+ list_remove(&sof_module_list, mod);
+ mutex_exit(&sof_module_lock);
+
+ kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1);
+ kmem_free(mod, sizeof (sof_module_t));
+ return (0);
+ } else {
+ mutex_exit(&sof_module_lock);
+ return (EBUSY);
+ }
+ }
+ mutex_exit(&sof_module_lock);
+
+ return (ENXIO);
+}
+
+/*
+ * sof_newconn_ready(handle)
+ *
+ * The filter `handle` no longer wants to defer the socket it is attached
+ * to. A newconn notification will be generated if there is no other filter
+ * that wants the socket deferred.
+ */
+void
+sof_newconn_ready(sof_handle_t handle)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ struct sonode *so = inst->sofi_sonode;
+ struct sonode *pso = so->so_listener;
+
+ mutex_enter(&so->so_lock);
+ if (!(inst->sofi_flags & SOFIF_DEFER)) {
+ mutex_exit(&so->so_lock);
+ return;
+ }
+ ASSERT(so->so_state & SS_FIL_DEFER);
+ inst->sofi_flags &= ~SOFIF_DEFER;
+ SOF_STAT_ADD(inst, ndeferred, -1);
+
+ /*
+ * Check if any other filter has deferred the socket. The last
+ * filter to remove its DEFER flag will be the one generating the
+ * wakeup.
+ */
+ for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) {
+ /* Still deferred; nothing to do */
+ if (inst->sofi_flags & SOFIF_DEFER) {
+ mutex_exit(&so->so_lock);
+ return;
+ }
+ }
+ so->so_state &= ~SS_FIL_DEFER;
+ mutex_exit(&so->so_lock);
+
+ /*
+ * The socket is no longer deferred; move it over to the regular
+ * accept list and notify the user. However, it is possible that
+ * the socket is being dropped by sof_sonode_drop_deferred(), so
+ * first make sure the socket is on the deferred list.
+ */
+ mutex_enter(&pso->so_acceptq_lock);
+ if (!list_link_active(&so->so_acceptq_node)) {
+ mutex_exit(&pso->so_acceptq_lock);
+ return;
+ }
+ list_remove(&pso->so_acceptq_defer, so);
+ list_insert_tail(&pso->so_acceptq_list, so);
+ cv_signal(&pso->so_acceptq_cv);
+ mutex_exit(&pso->so_acceptq_lock);
+
+ mutex_enter(&pso->so_lock);
+ so_notify_newconn(pso); /* so_notify_newconn drops the lock */
+}
+
+/*
+ * sof_bypass(handle)
+ *
+ * Stop generating callbacks for `handle'.
+ */
+void
+sof_bypass(sof_handle_t handle)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ struct sonode *so = inst->sofi_sonode;
+
+ mutex_enter(&so->so_lock);
+ if (!(inst->sofi_flags & SOFIF_BYPASS)) {
+ inst->sofi_flags |= SOFIF_BYPASS;
+ ASSERT(so->so_filter_active > 0);
+ so->so_filter_active--;
+ }
+ mutex_exit(&so->so_lock);
+}
+
+/*
+ * sof_rcv_flowctrl(handle, enable)
+ *
+ * If `enable' is TRUE, then recv side flow control will be asserted for
+ * the socket associated with `handle'. When `enable' is FALSE the filter
+ * indicates that it no longer wants to assert flow control, however, the
+ * condition will not be removed until there are no other filters asserting
+ * flow control and there is space available in the receive buffer.
+ */
+void
+sof_rcv_flowctrl(sof_handle_t handle, boolean_t enable)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ struct sonode *so = inst->sofi_sonode;
+
+ mutex_enter(&so->so_lock);
+ if (enable) {
+ inst->sofi_flags |= SOFIF_RCV_FLOWCTRL;
+ so->so_flowctrld = B_TRUE;
+ so->so_state |= SS_FIL_RCV_FLOWCTRL;
+ mutex_exit(&so->so_lock);
+ } else {
+ inst->sofi_flags &= ~SOFIF_RCV_FLOWCTRL;
+ for (inst = so->so_filter_top; inst != NULL;
+ inst = inst->sofi_next) {
+ /* another filter is asserting flow control */
+ if (inst->sofi_flags & SOFIF_RCV_FLOWCTRL) {
+ mutex_exit(&so->so_lock);
+ return;
+ }
+ }
+ so->so_state &= ~SS_FIL_RCV_FLOWCTRL;
+ /* so_check_flow_control drops so_lock */
+ so_check_flow_control(so);
+ }
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * sof_snd_flowctrl(handle, enable)
+ *
+ * If `enable' is TRUE, then send side flow control will be asserted for
+ * the socket associated with `handle'. When `enable' is FALSE the filter
+ * indicates that is no longer wants to assert flow control, however, the
+ * condition will not be removed until there are no other filters asserting
+ * flow control and there are tx buffers available.
+ */
+void
+sof_snd_flowctrl(sof_handle_t handle, boolean_t enable)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ struct sonode *so = inst->sofi_sonode;
+
+ mutex_enter(&so->so_lock);
+ if (enable) {
+ inst->sofi_flags |= SOFIF_SND_FLOWCTRL;
+ so->so_state |= SS_FIL_SND_FLOWCTRL;
+ } else {
+ inst->sofi_flags &= ~SOFIF_SND_FLOWCTRL;
+ for (inst = so->so_filter_top; inst != NULL;
+ inst = inst->sofi_next) {
+ if (inst->sofi_flags & SOFIF_SND_FLOWCTRL) {
+ mutex_exit(&so->so_lock);
+ return;
+ }
+ }
+ so->so_state &= ~SS_FIL_SND_FLOWCTRL;
+ /*
+ * Wake up writer if the socket is no longer flow controlled.
+ */
+ if (!SO_SND_FLOWCTRLD(so)) {
+ /* so_notify_writable drops so_lock */
+ so_notify_writable(so);
+ return;
+ }
+ }
+ mutex_exit(&so->so_lock);
+}
+
+/*
+ * sof_get_cookie(handle)
+ *
+ * Returns the cookie used by `handle'.
+ */
+void *
+sof_get_cookie(sof_handle_t handle)
+{
+ return (((sof_instance_t *)handle)->sofi_cookie);
+}
+
+/*
+ * sof_cas_cookie(handle, old, new)
+ *
+ * Compare-and-swap the cookie used by `handle'.
+ */
+void *
+sof_cas_cookie(sof_handle_t handle, void *old, void *new)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+
+ return (atomic_cas_ptr(&inst->sofi_cookie, old, new));
+}
+
+/*
+ * sof_inject_data_out(handle, mp, msg, flowctrld)
+ *
+ * Submit `mp' for transmission. `msg' cannot by NULL, and may contain
+ * ancillary data and destination address. Returns 0 when successful
+ * in which case `flowctrld' is updated. If flow controlled, no new data
+ * should be injected until a SOF_EV_INJECT_DATA_OUT_OK event is observed.
+ * In case of failure, an errno is returned.
+ *
+ * Filters that are lower in the stack than `handle' will see the data
+ * before it is transmitted and may end up modifying or freeing the data.
+ */
+int
+sof_inject_data_out(sof_handle_t handle, mblk_t *mp, struct nmsghdr *msg,
+ boolean_t *flowctrld)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ struct sonode *so = inst->sofi_sonode;
+ int error;
+
+ /*
+ * Data cannot be sent down to the protocol once the socket has
+ * started the process of closing.
+ */
+ mutex_enter(&so->so_lock);
+ if (so->so_state & SS_CLOSING) {
+ mutex_exit(&so->so_lock);
+ freemsg(mp);
+ return (EPIPE);
+ }
+ so->so_filter_tx++;
+ mutex_exit(&so->so_lock);
+
+ error = so_sendmblk_impl(inst->sofi_sonode, msg, FNONBLOCK,
+ kcred, &mp, inst->sofi_next, B_TRUE);
+
+ mutex_enter(&so->so_lock);
+ ASSERT(so->so_filter_tx > 0);
+ so->so_filter_tx--;
+ if (so->so_state & SS_CLOSING)
+ cv_signal(&so->so_closing_cv);
+ mutex_exit(&so->so_lock);
+
+ if (mp != NULL)
+ freemsg(mp);
+
+ if (error == ENOSPC) {
+ *flowctrld = B_TRUE;
+ error = 0;
+ } else {
+ *flowctrld = B_FALSE;
+ }
+
+ return (error);
+}
+
+/*
+ * sof_inject_data_in(handle, mp, len, flag, flowctrld)
+ *
+ * Enqueue `mp' which contains `len' bytes of M_DATA onto the socket
+ * associated with `handle'. `flags' should be set to 0. Returns 0 when
+ * successful in which case `flowctrld' is updated. If flow controlled,
+ * no new data should be injected until a SOF_EV_INJECT_DATA_IN_OK event
+ * is observed. In case of failure, an errno is returned.
+ *
+ * Filters that are higher in the stack than `handle' will see the data
+ * before it is enqueued on the receive queue and may end up modifying or
+ * freeing the data.
+ */
+int
+sof_inject_data_in(sof_handle_t handle, mblk_t *mp, size_t len, int flags,
+ boolean_t *flowctrld)
+{
+ sof_instance_t *inst = (sof_instance_t *)handle;
+ ssize_t avail;
+ int error = 0;
+
+ ASSERT(flags == 0);
+ avail = so_queue_msg_impl(inst->sofi_sonode, mp, len, flags, &error,
+ NULL, inst->sofi_prev);
+ /* fallback should never happen when there is an active filter */
+ ASSERT(error != EOPNOTSUPP);
+
+ *flowctrld = (avail > 0) ? B_FALSE : B_TRUE;
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
new file mode 100644
index 0000000000..d37410a0d1
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -0,0 +1,213 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SOCKFS_SOCKFILTER_H
+#define _SOCKFS_SOCKFILTER_H
+
+#include <sys/kstat.h>
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sonode;
+struct sockparams;
+
+typedef struct sof_module sof_module_t;
+typedef struct sof_entry_kstat sof_entry_kstat_t;
+typedef struct sof_entry sof_entry_t;
+typedef struct sof_instance sof_instance_t;
+typedef struct sof_kstat sof_kstat_t;
+
+#define SOF_MAXNAMELEN FILNAME_MAX
+#define SOF_MAXSOCKTUPLECNT 32
+#define SOF_MODPATH SOCKMOD_PATH
+
+struct sof_module {
+ char *sofm_name;
+ sof_ops_t sofm_ops;
+ uint_t sofm_refcnt;
+ list_node_t sofm_node;
+};
+
+struct sof_kstat {
+ kstat_named_t sofks_defer_closed;
+ kstat_named_t sofks_defer_close_backlog;
+ kstat_named_t sofks_defer_close_failed_backlog_too_big;
+};
+
+#define SOF_GLOBAL_STAT_BUMP(s) \
+ atomic_add_64(&sof_stat.sofks_##s.value.ui64, 1)
+
+/*
+ * Per filter statistics.
+ */
+struct sof_entry_kstat {
+ kstat_named_t sofek_nactive; /* # of consumers */
+ kstat_named_t sofek_tot_active_attach;
+ kstat_named_t sofek_tot_passive_attach;
+ kstat_named_t sofek_ndeferred; /* # of deferred conns */
+ kstat_named_t sofek_attach_failures;
+};
+
+/*
+ * Socket filter entry - one for each configured filter (added and
+ * removed by soconfig(1M)).
+ *
+ * sofe_flags, sofe_refcnt and sofe_mod are protected by sofe_lock, and all
+ * other fields are write once.
+ */
+struct sof_entry {
+ char sofe_name[SOF_MAXNAMELEN]; /* filter name */
+ char sofe_modname[MODMAXNAMELEN]; /* filter module */
+ sof_hint_t sofe_hint; /* order hint */
+ char *sofe_hintarg; /* hint argument */
+ list_node_t sofe_node; /* global list node */
+ uint_t sofe_socktuple_cnt; /* # of socket tuples */
+ sof_socktuple_t *sofe_socktuple; /* socket tuple list */
+
+ sof_entry_kstat_t sofe_kstat; /* filter stats */
+ kstat_t *sofe_ksp;
+
+ kmutex_t sofe_lock;
+ char sofe_flags; /* SOFEF_* flags */
+ uint_t sofe_refcnt; /* # of instances */
+ sof_module_t *sofe_mod; /* filter module */
+};
+
+/* Filter entry flags */
+#define SOFEF_AUTO 0x1 /* automatic filter */
+#define SOFEF_PROG 0x2 /* programmatic filter */
+#define SOFEF_CONDEMED 0x4 /* removed by soconfig(1M) */
+
+/*
+ * Socket filter instance - one for each socket using a sof_entry_t
+ */
+struct sof_instance {
+ sof_ops_t *sofi_ops; /* filter ops */
+ void *sofi_cookie; /* filter cookie (from attach) */
+ char sofi_flags; /* instance flags (SOFIF_*) */
+ sof_instance_t *sofi_prev; /* up the stack */
+ sof_instance_t *sofi_next; /* down the stack */
+ struct sonode *sofi_sonode; /* socket instance is attached to */
+ sof_entry_t *sofi_filter; /* filter this is an instance of */
+};
+
+/* Filter instance flags */
+#define SOFIF_BYPASS 0x1 /* filter does not want any callbacks */
+#define SOFIF_DEFER 0x2 /* defer notification of socket */
+#define SOFIF_RCV_FLOWCTRL 0x4 /* flow control recv path */
+#define SOFIF_SND_FLOWCTRL 0x8 /* flow control send path */
+
+#define SOF_STAT_ADD(i, s, v) \
+ atomic_add_64(&(i)->sofi_filter->sofe_kstat.sofek_##s.value.ui64, (v))
+
+extern void sof_init(void);
+
+extern void sof_entry_free(sof_entry_t *);
+extern int sof_entry_add(sof_entry_t *);
+extern sof_entry_t *sof_entry_remove_by_name(const char *);
+extern int sof_entry_proc_sockparams(sof_entry_t *, struct sockparams *);
+
+extern int sof_sockparams_init(struct sockparams *);
+extern void sof_sockparams_fini(struct sockparams *);
+
+extern int sof_sonode_autoattach_filters(struct sonode *, cred_t *);
+extern int sof_sonode_inherit_filters(struct sonode *, struct sonode *);
+extern void sof_sonode_closing(struct sonode *);
+extern void sof_sonode_cleanup(struct sonode *);
+extern void sof_sonode_notify_filters(struct sonode *, sof_event_t,
+ uintptr_t);
+extern boolean_t sof_sonode_drop_deferred(struct sonode *);
+
+extern int sof_setsockopt(struct sonode *, int, const void *, socklen_t,
+ struct cred *);
+extern int sof_getsockopt(struct sonode *, int, void *, socklen_t *,
+ struct cred *);
+
+extern int sof_rval2errno(sof_rval_t);
+
+#define SOF_INTERESTED(inst, op) \
+ (!((inst)->sofi_flags & SOFIF_BYPASS) && \
+ (inst)->sofi_ops->sofop_##op != NULL)
+
+/*
+ * SOF_FILTER_OP traverses the filter stack for sonode `so' top-down,
+ * calling `op' for each filter with the supplied `args'. A non-negative
+ * return value indicates that a filter action was taken.
+ */
+#define __SOF_FILTER_OP(so, op, cr, ...) \
+ sof_instance_t *__inst; \
+ sof_rval_t __rval; \
+ \
+ for (__inst = (so)->so_filter_top; __inst != NULL; \
+ __inst = __inst->sofi_next) { \
+ if (!SOF_INTERESTED(__inst, op)) \
+ continue; \
+ __rval = (__inst->sofi_ops->sofop_##op)((sof_handle_t)__inst,\
+ __inst->sofi_cookie, __VA_ARGS__, cr); \
+ DTRACE_PROBE2(filter__action, (sof_instance_t), __inst,\
+ (sof_rval_t), __rval); \
+ if (__rval != SOF_RVAL_CONTINUE) \
+ return (sof_rval2errno(__rval)); \
+ } \
+ return (-1);
+
+extern mblk_t *sof_filter_data_out_from(struct sonode *so,
+ sof_instance_t *, mblk_t *, struct nmsghdr *, cred_t *, int *);
+extern mblk_t *sof_filter_data_in_proc(struct sonode *so,
+ mblk_t *, mblk_t **);
+extern int sof_filter_bind(struct sonode *, struct sockaddr *,
+ socklen_t *, cred_t *);
+extern int sof_filter_listen(struct sonode *, int *, cred_t *);
+extern int sof_filter_connect(struct sonode *, struct sockaddr *,
+ socklen_t *, cred_t *);
+extern int sof_filter_accept(struct sonode *, cred_t *);
+extern int sof_filter_shutdown(struct sonode *, int *, cred_t *);
+extern int sof_filter_getsockname(struct sonode *, struct sockaddr *,
+ socklen_t *, cred_t *);
+extern int sof_filter_getpeername(struct sonode *, struct sockaddr *,
+ socklen_t *, cred_t *);
+extern int sof_filter_setsockopt(struct sonode *, int, int, void *,
+ socklen_t *, cred_t *);
+extern int sof_filter_getsockopt(struct sonode *, int, int, void *,
+ socklen_t *, cred_t *);
+extern int sof_filter_ioctl(struct sonode *, int, intptr_t, int,
+ int32_t *, cred_t *);
+
+#define SOF_FILTER_DATA_OUT(so, mp, msg, cr, errp) \
+ sof_filter_data_out_from(so, (so)->so_filter_top, mp, msg, cr, errp)
+#define SOF_FILTER_DATA_OUT_FROM(so, inst, mp, msg, cr, errp) \
+ sof_filter_data_out_from(so, inst, mp, msg, cr, errp)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SOCKFS_SOCKFILTER_H */
diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c
index 2bb564288b..d6c1f9ea85 100644
--- a/usr/src/uts/common/fs/sockfs/socknotify.c
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -33,6 +32,7 @@
#include <io/ksocket/ksocket_impl.h>
#include <fs/sockfs/sockcommon.h>
#include <fs/sockfs/sodirect.h>
+#include <fs/sockfs/sockfilter_impl.h>
/*
* There can only be a single thread waiting for data (enforced by
@@ -78,6 +78,7 @@ so_notify_connected(struct sonode *so)
mutex_exit(&so->so_lock);
pollwakeup(&so->so_poll_list, POLLOUT);
}
+ sof_sonode_notify_filters(so, SOF_EV_CONNECTED, 0);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
}
@@ -93,18 +94,19 @@ so_notify_disconnecting(struct sonode *so)
int sigev = 0;
ASSERT(MUTEX_HELD(&so->so_lock));
+ (void) i_so_notify_last_tx(so, &pollev, &sigev);
if (IS_KERNEL_SOCKET(so)) {
- SO_WAKEUP_WRITER(so);
KSOCKET_CALLBACK(so, cantsendmore, 0);
mutex_exit(&so->so_lock);
- } else if (i_so_notify_last_tx(so, &pollev, &sigev)) {
- socket_sendsig(so, sigev);
- mutex_exit(&so->so_lock);
- pollwakeup(&so->so_poll_list, pollev);
} else {
+ if (sigev != 0)
+ socket_sendsig(so, sigev);
mutex_exit(&so->so_lock);
+ if (pollev != 0)
+ pollwakeup(&so->so_poll_list, pollev);
}
+ sof_sonode_notify_filters(so, SOF_EV_CANTSENDMORE, 0);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
}
@@ -114,7 +116,7 @@ so_notify_disconnecting(struct sonode *so)
* Wake up anyone that is waiting to send or receive data.
*/
void
-so_notify_disconnected(struct sonode *so, int error)
+so_notify_disconnected(struct sonode *so, boolean_t connfailed, int error)
{
int pollev = 0;
int sigev = 0;
@@ -125,7 +127,11 @@ so_notify_disconnected(struct sonode *so, int error)
(void) i_so_notify_last_rx(so, &pollev, &sigev);
if (IS_KERNEL_SOCKET(so)) {
- KSOCKET_CALLBACK(so, disconnected, error);
+ if (connfailed) {
+ KSOCKET_CALLBACK(so, disconnected, error);
+ } else {
+ KSOCKET_CALLBACK(so, connectfailed, error);
+ }
mutex_exit(&so->so_lock);
} else {
if (sigev != 0)
@@ -134,6 +140,8 @@ so_notify_disconnected(struct sonode *so, int error)
if (pollev != 0)
pollwakeup(&so->so_poll_list, pollev);
}
+ sof_sonode_notify_filters(so, (connfailed) ? SOF_EV_CONNECTFAILED :
+ SOF_EV_DISCONNECTED, error);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
}
@@ -158,6 +166,10 @@ so_notify_writable(struct sonode *so)
}
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+
+ /* filters can start injecting data */
+ if (so->so_filter_active > 0)
+ sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_OUT_OK, 0);
}
/*
@@ -270,7 +282,6 @@ so_notify_eof(struct sonode *so)
(void) i_so_notify_last_rx(so, &pollev, &sigev);
if (IS_KERNEL_SOCKET(so)) {
- SO_WAKEUP_READER(so);
KSOCKET_CALLBACK(so, cantrecvmore, 0);
mutex_exit(&so->so_lock);
} else {
@@ -281,6 +292,7 @@ so_notify_eof(struct sonode *so)
pollwakeup(&so->so_poll_list, pollev);
}
+ sof_sonode_notify_filters(so, SOF_EV_CANTRECVMORE, 0);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
}
@@ -294,7 +306,7 @@ so_notify_newconn(struct sonode *so)
ASSERT(MUTEX_HELD(&so->so_lock));
if (IS_KERNEL_SOCKET(so)) {
- KSOCKET_CALLBACK(so, newconn, so->so_rcv_queued);
+ KSOCKET_CALLBACK(so, newconn, 0);
mutex_exit(&so->so_lock);
} else {
socket_sendsig(so, SOCKETSIG_READ);
diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c
index 5c4872f090..60a1a1580c 100644
--- a/usr/src/uts/common/fs/sockfs/sockparams.c
+++ b/usr/src/uts/common/fs/sockfs/sockparams.c
@@ -36,6 +36,7 @@
#include <sys/socketvar.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/socktpi.h>
/*
@@ -53,12 +54,9 @@
* supplied device path, or when a socket is falling back to TPI.
*
* Lock order:
- * The lock order is splist_lock -> sp_lock.
- * The lock order is sp_ephem_lock -> sp_lock.
+ * The lock order is sockconf_lock -> sp_lock.
*/
extern int kobj_path_exists(char *, int);
-extern void nl7c_init(void);
-extern int sockfs_defer_nl7c_init;
static int sockparams_sdev_init(struct sockparams *, char *, int);
static void sockparams_sdev_fini(struct sockparams *);
@@ -67,13 +65,11 @@ static void sockparams_sdev_fini(struct sockparams *);
* Global sockparams list (populated via soconfig(1M)).
*/
static list_t sphead;
-static krwlock_t splist_lock;
/*
* List of ephemeral sockparams.
*/
static list_t sp_ephem_list;
-static krwlock_t sp_ephem_lock;
/* Global kstats for sockparams */
typedef struct sockparams_g_stats {
@@ -93,9 +89,6 @@ sockparams_init(void)
list_create(&sp_ephem_list, sizeof (struct sockparams),
offsetof(struct sockparams, sp_node));
- rw_init(&splist_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&sp_ephem_lock, NULL, RW_DEFAULT, NULL);
-
kstat_named_init(&sp_g_stats.spgs_ephem_nalloc, "ephemeral_nalloc",
KSTAT_DATA_UINT64);
kstat_named_init(&sp_g_stats.spgs_ephem_nreuse, "ephemeral_nreuse",
@@ -170,9 +163,8 @@ sockparams_kstat_fini(struct sockparams *sp)
* modname: Name of the module associated with the socket type. The
* module can be NULL if a device path is given, in which
* case the TPI module is used.
- * devpath: Path to the STREAMS device. May be NULL for non-STREAMS
- * based transports, or those transports that do not provide
- * the capability to fallback to STREAMS.
+ * devpath: Path to the STREAMS device. Must be NULL for non-STREAMS
+ * based transports.
* devpathlen: Length of the devpath string. The argument can be 0,
* indicating that devpath was allocated statically, and should
* not be freed when the sockparams entry is destroyed.
@@ -202,7 +194,7 @@ sockparams_create(int family, int type, int protocol, char *modname,
goto error;
}
- /* either a module or device must be given */
+ /* either a module or device must be given, but not both */
if (modname == NULL && devpath == NULL) {
*errorp = EINVAL;
goto error;
@@ -219,6 +211,11 @@ sockparams_create(int family, int type, int protocol, char *modname,
sp->sp_refcnt = 0;
sp->sp_flags = flags;
+ list_create(&sp->sp_auto_filters, sizeof (sp_filter_t),
+ offsetof(sp_filter_t, spf_node));
+ list_create(&sp->sp_prog_filters, sizeof (sp_filter_t),
+ offsetof(sp_filter_t, spf_node));
+
kstat_named_init(&sp->sp_stats.sps_nfallback, "nfallback",
KSTAT_DATA_UINT64);
kstat_named_init(&sp->sp_stats.sps_nactive, "nactive",
@@ -322,6 +319,10 @@ sockparams_destroy(struct sockparams *sp)
mutex_destroy(&sp->sp_lock);
sockparams_kstat_fini(sp);
+ sof_sockparams_fini(sp);
+ list_destroy(&sp->sp_auto_filters);
+ list_destroy(&sp->sp_prog_filters);
+
kmem_free(sp, sizeof (*sp));
}
@@ -404,12 +405,12 @@ sockparams_hold_ephemeral(int family, int type, int protocol,
/*
* First look for an existing entry
*/
- rw_enter(&sp_ephem_lock, RW_READER);
+ rw_enter(&sockconf_lock, RW_READER);
sp = sockparams_find(&sp_ephem_list, family, type, protocol,
by_devpath, name);
if (sp != NULL) {
SOCKPARAMS_INC_REF(sp);
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
sp_g_stats.spgs_ephem_nreuse.value.ui64++;
return (sp);
@@ -418,7 +419,7 @@ sockparams_hold_ephemeral(int family, int type, int protocol,
char *namebuf = NULL;
int namelen = 0;
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
namelen = strlen(name) + 1;
namebuf = kmem_alloc(namelen, kmflag);
@@ -460,7 +461,7 @@ sockparams_hold_ephemeral(int family, int type, int protocol,
* The sockparams entry was created, now try to add it
* to the list. We need to hold the lock as a WRITER.
*/
- rw_enter(&sp_ephem_lock, RW_WRITER);
+ rw_enter(&sockconf_lock, RW_WRITER);
sp = sockparams_find(&sp_ephem_list, family, type, protocol,
by_devpath, name);
if (sp != NULL) {
@@ -469,13 +470,19 @@ sockparams_hold_ephemeral(int family, int type, int protocol,
* place a hold on it and release the entry we alloc'ed.
*/
SOCKPARAMS_INC_REF(sp);
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
sockparams_destroy(newsp);
} else {
+ *errorp = sof_sockparams_init(newsp);
+ if (*errorp != 0) {
+ rw_exit(&sockconf_lock);
+ sockparams_destroy(newsp);
+ return (NULL);
+ }
SOCKPARAMS_INC_REF(newsp);
list_insert_tail(&sp_ephem_list, newsp);
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
sp = newsp;
}
@@ -514,18 +521,18 @@ sockparams_ephemeral_drop_last_ref(struct sockparams *sp)
ASSERT(sp->sp_flags & SOCKPARAMS_EPHEMERAL);
ASSERT(MUTEX_NOT_HELD(&sp->sp_lock));
- rw_enter(&sp_ephem_lock, RW_WRITER);
+ rw_enter(&sockconf_lock, RW_WRITER);
mutex_enter(&sp->sp_lock);
if (--sp->sp_refcnt == 0) {
list_remove(&sp_ephem_list, sp);
mutex_exit(&sp->sp_lock);
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
sockparams_destroy(sp);
} else {
mutex_exit(&sp->sp_lock);
- rw_exit(&sp_ephem_lock);
+ rw_exit(&sockconf_lock);
}
}
@@ -542,21 +549,37 @@ sockparams_ephemeral_drop_last_ref(struct sockparams *sp)
* is returned.
*
* Locking:
- * The caller can not be holding splist_lock.
+ * The caller can not be holding sockconf_lock.
*/
-static int
+int
sockparams_add(struct sockparams *sp)
{
+ int error;
+
ASSERT(!(sp->sp_flags & SOCKPARAMS_EPHEMERAL));
- rw_enter(&splist_lock, RW_WRITER);
+ rw_enter(&sockconf_lock, RW_WRITER);
if (sockparams_find(&sphead, sp->sp_family, sp->sp_type,
sp->sp_protocol, B_TRUE, NULL) != 0) {
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
return (EEXIST);
} else {
+ /*
+ * Unique sockparams entry, so init the kstats.
+ */
+ sockparams_kstat_init(sp);
+
+ /*
+ * Before making the socket type available we must make
+ * sure that interested socket filters are aware of it.
+ */
+ error = sof_sockparams_init(sp);
+ if (error != 0) {
+ rw_exit(&sockconf_lock);
+ return (error);
+ }
list_insert_tail(&sphead, sp);
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
return (0);
}
}
@@ -575,15 +598,15 @@ sockparams_add(struct sockparams *sp)
* On success 0, otherwise ENXIO.
*
* Locking:
- * Caller can not be holding splist_lock or the sp_lock of
+ * Caller can not be holding sockconf_lock or the sp_lock of
* any sockparams entry.
*/
-static int
+int
sockparams_delete(int family, int type, int protocol)
{
struct sockparams *sp;
- rw_enter(&splist_lock, RW_WRITER);
+ rw_enter(&sockconf_lock, RW_WRITER);
sp = sockparams_find(&sphead, family, type, protocol, B_TRUE, NULL);
if (sp != NULL) {
@@ -595,97 +618,22 @@ sockparams_delete(int family, int type, int protocol)
mutex_enter(&sp->sp_lock);
if (sp->sp_refcnt != 0) {
mutex_exit(&sp->sp_lock);
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
return (EBUSY);
}
mutex_exit(&sp->sp_lock);
/* Delete the sockparams entry. */
list_remove(&sphead, sp);
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
sockparams_destroy(sp);
return (0);
} else {
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
return (ENXIO);
}
}
-/*
- * soconfig(int family, int type, int protocol,
- * char *devpath, int devpathlen, char *module)
- *
- * Add or delete an entry to the sockparams table.
- * When devpath and module both are NULL, it will delete an entry.
- *
- * Arguments:
- * family, type, protocol: the tuple in question
- * devpath: STREAMS device path. Can be NULL for module based sockets.
- * module : Name of the socket module. Can be NULL for STREAMS
- * based sockets.
- * devpathlen: length of the devpath string, or 0 if devpath
- * was statically allocated.
- *
- * Note:
- * This routine assumes that the caller has kmem_alloced
- * devpath (if devpathlen > 0) and module for this routine to
- * consume.
- */
-int
-soconfig(int family, int type, int protocol,
- char *devpath, int devpathlen, char *module)
-{
- struct sockparams *sp;
- int error = 0;
-
- dprint(0, ("soconfig(%d,%d,%d,%s,%d,%s)\n",
- family, type, protocol, devpath, devpathlen,
- module == NULL ? "NULL" : module));
-
- if (sockfs_defer_nl7c_init) {
- nl7c_init();
- sockfs_defer_nl7c_init = 0;
- }
-
- if (devpath == NULL && module == NULL) {
- /*
- * Delete existing entry,
- * both socket module and STEAMS device.
- */
- ASSERT(module == NULL);
- error = sockparams_delete(family, type, protocol);
- } else {
- /*
- * Adding an entry
- * sockparams_create frees mod name and devpath upon failure.
- */
- sp = sockparams_create(family, type, protocol, module,
- devpath, devpathlen, 0, KM_SLEEP, &error);
-
- if (sp != NULL) {
- /*
- * The sockparams entry becomes globally visible once
- * we call sockparams_add(). So we add a reference so
- * we do not have to worry about the entry being
- * immediately deleted.
- */
- SOCKPARAMS_INC_REF(sp);
- error = sockparams_add(sp);
- if (error != 0) {
- SOCKPARAMS_DEC_REF(sp);
- sockparams_destroy(sp);
- } else {
- /*
- * Unique sockparams entry, so init the kstats.
- */
- sockparams_kstat_init(sp);
- SOCKPARAMS_DEC_REF(sp);
- }
- }
- }
-
- return (error);
-}
/*
* solookup(int family, int type, int protocol, struct sockparams **spp)
@@ -716,7 +664,7 @@ solookup(int family, int type, int protocol, struct sockparams **spp)
int error = 0;
*spp = NULL;
- rw_enter(&splist_lock, RW_READER);
+ rw_enter(&sockconf_lock, RW_READER);
/*
* Search the sockparams list for an appropiate entry.
@@ -740,7 +688,7 @@ solookup(int family, int type, int protocol, struct sockparams **spp)
sp->sp_protocol == protocol && found < 2)
found = 2;
}
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
switch (found) {
case 0:
error = EAFNOSUPPORT;
@@ -760,13 +708,13 @@ solookup(int family, int type, int protocol, struct sockparams **spp)
*
* We put a hold on the entry early on, so if the
* sockmod is not loaded, and we have to exit
- * splist_lock to call modload(), we know that the
+ * sockconf_lock to call modload(), we know that the
* sockparams entry wont go away. That way we don't
* have to look up the entry once we come back from
* modload().
*/
SOCKPARAMS_INC_REF(sp);
- rw_exit(&splist_lock);
+ rw_exit(&sockconf_lock);
if (sp->sp_smod_info == NULL) {
smod_info_t *smod = smod_lookup_byname(sp->sp_smod_name);
@@ -807,3 +755,73 @@ solookup(int family, int type, int protocol, struct sockparams **spp)
*spp = sp;
return (0);
}
+
+/*
+ * Called when filter entry `ent' is going away. All sockparams remove
+ * their references to `ent'.
+ */
+static void
+sockparams_filter_cleanup_impl(sof_entry_t *ent, list_t *list)
+{
+ struct sockparams *sp;
+ sp_filter_t *fil;
+ list_t *flist;
+
+ ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+ for (sp = list_head(list); sp != NULL;
+ sp = list_next(list, sp)) {
+ flist = (ent->sofe_flags & SOFEF_AUTO) ?
+ &sp->sp_auto_filters : &sp->sp_prog_filters;
+ fil = list_head(flist);
+ for (fil = list_head(flist); fil != NULL;
+ fil = list_next(flist, fil)) {
+ if (fil->spf_filter == ent) {
+ list_remove(flist, fil);
+ kmem_free(fil, sizeof (sp_filter_t));
+ break;
+ }
+ }
+ }
+}
+void
+sockparams_filter_cleanup(sof_entry_t *ent)
+{
+ sockparams_filter_cleanup_impl(ent, &sphead);
+ sockparams_filter_cleanup_impl(ent, &sp_ephem_list);
+}
+
+/*
+ * New filter is being added; walk the list of sockparams to see if
+ * the filter is interested in any of the sockparams.
+ */
+static int
+sockparams_new_filter_impl(sof_entry_t *ent, list_t *list)
+{
+ struct sockparams *sp;
+ int err;
+
+ ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+ for (sp = list_head(list); sp != NULL;
+ sp = list_next(list, sp)) {
+ if ((err = sof_entry_proc_sockparams(ent, sp)) != 0) {
+ sockparams_filter_cleanup(ent);
+ return (err);
+ }
+ }
+ return (0);
+}
+
+int
+sockparams_new_filter(sof_entry_t *ent)
+{
+ int error;
+
+ if ((error = sockparams_new_filter_impl(ent, &sphead)) != 0)
+ return (error);
+
+ if ((error = sockparams_new_filter_impl(ent, &sp_ephem_list)) != 0)
+ sockparams_filter_cleanup_impl(ent, &sphead);
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 2a329da653..06d76044e5 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -72,6 +71,7 @@
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/socktpi.h>
#include <fs/sockfs/socktpi_impl.h>
#include <fs/sockfs/sodirect.h>
@@ -97,6 +97,12 @@ struct socklist socklist;
struct kmem_cache *socket_cache;
+/*
+ * sockconf_lock protects the socket configuration (socket types and
+ * socket filters) which is changed via the sockconfig system call.
+ */
+krwlock_t sockconf_lock;
+
static int sockfs_update(kstat_t *, int);
static int sockfs_snapshot(kstat_t *, void *, int);
extern smod_info_t *sotpi_smod_create(void);
@@ -239,6 +245,8 @@ sockinit(int fstype, char *name)
sizeof (struct sonode), 0, sonode_constructor,
sonode_destructor, NULL, NULL, NULL, 0);
+ rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
+
error = socktpi_init();
if (error != 0) {
err_str = NULL;
@@ -288,6 +296,9 @@ sockinit(int fstype, char *name)
nl7c_init();
}
+ /* Initialize socket filters */
+ sof_init();
+
return (0);
failure:
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index 6ce3fac8e6..4b518e632b 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -67,6 +66,7 @@
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/socktpi.h>
#ifdef SOCK_TEST
@@ -75,7 +75,10 @@ int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
#define do_useracc 1
#endif /* SOCK_TEST */
-extern int xnet_truncate_print;
+extern int xnet_truncate_print;
+
+extern void nl7c_init(void);
+extern int sockfs_defer_nl7c_init;
/*
* Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
@@ -1519,143 +1522,291 @@ done2:
return (0);
}
-/*
- * Add config info when name is non-NULL; delete info when name is NULL.
- * name could be a device name or a module name and are user address.
- */
-int
-sockconfig(int family, int type, int protocol, char *name)
+static int
+sockconf_add_sock(int family, int type, int protocol, char *name)
{
- char *kdevpath = NULL; /* Copied in devpath string */
+ int error = 0;
+ char *kdevpath = NULL;
char *kmodule = NULL;
+ char *buf = NULL;
size_t pathlen = 0;
- int error = 0;
-
- dprint(1, ("sockconfig(%d, %d, %d, %p)\n",
- family, type, protocol, (void *)name));
-
- if (secpolicy_net_config(CRED(), B_FALSE) != 0)
- return (set_errno(EPERM));
+ struct sockparams *sp;
+ if (name == NULL)
+ return (EINVAL);
/*
- * By default set the kdevpath and kmodule to NULL to delete an entry.
- * Otherwise when name is not NULL, set the kdevpath or kmodule
- * value to add an entry.
+ * Copyin the name.
+ * This also makes it possible to check for too long pathnames.
+ * Compress the space needed for the name before passing it
+ * to soconfig - soconfig will store the string until
+ * the configuration is removed.
*/
- if (name != NULL) {
+ buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
+ kmem_free(buf, MAXPATHLEN);
+ return (error);
+ }
+ if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
+ /* For device */
+
/*
- * Adding an entry.
- * Copyin the name.
- * This also makes it possible to check for too long pathnames.
- * Compress the space needed for the name before passing it
- * to soconfig - soconfig will store the string until
- * the configuration is removed.
+ * Special handling for NCA:
+ *
+ * DEV_NCA is never opened even if an application
+ * requests for AF_NCA. The device opened is instead a
+ * predefined AF_INET transport (NCA_INET_DEV).
+ *
+ * Prior to Volo (PSARC/2007/587) NCA would determine
+ * the device using a lookup, which worked then because
+ * all protocols were based on TPI. Since TPI is no
+ * longer the default, we have to explicitly state
+ * which device to use.
*/
- char *buf;
- buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
- kmem_free(buf, MAXPATHLEN);
- goto done;
+ if (strcmp(buf, NCA_DEV) == 0) {
+ /* only support entry <28, 2, 0> */
+ if (family != AF_NCA || type != SOCK_STREAM ||
+ protocol != 0) {
+ kmem_free(buf, MAXPATHLEN);
+ return (EINVAL);
+ }
+
+ pathlen = strlen(NCA_INET_DEV) + 1;
+ kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(NCA_INET_DEV, kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ } else {
+ kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(buf, kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
}
- if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
- /* For device */
+ } else {
+ /* For socket module */
+ kmodule = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(buf, kmodule, pathlen);
+ kmodule[pathlen - 1] = '\0';
+ pathlen = 0;
+ }
+ kmem_free(buf, MAXPATHLEN);
- /*
- * Special handling for NCA:
- *
- * DEV_NCA is never opened even if an application
- * requests for AF_NCA. The device opened is instead a
- * predefined AF_INET transport (NCA_INET_DEV).
- *
- * Prior to Volo (PSARC/2007/587) NCA would determine
- * the device using a lookup, which worked then because
- * all protocols were based on TPI. Since TPI is no
- * longer the default, we have to explicitly state
- * which device to use.
- */
- if (strcmp(buf, NCA_DEV) == 0) {
- /* only support entry <28, 2, 0> */
- if (family != AF_NCA || type != SOCK_STREAM ||
- protocol != 0) {
- kmem_free(buf, MAXPATHLEN);
- error = EINVAL;
- goto done;
- }
+ /* sockparams_create frees mod name and devpath upon failure */
+ sp = sockparams_create(family, type, protocol, kmodule,
+ kdevpath, pathlen, 0, KM_SLEEP, &error);
+ if (sp != NULL) {
+ error = sockparams_add(sp);
+ if (error != 0)
+ sockparams_destroy(sp);
+ }
- pathlen = strlen(NCA_INET_DEV) + 1;
- kdevpath = kmem_alloc(pathlen, KM_SLEEP);
- bcopy(NCA_INET_DEV, kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- } else {
- kdevpath = kmem_alloc(pathlen, KM_SLEEP);
- bcopy(buf, kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- }
- } else {
- /* For socket module */
- kmodule = kmem_alloc(pathlen, KM_SLEEP);
- bcopy(buf, kmodule, pathlen);
- kmodule[pathlen - 1] = '\0';
-
- pathlen = 0;
- if (strcmp(kmodule, "tcp") == 0) {
- /* Get the tcp device name for fallback */
- if (family == 2) {
- pathlen = strlen("/dev/tcp") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/tcp", kdevpath,
- pathlen);
- kdevpath[pathlen - 1] = '\0';
- } else {
- ASSERT(family == 26);
- pathlen = strlen("/dev/tcp6") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/tcp6", kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- }
- } else if (strcmp(kmodule, "udp") == 0) {
- /* Get the udp device name for fallback */
- if (family == 2) {
- pathlen = strlen("/dev/udp") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/udp", kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- } else {
- ASSERT(family == 26);
- pathlen = strlen("/dev/udp6") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/udp6", kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- }
- } else if (strcmp(kmodule, "icmp") == 0) {
- /* Get the icmp device name for fallback */
- if (family == 2) {
- pathlen = strlen("/dev/rawip") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/rawip", kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- } else {
- ASSERT(family == 26);
- pathlen = strlen("/dev/rawip6") + 1;
- kdevpath = kmem_alloc(pathlen,
- KM_SLEEP);
- bcopy("/dev/rawip6", kdevpath, pathlen);
- kdevpath[pathlen - 1] = '\0';
- }
+ return (error);
+}
+
+static int
+sockconf_remove_sock(int family, int type, int protocol)
+{
+ return (sockparams_delete(family, type, protocol));
+}
+
+static int
+sockconfig_remove_filter(const char *uname)
+{
+ char kname[SOF_MAXNAMELEN];
+ size_t len;
+ int error;
+ sof_entry_t *ent;
+
+ if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
+ return (error);
+
+ ent = sof_entry_remove_by_name(kname);
+ if (ent == NULL)
+ return (ENXIO);
+
+ mutex_enter(&ent->sofe_lock);
+ ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
+ if (ent->sofe_refcnt == 0) {
+ mutex_exit(&ent->sofe_lock);
+ sof_entry_free(ent);
+ } else {
+ /* let the last socket free the filter */
+ ent->sofe_flags |= SOFEF_CONDEMED;
+ mutex_exit(&ent->sofe_lock);
+ }
+
+ return (0);
+}
+
+static int
+sockconfig_add_filter(const char *uname, void *ufilpropp)
+{
+ struct sockconfig_filter_props filprop;
+ sof_entry_t *ent;
+ int error;
+ size_t tuplesz, len;
+ char hintbuf[SOF_MAXNAMELEN];
+
+ ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
+ mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
+ &len)) != 0) {
+ sof_entry_free(ent);
+ return (error);
+ }
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
+ sof_entry_free(ent);
+ return (EFAULT);
+ }
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ struct sockconfig_filter_props32 filprop32;
+
+ if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
+ sof_entry_free(ent);
+ return (EFAULT);
+ }
+ filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
+ filprop.sfp_autoattach = filprop32.sfp_autoattach;
+ filprop.sfp_hint = filprop32.sfp_hint;
+ filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
+ filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
+ filprop.sfp_socktuple =
+ (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
+ sizeof (ent->sofe_modname), &len)) != 0) {
+ sof_entry_free(ent);
+ return (error);
+ }
+
+ /*
+ * A filter must specify at least one socket tuple.
+ */
+ if (filprop.sfp_socktuple_cnt == 0 ||
+ filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
+ sof_entry_free(ent);
+ return (EINVAL);
+ }
+ ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
+ ent->sofe_hint = filprop.sfp_hint;
+
+ /*
+ * Verify the hint, and copy in the hint argument, if necessary.
+ */
+ switch (ent->sofe_hint) {
+ case SOF_HINT_BEFORE:
+ case SOF_HINT_AFTER:
+ if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
+ sizeof (hintbuf), &len)) != 0) {
+ sof_entry_free(ent);
+ return (error);
+ }
+ ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
+ bcopy(hintbuf, ent->sofe_hintarg, len);
+ /* FALLTHRU */
+ case SOF_HINT_TOP:
+ case SOF_HINT_BOTTOM:
+ /* hints cannot be used with programmatic filters */
+ if (ent->sofe_flags & SOFEF_PROG) {
+ sof_entry_free(ent);
+ return (EINVAL);
+ }
+ break;
+ case SOF_HINT_NONE:
+ break;
+ default:
+ /* bad hint value */
+ sof_entry_free(ent);
+ return (EINVAL);
+ }
+
+ ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
+ tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
+ ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
+ tuplesz)) {
+ sof_entry_free(ent);
+ return (EFAULT);
+ }
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ int i;
+ caddr_t data = (caddr_t)filprop.sfp_socktuple;
+ sof_socktuple_t *tup = ent->sofe_socktuple;
+ sof_socktuple32_t tup32;
+
+ tup = ent->sofe_socktuple;
+ for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
+ ASSERT(tup < ent->sofe_socktuple + tuplesz);
+
+ if (copyin(data, &tup32, sizeof (tup32)) != 0) {
+ sof_entry_free(ent);
+ return (EFAULT);
}
+ tup->sofst_family = tup32.sofst_family;
+ tup->sofst_type = tup32.sofst_type;
+ tup->sofst_protocol = tup32.sofst_protocol;
+
+ data += sizeof (tup32);
}
+ }
+#endif /* _SYSCALL32_IMPL */
- kmem_free(buf, MAXPATHLEN);
+ /* Sockets can start using the filter as soon as the filter is added */
+ if ((error = sof_entry_add(ent)) != 0)
+ sof_entry_free(ent);
+
+ return (error);
+}
+
+/*
+ * Socket configuration system call. It is used to add and remove
+ * socket types.
+ */
+int
+sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
+{
+ int error = 0;
+
+ if (secpolicy_net_config(CRED(), B_FALSE) != 0)
+ return (set_errno(EPERM));
+
+ if (sockfs_defer_nl7c_init) {
+ nl7c_init();
+ sockfs_defer_nl7c_init = 0;
}
- error = soconfig(family, type, protocol, kdevpath, (int)pathlen,
- kmodule);
-done:
- if (error) {
+
+ switch (cmd) {
+ case SOCKCONFIG_ADD_SOCK:
+ error = sockconf_add_sock((int)(uintptr_t)arg1,
+ (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
+ break;
+ case SOCKCONFIG_REMOVE_SOCK:
+ error = sockconf_remove_sock((int)(uintptr_t)arg1,
+ (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
+ break;
+ case SOCKCONFIG_ADD_FILTER:
+ error = sockconfig_add_filter((const char *)arg1, arg2);
+ break;
+ case SOCKCONFIG_REMOVE_FILTER:
+ error = sockconfig_remove_filter((const char *)arg1);
+ break;
+ default:
+#ifdef DEBUG
+ cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
+#endif
+ error = EINVAL;
+ break;
+ }
+
+ if (error != 0) {
eprintline(error);
return (set_errno(error));
}
@@ -1943,9 +2094,15 @@ snf_async_read(snf_req_t *sr)
* For sockets acting as an SSL proxy, we
* need to adjust the size to the maximum
* SSL record size set in the stream head.
+ *
+ * Socket filters can limit the mblk size,
+ * so limit reads to maxblk if there are
+ * filters present.
*/
- if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
- SOTOTPI(so)->sti_kssl_ctx != NULL)
+ if (vp->v_type == VSOCK &&
+ (!SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+ (so->so_filter_active > 0 && maxblk != INFPSZ))
iosize = (int)MIN(iosize, maxblk);
if (is_system_labeled()) {
@@ -2550,9 +2707,14 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
* For sockets acting as an SSL proxy, we
* need to adjust the size to the maximum
* SSL record size set in the stream head.
+ *
+ * Socket filters can limit the mblk size,
+ * so limit reads to maxblk if there are
*/
- if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
- SOTOTPI(so)->sti_kssl_ctx != NULL)
+ if (vp->v_type == VSOCK &&
+ (!SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+ so->so_filter_active > 0 && maxblk != INFPSZ)
iosize = (int)MIN(iosize, maxblk);
if (is_system_labeled()) {
@@ -2804,7 +2966,7 @@ solisten(struct sonode *so, int backlog)
}
int
-soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen,
+soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
int fflag, int flags)
{
return (socket_connect(so, name, namelen, fflag, flags, CRED()));
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index de0293e710..a4f9f90a4a 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -214,7 +213,7 @@ static int sotpi_accept(struct sonode *, int, struct cred *,
static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
int, struct cred *);
static int sotpi_listen(struct sonode *, int, struct cred *);
-static int sotpi_connect(struct sonode *, const struct sockaddr *,
+static int sotpi_connect(struct sonode *, struct sockaddr *,
socklen_t, int, int, struct cred *);
extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
struct uio *, struct cred *);
@@ -2231,7 +2230,7 @@ e_bad:
*/
int
sotpi_connect(struct sonode *so,
- const struct sockaddr *name,
+ struct sockaddr *name,
socklen_t namelen,
int fflag,
int flags,
@@ -6484,23 +6483,6 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
*direct = B_TRUE;
/*
- * When it comes to urgent data we have two cases to deal with;
- * (1) The oob byte has already arrived, or (2) the protocol has
- * notified that oob data is pending, but it has not yet arrived.
- *
- * For (1) all we need to do is send a T_EXDATA_IND to indicate were
- * in the byte stream the oob byte is. For (2) we have to send a
- * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
- * the oob byte will be the next byte from the protocol.
- *
- * So in the worst case we need two mblks, one for the signal, another
- * for mark indication. In that case we use the exdata_mp for the sig.
- */
- sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
- STR_NOSIG, NULL);
- sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
-
- /*
* Keep the original sp around so we can properly dispose of the
* sonode when the socket is being closed.
*/
@@ -6560,16 +6542,6 @@ sotpi_revert_sonode(struct sonode *so, struct cred *cr)
ASSERT(!SOCK_IS_NONSTR(so));
ASSERT(vp->v_stream != NULL);
- if (SOTOTPI(so)->sti_exdata_mp != NULL) {
- freeb(SOTOTPI(so)->sti_exdata_mp);
- SOTOTPI(so)->sti_exdata_mp = NULL;
- }
-
- if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
- freeb(SOTOTPI(so)->sti_urgmark_mp);
- SOTOTPI(so)->sti_urgmark_mp = NULL;
- }
-
strclean(vp);
(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
@@ -6677,9 +6649,6 @@ i_sotpi_info_constructor(sotpi_info_t *sti)
sti->sti_nl7c_uri = NULL;
sti->sti_nl7c_rcv_mp = NULL;
- sti->sti_exdata_mp = NULL;
- sti->sti_urgmark_mp = NULL;
-
mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
@@ -6705,9 +6674,6 @@ i_sotpi_info_destructor(sotpi_info_t *sti)
ASSERT(sti->sti_nl7c_uri == NULL);
ASSERT(sti->sti_nl7c_rcv_mp == NULL);
- ASSERT(sti->sti_exdata_mp == NULL);
- ASSERT(sti->sti_urgmark_mp == NULL);
-
mutex_destroy(&sti->sti_plumb_lock);
cv_destroy(&sti->sti_ack_cv);
}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h
index c8dc101bdd..8044973377 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi.h
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SOCKFS_SOCKTPI_H
@@ -251,12 +250,6 @@ typedef struct sotpi_info {
kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */
kssl_ent_t sti_kssl_ent; /* SSL config entry */
kssl_ctx_t sti_kssl_ctx; /* SSL session context */
-
- /*
- * The mblks below are only allocated and used during fallback.
- */
- mblk_t *sti_exdata_mp; /* T_EXDATA_IND or SIGURG */
- mblk_t *sti_urgmark_mp; /* mark indication */
} sotpi_info_t;
struct T_capability_ack;
diff --git a/usr/src/uts/common/fs/sockfs/sodirect.c b/usr/src/uts/common/fs/sockfs/sodirect.c
index e64fca9de6..f30681fdc7 100644
--- a/usr/src/uts/common/fs/sockfs/sodirect.c
+++ b/usr/src/uts/common/fs/sockfs/sodirect.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -78,7 +77,7 @@ sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
if (uiop->uio_resid >= uioasync.mincnt &&
sodp != NULL && sodp->sod_enabled &&
uioasync.enabled && !(flags & MSG_PEEK) &&
- !so->so_proto_props.sopp_loopback &&
+ !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 &&
!(so->so_state & SS_CANTRCVMORE)) {
/*
* Big enough I/O for uioa min setup and an sodirect socket
diff --git a/usr/src/uts/common/inet/inetddi.c b/usr/src/uts/common/inet/inetddi.c
index 6b0cd5839a..a64bf7e978 100644
--- a/usr/src/uts/common/inet/inetddi.c
+++ b/usr/src/uts/common/inet/inetddi.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -88,6 +87,12 @@ INET_SOCKDESC must be defined!
#elif defined(INET_SOCKDESC) && !defined(INET_SOCK_PROTO_CREATE_FUNC)
#error inetddi.c: INET_SOCKDESC is defined but INET_SOCK_PROTO_CREATE_FUNC \
is not!
+#elif defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V4)
+#error inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \
+INET_SOCK_FALLBACK_DEV_V4 is not!
+#elif defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V6)
+#error inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \
+INET_SOCK_FALLBACK_DEV_V6 is not!
#endif
#ifdef INET_DEVDESC
@@ -216,7 +221,9 @@ static struct modlstrmod modlstrmod = {
static __smod_priv_t smodpriv = {
NULL,
NULL,
- INET_SOCK_PROTO_FB_FUNC
+ INET_SOCK_PROTO_FB_FUNC,
+ INET_SOCK_FALLBACK_DEV_V4,
+ INET_SOCK_FALLBACK_DEV_V6
};
#endif /* INET_SOCK_PROTO_FB_FUNC */
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 7c5ae628be..d67bf624dd 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -5215,7 +5215,8 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
/* ARGSUSED2 */
int
rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
- boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
+ sock_quiesce_arg_t *arg)
{
conn_t *connp = (conn_t *)proto_handle;
icmp_t *icmp;
@@ -5224,7 +5225,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
socklen_t laddrlen, faddrlen;
short opts;
struct stroptions *stropt;
- mblk_t *stropt_mp;
+ mblk_t *mp, *stropt_mp;
int error;
icmp = connp->conn_icmp;
@@ -5276,7 +5277,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
- (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
(struct sockaddr *)&laddr, laddrlen,
(struct sockaddr *)&faddr, faddrlen, opts);
@@ -5285,9 +5286,11 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
* queued in icmp_t. Now we push up any queued packets.
*/
mutex_enter(&icmp->icmp_recv_lock);
+ if (mp != NULL) {
+ mp->b_next = icmp->icmp_fallback_queue_head;
+ icmp->icmp_fallback_queue_head = mp;
+ }
while (icmp->icmp_fallback_queue_head != NULL) {
- mblk_t *mp;
-
mp = icmp->icmp_fallback_queue_head;
icmp->icmp_fallback_queue_head = mp->b_next;
mp->b_next = NULL;
diff --git a/usr/src/uts/common/inet/ip/icmpddi.c b/usr/src/uts/common/inet/ip/icmpddi.c
index dd0023c0c8..0caa9c7f6c 100644
--- a/usr/src/uts/common/inet/ip/icmpddi.c
+++ b/usr/src/uts/common/inet/ip/icmpddi.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -40,8 +39,10 @@
#define INET_DEVSTRTAB icmpinfov4
#define INET_MODSTRTAB dummymodinfo
#define INET_SOCKDESC "Rawip socket module"
-#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create)
-#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback)
+#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create)
+#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback)
+#define INET_SOCK_FALLBACK_DEV_V4 "/dev/icmp"
+#define INET_SOCK_FALLBACK_DEV_V6 "/dev/icmp6"
#define INET_DEVMTFLAGS D_MP
#define INET_MODMTFLAGS D_MP
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 71b4f3f228..6fb72d1d08 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -107,7 +106,7 @@ extern void icmp_ddi_g_destroy(void);
extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **,
uint_t *, int *, int, cred_t *);
extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
- so_proto_quiesced_cb_t);
+ so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
extern sock_downcalls_t sock_rawip_downcalls;
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c
index 0f277be716..871e9f71e5 100644
--- a/usr/src/uts/common/inet/sockmods/socksctp.c
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -63,7 +62,7 @@ static int sosctp_accept(struct sonode *, int, struct cred *, struct sonode **);
static int sosctp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
struct cred *);
static int sosctp_listen(struct sonode *, int, struct cred *);
-static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t,
+static int sosctp_connect(struct sonode *, struct sockaddr *, socklen_t,
int, int, struct cred *);
static int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
struct cred *);
@@ -86,7 +85,7 @@ void sosctp_fini(struct sonode *, struct cred *);
/*
* SCTP sockfs sonode operations, 1-N socket
*/
-static int sosctp_seq_connect(struct sonode *, const struct sockaddr *,
+static int sosctp_seq_connect(struct sonode *, struct sockaddr *,
socklen_t, int, int, struct cred *);
static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
struct cred *);
@@ -352,7 +351,7 @@ done:
*/
/*ARGSUSED*/
static int
-sosctp_connect(struct sonode *so, const struct sockaddr *name,
+sosctp_connect(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int fflag, int flags, struct cred *cr)
{
int error = 0;
@@ -433,7 +432,7 @@ done:
* make it so.
*/
static int
-sosctp_seq_connect(struct sonode *so, const struct sockaddr *name,
+sosctp_seq_connect(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int fflag, int flags, struct cred *cr)
{
struct sctp_soassoc *ssa;
diff --git a/usr/src/uts/common/inet/sockmods/socksdp.c b/usr/src/uts/common/inet/sockmods/socksdp.c
index 3ec9ff5cfb..8841bce55c 100644
--- a/usr/src/uts/common/inet/sockmods/socksdp.c
+++ b/usr/src/uts/common/inet/sockmods/socksdp.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -63,7 +62,7 @@ static int sosdp_accept(struct sonode *, int, struct cred *, struct sonode **);
static int sosdp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
struct cred *);
static int sosdp_listen(struct sonode *, int, struct cred *);
-static int sosdp_connect(struct sonode *, const struct sockaddr *, socklen_t,
+static int sosdp_connect(struct sonode *, struct sockaddr *, socklen_t,
int, int, struct cred *);
static int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
struct cred *);
@@ -325,7 +324,7 @@ done:
*/
/*ARGSUSED*/
static int
-sosdp_connect(struct sonode *so, const struct sockaddr *name,
+sosdp_connect(struct sonode *so, struct sockaddr *name,
socklen_t namelen, int fflag, int flags, struct cred *cr)
{
int error = 0;
@@ -1120,7 +1119,7 @@ sosdp_poll(struct sonode *so, short events, int anyyet, short *reventsp,
*reventsp |= (POLLIN|POLLRDNORM) & events;
}
- if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_head != NULL)) {
+ if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_len > 0)) {
*reventsp |= (POLLIN|POLLRDNORM) & events;
}
@@ -1158,7 +1157,7 @@ sosdp_close(struct sonode *so, int flag, struct cred *cr)
mutex_enter(&so->so_lock);
so_unlock_single(so, SOLOCKED);
- so_notify_disconnected(so, error);
+ so_notify_disconnected(so, B_FALSE, error);
return (error);
}
@@ -1266,7 +1265,7 @@ sdp_sock_disconnected(void *handle, int error)
ASSERT(so->so_proto_handle != NULL); /* closed conn */
soisdisconnected(so, error);
- so_notify_disconnected(so, error);
+ so_notify_disconnected(so, B_FALSE, error);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 441722acd4..40e78141c9 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -952,6 +952,18 @@ tcp_clean_death(tcp_t *tcp, int err)
}
}
+ /*
+ * ESTABLISHED non-STREAMS eagers are not 'detached' because
+ * an upper handle is obtained when the SYN-ACK comes in. So it
+ * should receive the 'disconnected' upcall, but tcp_reinit should
+ * not be called since this is an eager.
+ */
+ if (tcp->tcp_listener != NULL && IPCL_IS_NONSTR(connp)) {
+ tcp_closei_local(tcp);
+ tcp->tcp_state = TCPS_BOUND;
+ return (0);
+ }
+
tcp_reinit(tcp);
if (IPCL_IS_NONSTR(connp))
(void) tcp_do_unbind(connp);
@@ -1014,15 +1026,23 @@ tcp_stop_lingering(tcp_t *tcp)
CONN_DEC_REF(connp);
}
finish:
- /* Signal closing thread that it can complete close */
- mutex_enter(&tcp->tcp_closelock);
tcp->tcp_detached = B_TRUE;
connp->conn_rq = NULL;
connp->conn_wq = NULL;
+ /* Signal closing thread that it can complete close */
+ mutex_enter(&tcp->tcp_closelock);
tcp->tcp_closed = 1;
cv_signal(&tcp->tcp_closecv);
mutex_exit(&tcp->tcp_closelock);
+
+ /* If we have an upper handle (socket), release it */
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(connp->conn_upper_handle != NULL);
+ (*connp->conn_upcalls->su_closed)(connp->conn_upper_handle);
+ connp->conn_upper_handle = NULL;
+ connp->conn_upcalls = NULL;
+ }
}
void
@@ -1088,6 +1108,15 @@ tcp_close_common(conn_t *connp, int flags)
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
+ /*
+ * For non-STREAMS sockets, the normal case is that the conn makes
+ * an upcall when it's finally closed, so there is no need to wait
+ * in the protocol. But in case of SO_LINGER the thread sleeps here
+ * so it can properly deal with the thread being interrupted.
+ */
+ if (IPCL_IS_NONSTR(connp) && connp->conn_linger == 0)
+ goto nowait;
+
mutex_enter(&tcp->tcp_closelock);
while (!tcp->tcp_closed) {
if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
@@ -1129,8 +1158,12 @@ tcp_close_common(conn_t *connp, int flags)
* conn_wq of the eagers point to our queues. By waiting for the
* refcnt to drop to 1, we are sure that the eagers have cleaned
* up their queue pointers and also dropped their references to us.
+ *
+ * For non-STREAMS sockets we do not have to wait here; the
+ * listener will instead make a su_closed upcall when the last
+ * reference is dropped.
*/
- if (tcp->tcp_wait_for_eagers) {
+ if (tcp->tcp_wait_for_eagers && !IPCL_IS_NONSTR(connp)) {
mutex_enter(&connp->conn_lock);
while (connp->conn_ref != 1) {
cv_wait(&connp->conn_cv, &connp->conn_lock);
@@ -1138,6 +1171,7 @@ tcp_close_common(conn_t *connp, int flags)
mutex_exit(&connp->conn_lock);
}
+nowait:
connp->conn_cpid = NOPID;
}
@@ -1410,6 +1444,22 @@ tcp_free(tcp_t *tcp)
* the following code is enough.
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+
+ /*
+ * If this is a non-STREAM socket still holding on to an upper
+ * handle, release it. As a result of fallback we might also see
+ * STREAMS based conns with upper handles, in which case there is
+ * nothing to do other than clearing the field.
+ */
+ if (connp->conn_upper_handle != NULL) {
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_closed)(
+ connp->conn_upper_handle);
+ tcp->tcp_detached = B_TRUE;
+ }
+ connp->conn_upper_handle = NULL;
+ connp->conn_upcalls = NULL;
+ }
}
/*
@@ -3092,103 +3142,19 @@ tcp_do_unbind(conn_t *connp)
}
/*
- * This runs at the tail end of accept processing on the squeue of the
- * new connection.
+ * Collect protocol properties to send to the upper handle.
*/
-/* ARGSUSED */
void
-tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+tcp_get_proto_props(tcp_t *tcp, struct sock_proto_props *sopp)
{
- conn_t *connp = (conn_t *)arg;
- tcp_t *tcp = connp->conn_tcp;
- queue_t *q = connp->conn_rq;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- /* socket options */
- struct sock_proto_props sopp;
-
- /* We should just receive a single mblk that fits a T_discon_ind */
- ASSERT(mp->b_cont == NULL);
-
- /*
- * Drop the eager's ref on the listener, that was placed when
- * this eager began life in tcp_input_listener.
- */
- CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
- if (IPCL_IS_NONSTR(connp)) {
- /* Safe to free conn_ind message */
- freemsg(tcp->tcp_conn.tcp_eager_conn_ind);
- tcp->tcp_conn.tcp_eager_conn_ind = NULL;
- }
-
- tcp->tcp_detached = B_FALSE;
+ conn_t *connp = tcp->tcp_connp;
- if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
- /*
- * Someone blewoff the eager before we could finish
- * the accept.
- *
- * The only reason eager exists it because we put in
- * a ref on it when conn ind went up. We need to send
- * a disconnect indication up while the last reference
- * on the eager will be dropped by the squeue when we
- * return.
- */
- ASSERT(tcp->tcp_listener == NULL);
- if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
- if (IPCL_IS_NONSTR(connp)) {
- ASSERT(tcp->tcp_issocket);
- (*connp->conn_upcalls->su_disconnected)(
- connp->conn_upper_handle, tcp->tcp_connid,
- ECONNREFUSED);
- freemsg(mp);
- } else {
- struct T_discon_ind *tdi;
-
- (void) putnextctl1(q, M_FLUSH, FLUSHRW);
- /*
- * Let us reuse the incoming mblk to avoid
- * memory allocation failure problems. We know
- * that the size of the incoming mblk i.e.
- * stroptions is greater than sizeof
- * T_discon_ind.
- */
- ASSERT(DB_REF(mp) == 1);
- ASSERT(MBLKSIZE(mp) >=
- sizeof (struct T_discon_ind));
-
- DB_TYPE(mp) = M_PROTO;
- ((union T_primitives *)mp->b_rptr)->type =
- T_DISCON_IND;
- tdi = (struct T_discon_ind *)mp->b_rptr;
- if (tcp->tcp_issocket) {
- tdi->DISCON_reason = ECONNREFUSED;
- tdi->SEQ_number = 0;
- } else {
- tdi->DISCON_reason = ENOPROTOOPT;
- tdi->SEQ_number =
- tcp->tcp_conn_req_seqnum;
- }
- mp->b_wptr = mp->b_rptr +
- sizeof (struct T_discon_ind);
- putnext(q, mp);
- }
- }
- tcp->tcp_hard_binding = B_FALSE;
- return;
- }
-
- /*
- * This is the first time we run on the correct
- * queue after tcp_accept. So fix all the q parameters
- * here.
- */
- sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
- sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+ sopp->sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
+ sopp->sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
- sopp.sopp_rxhiwat = tcp->tcp_fused ?
+ sopp->sopp_rxhiwat = tcp->tcp_fused ?
tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) :
connp->conn_rcvbuf;
-
/*
* Determine what write offset value to use depending on SACK and
* whether the endpoint is fused or not.
@@ -3203,18 +3169,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
* since it would reduce the amount of work done by kmem.
* Non-fused tcp loopback case is handled separately below.
*/
- sopp.sopp_wroff = 0;
+ sopp->sopp_wroff = 0;
/*
* Update the peer's transmit parameters according to
* our recently calculated high water mark value.
*/
(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
} else if (tcp->tcp_snd_sack_ok) {
- sopp.sopp_wroff = connp->conn_ht_iphc_allocated +
- (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
+ sopp->sopp_wroff = connp->conn_ht_iphc_allocated +
+ (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra);
} else {
- sopp.sopp_wroff = connp->conn_ht_iphc_len +
- (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
+ sopp->sopp_wroff = connp->conn_ht_iphc_len +
+ (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra);
}
/*
@@ -3239,297 +3205,10 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN;
}
-
- /* Send the options up */
- if (IPCL_IS_NONSTR(connp)) {
- if (sopp.sopp_flags & SOCKOPT_TAIL) {
- ASSERT(tcp->tcp_kssl_ctx != NULL);
- ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY);
- }
- if (tcp->tcp_loopback) {
- sopp.sopp_flags |= SOCKOPT_LOOPBACK;
- sopp.sopp_loopback = B_TRUE;
- }
- (*connp->conn_upcalls->su_set_proto_props)
- (connp->conn_upper_handle, &sopp);
- freemsg(mp);
- } else {
- /*
- * Let us reuse the incoming mblk to avoid
- * memory allocation failure problems. We know
- * that the size of the incoming mblk is at least
- * stroptions
- */
- struct stroptions *stropt;
-
- ASSERT(DB_REF(mp) == 1);
- ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
-
- DB_TYPE(mp) = M_SETOPTS;
- stropt = (struct stroptions *)mp->b_rptr;
- mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
- stropt = (struct stroptions *)mp->b_rptr;
- stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
- stropt->so_hiwat = sopp.sopp_rxhiwat;
- stropt->so_wroff = sopp.sopp_wroff;
- stropt->so_maxblk = sopp.sopp_maxblk;
-
- if (sopp.sopp_flags & SOCKOPT_TAIL) {
- ASSERT(tcp->tcp_kssl_ctx != NULL);
-
- stropt->so_flags |= SO_TAIL | SO_COPYOPT;
- stropt->so_tail = sopp.sopp_tail;
- stropt->so_copyopt = sopp.sopp_zcopyflag;
- }
-
- /* Send the options up */
- putnext(q, mp);
- }
-
- /*
- * Pass up any data and/or a fin that has been received.
- *
- * Adjust receive window in case it had decreased
- * (because there is data <=> tcp_rcv_list != NULL)
- * while the connection was detached. Note that
- * in case the eager was flow-controlled, w/o this
- * code, the rwnd may never open up again!
- */
- if (tcp->tcp_rcv_list != NULL) {
- if (IPCL_IS_NONSTR(connp)) {
- mblk_t *mp;
- int space_left;
- int error;
- boolean_t push = B_TRUE;
-
- if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, NULL, 0, 0, &error,
- &push) >= 0) {
- tcp->tcp_rwnd = connp->conn_rcvbuf;
- if (tcp->tcp_state >= TCPS_ESTABLISHED &&
- tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
- tcp_xmit_ctl(NULL,
- tcp, (tcp->tcp_swnd == 0) ?
- tcp->tcp_suna : tcp->tcp_snxt,
- tcp->tcp_rnxt, TH_ACK);
- }
- }
- while ((mp = tcp->tcp_rcv_list) != NULL) {
- push = B_TRUE;
- tcp->tcp_rcv_list = mp->b_next;
- mp->b_next = NULL;
- space_left = (*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, mp, msgdsize(mp),
- 0, &error, &push);
- if (space_left < 0) {
- /*
- * We should never be in middle of a
- * fallback, the squeue guarantees that.
- */
- ASSERT(error != EOPNOTSUPP);
- }
- }
- tcp->tcp_rcv_last_head = NULL;
- tcp->tcp_rcv_last_tail = NULL;
- tcp->tcp_rcv_cnt = 0;
- } else {
- /* We drain directly in case of fused tcp loopback */
-
- if (!tcp->tcp_fused && canputnext(q)) {
- tcp->tcp_rwnd = connp->conn_rcvbuf;
- if (tcp->tcp_state >= TCPS_ESTABLISHED &&
- tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
- tcp_xmit_ctl(NULL,
- tcp, (tcp->tcp_swnd == 0) ?
- tcp->tcp_suna : tcp->tcp_snxt,
- tcp->tcp_rnxt, TH_ACK);
- }
- }
-
- (void) tcp_rcv_drain(tcp);
- }
-
- /*
- * For fused tcp loopback, back-enable peer endpoint
- * if it's currently flow-controlled.
- */
- if (tcp->tcp_fused) {
- tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-
- ASSERT(peer_tcp != NULL);
- ASSERT(peer_tcp->tcp_fused);
-
- mutex_enter(&peer_tcp->tcp_non_sq_lock);
- if (peer_tcp->tcp_flow_stopped) {
- tcp_clrqfull(peer_tcp);
- TCP_STAT(tcps, tcp_fusion_backenabled);
- }
- mutex_exit(&peer_tcp->tcp_non_sq_lock);
- }
- }
- ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
- if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
- tcp->tcp_ordrel_done = B_TRUE;
- if (IPCL_IS_NONSTR(connp)) {
- ASSERT(tcp->tcp_ordrel_mp == NULL);
- (*connp->conn_upcalls->su_opctl)(
- connp->conn_upper_handle,
- SOCK_OPCTL_SHUT_RECV, 0);
- } else {
- mp = tcp->tcp_ordrel_mp;
- tcp->tcp_ordrel_mp = NULL;
- putnext(q, mp);
- }
- }
- tcp->tcp_hard_binding = B_FALSE;
-
- if (connp->conn_keepalive) {
- tcp->tcp_ka_last_intrvl = 0;
- tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
- tcp->tcp_ka_interval);
- }
-
- /*
- * At this point, eager is fully established and will
- * have the following references -
- *
- * 2 references for connection to exist (1 for TCP and 1 for IP).
- * 1 reference for the squeue which will be dropped by the squeue as
- * soon as this function returns.
- * There will be 1 additonal reference for being in classifier
- * hash list provided something bad hasn't happened.
- */
- ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
- (connp->conn_fanout == NULL && connp->conn_ref >= 3));
-}
-
-/*
- * Common to TPI and sockfs accept code.
- */
-/* ARGSUSED2 */
-int
-tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
-{
- tcp_t *listener, *eager;
- mblk_t *discon_mp;
-
- listener = lconnp->conn_tcp;
- ASSERT(listener->tcp_state == TCPS_LISTEN);
- eager = econnp->conn_tcp;
- ASSERT(eager->tcp_listener != NULL);
-
- /*
- * Pre allocate the discon_ind mblk also. tcp_accept_finish will
- * use it if something failed.
- */
- discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
- sizeof (struct stroptions)), BPRI_HI);
-
- if (discon_mp == NULL) {
- return (-TPROTO);
- }
- eager->tcp_issocket = B_TRUE;
-
- econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
- econnp->conn_allzones = listener->tcp_connp->conn_allzones;
- ASSERT(econnp->conn_netstack ==
- listener->tcp_connp->conn_netstack);
- ASSERT(eager->tcp_tcps == listener->tcp_tcps);
-
- /* Put the ref for IP */
- CONN_INC_REF(econnp);
-
- /*
- * We should have minimum of 3 references on the conn
- * at this point. One each for TCP and IP and one for
- * the T_conn_ind that was sent up when the 3-way handshake
- * completed. In the normal case we would also have another
- * reference (making a total of 4) for the conn being in the
- * classifier hash list. However the eager could have received
- * an RST subsequently and tcp_closei_local could have removed
- * the eager from the classifier hash list, hence we can't
- * assert that reference.
- */
- ASSERT(econnp->conn_ref >= 3);
-
- mutex_enter(&listener->tcp_eager_lock);
- if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
-
- tcp_t *tail;
- tcp_t *tcp;
- mblk_t *mp1;
-
- tcp = listener->tcp_eager_prev_q0;
- /*
- * listener->tcp_eager_prev_q0 points to the TAIL of the
- * deferred T_conn_ind queue. We need to get to the head
- * of the queue in order to send up T_conn_ind the same
- * order as how the 3WHS is completed.
- */
- while (tcp != listener) {
- if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
- !tcp->tcp_kssl_pending)
- break;
- else
- tcp = tcp->tcp_eager_prev_q0;
- }
- /* None of the pending eagers can be sent up now */
- if (tcp == listener)
- goto no_more_eagers;
-
- mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
- tcp->tcp_conn.tcp_eager_conn_ind = NULL;
- /* Move from q0 to q */
- ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
- listener->tcp_conn_req_cnt_q0--;
- listener->tcp_conn_req_cnt_q++;
- tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
- tcp->tcp_eager_prev_q0;
- tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
- tcp->tcp_eager_next_q0;
- tcp->tcp_eager_prev_q0 = NULL;
- tcp->tcp_eager_next_q0 = NULL;
- tcp->tcp_conn_def_q0 = B_FALSE;
-
- /* Make sure the tcp isn't in the list of droppables */
- ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
- tcp->tcp_eager_prev_drop_q0 == NULL);
-
- /*
- * Insert at end of the queue because sockfs sends
- * down T_CONN_RES in chronological order. Leaving
- * the older conn indications at front of the queue
- * helps reducing search time.
- */
- tail = listener->tcp_eager_last_q;
- if (tail != NULL) {
- tail->tcp_eager_next_q = tcp;
- } else {
- listener->tcp_eager_next_q = tcp;
- }
- listener->tcp_eager_last_q = tcp;
- tcp->tcp_eager_next_q = NULL;
-
- /* Need to get inside the listener perimeter */
- CONN_INC_REF(listener->tcp_connp);
- SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
- tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL,
- SQTAG_TCP_SEND_PENDING);
+ if (tcp->tcp_loopback) {
+ sopp->sopp_flags |= SOCKOPT_LOOPBACK;
+ sopp->sopp_loopback = B_TRUE;
}
-no_more_eagers:
- tcp_eager_unlink(eager);
- mutex_exit(&listener->tcp_eager_lock);
-
- /*
- * At this point, the eager is detached from the listener
- * but we still have an extra refs on eager (apart from the
- * usual tcp references). The ref was placed in tcp_input_data
- * before sending the conn_ind in tcp_send_conn_ind.
- * The ref will be dropped in tcp_accept_finish().
- */
- SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
- econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
- return (0);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index c8f50cee8f..81640b8329 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -233,8 +233,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
mp->b_wptr += sizeof (*stropt);
stropt = (struct stroptions *)mp->b_rptr;
- stropt->so_flags = SO_WROFF;
+ stropt->so_flags = SO_WROFF | SO_MAXBLK;
stropt->so_wroff = 0;
+ stropt->so_maxblk = INFPSZ;
/* Send the options up */
putnext(peer_rq, mp);
@@ -244,8 +245,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
/* The peer is a non-STREAMS end point */
ASSERT(IPCL_IS_TCP(peer_connp));
- sopp.sopp_flags = SOCKOPT_WROFF;
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK;
sopp.sopp_wroff = 0;
+ sopp.sopp_maxblk = INFPSZ;
(*peer_connp->conn_upcalls->su_set_proto_props)
(peer_connp->conn_upper_handle, &sopp);
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index ce00372741..4b5a37a13c 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -1542,14 +1542,14 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
eager->tcp_kssl_pending = B_TRUE;
}
+ ASSERT(eager->tcp_ordrel_mp == NULL);
+
/* Inherit the listener's non-STREAMS flag */
if (IPCL_IS_NONSTR(lconnp)) {
econnp->conn_flags |= IPCL_NONSTR;
- }
-
- ASSERT(eager->tcp_ordrel_mp == NULL);
-
- if (!IPCL_IS_NONSTR(econnp)) {
+ /* All non-STREAMS tcp_ts are sockets */
+ eager->tcp_issocket = B_TRUE;
+ } else {
/*
* Pre-allocate the T_ordrel_ind mblk for TPI socket so that
* at close time, we will always have that to send up.
@@ -1632,7 +1632,7 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
/*
* Since we will clear tcp_listener before we clear tcp_detached
* in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
- * so we can tell a TCP_DETACHED_NONEAGER apart.
+ * so we can tell a TCP_IS_DETACHED_NONEAGER apart.
*/
eager->tcp_hard_binding = B_TRUE;
@@ -2003,8 +2003,6 @@ tcp_rcv_drain(tcp_t *tcp)
* some work.
*/
if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
- ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) ||
- tcp->tcp_fused_sigurg_mp != NULL);
if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
&tcp->tcp_fused_sigurg_mp))
return (ret);
@@ -3588,14 +3586,79 @@ process_ack:
if (bytes_acked > 0)
tcp->tcp_ip_forward_progress = B_TRUE;
if (tcp->tcp_state == TCPS_SYN_RCVD) {
- if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) &&
- ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) {
- /* 3-way handshake complete - pass up the T_CONN_IND */
+ /*
+ * tcp_sendmsg() checks tcp_state without entering
+ * the squeue so tcp_state should be updated before
+ * sending up a connection confirmation or a new
+ * connection indication.
+ */
+ tcp->tcp_state = TCPS_ESTABLISHED;
+
+ /*
+ * We are seeing the final ack in the three way
+ * hand shake of a active open'ed connection
+ * so we must send up a T_CONN_CON
+ */
+ if (tcp->tcp_active_open) {
+ if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
+ freemsg(mp);
+ tcp->tcp_state = TCPS_SYN_RCVD;
+ return;
+ }
+ /*
+ * Don't fuse the loopback endpoints for
+ * simultaneous active opens.
+ */
+ if (tcp->tcp_loopback) {
+ TCP_STAT(tcps, tcp_fusion_unfusable);
+ tcp->tcp_unfusable = B_TRUE;
+ }
+ /*
+ * For simultaneous active open, trace receipt of final
+ * ACK as tcp:::connect-established.
+ */
+ DTRACE_TCP5(connect__established, mblk_t *, NULL,
+ ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
+ iphdr, tcp_t *, tcp, tcph_t *, tcpha);
+ } else if (IPCL_IS_NONSTR(connp)) {
+ /*
+ * 3-way handshake has completed, so notify socket
+ * of the new connection.
+ *
+ * We are here means eager is fine but it can
+ * get a TH_RST at any point between now and till
+ * accept completes and disappear. We need to
+ * ensure that reference to eager is valid after
+ * we get out of eager's perimeter. So we do
+ * an extra refhold.
+ */
+ CONN_INC_REF(connp);
+
+ if (!tcp_newconn_notify(tcp, ira)) {
+ freemsg(mp);
+ /* notification did not go up, so drop ref */
+ CONN_DEC_REF(connp);
+ return;
+ }
+ /*
+ * For passive open, trace receipt of final ACK as
+ * tcp:::accept-established.
+ */
+ DTRACE_TCP5(accept__established, mlbk_t *, NULL,
+ ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
+ iphdr, tcp_t *, tcp, tcph_t *, tcpha);
+ } else if (((tcp->tcp_kssl_ent == NULL) ||
+ !tcp->tcp_kssl_pending)) {
+ /*
+ * 3-way handshake complete - this is a STREAMS based
+ * socket, so pass up the T_CONN_IND.
+ */
tcp_t *listener = tcp->tcp_listener;
mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind;
tcp->tcp_tconnind_started = B_TRUE;
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ ASSERT(mp != NULL);
/*
* We are here means eager is fine but it can
* get a TH_RST at any point between now and till
@@ -3638,43 +3701,6 @@ process_ack:
listener->tcp_connp, NULL, SQ_NODRAIN,
SQTAG_TCP_CONN_IND);
}
- }
-
- /*
- * We are seeing the final ack in the three way
- * hand shake of a active open'ed connection
- * so we must send up a T_CONN_CON
- *
- * tcp_sendmsg() checks tcp_state without entering
- * the squeue so tcp_state should be updated before
- * sending up connection confirmation. Probe the state
- * change below when we are sure sending of the confirmation
- * has succeeded.
- */
- tcp->tcp_state = TCPS_ESTABLISHED;
-
- if (tcp->tcp_active_open) {
- if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
- freemsg(mp);
- tcp->tcp_state = TCPS_SYN_RCVD;
- return;
- }
- /*
- * Don't fuse the loopback endpoints for
- * simultaneous active opens.
- */
- if (tcp->tcp_loopback) {
- TCP_STAT(tcps, tcp_fusion_unfusable);
- tcp->tcp_unfusable = B_TRUE;
- }
- /*
- * For simultaneous active open, trace receipt of final
- * ACK as tcp:::connect-established.
- */
- DTRACE_TCP5(connect__established, mblk_t *, NULL,
- ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
- iphdr, tcp_t *, tcp, tcph_t *, tcpha);
- } else {
/*
* For passive open, trace receipt of final ACK as
* tcp:::accept-established.
@@ -4454,13 +4480,14 @@ est:
tcpha->tha_ack = htonl(tcp->tcp_rnxt);
/*
- * Generate the ordrel_ind at the end unless we
- * are an eager guy.
- * In the eager case tcp_rsrv will do this when run
- * after tcp_accept is done.
+ * Generate the ordrel_ind at the end unless the
+ * conn is detached or it is a STREAMS based eager.
+ * In the eager case we defer the notification until
+ * tcp_accept_finish has run.
*/
- if (tcp->tcp_listener == NULL &&
- !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding)
+ if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) ||
+ (tcp->tcp_listener == NULL &&
+ !tcp->tcp_hard_binding)))
flags |= TH_ORDREL_NEEDED;
switch (tcp->tcp_state) {
case TCPS_SYN_RCVD:
@@ -4599,25 +4626,7 @@ update_ack:
return;
}
- if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
- /*
- * Side queue inbound data until the accept happens.
- * tcp_accept/tcp_rput drains this when the accept happens.
- * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
- * T_EXDATA_IND) it is queued on b_next.
- * XXX Make urgent data use this. Requires:
- * Removing tcp_listener check for TH_URG
- * Making M_PCPROTO and MARK messages skip the eager case
- */
-
- if (tcp->tcp_kssl_pending) {
- DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
- mblk_t *, mp);
- tcp_kssl_input(tcp, mp, ira->ira_cred);
- } else {
- tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
- }
- } else if (IPCL_IS_NONSTR(connp)) {
+ if (IPCL_IS_NONSTR(connp)) {
/*
* Non-STREAMS socket
*
@@ -4641,8 +4650,26 @@ update_ack:
/* PUSH bit set and sockfs is not flow controlled */
flags |= tcp_rwnd_reopen(tcp);
}
+ } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
+ /*
+ * Side queue inbound data until the accept happens.
+ * tcp_accept/tcp_rput drains this when the accept happens.
+ * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
+ * T_EXDATA_IND) it is queued on b_next.
+ * XXX Make urgent data use this. Requires:
+ * Removing tcp_listener check for TH_URG
+ * Making M_PCPROTO and MARK messages skip the eager case
+ */
+
+ if (tcp->tcp_kssl_pending) {
+ DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
+ mblk_t *, mp);
+ tcp_kssl_input(tcp, mp, ira->ira_cred);
+ } else {
+ tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
+ }
} else {
- /* STREAMS socket */
+ /* Active STREAMS socket */
if (mp->b_datap->db_type != M_DATA ||
(flags & TH_MARKNEXT_NEEDED)) {
if (tcp->tcp_rcv_list != NULL) {
@@ -4858,11 +4885,14 @@ ack_check:
}
if (flags & TH_ORDREL_NEEDED) {
/*
- * Send up the ordrel_ind unless we are an eager guy.
- * In the eager case tcp_rsrv will do this when run
- * after tcp_accept is done.
+ * Notify upper layer about an orderly release. If this is
+ * a non-STREAMS socket, then just make an upcall. For STREAMS
+ * we send up an ordrel_ind, unless this is an eager, in which
+ * case the ordrel will be sent when tcp_accept_finish runs.
+ * Note that for non-STREAMS we make an upcall even if it is an
+ * eager, because we have an upper handle to send it to.
*/
- ASSERT(tcp->tcp_listener == NULL);
+ ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
ASSERT(!tcp->tcp_detached);
if (IPCL_IS_NONSTR(connp)) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c
index a93c5bce9e..249df69de5 100644
--- a/usr/src/uts/common/inet/tcp/tcp_output.c
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c
@@ -1465,13 +1465,24 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
clock_t delta = 0;
tcp_stack_t *tcps = tcp->tcp_tcps;
- ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
- (connp->conn_fanout == NULL && connp->conn_ref >= 3));
+ /*
+ * When a non-STREAMS socket is being closed, it does not always
+ * stick around waiting for tcp_close_output to run and can therefore
+ * have dropped a reference already. So adjust the asserts accordingly.
+ */
+ ASSERT((connp->conn_fanout != NULL &&
+ connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
+ (connp->conn_fanout == NULL &&
+ connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
mutex_enter(&tcp->tcp_eager_lock);
if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
- /* Cleanup for listener */
- tcp_eager_cleanup(tcp, 0);
+ /*
+ * Cleanup for listener. For non-STREAM sockets sockfs will
+ * close all the eagers on 'q', so in that case only deal
+ * with 'q0'.
+ */
+ tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
tcp->tcp_wait_for_eagers = 1;
}
mutex_exit(&tcp->tcp_eager_lock);
@@ -1516,14 +1527,37 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
msg = "tcp_close, unread data";
break;
}
+
/*
- * We have done a qwait() above which could have possibly
- * drained more messages in turn causing transition to a
- * different state. Check whether we have to do the rest
- * of the processing or not.
+ * Abort connection if it is being closed without first
+ * being accepted. This can happen if a listening non-STREAM
+ * socket wants to get rid of the socket, for example, if the
+ * listener is closing.
*/
- if (tcp->tcp_state <= TCPS_LISTEN)
+ if (tcp->tcp_listener != NULL) {
+ ASSERT(IPCL_IS_NONSTR(connp));
+ msg = "tcp_close, close before accept";
+
+ /*
+ * Unlink from the listener and drop the reference
+ * put on it by the eager. tcp_closei_local will not
+ * do it because tcp_tconnind_started is TRUE.
+ */
+ mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
+ tcp_eager_unlink(tcp);
+ mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
+ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+
+ /*
+ * If the conn has received a RST, the only thing
+ * left to do is to drop the ref.
+ */
+ if (tcp->tcp_state <= TCPS_BOUND) {
+ CONN_DEC_REF(tcp->tcp_connp);
+ return;
+ }
break;
+ }
/*
* Transmit the FIN before detaching the tcp_t.
@@ -1593,7 +1627,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_append(tcp);
TCP_DBGSTAT(tcps, tcp_detach_time_wait);
- ASSERT(connp->conn_ref >= 3);
+ ASSERT(connp->conn_ref >=
+ (IPCL_IS_NONSTR(connp) ? 2 : 3));
goto finish;
}
@@ -1606,7 +1641,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
delta ? delta : 1);
- ASSERT(connp->conn_ref >= 3);
+ ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
goto finish;
}
@@ -1623,22 +1658,35 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
tcp_closei_local(tcp);
CONN_DEC_REF(connp);
- ASSERT(connp->conn_ref >= 2);
+ ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
finish:
- mutex_enter(&tcp->tcp_closelock);
/*
* Don't change the queues in the case of a listener that has
* eagers in its q or q0. It could surprise the eagers.
* Instead wait for the eagers outside the squeue.
+ *
+ * For non-STREAMS sockets tcp_wait_for_eagers implies that
+ * we should delay the su_closed upcall until all eagers have
+ * dropped their references.
*/
if (!tcp->tcp_wait_for_eagers) {
tcp->tcp_detached = B_TRUE;
connp->conn_rq = NULL;
connp->conn_wq = NULL;
+
+ /* non-STREAM socket, release the upper handle */
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(connp->conn_upper_handle != NULL);
+ (*connp->conn_upcalls->su_closed)
+ (connp->conn_upper_handle);
+ connp->conn_upper_handle = NULL;
+ connp->conn_upcalls = NULL;
+ }
}
/* Signal tcp_close() to finish closing. */
+ mutex_enter(&tcp->tcp_closelock);
tcp->tcp_closed = 1;
cv_signal(&tcp->tcp_closecv);
mutex_exit(&tcp->tcp_closelock);
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index 4b50c65cc6..f5df6b156c 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -33,6 +33,7 @@
#include <sys/strsun.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>
+#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/tpicommon.h>
@@ -121,6 +122,7 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
}
+/*ARGSUSED*/
static int
tcp_accept(sock_lower_handle_t lproto_handle,
sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
@@ -135,18 +137,59 @@ tcp_accept(sock_lower_handle_t lproto_handle,
econnp = (conn_t *)eproto_handle;
eager = econnp->conn_tcp;
ASSERT(eager->tcp_listener != NULL);
+ ASSERT(IPCL_IS_NONSTR(econnp));
+ ASSERT(lconnp->conn_upper_handle != NULL);
/*
- * It is OK to manipulate these fields outside the eager's squeue
- * because they will not start being used until tcp_accept_finish
- * has been called.
+ * It is possible for the accept thread to race with the thread that
+ * made the su_newconn upcall in tcp_newconn_notify. Both
+ * tcp_newconn_notify and tcp_accept require that conn_upper_handle
+ * and conn_upcalls be set before returning, so they both write to
+ * them. However, we're guaranteed that the value written is the same
+ * for both threads.
*/
- ASSERT(lconnp->conn_upper_handle != NULL);
- ASSERT(econnp->conn_upper_handle == NULL);
+ ASSERT(econnp->conn_upper_handle == NULL ||
+ econnp->conn_upper_handle == sock_handle);
+ ASSERT(econnp->conn_upcalls == NULL ||
+ econnp->conn_upcalls == lconnp->conn_upcalls);
econnp->conn_upper_handle = sock_handle;
econnp->conn_upcalls = lconnp->conn_upcalls;
- ASSERT(IPCL_IS_NONSTR(econnp));
- return (tcp_accept_common(lconnp, econnp, cr));
+
+ ASSERT(econnp->conn_netstack ==
+ listener->tcp_connp->conn_netstack);
+ ASSERT(eager->tcp_tcps == listener->tcp_tcps);
+
+ /*
+ * We should have a minimum of 2 references on the conn at this
+ * point. One for TCP and one for the newconn notification
+ * (which is now taken over by IP). In the normal case we would
+ * also have another reference (making a total of 3) for the conn
+ * being in the classifier hash list. However the eager could have
+ * received an RST subsequently and tcp_closei_local could have
+ * removed the eager from the classifier hash list, hence we can't
+ * assert that reference.
+ */
+ ASSERT(econnp->conn_ref >= 2);
+
+ /*
+ * An error is returned if this conn has been reset, which will
+ * cause the socket to be closed immediately. The eager will be
+ * unlinked from the listener during close.
+ */
+ if (eager->tcp_state < TCPS_ESTABLISHED)
+ return (ECONNABORTED);
+
+ mutex_enter(&listener->tcp_eager_lock);
+ /*
+ * Non-STREAMS listeners never defer the notification of new
+ * connections.
+ */
+ ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
+ tcp_eager_unlink(eager);
+ mutex_exit(&listener->tcp_eager_lock);
+ CONN_DEC_REF(listener->tcp_connp);
+
+ return (0);
}
static int
@@ -188,14 +231,12 @@ tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
return (error);
}
-/*
- * SOP_LISTEN() calls into tcp_listen().
- */
/* ARGSUSED */
static int
tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
int error;
ASSERT(connp->conn_upper_handle != NULL);
@@ -211,8 +252,14 @@ tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
if (error == 0) {
+ /*
+ * sockfs needs to know what's the maximum number of socket
+ * that can be queued on the listener.
+ */
(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
- SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog);
+ SOCK_OPCTL_ENAB_ACCEPT,
+ (uintptr_t)(tcp->tcp_conn_req_max +
+ tcp->tcp_tcps->tcps_conn_req_max_q0));
} else if (error < 0) {
if (error == -TOUTSTATE)
error = EINVAL;
@@ -296,7 +343,6 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
conn_t *connp = (conn_t *)proto_handle;
tcp_t *tcp = connp->conn_tcp;
- ASSERT(connp->conn_upper_handle != NULL);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
@@ -317,7 +363,6 @@ tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- ASSERT(connp->conn_upper_handle != NULL);
return (conn_getsockname(connp, addr, addrlenp));
}
@@ -694,7 +739,12 @@ tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
* packets in squeue for the timewait state.
*/
CONN_DEC_REF(connp);
- return (0);
+
+ /*
+ * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
+ * freeing the socket.
+ */
+ return (EINPROGRESS);
}
/* ARGSUSED */
@@ -737,9 +787,206 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
return ((sock_lower_handle_t)connp);
}
+/*
+ * tcp_fallback
+ *
+ * A direct socket is falling back to using STREAMS. The queue
+ * that is being passed down was created using tcp_open() with
+ * the SO_FALLBACK flag set. As a result, the queue is not
+ * associated with a conn, and the q_ptrs instead contain the
+ * dev and minor area that should be used.
+ *
+ * The 'issocket' flag indicates whether the FireEngine
+ * optimizations should be used. The common case would be that
+ * optimizations are enabled, and they might be subsequently
+ * disabled using the _SIOCSOCKFALLBACK ioctl.
+ */
+
+/*
+ * An active connection is falling back to TPI. Gather all the information
+ * required by the STREAM head and TPI sonode and send it up.
+ */
+static void
+tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
+ boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
+ sock_quiesce_arg_t *arg)
+{
+ conn_t *connp = tcp->tcp_connp;
+ struct stroptions *stropt;
+ struct T_capability_ack tca;
+ struct sockaddr_in6 laddr, faddr;
+ socklen_t laddrlen, faddrlen;
+ short opts;
+ int error;
+ mblk_t *mp, *mpnext;
+
+ connp->conn_dev = (dev_t)RD(q)->q_ptr;
+ connp->conn_minor_arena = WR(q)->q_ptr;
+
+ RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
+
+ WR(q)->q_qinfo = &tcp_sock_winit;
+
+ if (!issocket)
+ tcp_use_pure_tpi(tcp);
+
+ /*
+ * free the helper stream
+ */
+ ip_free_helper_stream(connp);
+
+ /*
+ * Notify the STREAM head about options
+ */
+ DB_TYPE(stropt_mp) = M_SETOPTS;
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt_mp->b_wptr += sizeof (struct stroptions);
+ stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+
+ stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
+ tcp->tcp_tcps->tcps_wroff_xtra);
+ if (tcp->tcp_snd_sack_ok)
+ stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
+ stropt->so_hiwat = connp->conn_rcvbuf;
+ stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+
+ putnext(RD(q), stropt_mp);
+
+ /*
+ * Collect the information needed to sync with the sonode
+ */
+ tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
+
+ laddrlen = faddrlen = sizeof (sin6_t);
+ (void) tcp_getsockname((sock_lower_handle_t)connp,
+ (struct sockaddr *)&laddr, &laddrlen, CRED());
+ error = tcp_getpeername((sock_lower_handle_t)connp,
+ (struct sockaddr *)&faddr, &faddrlen, CRED());
+ if (error != 0)
+ faddrlen = 0;
+
+ opts = 0;
+ if (connp->conn_oobinline)
+ opts |= SO_OOBINLINE;
+ if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
+ opts |= SO_DONTROUTE;
+
+ /*
+ * Notify the socket that the protocol is now quiescent,
+ * and it's therefore safe move data from the socket
+ * to the stream head.
+ */
+ mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
+ (struct sockaddr *)&laddr, laddrlen,
+ (struct sockaddr *)&faddr, faddrlen, opts);
+
+ while (mp != NULL) {
+ mpnext = mp->b_next;
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+ putnext(q, mp);
+ mp = mpnext;
+ }
+ ASSERT(tcp->tcp_rcv_last_head == NULL);
+ ASSERT(tcp->tcp_rcv_last_tail == NULL);
+ ASSERT(tcp->tcp_rcv_cnt == 0);
+
+ /*
+ * All eagers in q0 are marked as being non-STREAM, so they will
+ * make su_newconn upcalls when the handshake completes, which
+ * will fail (resulting in the conn being closed). So we just blow
+ * off everything in q0 instead of waiting for the inevitable.
+ */
+ if (tcp->tcp_conn_req_cnt_q0 != 0)
+ tcp_eager_cleanup(tcp, B_TRUE);
+}
+
+/*
+ * An eager is falling back to TPI. All we have to do is send
+ * up a T_CONN_IND.
+ */
+static void
+tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
+ so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
+{
+ conn_t *connp = eager->tcp_connp;
+ tcp_t *listener = eager->tcp_listener;
+ mblk_t *mp;
+
+ ASSERT(listener != NULL);
+
+ /*
+ * Notify the socket that the protocol is now quiescent,
+ * and it's therefore safe move data from the socket
+ * to tcp's rcv queue.
+ */
+ mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
+ NULL, 0, 0);
+
+ if (mp != NULL) {
+ ASSERT(eager->tcp_rcv_cnt == 0);
+
+ eager->tcp_rcv_list = mp;
+ eager->tcp_rcv_cnt = msgdsize(mp);
+ while (mp->b_next != NULL) {
+ mp = mp->b_next;
+ eager->tcp_rcv_cnt += msgdsize(mp);
+ }
+ eager->tcp_rcv_last_head = mp;
+ while (mp->b_cont)
+ mp = mp->b_cont;
+ eager->tcp_rcv_last_tail = mp;
+ if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
+ eager->tcp_rwnd = 0;
+ else
+ eager->tcp_rwnd -= eager->tcp_rcv_cnt;
+ }
+
+ if (!issocket)
+ eager->tcp_issocket = B_FALSE;
+ /*
+ * The stream for this eager does not yet exist, so mark it as
+ * being detached.
+ */
+ eager->tcp_detached = B_TRUE;
+ eager->tcp_hard_binding = B_TRUE;
+ connp->conn_rq = listener->tcp_connp->conn_rq;
+ connp->conn_wq = listener->tcp_connp->conn_wq;
+
+ /* Send up the connection indication */
+ mp = eager->tcp_conn.tcp_eager_conn_ind;
+ ASSERT(mp != NULL);
+ eager->tcp_conn.tcp_eager_conn_ind = NULL;
+
+ /*
+ * TLI/XTI applications will get confused by
+ * sending eager as an option since it violates
+ * the option semantics. So remove the eager as
+ * option since TLI/XTI app doesn't need it anyway.
+ */
+ if (!issocket) {
+ struct T_conn_ind *conn_ind;
+
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ conn_ind->OPT_length = 0;
+ conn_ind->OPT_offset = 0;
+ }
+
+ /*
+ * Sockfs guarantees that the listener will not be closed
+ * during fallback. So we can safely use the listener's queue.
+ */
+ putnext(listener->tcp_connp->conn_rq, mp);
+}
+
+
int
tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
- boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
+ sock_quiesce_arg_t *arg)
{
tcp_t *tcp;
conn_t *connp = (conn_t *)proto_handle;
@@ -768,14 +1015,6 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/* failed to enter, free all the pre-allocated messages. */
freeb(stropt_mp);
freeb(ordrel_mp);
- /*
- * We cannot process the eager, so at least send out a
- * RST so the peer can reconnect.
- */
- if (tcp->tcp_listener != NULL) {
- (void) tcp_eager_blowoff(tcp->tcp_listener,
- tcp->tcp_conn_req_seqnum);
- }
return (ENOMEM);
}
@@ -787,22 +1026,25 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
if (tcp->tcp_fused)
tcp_unfuse(tcp);
- /*
- * No longer a direct socket
- */
- connp->conn_flags &= ~IPCL_NONSTR;
- tcp->tcp_ordrel_mp = ordrel_mp;
-
if (tcp->tcp_listener != NULL) {
/* The eager will deal with opts when accept() is called */
freeb(stropt_mp);
- tcp_fallback_eager(tcp, direct_sockfs);
+ tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
} else {
tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
- quiesced_cb);
+ quiesced_cb, arg);
}
/*
+ * No longer a direct socket
+ *
+ * Note that we intentionally leave the upper_handle and upcalls
+ * intact, since eagers may still be using them.
+ */
+ connp->conn_flags &= ~IPCL_NONSTR;
+ tcp->tcp_ordrel_mp = ordrel_mp;
+
+ /*
* There should be atleast two ref's (IP + TCP)
*/
ASSERT(connp->conn_ref >= 2);
@@ -810,3 +1052,141 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
return (0);
}
+
+/*
+ * Notifies a non-STREAMS based listener about a new connection. This
+ * function is executed on the *eager*'s squeue once the 3 way handshake
+ * has completed. Note that the behavior differs from STREAMS, where the
+ * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
+ * squeue.
+ *
+ * Returns B_TRUE if the notification succeeded, in which case `tcp' will
+ * be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
+ * B_FALSE is returned and `tcp' is killed.
+ */
+boolean_t
+tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
+{
+ tcp_t *listener = tcp->tcp_listener;
+ conn_t *lconnp = listener->tcp_connp;
+ conn_t *econnp = tcp->tcp_connp;
+ tcp_t *tail;
+ ipaddr_t *addr_cache;
+ sock_upper_handle_t upper;
+ struct sock_proto_props sopp;
+ mblk_t *mp;
+
+ mutex_enter(&listener->tcp_eager_lock);
+ /*
+ * Take the eager out, if it is in the list of droppable eagers
+ * as we are here because the 3W handshake is over.
+ */
+ MAKE_UNDROPPABLE(tcp);
+ /*
+ * The eager already has an extra ref put in tcp_input_data
+ * so that it stays till accept comes back even though it
+ * might get into TCPS_CLOSED as a result of a TH_RST etc.
+ */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+
+ /* Move from SYN_RCVD to ESTABLISHED list */
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+
+ /*
+ * Insert at end of the queue because connections are accepted
+ * in chronological order. Leaving the older connections at front
+ * of the queue helps reducing search time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL)
+ tail->tcp_eager_next_q = tcp;
+ else
+ listener->tcp_eager_next_q = tcp;
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+
+ /* we have timed out before */
+ if (tcp->tcp_syn_rcvd_timeout != 0) {
+ tcp->tcp_syn_rcvd_timeout = 0;
+ listener->tcp_syn_rcvd_timeout--;
+ if (listener->tcp_syn_defense &&
+ listener->tcp_syn_rcvd_timeout <=
+ (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
+ 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
+ listener->tcp_last_rcv_lbolt)) {
+ /*
+ * Turn off the defense mode if we
+ * believe the SYN attack is over.
+ */
+ listener->tcp_syn_defense = B_FALSE;
+ if (listener->tcp_ip_addr_cache) {
+ kmem_free((void *)listener->tcp_ip_addr_cache,
+ IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
+ listener->tcp_ip_addr_cache = NULL;
+ }
+ }
+ }
+ addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
+ if (addr_cache != NULL) {
+ /*
+ * We have finished a 3-way handshake with this
+ * remote host. This proves the IP addr is good.
+ * Cache it!
+ */
+ addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
+ tcp->tcp_connp->conn_faddr_v4;
+ }
+ mutex_exit(&listener->tcp_eager_lock);
+
+ /*
+ * Notify the ULP about the newconn. It is guaranteed that no
+ * tcp_accept() call will be made for the eager if the
+ * notification fails.
+ */
+ if ((upper = (*lconnp->conn_upcalls->su_newconn)
+ (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
+ &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
+ &econnp->conn_upcalls)) == NULL) {
+ /*
+ * Normally this should not happen, but the listener might
+ * have done a fallback to TPI followed by a close(), in
+ * which case tcp_closemp for this conn might have been
+ * used by tcp_eager_cleanup().
+ */
+ mutex_enter(&listener->tcp_eager_lock);
+ if (tcp->tcp_closemp_used) {
+ mutex_exit(&listener->tcp_eager_lock);
+ return (B_FALSE);
+ }
+ tcp->tcp_closemp_used = B_TRUE;
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+ mp = &tcp->tcp_closemp;
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_eager_kill(econnp, mp, NULL, NULL);
+ return (B_FALSE);
+ }
+ econnp->conn_upper_handle = upper;
+
+ tcp->tcp_detached = B_FALSE;
+ tcp->tcp_hard_binding = B_FALSE;
+ tcp->tcp_tconnind_started = B_TRUE;
+
+ if (econnp->conn_keepalive) {
+ tcp->tcp_ka_last_intrvl = 0;
+ tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+ tcp->tcp_ka_interval);
+ }
+
+ /* Update the necessary parameters */
+ tcp_get_proto_props(tcp, &sopp);
+
+ (*econnp->conn_upcalls->su_set_proto_props)
+ (econnp->conn_upper_handle, &sopp);
+
+ return (B_TRUE);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_tpi.c b/usr/src/uts/common/inet/tcp/tcp_tpi.c
index bcaa1595ec..8c645425b7 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tpi.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* This files contains all TCP TLI/TPI related functions */
@@ -47,7 +46,6 @@
static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
-static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
void
tcp_use_pure_tpi(tcp_t *tcp)
@@ -823,7 +821,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
/* TODO: Default ETSDU is 1. Is that correct for tcp? */
}
-static void
+void
tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
t_uscalar_t cap_bits1)
{
@@ -950,148 +948,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
}
/*
- * tcp_fallback
- *
- * A direct socket is falling back to using STREAMS. The queue
- * that is being passed down was created using tcp_open() with
- * the SO_FALLBACK flag set. As a result, the queue is not
- * associated with a conn, and the q_ptrs instead contain the
- * dev and minor area that should be used.
- *
- * The 'issocket' flag indicates whether the FireEngine
- * optimizations should be used. The common case would be that
- * optimizations are enabled, and they might be subsequently
- * disabled using the _SIOCSOCKFALLBACK ioctl.
- */
-
-/*
- * An active connection is falling back to TPI. Gather all the information
- * required by the STREAM head and TPI sonode and send it up.
- */
-void
-tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
- boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
-{
- conn_t *connp = tcp->tcp_connp;
- struct stroptions *stropt;
- struct T_capability_ack tca;
- struct sockaddr_in6 laddr, faddr;
- socklen_t laddrlen, faddrlen;
- short opts;
- int error;
- mblk_t *mp;
-
- connp->conn_dev = (dev_t)RD(q)->q_ptr;
- connp->conn_minor_arena = WR(q)->q_ptr;
-
- RD(q)->q_ptr = WR(q)->q_ptr = connp;
-
- connp->conn_rq = RD(q);
- connp->conn_wq = WR(q);
-
- WR(q)->q_qinfo = &tcp_sock_winit;
-
- if (!issocket)
- tcp_use_pure_tpi(tcp);
-
- /*
- * free the helper stream
- */
- ip_free_helper_stream(connp);
-
- /*
- * Notify the STREAM head about options
- */
- DB_TYPE(stropt_mp) = M_SETOPTS;
- stropt = (struct stroptions *)stropt_mp->b_rptr;
- stropt_mp->b_wptr += sizeof (struct stroptions);
- stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
-
- stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
- tcp->tcp_tcps->tcps_wroff_xtra);
- if (tcp->tcp_snd_sack_ok)
- stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
- stropt->so_hiwat = connp->conn_rcvbuf;
- stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
-
- putnext(RD(q), stropt_mp);
-
- /*
- * Collect the information needed to sync with the sonode
- */
- tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
-
- laddrlen = faddrlen = sizeof (sin6_t);
- (void) tcp_getsockname((sock_lower_handle_t)connp,
- (struct sockaddr *)&laddr, &laddrlen, CRED());
- error = tcp_getpeername((sock_lower_handle_t)connp,
- (struct sockaddr *)&faddr, &faddrlen, CRED());
- if (error != 0)
- faddrlen = 0;
-
- opts = 0;
- if (connp->conn_oobinline)
- opts |= SO_OOBINLINE;
- if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
- opts |= SO_DONTROUTE;
-
- /*
- * Notify the socket that the protocol is now quiescent,
- * and it's therefore safe move data from the socket
- * to the stream head.
- */
- (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
- (struct sockaddr *)&laddr, laddrlen,
- (struct sockaddr *)&faddr, faddrlen, opts);
-
- while ((mp = tcp->tcp_rcv_list) != NULL) {
- tcp->tcp_rcv_list = mp->b_next;
- mp->b_next = NULL;
- /* We never do fallback for kernel RPC */
- putnext(q, mp);
- }
- tcp->tcp_rcv_last_head = NULL;
- tcp->tcp_rcv_last_tail = NULL;
- tcp->tcp_rcv_cnt = 0;
-}
-
-/*
- * An eager is falling back to TPI. All we have to do is send
- * up a T_CONN_IND.
- */
-void
-tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
-{
- tcp_t *listener = eager->tcp_listener;
- mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
-
- ASSERT(listener != NULL);
- ASSERT(mp != NULL);
-
- eager->tcp_conn.tcp_eager_conn_ind = NULL;
-
- /*
- * TLI/XTI applications will get confused by
- * sending eager as an option since it violates
- * the option semantics. So remove the eager as
- * option since TLI/XTI app doesn't need it anyway.
- */
- if (!direct_sockfs) {
- struct T_conn_ind *conn_ind;
-
- conn_ind = (struct T_conn_ind *)mp->b_rptr;
- conn_ind->OPT_length = 0;
- conn_ind->OPT_offset = 0;
- }
-
- /*
- * Sockfs guarantees that the listener will not be closed
- * during fallback. So we can safely use the listener's queue.
- */
- putnext(listener->tcp_connp->conn_rq, mp);
-}
-
-/*
* Swap information between the eager and acceptor for a TLI/XTI client.
* The sockfs accept is done on the acceptor stream and control goes
* through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
@@ -1185,6 +1041,191 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
}
/*
+ * This runs at the tail end of accept processing on the squeue of the
+ * new connection.
+ */
+/* ARGSUSED */
+static void
+tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+{
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+ queue_t *q = connp->conn_rq;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ struct stroptions *stropt;
+ struct sock_proto_props sopp;
+
+ /* Should never be called for non-STREAMS sockets */
+ ASSERT(!IPCL_IS_NONSTR(connp));
+
+ /* We should just receive a single mblk that fits a T_discon_ind */
+ ASSERT(mp->b_cont == NULL);
+
+ /*
+ * Drop the eager's ref on the listener, that was placed when
+ * this eager began life in tcp_input_listener.
+ */
+ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+
+ tcp->tcp_detached = B_FALSE;
+
+ if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
+ /*
+ * Someone blewoff the eager before we could finish
+ * the accept.
+ *
+ * The only reason eager exists it because we put in
+ * a ref on it when conn ind went up. We need to send
+ * a disconnect indication up while the last reference
+ * on the eager will be dropped by the squeue when we
+ * return.
+ */
+ ASSERT(tcp->tcp_listener == NULL);
+ if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
+ struct T_discon_ind *tdi;
+
+ (void) putnextctl1(q, M_FLUSH, FLUSHRW);
+ /*
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk i.e.
+ * stroptions is greater than sizeof
+ * T_discon_ind.
+ */
+ ASSERT(DB_REF(mp) == 1);
+ ASSERT(MBLKSIZE(mp) >=
+ sizeof (struct T_discon_ind));
+
+ DB_TYPE(mp) = M_PROTO;
+ ((union T_primitives *)mp->b_rptr)->type =
+ T_DISCON_IND;
+ tdi = (struct T_discon_ind *)mp->b_rptr;
+ if (tcp->tcp_issocket) {
+ tdi->DISCON_reason = ECONNREFUSED;
+ tdi->SEQ_number = 0;
+ } else {
+ tdi->DISCON_reason = ENOPROTOOPT;
+ tdi->SEQ_number =
+ tcp->tcp_conn_req_seqnum;
+ }
+ mp->b_wptr = mp->b_rptr +
+ sizeof (struct T_discon_ind);
+ putnext(q, mp);
+ }
+ tcp->tcp_hard_binding = B_FALSE;
+ return;
+ }
+
+ /*
+ * This is the first time we run on the correct
+ * queue after tcp_accept. So fix all the q parameters
+ * here.
+ *
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk is at least
+ * stroptions
+ */
+ tcp_get_proto_props(tcp, &sopp);
+
+ ASSERT(DB_REF(mp) == 1);
+ ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
+
+ DB_TYPE(mp) = M_SETOPTS;
+ stropt = (struct stroptions *)mp->b_rptr;
+ mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
+ stropt = (struct stroptions *)mp->b_rptr;
+ ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
+ stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+ stropt->so_hiwat = sopp.sopp_rxhiwat;
+ stropt->so_wroff = sopp.sopp_wroff;
+ stropt->so_maxblk = sopp.sopp_maxblk;
+
+ if (sopp.sopp_flags & SOCKOPT_TAIL) {
+ ASSERT(tcp->tcp_kssl_ctx != NULL);
+
+ stropt->so_flags |= SO_TAIL | SO_COPYOPT;
+ stropt->so_tail = sopp.sopp_tail;
+ stropt->so_copyopt = sopp.sopp_zcopyflag;
+ }
+
+ /* Send the options up */
+ putnext(q, mp);
+
+ /*
+ * Pass up any data and/or a fin that has been received.
+ *
+ * Adjust receive window in case it had decreased
+ * (because there is data <=> tcp_rcv_list != NULL)
+ * while the connection was detached. Note that
+ * in case the eager was flow-controlled, w/o this
+ * code, the rwnd may never open up again!
+ */
+ if (tcp->tcp_rcv_list != NULL) {
+ /* We drain directly in case of fused tcp loopback */
+
+ if (!tcp->tcp_fused && canputnext(q)) {
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
+ if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+ tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
+ tcp_xmit_ctl(NULL,
+ tcp, (tcp->tcp_swnd == 0) ?
+ tcp->tcp_suna : tcp->tcp_snxt,
+ tcp->tcp_rnxt, TH_ACK);
+ }
+ }
+
+ (void) tcp_rcv_drain(tcp);
+
+ /*
+ * For fused tcp loopback, back-enable peer endpoint
+ * if it's currently flow-controlled.
+ */
+ if (tcp->tcp_fused) {
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(peer_tcp != NULL);
+ ASSERT(peer_tcp->tcp_fused);
+
+ mutex_enter(&peer_tcp->tcp_non_sq_lock);
+ if (peer_tcp->tcp_flow_stopped) {
+ tcp_clrqfull(peer_tcp);
+ TCP_STAT(tcps, tcp_fusion_backenabled);
+ }
+ mutex_exit(&peer_tcp->tcp_non_sq_lock);
+ }
+ }
+ ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+ if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
+ tcp->tcp_ordrel_done = B_TRUE;
+ mp = tcp->tcp_ordrel_mp;
+ tcp->tcp_ordrel_mp = NULL;
+ putnext(q, mp);
+ }
+ tcp->tcp_hard_binding = B_FALSE;
+
+ if (connp->conn_keepalive) {
+ tcp->tcp_ka_last_intrvl = 0;
+ tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+ tcp->tcp_ka_interval);
+ }
+
+ /*
+ * At this point, eager is fully established and will
+ * have the following references -
+ *
+ * 2 references for connection to exist (1 for TCP and 1 for IP).
+ * 1 reference for the squeue which will be dropped by the squeue as
+ * soon as this function returns.
+ * There will be 1 additonal reference for being in classifier
+ * hash list provided something bad hasn't happened.
+ */
+ ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
+ (connp->conn_fanout == NULL && connp->conn_ref >= 3));
+}
+
+
+/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
* on the acceptor STREAM and processed in tcp_accept_common().
@@ -1643,6 +1684,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
tcp_t *listener;
struct T_ok_ack *ok;
t_scalar_t PRIM_type;
+ mblk_t *discon_mp;
conn_t *econnp;
cred_t *cr;
@@ -1703,14 +1745,120 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
q->q_qinfo = &tcp_winit;
listener = eager->tcp_listener;
- if (tcp_accept_common(listener->tcp_connp,
- econnp, cr) < 0) {
+ /*
+ * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+ * use it if something failed.
+ */
+ discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+ sizeof (struct stroptions)), BPRI_HI);
+
+ if (discon_mp == NULL) {
mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
if (mp != NULL)
putnext(rq, mp);
return;
}
+ eager->tcp_issocket = B_TRUE;
+
+ ASSERT(econnp->conn_netstack ==
+ listener->tcp_connp->conn_netstack);
+ ASSERT(eager->tcp_tcps == listener->tcp_tcps);
+
+ /* Put the ref for IP */
+ CONN_INC_REF(econnp);
+
+ /*
+ * We should have minimum of 3 references on the conn
+ * at this point. One each for TCP and IP and one for
+ * the T_conn_ind that was sent up when the 3-way handshake
+ * completed. In the normal case we would also have another
+ * reference (making a total of 4) for the conn being in the
+ * classifier hash list. However the eager could have received
+ * an RST subsequently and tcp_closei_local could have removed
+ * the eager from the classifier hash list, hence we can't
+ * assert that reference.
+ */
+ ASSERT(econnp->conn_ref >= 3);
+
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
+
+ tcp_t *tail;
+ tcp_t *tcp;
+ mblk_t *mp1;
+
+ tcp = listener->tcp_eager_prev_q0;
+ /*
+ * listener->tcp_eager_prev_q0 points to the TAIL of the
+ * deferred T_conn_ind queue. We need to get to the head
+ * of the queue in order to send up T_conn_ind the same
+ * order as how the 3WHS is completed.
+ */
+ while (tcp != listener) {
+ if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
+ !tcp->tcp_kssl_pending)
+ break;
+ else
+ tcp = tcp->tcp_eager_prev_q0;
+ }
+ /* None of the pending eagers can be sent up now */
+ if (tcp == listener)
+ goto no_more_eagers;
+
+ mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
+ tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ /* Move from q0 to q */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+ tcp->tcp_conn_def_q0 = B_FALSE;
+
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
+ /*
+ * Insert at end of the queue because sockfs sends
+ * down T_CONN_RES in chronological order. Leaving
+ * the older conn indications at front of the queue
+ * helps reducing search time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL) {
+ tail->tcp_eager_next_q = tcp;
+ } else {
+ listener->tcp_eager_next_q = tcp;
+ }
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+
+ /* Need to get inside the listener perimeter */
+ CONN_INC_REF(listener->tcp_connp);
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
+ tcp_send_pending, listener->tcp_connp, NULL,
+ SQ_FILL, SQTAG_TCP_SEND_PENDING);
+ }
+no_more_eagers:
+ tcp_eager_unlink(eager);
+ mutex_exit(&listener->tcp_eager_lock);
+
+ /*
+ * At this point, the eager is detached from the listener
+ * but we still have an extra refs on eager (apart from the
+ * usual tcp references). The ref was placed in tcp_input_data
+ * before sending the conn_ind in tcp_send_conn_ind.
+ * The ref will be dropped in tcp_accept_finish().
+ */
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
+ econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+
/*
* Send the new local address also up to sockfs. There
* should already be enough space in the mp that came
@@ -1761,50 +1909,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
}
/*
- * Send the newconn notification to ulp. The eager is blown off if the
- * notification fails.
- */
-static void
-tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
-{
- if (IPCL_IS_NONSTR(lconnp)) {
- cred_t *cr;
- pid_t cpid = NOPID;
-
- ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
- ASSERT(econnp->conn_tcp->tcp_saved_listener ==
- lconnp->conn_tcp);
-
- cr = msg_getcred(mp, &cpid);
-
- /* Keep the message around in case of a fallback to TPI */
- econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
- /*
- * Notify the ULP about the newconn. It is guaranteed that no
- * tcp_accept() call will be made for the eager if the
- * notification fails, so it's safe to blow it off in that
- * case.
- *
- * The upper handle will be assigned when tcp_accept() is
- * called.
- */
- if ((*lconnp->conn_upcalls->su_newconn)
- (lconnp->conn_upper_handle,
- (sock_lower_handle_t)econnp,
- &sock_tcp_downcalls, cr, cpid,
- &econnp->conn_upcalls) == NULL) {
- /* Failed to allocate a socket */
- TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps,
- tcpEstabResets);
- (void) tcp_eager_blowoff(lconnp->conn_tcp,
- econnp->conn_tcp->tcp_conn_req_seqnum);
- }
- } else {
- putnext(lconnp->conn_rq, mp);
- }
-}
-
-/*
* The function called through squeue to get behind listener's perimeter to
* send a deferred conn_ind.
*/
@@ -1831,7 +1935,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
return;
}
- tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+ putnext(lconnp->conn_rq, mp);
}
/*
@@ -1989,5 +2093,5 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
}
mutex_exit(&listener->tcp_eager_lock);
if (need_send_conn_ind)
- tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+ putnext(lconnp->conn_rq, mp);
}
diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c
index 0d6fc8acc8..1984580efa 100644
--- a/usr/src/uts/common/inet/tcp/tcpddi.c
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -41,6 +40,8 @@
#define INET_SOCKDESC "TCP socket module"
#define INET_SOCK_PROTO_CREATE_FUNC (*tcp_create)
#define INET_SOCK_PROTO_FB_FUNC (*tcp_fallback)
+#define INET_SOCK_FALLBACK_DEV_V4 "/dev/tcp"
+#define INET_SOCK_FALLBACK_DEV_V6 "/dev/tcp6"
#define INET_DEVMINOR 0
#define INET_MODMTFLAGS D_MP
#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 46b12b27f0..5b0dfc6c3b 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -515,8 +515,6 @@ extern uint_t tcp_free_list_max_cnt;
/*
* Functions in tcp.c.
*/
-extern int tcp_accept_common(conn_t *, conn_t *, cred_t *);
-extern void tcp_accept_finish(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
extern tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);
extern void tcp_acceptor_hash_remove(tcp_t *);
@@ -565,6 +563,7 @@ extern void tcp_update_pmtu(tcp_t *, boolean_t);
extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
extern boolean_t tcp_zcopy_check(tcp_t *);
extern void tcp_zcopy_notify(tcp_t *);
+extern void tcp_get_proto_props(tcp_t *, struct sock_proto_props *);
/*
* Bind related functions in tcp_bind.c
@@ -630,8 +629,9 @@ extern boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
/*
* Kernel socket related functions in tcp_socket.c.
*/
-extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
- so_proto_quiesced_cb_t);
+extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+ so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
+extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *);
/*
* Timer related functions in tcp_timers.c.
@@ -657,9 +657,6 @@ extern boolean_t tcp_conn_con(tcp_t *, uchar_t *, mblk_t *,
mblk_t **, ip_recv_attr_t *);
extern void tcp_err_ack(tcp_t *, mblk_t *, int, int);
extern void tcp_err_ack_prim(tcp_t *, mblk_t *, int, int, int);
-extern void tcp_fallback_eager(tcp_t *, boolean_t);
-extern void tcp_fallback_noneager(tcp_t *, mblk_t *, queue_t *,
- boolean_t, so_proto_quiesced_cb_t);
extern void tcp_info_req(tcp_t *, mblk_t *);
extern void tcp_send_conn_ind(void *, mblk_t *, void *);
extern void tcp_send_pending(void *, mblk_t *, void *, ip_recv_attr_t *);
@@ -674,6 +671,8 @@ extern int tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
extern void tcp_tpi_unbind(tcp_t *, mblk_t *);
extern void tcp_tli_accept(tcp_t *, mblk_t *);
extern void tcp_use_pure_tpi(tcp_t *);
+extern void tcp_do_capability_ack(tcp_t *, struct T_capability_ack *,
+ t_uscalar_t);
/*
* TCP option processing related functions in tcp_opt_data.c
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index adcd0652d7..d3f6f0dc7e 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -6498,7 +6498,8 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
int
udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
- boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
+ boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
+ sock_quiesce_arg_t *arg)
{
conn_t *connp = (conn_t *)proto_handle;
udp_t *udp;
@@ -6507,7 +6508,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
socklen_t laddrlen, faddrlen;
short opts;
struct stroptions *stropt;
- mblk_t *stropt_mp;
+ mblk_t *mp, *stropt_mp;
int error;
udp = connp->conn_udp;
@@ -6563,17 +6564,21 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
- (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
(struct sockaddr *)&laddr, laddrlen,
(struct sockaddr *)&faddr, faddrlen, opts);
mutex_enter(&udp->udp_recv_lock);
/*
* Attempts to send data up during fallback will result in it being
- * queued in udp_t. Now we push up any queued packets.
+ * queued in udp_t. First push up the datagrams obtained from the
+ * socket, then any packets queued in udp_t.
*/
+ if (mp != NULL) {
+ mp->b_next = udp->udp_fallback_queue_head;
+ udp->udp_fallback_queue_head = mp;
+ }
while (udp->udp_fallback_queue_head != NULL) {
- mblk_t *mp;
mp = udp->udp_fallback_queue_head;
udp->udp_fallback_queue_head = mp->b_next;
mutex_exit(&udp->udp_recv_lock);
@@ -6598,7 +6603,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/* ARGSUSED3 */
int
-udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t *salenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c
index 144af2192f..6d1b110cec 100644
--- a/usr/src/uts/common/inet/udp/udpddi.c
+++ b/usr/src/uts/common/inet/udp/udpddi.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -43,6 +42,8 @@
#define INET_SOCKDESC "UDP socket module"
#define INET_SOCK_PROTO_CREATE_FUNC (*udp_create)
#define INET_SOCK_PROTO_FB_FUNC (*udp_fallback)
+#define INET_SOCK_FALLBACK_DEV_V4 "/dev/udp"
+#define INET_SOCK_FALLBACK_DEV_V6 "/dev/udp6"
#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
#include "../inetddi.c"
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 11ca9f9810..4fbcbb5323 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _UDP_IMPL_H
@@ -227,7 +226,7 @@ extern uint_t udp_max_optsize;
extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **,
uint_t *, int *, int, cred_t *);
extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
- so_proto_quiesced_cb_t);
+ so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
extern sock_downcalls_t sock_udp_downcalls;
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index 561188a388..4100f049d7 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/file.h>
@@ -166,7 +165,7 @@ ksocket_accept(ksocket_t ks, struct sockaddr *addr,
}
int
-ksocket_connect(ksocket_t ks, const struct sockaddr *addr, socklen_t addrlen,
+ksocket_connect(ksocket_t ks, struct sockaddr *addr, socklen_t addrlen,
struct cred *cr)
{
/* All Solaris components should pass a cred for this operation. */
diff --git a/usr/src/uts/common/io/sock_conf.c b/usr/src/uts/common/io/sock_conf.c
index b6d31de8ea..964175b6cd 100644
--- a/usr/src/uts/common/io/sock_conf.c
+++ b/usr/src/uts/common/io/sock_conf.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/sysmacros.h>
@@ -127,6 +126,10 @@ smod_register(const smod_reg_t *reg)
if (reg->__smod_priv != NULL) {
smodp->smod_proto_fallback_func =
reg->__smod_priv->smodp_proto_fallback_func;
+ smodp->smod_fallback_devpath_v4 =
+ reg->__smod_priv->smodp_fallback_devpath_v4;
+ smodp->smod_fallback_devpath_v6 =
+ reg->__smod_priv->smodp_fallback_devpath_v6;
}
}
smod_add(smodp);
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index 70160e318d..b956756758 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -21,8 +21,7 @@
/* ONC_PLUS EXTRACT START */
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -731,7 +730,7 @@ struct sysent sysent[NSYSCALL] =
/* 244 */ SYSENT_CI("getsockname", getsockname, 4),
/* 245 */ SYSENT_CI("getsockopt", getsockopt, 6),
/* 246 */ SYSENT_CI("setsockopt", setsockopt, 6),
- /* 247 */ SYSENT_CI("sockconfig", sockconfig, 4),
+ /* 247 */ SYSENT_CI("sockconfig", sockconfig, 5),
/* 248 */ SYSENT_CI("ntp_gettime", ntp_gettime, 1),
/* 249 */ SYSENT_CI("ntp_adjtime", ntp_adjtime, 1),
/* 250 */ SYSENT_CI("lwp_mutex_unlock", lwp_mutex_unlock, 1),
@@ -1057,7 +1056,7 @@ struct sysent sysent32[NSYSCALL] =
/* 244 */ SYSENT_CI("getsockname", getsockname, 4),
/* 245 */ SYSENT_CI("getsockopt", getsockopt, 6),
/* 246 */ SYSENT_CI("setsockopt", setsockopt, 6),
- /* 247 */ SYSENT_CI("sockconfig", sockconfig, 4),
+ /* 247 */ SYSENT_CI("sockconfig", sockconfig, 5),
/* 248 */ SYSENT_CI("ntp_gettime", ntp_gettime, 1),
/* 249 */ SYSENT_CI("ntp_adjtime", ntp_adjtime, 1),
/* 250 */ SYSENT_CI("lwp_mutex_unlock", lwp_mutex_unlock, 1),
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index acebf3dc1e..8d56de5adc 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -506,6 +506,7 @@ CHKHDRS= \
socket_impl.h \
socket_proto.h \
socketvar.h \
+ sockfilter.h \
sockio.h \
soundcard.h \
squeue.h \
diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h
index fb834b027f..df15b12c08 100644
--- a/usr/src/uts/common/sys/ksocket.h
+++ b/usr/src/uts/common/sys/ksocket.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_KSOCKET_H_
@@ -88,7 +87,7 @@ extern int ksocket_bind(ksocket_t, struct sockaddr *, socklen_t,
extern int ksocket_listen(ksocket_t, int, struct cred *);
extern int ksocket_accept(ksocket_t, struct sockaddr *, socklen_t *,
ksocket_t *, struct cred *);
-extern int ksocket_connect(ksocket_t, const struct sockaddr *, socklen_t,
+extern int ksocket_connect(ksocket_t, struct sockaddr *, socklen_t,
struct cred *);
extern int ksocket_send(ksocket_t, void *, size_t, int, size_t *,
struct cred *);
diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h
index 803f7a07b0..5d4648234c 100644
--- a/usr/src/uts/common/sys/socket.h
+++ b/usr/src/uts/common/sys/socket.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -185,6 +184,27 @@ struct so_snd_bufinfo {
#define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */
#endif /* _KERNEL */
+/*
+ * Socket filter options
+ */
+#define FIL_ATTACH 0x1 /* attach filter */
+#define FIL_DETACH 0x2 /* detach filter */
+#define FIL_LIST 0x3 /* list attached filters */
+
+#define FILNAME_MAX 32
+/*
+ * Structure returned by FIL_LIST
+ */
+struct fil_info {
+ int fi_flags; /* see below (FILF_*) */
+ int fi_pos; /* position (0 is bottom) */
+ char fi_name[FILNAME_MAX]; /* filter name */
+};
+
+#define FILF_PROG 0x1 /* programmatic attach */
+#define FILF_AUTO 0x2 /* automatic attach */
+#define FILF_BYPASS 0x4 /* filter is not active */
+
#ifdef _KERNEL
/*
* new socket open flags to identify socket and acceptor streams
@@ -199,13 +219,6 @@ struct so_snd_bufinfo {
#define SOCKET_SLEEP KM_SLEEP
#define SOCKET_NOSLEEP KM_NOSLEEP
-
-/*
- * flags used by sockfs when falling back to tpi socket
- */
-#define SO_FB_START 0x1
-#define SO_FB_FINISH 0x2
-
#endif /* _KERNEL */
/*
@@ -224,6 +237,7 @@ struct linger {
#define SOL_ROUTE 0xfffe /* options for routing socket level */
#endif
#define SOL_PACKET 0xfffd /* options for packet level */
+#define SOL_FILTER 0xfffc /* options for socket filter level */
/*
* Address families.
diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h
index 56e312930b..6bc968be1b 100644
--- a/usr/src/uts/common/sys/socket_proto.h
+++ b/usr/src/uts/common/sys/socket_proto.h
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_SOCKET_PROTO_H_
@@ -128,11 +127,15 @@ struct sock_downcalls_s {
typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int,
sock_downcalls_t **, uint_t *, int *, int, cred_t *);
-typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *,
- struct T_capability_ack *, struct sockaddr *, socklen_t,
- struct sockaddr *, socklen_t, short);
+typedef struct sock_quiesce_arg {
+ mblk_t *soqa_exdata_mp;
+ mblk_t *soqa_urgmark_mp;
+} sock_quiesce_arg_t;
+typedef mblk_t *(*so_proto_quiesced_cb_t)(sock_upper_handle_t,
+ sock_quiesce_arg_t *, struct T_capability_ack *, struct sockaddr *,
+ socklen_t, struct sockaddr *, socklen_t, short);
typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
- boolean_t, so_proto_quiesced_cb_t);
+ boolean_t, so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
/*
* These functions return EOPNOTSUPP and are intended for the sockfs
@@ -196,6 +199,7 @@ struct sock_upcalls_s {
void (*su_signal_oob)(sock_upper_handle_t, ssize_t);
void (*su_zcopy_notify)(sock_upper_handle_t);
void (*su_set_error)(sock_upper_handle_t, int);
+ void (*su_closed)(sock_upper_handle_t);
};
#define SOCK_UC_VERSION sizeof (sock_upcalls_t)
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index 268adc6103..75b1626bcb 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -162,12 +162,13 @@ struct sonode {
/* Accept queue */
kmutex_t so_acceptq_lock; /* protects accept queue */
- struct sonode *so_acceptq_next; /* acceptq list node */
- struct sonode *so_acceptq_head;
- struct sonode **so_acceptq_tail;
- unsigned int so_acceptq_len;
+ list_t so_acceptq_list; /* pending conns */
+ list_t so_acceptq_defer; /* deferred conns */
+ list_node_t so_acceptq_node; /* acceptq list node */
+ unsigned int so_acceptq_len; /* # of conns (both lists) */
unsigned int so_backlog; /* Listen backlog */
kcondvar_t so_acceptq_cv; /* wait for new conn. */
+ struct sonode *so_listener; /* parent socket */
/* Options */
short so_options; /* From socket call, see socket.h */
@@ -233,6 +234,13 @@ struct sonode {
/* != NULL for sodirect enabled socket */
struct sodirect_s *so_direct;
+
+ /* socket filters */
+ uint_t so_filter_active; /* # of active fil */
+ uint_t so_filter_tx; /* pending tx ops */
+ struct sof_instance *so_filter_top; /* top of stack */
+ struct sof_instance *so_filter_bottom; /* bottom of stack */
+ clock_t so_filter_defertime; /* time when deferred */
};
#define SO_HAVE_DATA(so) \
@@ -288,10 +296,10 @@ struct sonode {
#define SS_HADOOBDATA 0x00008000 /* OOB data consumed */
#define SS_CLOSING 0x00010000 /* in process of closing */
-/* unused 0x00020000 */ /* was SS_FADDR_NOXLATE */
-/* unused 0x00040000 */ /* was SS_HASDATA */
-/* unused 0x00080000 */ /* was SS_DONEREAD */
-/* unused 0x00100000 */ /* was SS_MOREDATA */
+#define SS_FIL_DEFER 0x00020000 /* filter deferred notification */
+#define SS_FILOP_OK 0x00040000 /* socket can attach filters */
+#define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */
+#define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */
/* unused 0x00200000 */ /* was SS_DIRECT */
#define SS_SODIRECT 0x00400000 /* transport supports sodirect */
@@ -312,19 +320,27 @@ struct sonode {
* Sockets that can fall back to TPI must ensure that fall back is not
* initiated while a thread is using a socket.
*/
-#define SO_BLOCK_FALLBACK(so, fn) { \
- ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \
- rw_enter(&(so)->so_fallback_rwlock, RW_READER); \
- if ((so)->so_state & SS_FALLBACK_COMP) { \
- rw_exit(&(so)->so_fallback_rwlock); \
- return (fn); \
- } \
-}
+#define SO_BLOCK_FALLBACK(so, fn) \
+ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \
+ rw_enter(&(so)->so_fallback_rwlock, RW_READER); \
+ if ((so)->so_state & (SS_FALLBACK_COMP|SS_FILOP_OK)) { \
+ if ((so)->so_state & SS_FALLBACK_COMP) { \
+ rw_exit(&(so)->so_fallback_rwlock); \
+ return (fn); \
+ } else { \
+ mutex_enter(&(so)->so_lock); \
+ (so)->so_state &= ~SS_FILOP_OK; \
+ mutex_exit(&(so)->so_lock); \
+ } \
+ }
#define SO_UNBLOCK_FALLBACK(so) { \
rw_exit(&(so)->so_fallback_rwlock); \
}
+#define SO_SND_FLOWCTRLD(so) \
+ ((so)->so_snd_qfull || (so)->so_state & SS_FIL_SND_FLOWCTRL)
+
/* Poll events */
#define SO_POLLEV_IN 0x1 /* POLLIN wakeup needed */
#define SO_POLLEV_ALWAYS 0x2 /* wakeups */
@@ -375,7 +391,9 @@ typedef struct sdev_info {
vnode_t *sd_vnode;
} sdev_info_t;
-#define SOCKMOD_VERSION 1
+#define SOCKMOD_VERSION_1 1
+#define SOCKMOD_VERSION 2
+
/* name of the TPI pseudo socket module */
#define SOTPI_SMOD_NAME "socktpi"
@@ -383,6 +401,8 @@ typedef struct __smod_priv_s {
so_create_func_t smodp_sock_create_func;
so_destroy_func_t smodp_sock_destroy_func;
so_proto_fallback_func_t smodp_proto_fallback_func;
+ const char *smodp_fallback_devpath_v4;
+ const char *smodp_fallback_devpath_v6;
} __smod_priv_t;
/*
@@ -410,6 +430,8 @@ typedef struct smod_info {
size_t smod_dc_version; /* down call version */
so_proto_create_func_t smod_proto_create_func;
so_proto_fallback_func_t smod_proto_fallback_func;
+ const char *smod_fallback_devpath_v4;
+ const char *smod_fallback_devpath_v6;
so_create_func_t smod_sock_create_func;
so_destroy_func_t smod_sock_destroy_func;
list_node_t smod_node;
@@ -448,12 +470,22 @@ struct sockparams {
/*
* The entries below are only modified while holding
- * splist_lock as a writer.
+ * sockconf_lock as a writer.
*/
int sp_flags; /* see below */
list_node_t sp_node;
+
+ list_t sp_auto_filters; /* list of automatic filters */
+ list_t sp_prog_filters; /* list of programmatic filters */
};
+struct sof_entry;
+
+typedef struct sp_filter {
+ struct sof_entry *spf_filter;
+ list_node_t spf_node;
+} sp_filter_t;
+
/*
* sockparams flags
@@ -467,6 +499,14 @@ extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int,
const char *, int, int *);
extern void sockparams_ephemeral_drop_last_ref(struct sockparams *);
+extern struct sockparams *sockparams_create(int, int, int, char *, char *, int,
+ int, int, int *);
+extern void sockparams_destroy(struct sockparams *);
+extern int sockparams_add(struct sockparams *);
+extern int sockparams_delete(int, int, int);
+extern int sockparams_new_filter(struct sof_entry *);
+extern void sockparams_filter_cleanup(struct sof_entry *);
+
extern void smod_init(void);
extern void smod_add(smod_info_t *);
extern int smod_register(const smod_reg_t *);
@@ -614,7 +654,7 @@ struct sonodeops {
int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
int, cred_t *);
int (*sop_listen)(struct sonode *, int, cred_t *);
- int (*sop_connect)(struct sonode *, const struct sockaddr *,
+ int (*sop_connect)(struct sonode *, struct sockaddr *,
socklen_t, int, int, cred_t *);
int (*sop_recvmsg)(struct sonode *, struct msghdr *,
struct uio *, cred_t *);
@@ -833,6 +873,8 @@ extern const struct fs_operation_def socket_vnodeops_template[];
extern dev_t sockdev;
+extern krwlock_t sockconf_lock;
+
/*
* sockfs functions
*/
@@ -842,7 +884,6 @@ extern int sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *,
uchar_t, int, int);
extern int sogetvp(char *, vnode_t **, int);
extern int sockinit(int, char *);
-extern int soconfig(int, int, int, char *, int, char *);
extern int solookup(int, int, int, struct sockparams **);
extern void so_lock_single(struct sonode *);
extern void so_unlock_single(struct sonode *, int);
@@ -885,7 +926,7 @@ extern int soaccept(struct sonode *, int, struct sonode **);
extern int sobind(struct sonode *, struct sockaddr *, socklen_t,
int, int);
extern int solisten(struct sonode *, int);
-extern int soconnect(struct sonode *, const struct sockaddr *, socklen_t,
+extern int soconnect(struct sonode *, struct sockaddr *, socklen_t,
int, int);
extern int sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *);
extern int sosendmsg(struct sonode *, struct nmsghdr *, struct uio *);
@@ -927,6 +968,70 @@ struct sockinfo {
zoneid_t si_szoneid;
};
+/*
+ * Subcodes for sockconf() system call
+ */
+#define SOCKCONFIG_ADD_SOCK 0
+#define SOCKCONFIG_REMOVE_SOCK 1
+#define SOCKCONFIG_ADD_FILTER 2
+#define SOCKCONFIG_REMOVE_FILTER 3
+
+/*
+ * Data structures for configuring socket filters.
+ */
+
+/*
+ * Placement hint for automatic filters
+ */
+typedef enum {
+ SOF_HINT_NONE,
+ SOF_HINT_TOP,
+ SOF_HINT_BOTTOM,
+ SOF_HINT_BEFORE,
+ SOF_HINT_AFTER
+} sof_hint_t;
+
+/*
+ * Socket tuple. Used by sockconfig_filter_props to list socket
+ * types of interest.
+ */
+typedef struct sof_socktuple {
+ int sofst_family;
+ int sofst_type;
+ int sofst_protocol;
+} sof_socktuple_t;
+
+/*
+ * Socket filter properties used by sockconfig() system call.
+ */
+struct sockconfig_filter_props {
+ char *sfp_modname;
+ boolean_t sfp_autoattach;
+ sof_hint_t sfp_hint;
+ char *sfp_hintarg;
+ uint_t sfp_socktuple_cnt;
+ sof_socktuple_t *sfp_socktuple;
+};
+
+#ifdef _SYSCALL32
+
+typedef struct sof_socktuple32 {
+ int32_t sofst_family;
+ int32_t sofst_type;
+ int32_t sofst_protocol;
+} sof_socktuple32_t;
+
+struct sockconfig_filter_props32 {
+ caddr32_t sfp_modname;
+ boolean_t sfp_autoattach;
+ sof_hint_t sfp_hint;
+ caddr32_t sfp_hintarg;
+ uint32_t sfp_socktuple_cnt;
+ caddr32_t sfp_socktuple;
+};
+
+#endif /* _SYSCALL32 */
+
#define SOCKMOD_PATH "socketmod" /* dir where sockmods are stored */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h
new file mode 100644
index 0000000000..9f6d8b499b
--- /dev/null
+++ b/usr/src/uts/common/sys/sockfilter.h
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_SOCKFILTER_H
+#define _SYS_SOCKFILTER_H
+
+#include <sys/cred.h>
+#include <sys/errno.h>
+#include <sys/socket.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Opaque socket filter handle
+ */
+typedef struct __sof_handle *sof_handle_t;
+
+/*
+ * Return values for callback functions.
+ *
+ * A - Attach (passive/active) only
+ * P - Passive attach only
+ */
+typedef enum {
+ SOF_RVAL_DEFER = -3, /* defer notification (P) */
+ SOF_RVAL_DETACH = -2, /* detach filter, continue proc. (A) */
+ SOF_RVAL_CONTINUE = -1, /* continue processing */
+ SOF_RVAL_RETURN = 0, /* stop proc, does not return error */
+ SOF_RVAL_EINVAL = EINVAL, /* stop proc., returns error */
+ SOF_RVAL_EACCES = EACCES, /* stop proc., returns error */
+ SOF_RVAL_ENOMEM = ENOMEM, /* stop proc., returns error */
+ SOF_RVAL_ECONNABORTED = ECONNABORTED /* stop proc, returns error */
+} sof_rval_t;
+
+/*
+ * Events generated by the sofop_notify callback.
+ */
+typedef enum { /* socket ... */
+ SOF_EV_CLOSING, /* ... is closing */
+ SOF_EV_CONNECTED, /* ... is connected */
+ SOF_EV_CONNECTFAILED, /* ... failed to connect */
+ SOF_EV_DISCONNECTED, /* ... was disconnected */
+ SOF_EV_CANTRECVMORE, /* ... cannot receive any more data */
+ SOF_EV_CANTSENDMORE, /* ... cannot send any more data */
+ SOF_EV_INJECT_DATA_IN_OK, /* ... has cleared rcv flow ctrl */
+ SOF_EV_INJECT_DATA_OUT_OK, /* ... has cleared snd flow ctrl */
+} sof_event_t;
+
+/* Filter callbacks */
+typedef sof_rval_t (*sof_attach_active_fn_t)(sof_handle_t, int, int, int,
+ cred_t *, void **);
+typedef sof_rval_t (*sof_attach_passive_fn_t)(sof_handle_t, sof_handle_t,
+ void *, struct sockaddr *, socklen_t, struct sockaddr *, socklen_t,
+ void **);
+typedef void (*sof_detach_fn_t)(sof_handle_t, void *, cred_t *);
+typedef mblk_t *(*sof_data_in_fn_t)(sof_handle_t, void *, mblk_t *,
+ int, size_t *);
+typedef mblk_t *(*sof_data_in_proc_fn_t)(sof_handle_t, void *,
+ mblk_t *, cred_t *, size_t *);
+typedef mblk_t *(*sof_data_out_fn_t)(sof_handle_t, void *, mblk_t *,
+ struct nmsghdr *, cred_t *, sof_rval_t *);
+typedef sof_rval_t (*sof_bind_fn_t)(sof_handle_t, void *,
+ struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_listen_fn_t)(sof_handle_t, void *, int *,
+ cred_t *);
+typedef sof_rval_t (*sof_accept_fn_t)(sof_handle_t, void *, cred_t *);
+typedef sof_rval_t (*sof_connect_fn_t)(sof_handle_t, void *,
+ struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_shutdown_fn_t)(sof_handle_t, void *, int *,
+ cred_t *);
+typedef sof_rval_t (*sof_getsockname_fn_t)(sof_handle_t, void *,
+ struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_getpeername_fn_t)(sof_handle_t, void *,
+ struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_setsockopt_fn_t)(sof_handle_t, void *,
+ int, int, void *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_getsockopt_fn_t)(sof_handle_t, void *,
+ int, int, void *, socklen_t *, cred_t *);
+typedef sof_rval_t (*sof_ioctl_fn_t)(sof_handle_t, void *, int, intptr_t,
+ int, int32_t *, cred_t *);
+typedef void (*sof_mblk_prop_fn_t)(sof_handle_t, void *, ssize_t *,
+ ushort_t *, ushort_t *);
+typedef void (*sof_notify_fn_t)(sof_handle_t, void *, sof_event_t,
+ uintptr_t);
+
+typedef struct sof_ops {
+ sof_attach_active_fn_t sofop_attach_active;
+ sof_attach_passive_fn_t sofop_attach_passive;
+ sof_detach_fn_t sofop_detach;
+ sof_data_in_fn_t sofop_data_in;
+ sof_data_in_proc_fn_t sofop_data_in_proc;
+ sof_data_out_fn_t sofop_data_out;
+ sof_bind_fn_t sofop_bind;
+ sof_listen_fn_t sofop_listen;
+ sof_connect_fn_t sofop_connect;
+ sof_accept_fn_t sofop_accept;
+ sof_shutdown_fn_t sofop_shutdown;
+ sof_getsockname_fn_t sofop_getsockname;
+ sof_getpeername_fn_t sofop_getpeername;
+ sof_setsockopt_fn_t sofop_setsockopt;
+ sof_getsockopt_fn_t sofop_getsockopt;
+ sof_ioctl_fn_t sofop_ioctl;
+ sof_mblk_prop_fn_t sofop_mblk_prop;
+ sof_notify_fn_t sofop_notify;
+} sof_ops_t;
+
+#define SOF_VERSION 1
+
+extern int sof_register(int, const char *, const sof_ops_t *, int);
+extern int sof_unregister(const char *);
+
+extern void sof_newconn_ready(sof_handle_t);
+extern void sof_bypass(sof_handle_t);
+extern void *sof_get_cookie(sof_handle_t);
+extern void *sof_cas_cookie(sof_handle_t, void *, void *);
+extern int sof_inject_data_out(sof_handle_t, mblk_t *, struct nmsghdr *,
+ boolean_t *);
+extern int sof_inject_data_in(sof_handle_t, mblk_t *, size_t, int,
+ boolean_t *);
+extern void sof_rcv_flowctrl(sof_handle_t, boolean_t);
+extern void sof_snd_flowctrl(sof_handle_t, boolean_t);
+extern boolean_t sof_newconn_move(sof_handle_t, sof_handle_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SOCKFILTER_H */
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
index d279593b0f..29d8c5b564 100644
--- a/usr/src/uts/common/syscall/sendfile.c
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/types.h>
@@ -781,8 +780,16 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
size_t iov_len;
iov_len = sfv_len;
- if (!SOCK_IS_NONSTR(so) &&
- SOTOTPI(so)->sti_kssl_ctx != NULL)
+ /*
+ * Socket filters can limit the mblk
+ * size, so limit reads to maxblk if
+ * there are filters present.
+ */
+ if ((!SOCK_IS_NONSTR(so) &&
+ _SOTOTPI(so)->sti_kssl_ctx
+ != NULL) ||
+ (so->so_filter_active > 0 &&
+ maxblk != INFPSZ))
iov_len = MIN(iov_len, maxblk);
aiov.iov_len = iov_len;
@@ -928,13 +935,16 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
copyflag = stp != NULL ? stp->sd_copyflag :
so->so_proto_props.sopp_zcopyflag;
+
/*
- * For sockets acting as an SSL proxy, we
- * need to adjust the size to the maximum
- * SSL record size set in the stream head.
+ * Socket filters can limit the mblk size,
+ * so limit reads to maxblk if there are
+ * filters present.
*/
- if (!SOCK_IS_NONSTR(so) &&
- _SOTOTPI(so)->sti_kssl_ctx != NULL)
+ if ((!SOCK_IS_NONSTR(so) &&
+ _SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+ (so->so_filter_active > 0 &&
+ maxblk != INFPSZ))
size = MIN(size, maxblk);
if (vn_has_flocks(readvp) ||