From 3e95bd4ab92abca814bd28e854607d1975c7dc88 Mon Sep 17 00:00:00 2001 From: Anders Persson Date: Thu, 17 Jun 2010 17:22:09 -0700 Subject: PSARC/2009/590 Socket Filter Framework 6939085 Socket Filter Framework 6802067 connect_failed kernel socket callback is not triggered 6776450 time spent in tcp_close could be reduced/deferred to a worker thread 6828586 assertion failed: family == 26, file: ../../common/fs/sockfs/socksyscalls.c, line: 1608 6802078 kernel socket 'newconn' callback is passing rcv queue size as an argument --- exception_lists/packaging | 4 + usr/src/cmd/cmd-inet/usr.sbin/Makefile | 9 +- usr/src/cmd/cmd-inet/usr.sbin/soconfig.c | 212 ++- usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter | 55 + usr/src/cmd/ptools/pfiles/pfiles.c | 68 +- usr/src/cmd/truss/expound.c | 133 +- usr/src/cmd/truss/print.c | 26 + usr/src/cmd/truss/print.h | 6 +- usr/src/cmd/truss/systable.c | 23 +- usr/src/lib/libc/common/sys/_sockconfig.s | 6 +- usr/src/pkg/manifests/SUNWcs.mf | 1 + usr/src/uts/common/Makefile.files | 2 +- usr/src/uts/common/c2/audit_event.c | 64 +- usr/src/uts/common/fs/sockfs/sockcommon.c | 66 +- usr/src/uts/common/fs/sockfs/sockcommon.h | 14 +- usr/src/uts/common/fs/sockfs/sockcommon_sops.c | 352 +++-- usr/src/uts/common/fs/sockfs/sockcommon_subr.c | 317 +++-- usr/src/uts/common/fs/sockfs/sockfilter.c | 1770 ++++++++++++++++++++++++ usr/src/uts/common/fs/sockfs/sockfilter_impl.h | 213 +++ usr/src/uts/common/fs/sockfs/socknotify.c | 34 +- usr/src/uts/common/fs/sockfs/sockparams.c | 242 ++-- usr/src/uts/common/fs/sockfs/socksubr.c | 15 +- usr/src/uts/common/fs/sockfs/socksyscalls.c | 422 ++++-- usr/src/uts/common/fs/sockfs/socktpi.c | 40 +- usr/src/uts/common/fs/sockfs/socktpi.h | 9 +- usr/src/uts/common/fs/sockfs/sodirect.c | 5 +- usr/src/uts/common/inet/inetddi.c | 13 +- usr/src/uts/common/inet/ip/icmp.c | 13 +- usr/src/uts/common/inet/ip/icmpddi.c | 9 +- usr/src/uts/common/inet/rawip_impl.h | 5 +- usr/src/uts/common/inet/sockmods/socksctp.c | 11 +- usr/src/uts/common/inet/sockmods/socksdp.c | 13 +- usr/src/uts/common/inet/tcp/tcp.c | 455 +----- usr/src/uts/common/inet/tcp/tcp_fusion.c | 6 +- usr/src/uts/common/inet/tcp/tcp_input.c | 186 +-- usr/src/uts/common/inet/tcp/tcp_output.c | 74 +- usr/src/uts/common/inet/tcp/tcp_socket.c | 442 +++++- usr/src/uts/common/inet/tcp/tcp_tpi.c | 492 ++++--- usr/src/uts/common/inet/tcp/tcpddi.c | 5 +- usr/src/uts/common/inet/tcp_impl.h | 13 +- usr/src/uts/common/inet/udp/udp.c | 17 +- usr/src/uts/common/inet/udp/udpddi.c | 5 +- usr/src/uts/common/inet/udp_impl.h | 5 +- usr/src/uts/common/io/ksocket/ksocket.c | 5 +- usr/src/uts/common/io/sock_conf.c | 7 +- usr/src/uts/common/os/sysent.c | 7 +- usr/src/uts/common/sys/Makefile | 1 + usr/src/uts/common/sys/ksocket.h | 5 +- usr/src/uts/common/sys/socket.h | 32 +- usr/src/uts/common/sys/socket_proto.h | 16 +- usr/src/uts/common/sys/socketvar.h | 147 +- usr/src/uts/common/sys/sockfilter.h | 151 ++ usr/src/uts/common/syscall/sendfile.c | 28 +- 53 files changed, 4872 insertions(+), 1399 deletions(-) create mode 100644 usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter create mode 100644 usr/src/uts/common/fs/sockfs/sockfilter.c create mode 100644 usr/src/uts/common/fs/sockfs/sockfilter_impl.h create mode 100644 usr/src/uts/common/sys/sockfilter.h diff --git a/exception_lists/packaging b/exception_lists/packaging index ba452253e7..58c70ec5c0 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -926,3 +926,7 @@ usr/lib/sparcv9/llib-lvrrpadm.ln sparc # opt/onbld/bin/i386/elfsign i386 opt/onbld/bin/sparc/elfsign sparc +# +# Private socket filter API +# +usr/include/sys/sockfilter.h diff --git a/usr/src/cmd/cmd-inet/usr.sbin/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/Makefile index f7ae749660..014e4511a5 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile @@ -20,8 +20,7 @@ # # -# Copyright 2010 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. # SYNCPROG= syncinit syncloop syncstat @@ -38,6 +37,7 @@ PROG= 6to4relay arp gettable if_mpadm \ MANIFEST= rarp.xml telnet.xml comsat.xml finger.xml \ login.xml shell.xml rexec.xml +SVCMETHOD= svc-sockfilter ROOTFS_PROG= hostconfig route soconfig SBINLINKS= hostconfig route @@ -106,7 +106,8 @@ SRCS+= $(COMMONSRCS) # # Message catalog # -POFILES= 6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po +POFILES= 6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po \ + soconfig.po POFILE= usr.sbin.po all:= TARGET= all @@ -199,7 +200,7 @@ $(ROOTUSRSBINLINKS): install: $(PROG) $(ROOTFS_PROG) $(SUBDIRS) .WAIT $(ROOTUSRSBINPROG) \ $(ROOTSBINPROG) $(ROOTUSRSBINLINKS) $(ROOTETCDEFAULTFILES) \ - $(ROOTMANIFEST) THIRDPARTYLICENSE.arp + $(ROOTMANIFEST) $(ROOTSVCMETHOD) THIRDPARTYLICENSE.arp THIRDPARTYLICENSE.arp: arp.c $(SED) -n '/University of California/,/SUCH DAMAGE/p' arp.c > $@ diff --git a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c index b5c45f7b6f..a47d455ce3 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -30,6 +29,9 @@ #include #include #include +#include +#include +#include #define MAXLINELEN 4096 @@ -47,6 +49,15 @@ * * soconfig * deregisters + * + * Filter Operations (Consolidation Private): + * + * soconfig -F {auto [top | bottom | before:filter | + * after:filter] | prog} ::,... + * configure filter + * + * soconfig -F + * unconfigures filter */ static int parse_file(char *filename); @@ -60,6 +71,8 @@ static int parse_int(char *str); static void usage(void); +static int parse_filter_params(int argc, char **argv); + int main(argc, argv) int argc; @@ -75,6 +88,11 @@ main(argc, argv) #endif (void) textdomain(TEXT_DOMAIN); + if (argc >= 2 && strcmp(argv[0], "-F") == 0) { + argc--; argv++; + ret = parse_filter_params(argc, argv); + exit(ret); + } if (argc == 2 && strcmp(argv[0], "-f") == 0) { ret = parse_file(argv[1]); exit(ret); @@ -213,7 +231,7 @@ split_line(char *line, char *argvec[], int maxargvec) static int parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) { - int fam, type, protocol; + int cmd, fam, type, protocol; fam = parse_int(famstr); if (fam == -1) { @@ -272,13 +290,17 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) } return (1); } + + cmd = SOCKCONFIG_ADD_SOCK; + } else { + cmd = SOCKCONFIG_REMOVE_SOCK; } #ifdef DEBUG - printf("not calling sockconfig(%d, %d, %d, %s)\n", - fam, type, protocol, path == NULL ? "(null)" : path); + printf("not calling sockconfig(%d, %d, %d, %d, %s)\n", + cmd, fam, type, protocol, path == NULL ? "(null)" : path); #else - if (_sockconfig(fam, type, protocol, path) == -1) { + if (_sockconfig(cmd, fam, type, protocol, path) == -1) { perror("sockconfig"); return (1); } @@ -297,3 +319,181 @@ parse_int(char *str) return (-1); return (res); } + +/* + * Add and remove socket filters. + */ +static int +parse_filter_params(int argc, char **argv) +{ + struct sockconfig_filter_props filprop; + sof_socktuple_t *socktuples; + size_t tupcnt, nalloc; + char *hintarg, *socktup, *tupstr; + int i; + + if (argc == 1) { + if (_sockconfig(SOCKCONFIG_REMOVE_FILTER, argv[0], 0, + 0, 0) < 0) { + switch (errno) { + case ENXIO: + fprintf(stderr, + gettext("socket filter is not configured " + "'%s'\n"), argv[0]); + break; + default: + perror("sockconfig"); + break; + } + return (1); + } + return (0); + } + + if (argc < 4 || argc > 5) + return (1); + + + if (strlen(argv[1]) >= MODMAXNAMELEN) { + fprintf(stderr, + gettext("invalid module name '%s': name too long\n"), + argv[1]); + return (1); + } + filprop.sfp_modname = argv[1]; + + /* Check the attach semantics */ + if (strcmp(argv[2], "auto") == 0) { + filprop.sfp_autoattach = B_TRUE; + if (argc == 5) { + /* placement hint */ + if (strcmp(argv[3], "top") == 0) { + filprop.sfp_hint = SOF_HINT_TOP; + } else if (strcmp(argv[3], "bottom") == 0) { + filprop.sfp_hint = SOF_HINT_BOTTOM; + } else { + if (strncmp(argv[3], "before", 6) == 0) { + filprop.sfp_hint = SOF_HINT_BEFORE; + } else if (strncmp(argv[3], "after", 5) == 0) { + filprop.sfp_hint = SOF_HINT_AFTER; + } else { + fprintf(stderr, + gettext("invalid placement hint " + "'%s'\n"), argv[3]); + return (1); + } + + hintarg = strchr(argv[3], ':'); + if (hintarg == NULL || + (strlen(++hintarg) == 0) || + (strlen(hintarg) >= FILNAME_MAX)) { + fprintf(stderr, + gettext("invalid placement hint " + "argument '%s': name too long\n"), + argv[3]); + return (1); + } + + filprop.sfp_hintarg = hintarg; + } + } else { + filprop.sfp_hint = SOF_HINT_NONE; + } + } else if (strcmp(argv[2], "prog") == 0) { + filprop.sfp_autoattach = B_FALSE; + filprop.sfp_hint = SOF_HINT_NONE; + /* cannot specify placement hint for programmatic filter */ + if (argc == 5) { + fprintf(stderr, + gettext("placement hint specified for programmatic " + "filter\n")); + return (1); + } + } else { + fprintf(stderr, gettext("invalid attach semantic '%s'\n"), + argv[2]); + return (1); + } + + /* parse the socket tuples */ + nalloc = 4; + socktuples = calloc(nalloc, sizeof (sof_socktuple_t)); + if (socktuples == NULL) { + perror("calloc"); + return (1); + } + + tupcnt = 0; + tupstr = argv[(argc == 4) ? 3 : 4]; + while ((socktup = strsep(&tupstr, ",")) != NULL) { + int val; + char *valstr; + + if (tupcnt == nalloc) { + sof_socktuple_t *new; + + nalloc *= 2; + new = realloc(socktuples, + nalloc * sizeof (sof_socktuple_t)); + if (new == NULL) { + perror("realloc"); + free(socktuples); + return (1); + } + socktuples = new; + } + i = 0; + while ((valstr = strsep(&socktup, ":")) != NULL && i < 3) { + val = parse_int(valstr); + if (val == -1) { + fprintf(stderr, gettext("bad socket tuple\n")); + free(socktuples); + return (1); + } + switch (i) { + case 0: socktuples[tupcnt].sofst_family = val; break; + case 1: socktuples[tupcnt].sofst_type = val; break; + case 2: socktuples[tupcnt].sofst_protocol = val; break; + } + i++; + } + if (i != 3) { + fprintf(stderr, gettext("bad socket tuple\n")); + free(socktuples); + return (1); + } + tupcnt++; + } + if (tupcnt == 0) { + fprintf(stderr, gettext("no socket tuples specified\n")); + free(socktuples); + return (1); + } + filprop.sfp_socktuple_cnt = tupcnt; + filprop.sfp_socktuple = socktuples; + + if (_sockconfig(SOCKCONFIG_ADD_FILTER, argv[0], &filprop, 0, 0) < 0) { + switch (errno) { + case EINVAL: + fprintf(stderr, + gettext("invalid socket filter configuration\n")); + break; + case EEXIST: + fprintf(stderr, + gettext("socket filter is already configured " + "'%s'\n"), argv[0]); + break; + case ENOSPC: + fprintf(stderr, gettext("unable to satisfy placement " + "constraint\n")); + break; + default: + perror("sockconfig"); + break; + } + free(socktuples); + return (1); + } + free(socktuples); + return (0); +} diff --git a/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter b/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter new file mode 100644 index 0000000000..8df5ab52d8 --- /dev/null +++ b/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter @@ -0,0 +1,55 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# + +. /lib/svc/share/smf_include.sh + +filter_name=`svcprop -p socket-filter/name $SMF_FMRI 2>/dev/null` +if [ -z "$filter_name" ]; then + echo "socket-filter/name is missing" + exit $SMF_EXIT_ERR_CONFIG +fi + +case "$1" in +start) + mod_name=`svcprop -p socket-filter/module_name $SMF_FMRI 2>/dev/null` + type=`svcprop -p socket-filter/attach_semantics $SMF_FMRI 2>/dev/null` + order=`svcprop -p socket-filter/order_hint $SMF_FMRI 2>/dev/null` + socktups=`svcprop -p socket-filter/socket_tuples $SMF_FMRI 2>/dev/null` + + /sbin/soconfig -F $filter_name $mod_name $type $order $socktups + if [ $? -ne 0 ]; then + exit $SMF_EXIT_ERR_FATAL + fi + ;; +stop) + /sbin/soconfig -F $filter_name + ;; +*) + echo "Usage: $0 { start | stop }" + exit 1 + ;; +esac + +exit $SMF_EXIT_OK diff --git a/usr/src/cmd/ptools/pfiles/pfiles.c b/usr/src/cmd/ptools/pfiles/pfiles.c index 50e82ad34a..5bd1373a1d 100644 --- a/usr/src/cmd/ptools/pfiles/pfiles.c +++ b/usr/src/cmd/ptools/pfiles/pfiles.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -650,6 +649,70 @@ show_sockopts(struct ps_prochandle *Pr, int fd) (void) printf("\t%s\n", buf+1); } +#define MAXNALLOC 32 +static void +show_sockfilters(struct ps_prochandle *Pr, int fd) +{ + struct fil_info *fi; + int i = 0, nalloc = 2, len = nalloc * sizeof (*fi); + boolean_t printhdr = B_TRUE; + + fi = calloc(nalloc, sizeof (*fi)); + if (fi == NULL) { + perror("calloc"); + return; + } + /* CONSTCOND */ + while (1) { + if (pr_getsockopt(Pr, fd, SOL_FILTER, FIL_LIST, fi, &len) != 0) + break; + /* No filters */ + if (len == 0) + break; + /* Make sure buffer was large enough */ + if (fi->fi_pos >= nalloc) { + struct fil_info *new; + + nalloc = fi->fi_pos + 1; + if (nalloc > MAXNALLOC) + break; + len = nalloc * sizeof (*fi); + new = realloc(fi, nalloc * sizeof (*fi)); + if (new == NULL) { + perror("realloc"); + break; + } + fi = new; + continue; + } + + for (i = 0; (i + 1) * sizeof (*fi) <= len; i++) { + if (fi[i].fi_flags & FILF_BYPASS) + continue; + if (printhdr) { + (void) printf("\tfilters: "); + printhdr = B_FALSE; + } + (void) printf("%s", fi[i].fi_name); + if (fi[i].fi_flags != 0) { + (void) printf("("); + if (fi[i].fi_flags & FILF_AUTO) + (void) printf("auto,"); + if (fi[i].fi_flags & FILF_PROG) + (void) printf("prog,"); + (void) printf("\b)"); + } + if (fi[i].fi_pos == 0) /* last one */ + break; + (void) printf(","); + } + if (!printhdr) + (void) printf("\n"); + break; + } + free(fi); +} + /* the file is a socket */ static void dosocket(struct ps_prochandle *Pr, int fd) @@ -666,6 +729,7 @@ dosocket(struct ps_prochandle *Pr, int fd) show_socktype((uint_t)type); show_sockopts(Pr, fd); + show_sockfilters(Pr, fd); len = sizeof (buf); if (pr_getsockname(Pr, fd, sa, &len) == 0) diff --git a/usr/src/cmd/truss/expound.c b/usr/src/cmd/truss/expound.c index d78cbecad5..78c13fc4dc 100644 --- a/usr/src/cmd/truss/expound.c +++ b/usr/src/cmd/truss/expound.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -90,6 +89,7 @@ #include #include #include +#include #include "ramdata.h" #include "systable.h" @@ -4721,6 +4721,132 @@ show_utimesys(private_t *pri) } } +#ifdef _LP64 +static void +show_sockconfig_filter_prop32(private_t *pri, long addr) +{ + struct sockconfig_filter_props32 props; + const char *s = NULL; + char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)]; + sof_socktuple32_t *tup; + size_t sz; + int i; + + if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) { + if (Pread_string(Proc, buf, sizeof (buf), + (uintptr_t)props.sfp_modname) == -1) + (void) strcpy(buf, ""); + (void) printf("%s\tmodule name: %s\n", pri->pname, buf); + (void) printf("%s\tattach semantics: %s", pri->pname, + props.sfp_autoattach ? "automatic" : "progammatic"); + if (props.sfp_autoattach) { + buf[0] = '\0'; + switch (props.sfp_hint) { + case SOF_HINT_TOP: s = "top"; break; + case SOF_HINT_BOTTOM: s = "bottom"; break; + case SOF_HINT_BEFORE: + case SOF_HINT_AFTER: + s = (props.sfp_hint == SOF_HINT_BEFORE) ? + "before" : "after"; + if (Pread_string(Proc, buf, sizeof (buf), + (uintptr_t)props.sfp_hintarg) == -1) + (void) strcpy(buf, ""); + } + if (s != NULL) { + (void) printf(", placement: %s %s", s, buf); + } + } + (void) printf("\n"); + (void) printf("%s\tsocket tuples:\n", pri->pname); + if (props.sfp_socktuple_cnt == 0) { + (void) printf("\t\t\n"); + return; + } + sz = props.sfp_socktuple_cnt * sizeof (*tup); + tup = my_malloc(sz, "socket tuple buffer"); + if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz) + for (i = 0; i < props.sfp_socktuple_cnt; i++) { + (void) printf( + "\t\tfamily: %d, type: %d, proto: %d\n", + tup[i].sofst_family, tup[i].sofst_type, + tup[i].sofst_protocol); + } + } +} +#endif /* _LP64 */ +static void +show_sockconfig_filter_prop(private_t *pri, long addr) +{ + struct sockconfig_filter_props props; + const char *s = NULL; + char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)]; + sof_socktuple_t *tup; + size_t sz; + int i; + + if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) { + if (Pread_string(Proc, buf, sizeof (buf), + (uintptr_t)props.sfp_modname) == -1) + (void) strcpy(buf, ""); + (void) printf("%s\tmodule name: %s\n", pri->pname, buf); + (void) printf("%s\tattach semantics: %s", pri->pname, + props.sfp_autoattach ? "automatic" : "progammatic"); + if (props.sfp_autoattach) { + buf[0] = '\0'; + switch (props.sfp_hint) { + case SOF_HINT_TOP: s = "top"; break; + case SOF_HINT_BOTTOM: s = "bottom"; break; + case SOF_HINT_BEFORE: + case SOF_HINT_AFTER: + s = (props.sfp_hint == SOF_HINT_BEFORE) ? + "before" : "after"; + if (Pread_string(Proc, buf, sizeof (buf), + (uintptr_t)props.sfp_hintarg) == -1) + (void) strcpy(buf, ""); + } + if (s != NULL) { + (void) printf(", placement: %s", s); + } + } + (void) printf("\n"); + (void) printf("%s\tsocket tuples:\n", pri->pname); + if (props.sfp_socktuple_cnt == 0) { + (void) printf("\t\t\n"); + return; + } + sz = props.sfp_socktuple_cnt * sizeof (*tup); + tup = my_malloc(sz, "socket tuple buffer"); + if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz) + for (i = 0; i < props.sfp_socktuple_cnt; i++) { + (void) printf( + "\t\tfamily: %d, type: %d, proto: %d\n", + tup[i].sofst_family, tup[i].sofst_type, + tup[i].sofst_protocol); + } + } +} + +void +show_sockconfig(private_t *pri) +{ + switch (pri->sys_args[0]) { + case SOCKCONFIG_ADD_FILTER: +#ifdef _LP64 + if (data_model == PR_MODEL_LP64) + show_sockconfig_filter_prop(pri, + (long)pri->sys_args[2]); + else + show_sockconfig_filter_prop32(pri, + (long)pri->sys_args[2]); +#else + show_sockconfig_filter_prop(pri, (long)pri->sys_args[2]); +#endif + break; + default: + break; + } +} + /* expound verbosely upon syscall arguments */ /*ARGSUSED*/ void @@ -5199,5 +5325,8 @@ expound(private_t *pri, long r0, int raw) case SYS_utimesys: show_utimesys(pri); break; + case SYS_sockconfig: + show_sockconfig(pri); + break; } } diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index 5de1342c0e..1a92777c28 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -1649,7 +1649,32 @@ prt_pfm(private_t *pri, int raw, long val) } } +/* + * Print sockconfig() subcode. + */ +/*ARGSUSED*/ +void +prt_skc(private_t *pri, int raw, long val) +{ + const char *s = NULL; + if (!raw) { + switch (val) { + case SOCKCONFIG_ADD_SOCK: + s = "SOCKCONFIG_ADD_SOCK"; break; + case SOCKCONFIG_REMOVE_SOCK: + s = "SOCKCONFIG_REMOVE_SOCK"; break; + case SOCKCONFIG_ADD_FILTER: + s = "SOCKCONFIG_ADD_FILTER"; break; + case SOCKCONFIG_REMOVE_FILTER: + s = "SOCKCONFIG_REMOVE_FILTER"; break; + } + } + if (s == NULL) + prt_dec(pri, 0, val); + else + outstring(pri, s); +} /* * Print so_socket() 2nd argument. */ @@ -2709,5 +2734,6 @@ void (* const Print[])() = { prt_un1, /* UN1 -- as prt_uns except for -1 */ prt_mob, /* MOB -- print mmapobj() flags */ prt_utf, /* UTF -- print utimensat() flag */ + prt_skc, /* SKC -- print sockconfig() subcode */ prt_dec, /* HID -- hidden argument, make this the last one */ }; diff --git a/usr/src/cmd/truss/print.h b/usr/src/cmd/truss/print.h index 159b2fbe58..7a190f9cab 100644 --- a/usr/src/cmd/truss/print.h +++ b/usr/src/cmd/truss/print.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -135,7 +134,8 @@ extern "C" { #define UN1 95 /* unsigned except for -1 */ #define MOB 96 /* print mmapobj() flags */ #define UTF 97 /* print utimensat() flag */ -#define HID 98 /* hidden argument, don't print */ +#define SKC 98 /* print sockconfig subcode */ +#define HID 99 /* hidden argument, don't print */ /* make sure HID is always the last member */ /* diff --git a/usr/src/cmd/truss/systable.c b/usr/src/cmd/truss/systable.c index fe49984a29..b8bdbe6af5 100644 --- a/usr/src/cmd/truss/systable.c +++ b/usr/src/cmd/truss/systable.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -464,7 +463,7 @@ const struct systable systable[] = { {"getsockname", 4, DEC, NOV, DEC, HEX, HEX, SKV}, /* 244 */ {"getsockopt", 6, DEC, NOV, DEC, SOL, SON, HEX, HEX, SKV}, /* 245 */ {"setsockopt", 6, DEC, NOV, DEC, SOL, SON, HEX, DEC, SKV}, /* 246 */ -{"sockconfig", 4, DEC, NOV, DEC, DEC, DEC, STG}, /* 247 */ +{"sockconfig", 5, DEC, NOV, DEC, HEX, HEX, HEX, HEX}, /* 247 */ {"ntp_gettime", 1, DEC, NOV, HEX}, /* 248 */ {"ntp_adjtime", 1, DEC, NOV, HEX}, /* 249 */ {"lwp_mutex_unlock", 1, DEC, NOV, HEX}, /* 250 */ @@ -873,6 +872,14 @@ const struct systable utimesystable[] = { }; #define NUTIMESYSCODE (sizeof (utimesystable) / sizeof (struct systable)) +const struct systable sockconfigtable[] = { +{"sockconfig", 5, DEC, NOV, SKC, DEC, DEC, DEC, STG}, /* 0 */ +{"sockconfig", 4, DEC, NOV, SKC, DEC, DEC, DEC}, /* 1 */ +{"sockconfig", 3, DEC, NOV, SKC, STG, HEX }, /* 2 */ +{"sockconfig", 2, DEC, NOV, SKC, STG }, /* 3 */ +}; +#define NSOCKCONFIGCODE (sizeof (sockconfigtable) / sizeof (struct systable)) + const struct sysalias sysalias[] = { { "exit", SYS_exit }, { "fork", SYS_forksys }, @@ -1204,6 +1211,10 @@ subsys(int syscall, int subcode) if ((unsigned)subcode < NUTIMESYSCODE) stp = &utimesystable[subcode]; break; + case SYS_sockconfig: /* sockconfig family */ + if ((unsigned)subcode < NSOCKCONFIGCODE) + stp = &sockconfigtable[subcode]; + break; } } @@ -1383,6 +1394,7 @@ getsubcode(private_t *pri) case SYS_rctlsys: /* rctlsys */ case SYS_sidsys: /* sidsys */ case SYS_utimesys: /* utimesys */ + case SYS_sockconfig: /* sockconfig */ subcode = arg0; break; case SYS_fcntl: /* fcntl() */ @@ -1453,7 +1465,8 @@ maxsyscalls() + NRCTLCODE - 1 + NFORKCODE - 1 + NSIDSYSCODE - 1 - + NUTIMESYSCODE - 1); + + NUTIMESYSCODE - 1 + + NSOCKCONFIGCODE - 1); } /* @@ -1545,6 +1558,8 @@ nsubcodes(int syscall) return (NSIDSYSCODE); case SYS_utimesys: return (NUTIMESYSCODE); + case SYS_sockconfig: + return (NSOCKCONFIGCODE); default: return (1); } diff --git a/usr/src/lib/libc/common/sys/_sockconfig.s b/usr/src/lib/libc/common/sys/_sockconfig.s index 4ee709ee1b..9c939f0d52 100644 --- a/usr/src/lib/libc/common/sys/_sockconfig.s +++ b/usr/src/lib/libc/common/sys/_sockconfig.s @@ -23,16 +23,14 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. */ .file "_sockconfig.s" /* C library -- _sockconfig */ /* - * int _sockconfig (int domain, int type, int protocol, - * dev_t dev, int version); + * int _sockconfig (int cmd, void *arg1, void *arg2, void *arg3, void *arg4); */ #include "SYS.h" diff --git a/usr/src/pkg/manifests/SUNWcs.mf b/usr/src/pkg/manifests/SUNWcs.mf index 6a92f9b5d8..4035949864 100644 --- a/usr/src/pkg/manifests/SUNWcs.mf +++ b/usr/src/pkg/manifests/SUNWcs.mf @@ -629,6 +629,7 @@ file path=lib/svc/method/svc-hotplug mode=0555 file path=lib/svc/method/svc-legacy-routing mode=0555 file path=lib/svc/method/svc-nscd mode=0555 file path=lib/svc/method/svc-rbac mode=0555 +file path=lib/svc/method/svc-sockfilter mode=0555 file path=lib/svc/method/svc-utmpd mode=0555 file path=lib/svc/method/system-log mode=0555 file path=lib/svc/method/vtdaemon mode=0555 diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 265ad592b7..7ba9696a61 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1263,7 +1263,7 @@ SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \ sockcommon_sops.o sockcommon.o \ sock_notsupp.o socknotify.o \ nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \ - nl7cnca.o sodirect.o + nl7cnca.o sodirect.o sockfilter.o TMPFS_OBJS += tmp_dir.o tmp_subr.o tmp_tnode.o tmp_vfsops.o \ tmp_vnops.o diff --git a/usr/src/uts/common/c2/audit_event.c b/usr/src/uts/common/c2/audit_event.c index 440db9dd3f..68975f00aa 100644 --- a/usr/src/uts/common/c2/audit_event.c +++ b/usr/src/uts/common/c2/audit_event.c @@ -3822,38 +3822,60 @@ aus_sockconfig(tad) struct t_audit_data *tad; { struct a { - long domain; - long type; - long protocol; - long devpath; + long cmd; + long arg1; + long arg2; + long arg3; + long arg4; } *uap = (struct a *)ttolwp(curthread)->lwp_ap; - char *kdevpath; - int kdevpathlen = MAXPATHLEN + 1; + char *buf; + int buflen; size_t size; - au_uwrite(au_to_arg32(1, "domain", (uint32_t)uap->domain)); - au_uwrite(au_to_arg32(2, "type", (uint32_t)uap->type)); - au_uwrite(au_to_arg32(3, "protocol", (uint32_t)uap->protocol)); + au_uwrite(au_to_arg32(1, "cmd", (uint_t)uap->cmd)); + switch (uap->cmd) { + case SOCKCONFIG_ADD_SOCK: + case SOCKCONFIG_REMOVE_SOCK: + au_uwrite(au_to_arg32(2, "domain", (uint32_t)uap->arg1)); + au_uwrite(au_to_arg32(3, "type", (uint32_t)uap->arg2)); + au_uwrite(au_to_arg32(4, "protocol", (uint32_t)uap->arg3)); + + if (uap->arg4 == 0) { + au_uwrite(au_to_arg32(5, "devpath", (uint32_t)0)); + } else { + buflen = MAXPATHLEN + 1; + buf = kmem_alloc(buflen, KM_SLEEP); + if (copyinstr((caddr_t)uap->arg4, buf, buflen, + &size)) { + kmem_free(buf, buflen); + return; + } - if (uap->devpath == 0) { - au_uwrite(au_to_arg32(3, "devpath", (uint32_t)0)); - } else { - kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP); + if (size > MAXPATHLEN) { + kmem_free(buf, buflen); + return; + } - if (copyinstr((caddr_t)uap->devpath, kdevpath, kdevpathlen, - &size)) { - kmem_free(kdevpath, kdevpathlen); - return; + au_uwrite(au_to_text(buf)); + kmem_free(buf, buflen); } + break; + case SOCKCONFIG_ADD_FILTER: + case SOCKCONFIG_REMOVE_FILTER: + buflen = FILNAME_MAX; + buf = kmem_alloc(buflen, KM_SLEEP); - if (size > MAXPATHLEN) { - kmem_free(kdevpath, kdevpathlen); + if (copyinstr((caddr_t)uap->arg1, buf, buflen, &size)) { + kmem_free(buf, buflen); return; } - au_uwrite(au_to_text(kdevpath)); - kmem_free(kdevpath, kdevpathlen); + au_uwrite(au_to_text(buf)); + kmem_free(buf, buflen); + break; + default: + break; } } diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index e92e72f8dc..703e26ea61 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -45,6 +44,7 @@ #include #include +#include #include #include #include @@ -216,7 +216,7 @@ socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) * Active open. */ int -socket_connect(struct sonode *so, const struct sockaddr *name, +socket_connect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags, cred_t *cr) { int error; @@ -471,14 +471,23 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) so->so_rcv_timer_tid = 0; so->so_rcv_thresh = 0; - so->so_acceptq_head = NULL; - so->so_acceptq_tail = &so->so_acceptq_head; - so->so_acceptq_next = NULL; + list_create(&so->so_acceptq_list, sizeof (struct sonode), + offsetof(struct sonode, so_acceptq_node)); + list_create(&so->so_acceptq_defer, sizeof (struct sonode), + offsetof(struct sonode, so_acceptq_node)); + list_link_init(&so->so_acceptq_node); so->so_acceptq_len = 0; so->so_backlog = 0; + so->so_listener = NULL; so->so_snd_qfull = B_FALSE; + so->so_filter_active = 0; + so->so_filter_tx = 0; + so->so_filter_defertime = 0; + so->so_filter_top = NULL; + so->so_filter_bottom = NULL; + mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); @@ -509,9 +518,15 @@ sonode_destructor(void *buf, void *cdrarg) ASSERT(so->so_rcv_q_head == NULL); - ASSERT(so->so_acceptq_head == NULL); - ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); - ASSERT(so->so_acceptq_next == NULL); + list_destroy(&so->so_acceptq_list); + list_destroy(&so->so_acceptq_defer); + ASSERT(!list_link_active(&so->so_acceptq_node)); + ASSERT(so->so_listener == NULL); + + ASSERT(so->so_filter_active == 0); + ASSERT(so->so_filter_tx == 0); + ASSERT(so->so_filter_top == NULL); + ASSERT(so->so_filter_bottom == NULL); ASSERT(vp->v_data == so); ASSERT(vn_matchops(vp, socket_vnodeops)); @@ -581,21 +596,11 @@ sonode_init(struct sonode *so, struct sockparams *sp, int family, so->so_copyflag = 0; - ASSERT(so->so_acceptq_head == NULL); - ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); - ASSERT(so->so_acceptq_next == NULL); - vn_reinit(vp); vp->v_vfsp = rootvfs; vp->v_type = VSOCK; vp->v_rdev = sockdev; - so->so_rcv_queued = 0; - so->so_rcv_q_head = NULL; - so->so_rcv_q_last_head = NULL; - so->so_rcv_head = NULL; - so->so_rcv_last_head = NULL; - so->so_snd_qfull = B_FALSE; so->so_minpsz = 0; @@ -620,7 +625,6 @@ sonode_init(struct sonode *so, struct sockparams *sp, int family, void sonode_fini(struct sonode *so) { - mblk_t *mp; vnode_t *vp; ASSERT(so->so_count == 0); @@ -631,15 +635,6 @@ sonode_fini(struct sonode *so) so->so_rcv_timer_tid = 0; } - so_acceptq_flush(so, B_FALSE); - - if ((mp = so->so_oobmsg) != NULL) { - freemsg(mp); - so->so_oobmsg = NULL; - so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| - SS_RCVATMARK); - } - if (so->so_poll_list.ph_list != NULL) { pollwakeup(&so->so_poll_list, POLLERR); pollhead_clean(&so->so_poll_list); @@ -655,4 +650,17 @@ sonode_fini(struct sonode *so) crfree(so->so_peercred); so->so_peercred = NULL; } + /* Detach and destroy filters */ + if (so->so_filter_top != NULL) + sof_sonode_cleanup(so); + + ASSERT(list_is_empty(&so->so_acceptq_list)); + ASSERT(list_is_empty(&so->so_acceptq_defer)); + ASSERT(!list_link_active(&so->so_acceptq_node)); + + ASSERT(so->so_rcv_queued == 0); + ASSERT(so->so_rcv_q_head == NULL); + ASSERT(so->so_rcv_q_last_head == NULL); + ASSERT(so->so_rcv_head == NULL); + ASSERT(so->so_rcv_last_head == NULL); } diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h index fac10a8935..d4e1883b1d 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.h +++ b/usr/src/uts/common/fs/sockfs/sockcommon.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SOCKCOMMON_H_ @@ -54,7 +53,7 @@ extern int socket_bind(struct sonode *, struct sockaddr *, socklen_t, int, struct cred *); extern int socket_accept(struct sonode *, int, struct cred *, struct sonode **); extern int socket_listen(struct sonode *, int, struct cred *); -extern int socket_connect(struct sonode *, const struct sockaddr *, +extern int socket_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); extern int socket_getpeername(struct sonode *, struct sockaddr *, socklen_t *, boolean_t, struct cred *); @@ -120,7 +119,7 @@ extern int so_accept(struct sonode *, int, struct cred *, struct sonode **); extern int so_bind(struct sonode *, struct sockaddr *, socklen_t, int, struct cred *); extern int so_listen(struct sonode *, int, struct cred *); -extern int so_connect(struct sonode *, const struct sockaddr *, +extern int so_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); extern int so_getsockopt(struct sonode *, int, int, void *, socklen_t *, int, struct cred *); @@ -136,6 +135,8 @@ extern int so_poll(struct sonode *, short, int, short *, struct pollhead **); extern int so_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, struct cred *); +extern int so_sendmblk_impl(struct sonode *, struct nmsghdr *, int, + struct cred *, mblk_t **, struct sof_instance *, boolean_t); extern int so_sendmblk(struct sonode *, struct nmsghdr *, int, struct cred *, mblk_t **); extern int so_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, @@ -153,6 +154,8 @@ extern void so_set_prop(sock_upper_handle_t, struct sock_proto_props *); extern ssize_t so_queue_msg(sock_upper_handle_t, mblk_t *, size_t, int, int *, boolean_t *); +extern ssize_t so_queue_msg_impl(struct sonode *, mblk_t *, size_t, int, + int *, boolean_t *, struct sof_instance *); extern void so_signal_oob(sock_upper_handle_t, ssize_t); extern void so_connected(sock_upper_handle_t, sock_connid_t, struct cred *, @@ -183,6 +186,7 @@ extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *, rval_t *, int); extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t); extern void so_process_new_message(struct sonode *, mblk_t *, mblk_t *); +extern void so_check_flow_control(struct sonode *); extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *); extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *); @@ -213,7 +217,7 @@ extern int so_get_mod_version(struct sockparams *); /* Notification functions */ extern void so_notify_connected(struct sonode *); extern void so_notify_disconnecting(struct sonode *); -extern void so_notify_disconnected(struct sonode *, int); +extern void so_notify_disconnected(struct sonode *, boolean_t, int); extern void so_notify_writable(struct sonode *); extern void so_notify_data(struct sonode *, size_t); extern void so_notify_oobsig(struct sonode *); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 64ea59c4b5..bf5fcdeb08 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -46,6 +46,7 @@ #include #include +#include #include @@ -59,7 +60,7 @@ extern int xnet_skip_checks; extern int xnet_check_print; -static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t); +static void so_queue_oob(struct sonode *, mblk_t *, size_t); /*ARGSUSED*/ @@ -291,8 +292,11 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, } dobind: - error = (*so->so_downcalls->sd_bind) - (so->so_proto_handle, name, namelen, cr); + if (so->so_filter_active == 0 || + (error = sof_filter_bind(so, name, &namelen, cr)) < 0) { + error = (*so->so_downcalls->sd_bind) + (so->so_proto_handle, name, namelen, cr); + } done: SO_UNBLOCK_FALLBACK(so); @@ -307,8 +311,10 @@ so_listen(struct sonode *so, int backlog, struct cred *cr) ASSERT(MUTEX_NOT_HELD(&so->so_lock)); SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr)); - error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog, - cr); + if ((so)->so_filter_active == 0 || + (error = sof_filter_listen(so, &backlog, cr)) < 0) + error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, + backlog, cr); SO_UNBLOCK_FALLBACK(so); @@ -317,7 +323,7 @@ so_listen(struct sonode *so, int backlog, struct cred *cr) int -so_connect(struct sonode *so, const struct sockaddr *name, +so_connect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags, struct cred *cr) { int error = 0; @@ -339,12 +345,16 @@ so_connect(struct sonode *so, const struct sockaddr *name, goto done; } - error = (*so->so_downcalls->sd_connect)(so->so_proto_handle, - name, namelen, &id, cr); - - if (error == EINPROGRESS) - error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id); + if (so->so_filter_active == 0 || + (error = sof_filter_connect(so, (struct sockaddr *)name, + &namelen, cr)) < 0) { + error = (*so->so_downcalls->sd_connect)(so->so_proto_handle, + name, namelen, &id, cr); + if (error == EINPROGRESS) + error = so_wait_connected(so, + fflag & (FNONBLOCK|FNDELAY), id); + } done: SO_UNBLOCK_FALLBACK(so); return (error); @@ -371,9 +381,10 @@ so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop) ASSERT(nso != NULL); /* finish the accept */ - error = (*so->so_downcalls->sd_accept)(so->so_proto_handle, - nso->so_proto_handle, (sock_upper_handle_t)nso, cr); - if (error != 0) { + if ((so->so_filter_active > 0 && + (error = sof_filter_accept(nso, cr)) > 0) || + (error = (*so->so_downcalls->sd_accept)(so->so_proto_handle, + nso->so_proto_handle, (sock_upper_handle_t)nso, cr)) != 0) { (void) socket_close(nso, 0, cr); socket_destroy(nso); } else { @@ -442,7 +453,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, error = EOPNOTSUPP; break; } - } else if (so->so_snd_qfull) { + } else if (SO_SND_FLOWCTRLD(so)) { /* * Need to wait until the protocol is ready to receive * more data for transmission. @@ -474,6 +485,13 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, } ASSERT(uiop->uio_resid >= 0); + if (so->so_filter_active > 0 && + ((mp = SOF_FILTER_DATA_OUT(so, mp, msg, cr, + &error)) == NULL)) { + if (error != 0) + break; + continue; + } error = (*so->so_downcalls->sd_send) (so->so_proto_handle, mp, msg, cr); if (error != 0) { @@ -495,27 +513,23 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, } int -so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, - struct cred *cr, mblk_t **mpp) +so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp, sof_instance_t *fil, + boolean_t fil_inject) { int error; boolean_t dontblock; size_t size; mblk_t *mp = *mpp; - SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); + if (so->so_downcalls->sd_send == NULL) + return (EOPNOTSUPP); error = 0; dontblock = (msg->msg_flags & MSG_DONTWAIT) || (fflag & (FNONBLOCK|FNDELAY)); size = msgdsize(mp); - if ((so->so_mode & SM_SENDFILESUPP) == 0 || - so->so_downcalls->sd_send == NULL) { - SO_UNBLOCK_FALLBACK(so); - return (EOPNOTSUPP); - } - if ((so->so_mode & SM_ATOMIC) && size > so->so_proto_props.sopp_maxpsz && so->so_proto_props.sopp_maxpsz != -1) { @@ -538,7 +552,8 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, if (error != 0) break; } - if (so->so_snd_qfull) { + /* Socket filters are not flow controlled */ + if (SO_SND_FLOWCTRLD(so) && !fil_inject) { /* * Need to wait until the protocol is ready to receive * more data for transmission. @@ -564,6 +579,14 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, nmp = nmp->b_cont; } + if (so->so_filter_active > 0 && + (mp = SOF_FILTER_DATA_OUT_FROM(so, fil, mp, msg, + cr, &error)) == NULL) { + *mpp = mp = nmp; + if (error != 0) + break; + continue; + } error = (*so->so_downcalls->sd_send) (so->so_proto_handle, mp, msg, cr); if (error != 0) { @@ -578,6 +601,30 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, *mpp = mp = nmp; } + /* Let the filter know whether the protocol is flow controlled */ + if (fil_inject && error == 0 && SO_SND_FLOWCTRLD(so)) + error = ENOSPC; + + return (error); +} + +#pragma inline(so_sendmblk_impl) + +int +so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp) +{ + int error; + + SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); + + if ((so->so_mode & SM_SENDFILESUPP) == 0) { + SO_UNBLOCK_FALLBACK(so); + return (EOPNOTSUPP); + } + + error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top, + B_FALSE); SO_UNBLOCK_FALLBACK(so); @@ -607,8 +654,10 @@ so_shutdown(struct sonode *so, int how, struct cred *cr) goto done; } - error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle, - how, cr)); + if (so->so_filter_active == 0 || + (error = sof_filter_shutdown(so, &how, cr)) < 0) + error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle, + how, cr)); /* * Protocol agreed to shutdown. We need to flush the @@ -638,8 +687,10 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); - error = (*so->so_downcalls->sd_getsockname) - (so->so_proto_handle, addr, addrlen, cr); + if (so->so_filter_active == 0 || + (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) + error = (*so->so_downcalls->sd_getsockname) + (so->so_proto_handle, addr, addrlen, cr); SO_UNBLOCK_FALLBACK(so); return (error); @@ -664,7 +715,8 @@ so_getpeername(struct sonode *so, struct sockaddr *addr, if (xnet_check_print) { printf("sockfs: X/Open getpeername check => EINVAL\n"); } - } else { + } else if (so->so_filter_active == 0 || + (error = sof_filter_getpeername(so, addr, addrlen, cr)) < 0) { error = (*so->so_downcalls->sd_getpeername) (so->so_proto_handle, addr, addrlen, cr); } @@ -679,13 +731,17 @@ so_getsockopt(struct sonode *so, int level, int option_name, { int error = 0; - ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + if (level == SOL_FILTER) + return (sof_getsockopt(so, option_name, optval, optlenp, cr)); + SO_BLOCK_FALLBACK(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); - error = socket_getopt_common(so, level, option_name, optval, optlenp, - flags); - if (error < 0) { + if ((so->so_filter_active == 0 || + (error = sof_filter_getsockopt(so, level, option_name, optval, + optlenp, cr)) < 0) && + (error = socket_getopt_common(so, level, option_name, optval, + optlenp, flags)) < 0) { error = (*so->so_downcalls->sd_getsockopt) (so->so_proto_handle, level, option_name, optval, optlenp, cr); @@ -764,6 +820,9 @@ so_setsockopt(struct sonode *so, int level, int option_name, struct timeval tl; const void *opt = optval; + if (level == SOL_FILTER) + return (sof_setsockopt(so, option_name, optval, optlen, cr)); + SO_BLOCK_FALLBACK(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); @@ -775,6 +834,11 @@ so_setsockopt(struct sonode *so, int level, int option_name, return (EINVAL); } + if (so->so_filter_active > 0 && + (error = sof_filter_setsockopt(so, level, option_name, + (void *)optval, &optlen, cr)) >= 0) + goto done; + if (level == SOL_SOCKET) { switch (option_name) { case SO_RCVTIMEO: @@ -856,7 +920,10 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * calling strioc can result in the socket falling back to TPI, * if that is supported. */ - if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 && + if ((so->so_filter_active == 0 || + (error = sof_filter_ioctl(so, cmd, arg, mode, + rvalp, cr)) < 0) && + (error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 && (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) { error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle, cmd, arg, mode, rvalp, cr); @@ -894,7 +961,7 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, * is flow controlled */ *reventsp |= POLLWRBAND & events; - if (!so->so_snd_qfull) { + if (!SO_SND_FLOWCTRLD(so)) { /* * As long as there is buffer to send data * turn on POLLOUT events @@ -915,7 +982,7 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, */ /* Pending connections */ - if (so->so_acceptq_len > 0) + if (!list_is_empty(&so->so_acceptq_list)) *reventsp |= (POLLIN|POLLRDNORM) & events; /* Data */ @@ -941,7 +1008,8 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, /* Check for read events again, but this time under lock */ if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&so->so_lock); - if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) { + if (SO_HAVE_DATA(so) || + !list_is_empty(&so->so_acceptq_list)) { mutex_exit(&so->so_lock); *reventsp |= (POLLIN|POLLRDNORM) & events; return (0); @@ -987,12 +1055,13 @@ int so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error) { struct sonode *so = (struct sonode *)sock_handle; + boolean_t connect_failed; mutex_enter(&so->so_lock); - + connect_failed = so->so_state & SS_ISCONNECTED; so->so_proto_connid = id; soisdisconnected(so, error); - so_notify_disconnected(so, error); + so_notify_disconnected(so, connect_failed, error); return (0); } @@ -1019,6 +1088,16 @@ so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action, mutex_enter(&so->so_lock); so->so_state |= SS_ACCEPTCONN; so->so_backlog = (unsigned int)arg; + /* + * The protocol can stop generating newconn upcalls when + * the backlog is full, so to make sure the listener does + * not end up with a queue full of deferred connections + * we reduce the backlog by one. Thus the listener will + * start closing deferred connections before the backlog + * is full. + */ + if (so->so_filter_active > 0) + so->so_backlog = MAX(1, so->so_backlog - 1); mutex_exit(&so->so_lock); break; default: @@ -1037,6 +1116,7 @@ so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull) } else { so_snd_qnotfull(so); mutex_enter(&so->so_lock); + /* so_notify_writable drops so_lock */ so_notify_writable(so); } } @@ -1053,8 +1133,10 @@ so_newconn(sock_upper_handle_t parenthandle, ASSERT(proto_handle != NULL); if ((so->so_state & SS_ACCEPTCONN) == 0 || - so->so_acceptq_len >= so->so_backlog) - return (NULL); + (so->so_acceptq_len >= so->so_backlog && + (so->so_filter_active == 0 || !sof_sonode_drop_deferred(so)))) { + return (NULL); + } nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP, &error); @@ -1066,6 +1148,7 @@ so_newconn(sock_upper_handle_t parenthandle, nso->so_peercred = peer_cred; nso->so_cpid = peer_cpid; } + nso->so_listener = so; /* * The new socket (nso), proto_handle and sock_upcallsp are all @@ -1075,12 +1158,30 @@ so_newconn(sock_upper_handle_t parenthandle, */ *sock_upcallsp = &so_upcalls; - (void) so_acceptq_enqueue(so, nso); - - mutex_enter(&so->so_lock); - so_notify_newconn(so); + mutex_enter(&so->so_acceptq_lock); + if (so->so_state & (SS_CLOSING|SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) { + mutex_exit(&so->so_acceptq_lock); + ASSERT(nso->so_count == 1); + nso->so_count--; + /* drop proto ref */ + VN_RELE(SOTOV(nso)); + socket_destroy(nso); + return (NULL); + } else { + so->so_acceptq_len++; + if (nso->so_state & SS_FIL_DEFER) { + list_insert_tail(&so->so_acceptq_defer, nso); + mutex_exit(&so->so_acceptq_lock); + } else { + list_insert_tail(&so->so_acceptq_list, nso); + cv_signal(&so->so_acceptq_cv); + mutex_exit(&so->so_acceptq_lock); + mutex_enter(&so->so_lock); + so_notify_newconn(so); + } - return ((sock_upper_handle_t)nso); + return ((sock_upper_handle_t)nso); + } } void @@ -1132,6 +1233,27 @@ so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp) mutex_exit(&so->so_lock); + if (so->so_filter_active > 0) { + sof_instance_t *inst; + ssize_t maxblk; + ushort_t wroff, tail; + maxblk = so->so_proto_props.sopp_maxblk; + wroff = so->so_proto_props.sopp_wroff; + tail = so->so_proto_props.sopp_tail; + for (inst = so->so_filter_bottom; inst != NULL; + inst = inst->sofi_prev) { + if (SOF_INTERESTED(inst, mblk_prop)) { + (*inst->sofi_ops->sofop_mblk_prop)( + (sof_handle_t)inst, inst->sofi_cookie, + &maxblk, &wroff, &tail); + } + } + mutex_enter(&so->so_lock); + so->so_proto_props.sopp_maxblk = maxblk; + so->so_proto_props.sopp_wroff = wroff; + so->so_proto_props.sopp_tail = tail; + mutex_exit(&so->so_lock); + } #ifdef DEBUG soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ | @@ -1144,10 +1266,10 @@ so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp) /* ARGSUSED */ ssize_t -so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, - size_t msg_size, int flags, int *errorp, boolean_t *force_pushp) +so_queue_msg_impl(struct sonode *so, mblk_t *mp, + size_t msg_size, int flags, int *errorp, boolean_t *force_pushp, + sof_instance_t *filter) { - struct sonode *so = (struct sonode *)sock_handle; boolean_t force_push = B_TRUE; int space_left; sodirect_t *sodp = so->so_direct; @@ -1165,31 +1287,14 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, return (0); } ASSERT(msg_size == 0); - /* - * recv space check - */ mutex_enter(&so->so_lock); - space_left = so->so_rcvbuf - so->so_rcv_queued; - if (space_left <= 0) { - so->so_flowctrld = B_TRUE; - *errorp = ENOSPC; - space_left = -1; - } - goto done_unlock; + goto space_check; } ASSERT(mp->b_next == NULL); ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO); ASSERT(msg_size == msgdsize(mp)); - if (flags & MSG_OOB) { - so_queue_oob(sock_handle, mp, msg_size); - return (0); - } - - if (force_pushp != NULL) - force_push = *force_pushp; - if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) { /* The read pointer is not aligned correctly for TPI */ zcmn_err(getzoneid(), CE_WARN, @@ -1199,11 +1304,36 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, mutex_enter(&so->so_lock); if (sodp != NULL) SOD_UIOAFINI(sodp); - mutex_exit(&so->so_lock); + goto space_check; + } + + if (so->so_filter_active > 0) { + for (; filter != NULL; filter = filter->sofi_prev) { + if (!SOF_INTERESTED(filter, data_in)) + continue; + mp = (*filter->sofi_ops->sofop_data_in)( + (sof_handle_t)filter, filter->sofi_cookie, mp, + flags, &msg_size); + ASSERT(msgdsize(mp) == msg_size); + DTRACE_PROBE2(filter__data, (sof_instance_t), filter, + (mblk_t *), mp); + /* Data was consumed/dropped, just do space check */ + if (msg_size == 0) { + mutex_enter(&so->so_lock); + goto space_check; + } + } + } - return (so->so_rcvbuf - so->so_rcv_queued); + if (flags & MSG_OOB) { + so_queue_oob(so, mp, msg_size); + mutex_enter(&so->so_lock); + goto space_check; } + if (force_pushp != NULL) + force_push = *force_pushp; + mutex_enter(&so->so_lock); if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) { if (sodp != NULL) @@ -1212,7 +1342,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, *errorp = EOPNOTSUPP; return (-1); } - if (so->so_state & SS_CANTRCVMORE) { + if (so->so_state & (SS_CANTRCVMORE | SS_CLOSING)) { freemsg(mp); if (sodp != NULL) SOD_DISABLE(sodp); @@ -1270,6 +1400,27 @@ done_unlock: mutex_exit(&so->so_lock); done: return (space_left); + +space_check: + space_left = so->so_rcvbuf - so->so_rcv_queued; + if (space_left <= 0) { + so->so_flowctrld = B_TRUE; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; +} + +#pragma inline(so_queue_msg_impl) + +ssize_t +so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, + size_t msg_size, int flags, int *errorp, boolean_t *force_pushp) +{ + struct sonode *so = (struct sonode *)sock_handle; + + return (so_queue_msg_impl(so, mp, msg_size, flags, errorp, force_pushp, + so->so_filter_bottom)); } /* @@ -1320,11 +1471,8 @@ so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset) * Queue the OOB byte */ static void -so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len) +so_queue_oob(struct sonode *so, mblk_t *mp, size_t len) { - struct sonode *so; - - so = (struct sonode *)sock_handle; mutex_enter(&so->so_lock); if (so->so_direct != NULL) SOD_UIOAFINI(so->so_direct); @@ -1345,21 +1493,62 @@ so_close(struct sonode *so, int flag, struct cred *cr) { int error; - error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr); - /* - * At this point there will be no more upcalls from the protocol + * No new data will be enqueued once the CLOSING flag is set. */ mutex_enter(&so->so_lock); - + so->so_state |= SS_CLOSING; ASSERT(so_verify_oobstate(so)); - so_rcv_flush(so); mutex_exit(&so->so_lock); + if (so->so_state & SS_ACCEPTCONN) { + /* + * We grab and release the accept lock to ensure that any + * thread about to insert a socket in so_newconn completes + * before we flush the queue. Any thread calling so_newconn + * after we drop the lock will observe the SS_CLOSING flag, + * which will stop it from inserting the socket in the queue. + */ + mutex_enter(&so->so_acceptq_lock); + mutex_exit(&so->so_acceptq_lock); + + so_acceptq_flush(so, B_TRUE); + } + + if (so->so_filter_active > 0) + sof_sonode_closing(so); + + error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr); + switch (error) { + default: + /* Protocol made a synchronous close; remove proto ref */ + VN_RELE(SOTOV(so)); + break; + case EINPROGRESS: + /* + * Protocol is in the process of closing, it will make a + * 'closed' upcall to remove the reference. + */ + error = 0; + break; + } + return (error); } +/* + * Upcall made by the protocol when it's doing an asynchronous close. It + * will drop the protocol's reference on the socket. + */ +void +so_closed(sock_upper_handle_t sock_handle) +{ + struct sonode *so = (struct sonode *)sock_handle; + + VN_RELE(SOTOV(so)); +} + void so_zcopy_notify(sock_upper_handle_t sock_handle) { @@ -1759,5 +1948,6 @@ sock_upcalls_t so_upcalls = { so_txq_full, so_signal_oob, so_zcopy_notify, - so_set_error + so_set_error, + so_closed }; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index 2e3442e879..a44d389855 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -39,6 +38,7 @@ #include #include +#include #include #include #include @@ -59,46 +59,6 @@ boolean_t so_debug_length = B_FALSE; static boolean_t so_check_length(sonode_t *so); #endif -int -so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso) -{ - ASSERT(MUTEX_HELD(&so->so_acceptq_lock)); - ASSERT(nso->so_acceptq_next == NULL); - - *so->so_acceptq_tail = nso; - so->so_acceptq_tail = &nso->so_acceptq_next; - so->so_acceptq_len++; - - /* Wakeup a single consumer */ - cv_signal(&so->so_acceptq_cv); - - return (so->so_acceptq_len); -} - -/* - * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso) - * - * Enqueue an incoming connection on a listening socket. - * - * Arguments: - * so - listening socket - * nso - new connection - * - * Returns: - * Number of queued connections, including the new connection - */ -int -so_acceptq_enqueue(struct sonode *so, struct sonode *nso) -{ - int conns; - - mutex_enter(&so->so_acceptq_lock); - conns = so_acceptq_enqueue_locked(so, nso); - mutex_exit(&so->so_acceptq_lock); - - return (conns); -} - static int so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock, struct sonode **nsop) @@ -107,7 +67,7 @@ so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock, *nsop = NULL; ASSERT(MUTEX_HELD(&so->so_acceptq_lock)); - while ((nso = so->so_acceptq_head) == NULL) { + while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) { /* * No need to check so_error here, because it is not * possible for a listening socket to be reset or otherwise @@ -126,15 +86,9 @@ so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock, } ASSERT(nso != NULL); - so->so_acceptq_head = nso->so_acceptq_next; - nso->so_acceptq_next = NULL; - - if (so->so_acceptq_head == NULL) { - ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next); - so->so_acceptq_tail = &so->so_acceptq_head; - } ASSERT(so->so_acceptq_len > 0); - --so->so_acceptq_len; + so->so_acceptq_len--; + nso->so_listener = NULL; *nsop = nso; @@ -174,8 +128,36 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock, return (error); } +static void +so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose) +{ + struct sonode *nso; + + while ((nso = list_remove_head(list)) != NULL) { + nso->so_listener = NULL; + if (doclose) { + (void) socket_close(nso, 0, CRED()); + } else { + /* + * Only used for fallback - not possible when filters + * are present. + */ + ASSERT(so->so_filter_active == 0); + /* + * Since the socket is on the accept queue, there can + * only be one reference. We drop the reference and + * just blow off the socket. + */ + ASSERT(nso->so_count == 1); + nso->so_count--; + /* drop the proto ref */ + VN_RELE(SOTOV(nso)); + } + socket_destroy(nso); + } +} /* - * void so_acceptq_flush(struct sonode *so, boolean_t doclose) + * void so_acceptq_flush(struct sonode *so) * * Removes all pending connections from a listening socket, and * frees the associated resources. @@ -183,7 +165,6 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock, * Arguments * so - listening socket * doclose - make a close downcall for each socket on the accept queue - * (Note, only SCTP and SDP sockets rely on this) * * Return values: * None. @@ -197,28 +178,9 @@ so_acceptq_dequeue(struct sonode *so, boolean_t dontblock, void so_acceptq_flush(struct sonode *so, boolean_t doclose) { - struct sonode *nso; + so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose); + so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose); - while ((nso = so->so_acceptq_head) != NULL) { - so->so_acceptq_head = nso->so_acceptq_next; - nso->so_acceptq_next = NULL; - - if (doclose) { - (void) socket_close(nso, 0, CRED()); - } else { - /* - * Since the socket is on the accept queue, there can - * only be one reference. We drop the reference and - * just blow off the socket. - */ - ASSERT(nso->so_count == 1); - nso->so_count--; - } - socket_destroy(nso); - } - - so->so_acceptq_head = NULL; - so->so_acceptq_tail = &so->so_acceptq_head; so->so_acceptq_len = 0; } @@ -296,7 +258,7 @@ so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock) int error; ASSERT(MUTEX_HELD(&so->so_lock)); - while (so->so_snd_qfull) { + while (SO_SND_FLOWCTRLD(so)) { if (so->so_state & SS_CANTSENDMORE) return (EPIPE); if (dontblock) @@ -334,11 +296,9 @@ so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock) int error = 0; mutex_enter(&so->so_lock); - if (so->so_snd_qfull) { - so->so_snd_wakeup = B_TRUE; - error = so_snd_wait_qnotfull_locked(so, dontblock); - so->so_snd_wakeup = B_FALSE; - } + so->so_snd_wakeup = B_TRUE; + error = so_snd_wait_qnotfull_locked(so, dontblock); + so->so_snd_wakeup = B_FALSE; mutex_exit(&so->so_lock); return (error); @@ -601,8 +561,13 @@ so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail) void so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head) { + if (so->so_filter_active > 0 && + (mp_head = sof_filter_data_in_proc(so, mp_head, + &mp_last_head)) == NULL) + return; + ASSERT(mp_head->b_prev != NULL); - if (so->so_rcv_q_head == NULL) { + if (so->so_rcv_q_head == NULL) { so->so_rcv_q_head = mp_head; so->so_rcv_q_last_head = mp_last_head; ASSERT(so->so_rcv_q_last_head->b_prev != NULL); @@ -650,13 +615,13 @@ so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head) * Check flow control on a given sonode. Must have so_lock held, and * this function will release the hold. */ - -static void +void so_check_flow_control(struct sonode *so) { ASSERT(MUTEX_HELD(&so->so_lock)); - if (so->so_flowctrld && so->so_rcv_queued < so->so_rcvlowat) { + if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat && + !(so->so_state & SS_FIL_RCV_FLOWCTRL))) { so->so_flowctrld = B_FALSE; mutex_exit(&so->so_lock); /* @@ -668,6 +633,8 @@ so_check_flow_control(struct sonode *so) (*so->so_downcalls->sd_clr_flowctrl) (so->so_proto_handle); } + /* filters can start injecting data */ + sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0); } else { mutex_exit(&so->so_lock); } @@ -1116,7 +1083,7 @@ so_rcv_flush(struct sonode *so) } /* - * Free messages sitting in the send and recv queue + * Free messages sitting in the recv queues */ while (so->so_rcv_q_head != NULL) { mp = so->so_rcv_q_head; @@ -1313,10 +1280,28 @@ socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr) so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS; mutex_exit(&pso->so_lock); + + /* + * If the parent has any filters, try to inherit them. + */ + if (pso->so_filter_active > 0 && + (error = sof_sonode_inherit_filters(so, pso)) != 0) + return (error); + } else { struct sockparams *sp = so->so_sockparams; sock_upcalls_t *upcalls_to_use; + /* + * Attach automatic filters, if there are any. + */ + if (!list_is_empty(&sp->sp_auto_filters) && + (error = sof_sonode_autoattach_filters(so, cr)) != 0) + return (error); + + /* OK to attach filters */ + so->so_state |= SS_FILOP_OK; + /* * Based on the version number select the right upcalls to * pass down. Currently we only have one version so choose @@ -1384,6 +1369,9 @@ socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr) if (uioasync.enabled) sod_sock_init(so); + /* put an extra reference on the socket for the protocol */ + VN_HOLD(SOTOV(so)); + return (0); } @@ -1812,6 +1800,22 @@ socket_getopt_common(struct sonode *so, int level, int option_name, *optlenp = sizeof (struct so_snd_bufinfo); return (0); } + case SO_SND_COPYAVOID: { + sof_instance_t *inst; + + /* + * Avoid zero-copy if there is a filter with a data_out + * callback. We could let the operation succeed, but then + * the filter would have to copy the data anyway. + */ + for (inst = so->so_filter_top; inst != NULL; + inst = inst->sofi_next) { + if (SOF_INTERESTED(inst, data_out)) + return (EOPNOTSUPP); + } + break; + } + default: break; } @@ -1982,15 +1986,19 @@ so_end_fallback(struct sonode *so) * We do not need to hold so_lock, since there can be only one thread * operating on the sonode. */ -static void -so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, - struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen, +static mblk_t * +so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg, + struct T_capability_ack *tcap, + struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr, socklen_t faddrlen, short opts) { struct sonode *so = (struct sonode *)sock_handle; boolean_t atmark; + mblk_t *retmp = NULL, **tailmpp = &retmp; - sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts); + if (tcap != NULL) + sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, + opts); /* * Some protocols do not quiece the data path during fallback. Once @@ -2038,9 +2046,9 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, */ if (atmark) { struct T_exdata_ind *tei; - mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp; + mblk_t *mp1 = arg->soqa_exdata_mp; - SOTOTPI(so)->sti_exdata_mp = NULL; + arg->soqa_exdata_mp = NULL; ASSERT(mp1 != NULL); mp1->b_datap->db_type = M_PROTO; tei = (struct T_exdata_ind *)mp1->b_rptr; @@ -2101,7 +2109,8 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, * Queue data on the STREAM head. */ so->so_rcv_queued -= mlen; - putnext(q, mp); + *tailmpp = mp; + tailmpp = &mp->b_next; } so->so_rcv_head = NULL; so->so_rcv_last_head = NULL; @@ -2121,8 +2130,8 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, if (atmark && so->so_oobmsg != NULL) { struct T_exdata_ind *tei; - mp = SOTOTPI(so)->sti_exdata_mp; - SOTOTPI(so)->sti_exdata_mp = NULL; + mp = arg->soqa_exdata_mp; + arg->soqa_exdata_mp = NULL; ASSERT(mp != NULL); mp->b_datap->db_type = M_PROTO; tei = (struct T_exdata_ind *)mp->b_rptr; @@ -2133,38 +2142,32 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, mp->b_cont = so->so_oobmsg; so->so_oobmsg = NULL; - putnext(q, mp); + *tailmpp = mp; + tailmpp = &mp->b_next; } else { /* Send up the signal */ - mp = SOTOTPI(so)->sti_exdata_mp; - SOTOTPI(so)->sti_exdata_mp = NULL; + mp = arg->soqa_exdata_mp; + arg->soqa_exdata_mp = NULL; ASSERT(mp != NULL); DB_TYPE(mp) = M_PCSIG; *mp->b_wptr++ = (uchar_t)SIGURG; - putnext(q, mp); + *tailmpp = mp; + tailmpp = &mp->b_next; /* Send up the mark indicator */ - mp = SOTOTPI(so)->sti_urgmark_mp; - SOTOTPI(so)->sti_urgmark_mp = NULL; + mp = arg->soqa_urgmark_mp; + arg->soqa_urgmark_mp = NULL; mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT; - putnext(q, mp); + *tailmpp = mp; + tailmpp = &mp->b_next; so->so_oobmark = 0; } } - - if (SOTOTPI(so)->sti_exdata_mp != NULL) { - freeb(SOTOTPI(so)->sti_exdata_mp); - SOTOTPI(so)->sti_exdata_mp = NULL; - } - - if (SOTOTPI(so)->sti_urgmark_mp != NULL) { - freeb(SOTOTPI(so)->sti_urgmark_mp); - SOTOTPI(so)->sti_urgmark_mp = NULL; - } - ASSERT(so->so_oobmark == 0); ASSERT(so->so_rcv_queued == 0); + + return (retmp); } #ifdef DEBUG @@ -2203,7 +2206,8 @@ so_integrity_check(struct sonode *cur, struct sonode *orig) VERIFY(cur->so_version == orig->so_version); /* New conns might have arrived, but none should have been lost */ VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len); - VERIFY(cur->so_acceptq_head == orig->so_acceptq_head); + VERIFY(list_head(&cur->so_acceptq_list) == + list_head(&orig->so_acceptq_list)); VERIFY(cur->so_backlog == orig->so_backlog); /* New OOB migth have arrived, but mark should not have been lost */ VERIFY(cur->so_oobmark >= orig->so_oobmark); @@ -2243,8 +2247,10 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) struct sockparams *sp; struct sockparams *newsp = NULL; so_proto_fallback_func_t fbfunc; + const char *devpath; boolean_t direct; struct sonode *nso; + sock_quiesce_arg_t arg = { NULL, NULL }; #ifdef DEBUG struct sonode origso; #endif @@ -2253,10 +2259,27 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) fbfunc = sp->sp_smod_info->smod_proto_fallback_func; /* - * Fallback can only happen if there is a device associated - * with the sonode, and the socket module has a fallback function. + * Cannot fallback if the socket has active filters + */ + if (so->so_filter_active > 0) + return (EINVAL); + + switch (so->so_family) { + case AF_INET: + devpath = sp->sp_smod_info->smod_fallback_devpath_v4; + break; + case AF_INET6: + devpath = sp->sp_smod_info->smod_fallback_devpath_v6; + break; + default: + return (EINVAL); + } + + /* + * Fallback can only happen if the socket module has a TPI device + * and fallback function. */ - if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL) + if (devpath == NULL || fbfunc == NULL) return (EINVAL); /* @@ -2276,8 +2299,7 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) sp->sp_stats.sps_nfallback.value.ui64++; newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type, - so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath, - KM_SLEEP, &error); + so->so_protocol, devpath, KM_SLEEP, &error); if (error != 0) goto out; @@ -2295,14 +2317,30 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) error = sotpi_convert_sonode(so, newsp, &direct, &q, cr); if (error != 0) goto out; - + /* + * When it comes to urgent data we have two cases to deal with; + * (1) The oob byte has already arrived, or (2) the protocol has + * notified that oob data is pending, but it has not yet arrived. + * + * For (1) all we need to do is send a T_EXDATA_IND to indicate were + * in the byte stream the oob byte is. For (2) we have to send a + * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether + * the oob byte will be the next byte from the protocol. + * + * So in the worst case we need two mblks, one for the signal, another + * for mark indication. In that case we use the exdata_mp for the sig. + */ + arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), + BPRI_MED, STR_NOSIG, NULL); + arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); /* * Now tell the protocol to start using TPI. so_quiesced_cb be * called once it's safe to synchronize state. */ DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so); - error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb); + error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb, + &arg); DTRACE_PROBE1(proto__fallback__end, struct sonode *, so); if (error != 0) { @@ -2315,19 +2353,40 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) * Walk the accept queue and notify the proto that they should * fall back to TPI. The protocol will send up the T_CONN_IND. */ - nso = so->so_acceptq_head; + nso = list_head(&so->so_acceptq_list); while (nso != NULL) { int rval; + struct sonode *next; + + if (arg.soqa_exdata_mp == NULL) { + arg.soqa_exdata_mp = + allocb_wait(sizeof (struct T_exdata_ind), + BPRI_MED, STR_NOSIG, NULL); + } + if (arg.soqa_urgmark_mp == NULL) { + arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, + STR_NOSIG, NULL); + } DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso); - rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL); + rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, + so_quiesced_cb, &arg); DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso); if (rval != 0) { + /* Abort the connection */ zcmn_err(getzoneid(), CE_WARN, "Failed to convert socket in accept queue to TPI. " "Pid = %d\n", curproc->p_pid); + next = list_next(&so->so_acceptq_list, nso); + list_remove(&so->so_acceptq_list, nso); + so->so_acceptq_len--; + + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + nso = next; + } else { + nso = list_next(&so->so_acceptq_list, nso); } - nso = nso->so_acceptq_next; } /* @@ -2352,6 +2411,14 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) * the STREAMS head). */ pollwakeup(&so->so_poll_list, POLLERR); + + /* + * When this non-STREAM socket was created we placed an extra ref on + * the associated vnode to support asynchronous close. Drop that ref + * here. + */ + ASSERT(SOTOV(so)->v_count >= 2); + VN_RELE(SOTOV(so)); out: so_end_fallback(so); @@ -2365,6 +2432,10 @@ out: if (newsp != NULL) SOCKPARAMS_DEC_REF(newsp); } + if (arg.soqa_exdata_mp != NULL) + freemsg(arg.soqa_exdata_mp); + if (arg.soqa_urgmark_mp != NULL) + freemsg(arg.soqa_urgmark_mp); return (error); } diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c new file mode 100644 index 0000000000..f4d4f9e922 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -0,0 +1,1770 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Socket Filter Framework + * + * Socket filter entry (sof_entry_t): + * + * There exists one entry for each configured filter (done via soconfig(1M)), + * and they are all in sof_entry_list. In addition to the global list, each + * sockparams entry maintains a list of filters that is interested in that + * particular socket type. So the filter entry may be referenced by multiple + * sockparams. The set of sockparams referencing a filter may change as + * socket types are added and/or removed from the system. Both sof_entry_list + * and the sockparams list is protected by sockconf_lock. + * + * Each filter entry has a ref count which is incremented whenever a filter + * is attached to a socket. An entry is marked SOFEF_CONDEMED when it is + * unconfigured, which will result in the entry being freed when its ref + * count reaches zero. + * + * Socket filter module (sof_module_t): + * + * Modules are created by sof_register() and placed in sof_module_list, + * which is protected by sof_module_lock. Each module has a reference count + * that is incremented when a filter entry is using the module. A module + * can be destroyed by sof_register() only when it's ref count is zero. + * + * Socket filter instance (sof_instance_t): + * + * Whenever a filter is attached to a socket (sonode), a new instance is + * created. The socket is guaranteed to be single threaded when filters are + * being attached/detached. The instance uses the sonode's so_lock for + * protection. + * + * The lifetime of an instance is the same as the socket it's attached to. + * + * How things link together: + * + * sockparams.sp_{auto,prog}_filters -> sp_filter_t -> sp_filter_t + * ^ | | + * | | | + * sonode.so_filter_top -> sof_instance_t | | + * | | | + * v v v + * sof_entry_list -> sof_entry_t -> sof_entry -> ... -> sof_entry_t + * | + * v + * sof_module_list -> sof_module_t -> ... -> sof_module_t + */ + +static list_t sof_entry_list; /* list of configured filters */ + +static list_t sof_module_list; /* list of loaded filter modules */ +static kmutex_t sof_module_lock; /* protect the module list */ + +static sof_kstat_t sof_stat; +static kstat_t *sof_stat_ksp; + +#ifdef DEBUG +static int socket_filter_debug = 0; +#endif + +/* + * A connection that has been deferred for more than `sof_defer_drop_time' + * ticks can be dropped to make room for new connections. A connection that + * is to be dropped is moved over to `sof_close_deferred_list' where it will + * be closed by sof_close_deferred() (which is running on a taskq). Connections + * will not be moved over to the close list if it grows larger than + * `sof_close_deferred_max_backlog'. + */ +clock_t sof_defer_drop_time = 3000; +uint_t sof_close_deferred_max_backlog = 1000; + +taskq_t *sof_close_deferred_taskq; +boolean_t sof_close_deferred_running; +uint_t sof_close_deferred_backlog; +list_t sof_close_deferred_list; +kmutex_t sof_close_deferred_lock; + +static void sof_close_deferred(void *); + +static void sof_module_rele(sof_module_t *); +static sof_module_t *sof_module_hold_by_name(const char *, const char *); + +static int sof_entry_load_module(sof_entry_t *); +static void sof_entry_hold(sof_entry_t *); +static void sof_entry_rele(sof_entry_t *); +static int sof_entry_kstat_create(sof_entry_t *); +static void sof_entry_kstat_destroy(sof_entry_t *); + +static sof_instance_t *sof_instance_create(sof_entry_t *, struct sonode *); +static void sof_instance_destroy(sof_instance_t *); + +static int +sof_kstat_update(kstat_t *ksp, int rw) +{ + _NOTE(ARGUNUSED(ksp)); + + if (rw == KSTAT_WRITE) + return (EACCES); + + sof_stat.sofks_defer_close_backlog.value.ui64 = + sof_close_deferred_backlog; + + return (0); +} + +void +sof_init(void) +{ + list_create(&sof_entry_list, sizeof (sof_entry_t), + offsetof(sof_entry_t, sofe_node)); + list_create(&sof_module_list, sizeof (sof_module_t), + offsetof(sof_module_t, sofm_node)); + list_create(&sof_close_deferred_list, sizeof (struct sonode), + offsetof(struct sonode, so_acceptq_node)); + + sof_close_deferred_taskq = taskq_create("sof_close_deferred_taskq", + 1, minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE); + sof_close_deferred_running = B_FALSE; + sof_close_deferred_backlog = 0; + + mutex_init(&sof_close_deferred_lock, NULL, MUTEX_DEFAULT, 0); + mutex_init(&sof_module_lock, NULL, MUTEX_DEFAULT, 0); + + sof_stat_ksp = kstat_create("sockfs", 0, "sockfilter", "misc", + KSTAT_TYPE_NAMED, sizeof (sof_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (sof_stat_ksp == NULL) + return; + + kstat_named_init(&sof_stat.sofks_defer_closed, "defer_closed", + KSTAT_DATA_UINT64); + kstat_named_init(&sof_stat.sofks_defer_close_backlog, + "defer_close_backlog", KSTAT_DATA_UINT64); + kstat_named_init(&sof_stat.sofks_defer_close_failed_backlog_too_big, + "defer_close_failed_backlog_too_big", KSTAT_DATA_UINT64); + + sof_stat_ksp->ks_data = &sof_stat; + sof_stat_ksp->ks_update = sof_kstat_update; + kstat_install(sof_stat_ksp); +} + +/* + * Process filter options. + */ +static int +sof_setsockopt_impl(struct sonode *so, int option_name, + const void *optval, socklen_t optlen, struct cred *cr) +{ + struct sockparams *sp = so->so_sockparams; + sof_entry_t *ent = NULL; + sp_filter_t *fil; + sof_instance_t *inst; + sof_rval_t rval; + int error; + + _NOTE(ARGUNUSED(optlen)); + + /* + * Is the filter in a state where filters can be attached? + */ + if (!(so->so_state & SS_FILOP_OK)) + return (EINVAL); + + if (option_name == FIL_ATTACH) { + /* + * Make sure there isn't already another instance of the + * same filter attached to the socket. + */ + for (inst = so->so_filter_top; inst != NULL; + inst = inst->sofi_next) { + if (strncmp(inst->sofi_filter->sofe_name, + (const char *)optval, SOF_MAXNAMELEN) == 0) + return (EEXIST); + } + /* Look up the filter. */ + rw_enter(&sockconf_lock, RW_READER); + for (fil = list_head(&sp->sp_prog_filters); fil != NULL; + fil = list_next(&sp->sp_prog_filters, fil)) { + ent = fil->spf_filter; + ASSERT(ent->sofe_flags & SOFEF_PROG); + + if (strncmp(ent->sofe_name, (const char *)optval, + SOF_MAXNAMELEN) == 0) + break; + } + /* No such filter */ + if (fil == NULL) { + rw_exit(&sockconf_lock); + return (ENOENT); + } + inst = sof_instance_create(ent, so); + rw_exit(&sockconf_lock); + + /* Failed to create an instance; must be out of memory */ + if (inst == NULL) + return (ENOMEM); + + /* + * This might be the first time the filter is being used, + * so try to load the module if it's not already registered. + */ + if (ent->sofe_mod == NULL && + (error = sof_entry_load_module(ent)) != 0) { + sof_instance_destroy(inst); + return (error); + } + + /* Module loaded OK, so there must be an ops vector */ + ASSERT(ent->sofe_mod != NULL); + inst->sofi_ops = &ent->sofe_mod->sofm_ops; + + SOF_STAT_ADD(inst, tot_active_attach, 1); + if (inst->sofi_ops->sofop_attach_active != NULL) { + rval = inst->sofi_ops->sofop_attach_active( + (sof_handle_t)inst, so->so_family, so->so_type, + so->so_protocol, cr, &inst->sofi_cookie); + if (rval != SOF_RVAL_CONTINUE) { + sof_instance_destroy(inst); + switch (rval) { + case SOF_RVAL_DETACH: + /* + * Filter does not want to to attach. + * An error is returned so the user + * knows the request did not go + * through. + */ + error = EINVAL; + break; + default: + SOF_STAT_ADD(inst, attach_failures, 1); + /* Not a valid rval for active attach */ + ASSERT(rval != SOF_RVAL_DEFER); + error = sof_rval2errno(rval); + break; + } + return (error); + } + } + return (0); + } else if (option_name == FIL_DETACH) { + for (inst = so->so_filter_top; inst != NULL; + inst = inst->sofi_next) { + + ent = inst->sofi_filter; + if (strncmp(ent->sofe_name, (const char *)optval, + SOF_MAXNAMELEN) == 0) + break; + } + if (inst == NULL) + return (ENXIO); + + /* automatic filters cannot be detached */ + if (inst->sofi_filter->sofe_flags & SOFEF_AUTO) + return (EINVAL); + + if (inst->sofi_ops->sofop_detach != NULL) + inst->sofi_ops->sofop_detach((sof_handle_t)inst, + inst->sofi_cookie, cr); + sof_instance_destroy(inst); + + return (0); + } else { + return (EINVAL); + } +} + +int +sof_setsockopt(struct sonode *so, int option_name, + const void *optval, socklen_t optlen, struct cred *cr) +{ + int error; + + /* + * By grabbing the lock as a writer we ensure that no other socket + * operations can start while the filter stack is being manipulated. + * + * We do a tryenter so that in case there is an active thread we + * ask the caller to try again instead of blocking here until the + * other thread is done (which could be indefinitely in case of recv). + */ + if (!rw_tryenter(&so->so_fallback_rwlock, RW_WRITER)) { + return (EAGAIN); + } + + /* Bail out if a fallback has taken place */ + if (so->so_state & SS_FALLBACK_COMP) + error = EINVAL; + else + error = sof_setsockopt_impl(so, option_name, optval, + optlen, cr); + rw_exit(&so->so_fallback_rwlock); + + return (error); +} + +/* + * Get filter socket options. + */ +static int +sof_getsockopt_impl(struct sonode *so, int option_name, + void *optval, socklen_t *optlenp, struct cred *cr) +{ + sof_instance_t *inst; + struct fil_info *fi; + socklen_t maxsz = *optlenp; + int i; + uint_t cnt; + + _NOTE(ARGUNUSED(cr)); + + if (option_name == FIL_LIST) { + fi = (struct fil_info *)optval; + + if (maxsz < sizeof (*fi)) + return (EINVAL); + + for (inst = so->so_filter_top, cnt = 0; inst != NULL; + inst = inst->sofi_next) + cnt++; + for (inst = so->so_filter_top, i = 0; + inst != NULL && (i+1) * sizeof (*fi) <= maxsz; + inst = inst->sofi_next, i++) { + fi[i].fi_flags = + (inst->sofi_filter->sofe_flags & SOFEF_AUTO) ? + FILF_AUTO : FILF_PROG; + if (inst->sofi_flags & SOFIF_BYPASS) + fi[i].fi_flags |= FILF_BYPASS; + (void) strncpy(fi[i].fi_name, + inst->sofi_filter->sofe_name, FILNAME_MAX); + ASSERT(cnt > 0); + fi[i].fi_pos = --cnt; + } + *optlenp = i * sizeof (*fi); + return (0); + } else { + return (EINVAL); + } +} + +int +sof_getsockopt(struct sonode *so, int option_name, + void *optval, socklen_t *optlenp, struct cred *cr) +{ + int error; + + /* + * The fallback lock is used here to serialize set and get + * filter operations. + */ + rw_enter(&so->so_fallback_rwlock, RW_READER); + if (so->so_state & SS_FALLBACK_COMP) + error = EINVAL; + else + error = sof_getsockopt_impl(so, option_name, optval, optlenp, + cr); + rw_exit(&so->so_fallback_rwlock); + + return (error); +} + +/* + * The socket `so' wants to inherit the filter stack from `pso'. + * Returns 0 if all went well or an errno otherwise. + */ +int +sof_sonode_inherit_filters(struct sonode *so, struct sonode *pso) +{ + sof_instance_t *inst, *pinst; + sof_rval_t rval; + int error; + struct sockaddr_in6 laddrbuf, faddrbuf; + struct sockaddr_in6 *laddr, *faddr; + socklen_t laddrlen, faddrlen; + + /* + * Make sure there is enough room to retrieve the addresses + */ + if (so->so_proto_props.sopp_maxaddrlen > sizeof (laddrbuf)) { + laddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen, + KM_NOSLEEP); + if (laddr == NULL) + return (ENOMEM); + faddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen, + KM_NOSLEEP); + if (faddr == NULL) { + kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen); + return (ENOMEM); + } + laddrlen = faddrlen = so->so_proto_props.sopp_maxaddrlen; + } else { + laddrlen = faddrlen = sizeof (laddrbuf); + laddr = &laddrbuf; + faddr = &faddrbuf; + } + + error = (*so->so_downcalls->sd_getpeername) + (so->so_proto_handle, (struct sockaddr *)faddr, &faddrlen, kcred); + if (error != 0) + goto out; + error = (*so->so_downcalls->sd_getsockname) + (so->so_proto_handle, (struct sockaddr *)laddr, &laddrlen, kcred); + if (error != 0) + goto out; + + /* + * The stack is built bottom up. Filters are allowed to modify the + * the foreign and local addresses during attach. + */ + for (pinst = pso->so_filter_bottom; + pinst != NULL && !(pinst->sofi_flags & SOFIF_BYPASS); + pinst = pinst->sofi_prev) { + inst = sof_instance_create(pinst->sofi_filter, so); + if (inst == NULL) { + error = ENOMEM; + goto out; + } + /* + * The filter module must be loaded since it's already + * attached to the listener. + */ + ASSERT(pinst->sofi_ops != NULL); + inst->sofi_ops = pinst->sofi_ops; + + SOF_STAT_ADD(inst, tot_passive_attach, 1); + if (inst->sofi_ops->sofop_attach_passive != NULL) { + rval = inst->sofi_ops->sofop_attach_passive( + (sof_handle_t)inst, + (sof_handle_t)pinst, pinst->sofi_cookie, + (struct sockaddr *)laddr, laddrlen, + (struct sockaddr *)faddr, faddrlen, + &inst->sofi_cookie); + if (rval != SOF_RVAL_CONTINUE) { + if (rval == SOF_RVAL_DEFER) { + mutex_enter(&so->so_lock); + inst->sofi_flags |= SOFIF_DEFER; + so->so_state |= SS_FIL_DEFER; + mutex_exit(&so->so_lock); + so->so_filter_defertime = + ddi_get_lbolt(); + SOF_STAT_ADD(inst, ndeferred, 1); + } else if (rval == SOF_RVAL_DETACH) { + sof_instance_destroy(inst); + } else { + SOF_STAT_ADD(inst, attach_failures, 1); + error = sof_rval2errno(rval); + /* + * Filters that called attached will be + * destroyed when the socket goes away, + * after detach is called. + */ + goto out; + } + } + } + } + +out: + if (laddr != &laddrbuf) { + kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen); + kmem_free(faddr, so->so_proto_props.sopp_maxaddrlen); + } + return (error); +} + +/* + * Attach any automatic filters to sonode `so'. Returns 0 if all went well + * and an errno otherwise. + */ +int +sof_sonode_autoattach_filters(struct sonode *so, cred_t *cr) +{ + struct sockparams *sp = so->so_sockparams; + sp_filter_t *fil; + sof_instance_t *inst; + sof_rval_t rval; + int error; + + /* + * A created instance is added to the top of the sonode's filter + * stack, so traverse the config list in reverse order. + */ + rw_enter(&sockconf_lock, RW_READER); + for (fil = list_tail(&sp->sp_auto_filters); + fil != NULL; fil = list_prev(&sp->sp_auto_filters, fil)) { + ASSERT(fil->spf_filter->sofe_flags & SOFEF_AUTO); + if (!sof_instance_create(fil->spf_filter, so)) { + rw_exit(&sockconf_lock); + error = ENOMEM; /* must have run out of memory */ + goto free_all; + } + } + rw_exit(&sockconf_lock); + + /* + * Notify each filter that it's being attached. + */ + inst = so->so_filter_top; + while (inst != NULL) { + sof_entry_t *ent = inst->sofi_filter; + sof_instance_t *ninst = inst->sofi_next; + + /* + * This might be the first time the filter is being used, + * so try to load the module if it's not already registered. + */ + if (ent->sofe_mod == NULL && + (error = sof_entry_load_module(ent)) != 0) + goto free_detached; + + /* Module loaded OK, so there must be an ops vector */ + ASSERT(ent->sofe_mod != NULL); + inst->sofi_ops = &ent->sofe_mod->sofm_ops; + + SOF_STAT_ADD(inst, tot_active_attach, 1); + if (inst->sofi_ops->sofop_attach_active != NULL) { + rval = inst->sofi_ops->sofop_attach_active( + (sof_handle_t)inst, so->so_family, so->so_type, + so->so_protocol, cr, &inst->sofi_cookie); + if (rval != SOF_RVAL_CONTINUE) { + switch (rval) { + case SOF_RVAL_DETACH: + /* filter does not want to attach */ + sof_instance_destroy(inst); + break; + default: + SOF_STAT_ADD(inst, attach_failures, 1); + /* Not a valid rval for active attach */ + ASSERT(rval != SOF_RVAL_DEFER); + error = sof_rval2errno(rval); + goto free_detached; + } + } + } + inst = ninst; + } + return (0); + +free_all: + inst = so->so_filter_top; +free_detached: + ASSERT(inst != NULL); + /* + * Destroy all filters for which attach was not called. The other + * filters will be destroyed (and detach called) when the socket + * is freed. + */ + do { + sof_instance_t *t = inst->sofi_next; + sof_instance_destroy(inst); + inst = t; + } while (inst != NULL); + + return (error); +} + +/* + * Detaches and frees all filters attached to sonode `so'. + */ +void +sof_sonode_cleanup(struct sonode *so) +{ + sof_instance_t *inst; + + while ((inst = so->so_filter_top) != NULL) { + (inst->sofi_ops->sofop_detach)((sof_handle_t)inst, + inst->sofi_cookie, kcred); + sof_instance_destroy(inst); + } +} + +/* + * Notifies all active filters attached to `so' about the `event' and + * where `arg' is an event specific argument. + */ +void +sof_sonode_notify_filters(struct sonode *so, sof_event_t event, uintptr_t arg) +{ + sof_instance_t *inst; + + for (inst = so->so_filter_bottom; inst != NULL; + inst = inst->sofi_prev) { + if (SOF_INTERESTED(inst, notify)) + (inst->sofi_ops->sofop_notify)((sof_handle_t)inst, + inst->sofi_cookie, event, arg); + } +} + +/* + * The socket `so' is closing. Notify filters and make sure that there + * are no pending tx operations. + */ +void +sof_sonode_closing(struct sonode *so) +{ + /* + * Notify filters that the socket is being closed. It's OK for + * filters to inject data. + */ + sof_sonode_notify_filters(so, SOF_EV_CLOSING, (uintptr_t)B_TRUE); + + /* wait for filters that are sending out data */ + mutex_enter(&so->so_lock); + while (so->so_filter_tx > 0) + cv_wait(&so->so_closing_cv, &so->so_lock); + mutex_exit(&so->so_lock); +} + +/* + * Called when socket `so' wants to get rid of a deferred connection. + * Returns TRUE if a connection was dropped. + */ +boolean_t +sof_sonode_drop_deferred(struct sonode *so) +{ + struct sonode *def; + clock_t now = ddi_get_lbolt(); + + if (sof_close_deferred_backlog > sof_close_deferred_max_backlog) { + SOF_GLOBAL_STAT_BUMP(defer_close_failed_backlog_too_big); + return (B_FALSE); + } + mutex_enter(&so->so_acceptq_lock); + if ((def = list_head(&so->so_acceptq_defer)) != NULL && + (now - def->so_filter_defertime) > sof_defer_drop_time) { + list_remove(&so->so_acceptq_defer, def); + so->so_acceptq_len--; + mutex_exit(&so->so_acceptq_lock); + def->so_listener = NULL; + } else { + mutex_exit(&so->so_acceptq_lock); + return (B_FALSE); + } + + mutex_enter(&sof_close_deferred_lock); + list_insert_tail(&sof_close_deferred_list, def); + sof_close_deferred_backlog++; + if (!sof_close_deferred_running) { + mutex_exit(&sof_close_deferred_lock); + (void) taskq_dispatch(sof_close_deferred_taskq, + sof_close_deferred, NULL, TQ_NOSLEEP); + } else { + mutex_exit(&sof_close_deferred_lock); + } + return (B_TRUE); +} + +/* + * Called from a taskq to close connections that have been deferred for + * too long. + */ +void +sof_close_deferred(void *unused) +{ + struct sonode *drop; + + _NOTE(ARGUNUSED(unused)); + + mutex_enter(&sof_close_deferred_lock); + if (!sof_close_deferred_running) { + sof_close_deferred_running = B_TRUE; + while ((drop = + list_remove_head(&sof_close_deferred_list)) != NULL) { + sof_close_deferred_backlog--; + mutex_exit(&sof_close_deferred_lock); + + SOF_GLOBAL_STAT_BUMP(defer_closed); + (void) socket_close(drop, 0, kcred); + socket_destroy(drop); + + mutex_enter(&sof_close_deferred_lock); + } + sof_close_deferred_running = B_FALSE; + ASSERT(sof_close_deferred_backlog == 0); + } + mutex_exit(&sof_close_deferred_lock); +} + +/* + * Creates a new filter instance from the entry `ent' and attaches + * it to the sonode `so'. On success, return a pointer to the created + * instance. + * + * The new instance will be placed on the top of the filter stack. + * + * The caller is responsible for assigning the instance's ops vector and + * calling the filter's attach callback. + * + * No locks are held while manipulating the sonode fields because we are + * guaranteed that this operation is serialized. + * + * We can be sure that the entry `ent' will not disappear, because the + * caller is either holding sockconf_lock (in case of an active open), or is + * already holding a reference (in case of a passive open, the listener has + * one). + */ +static sof_instance_t * +sof_instance_create(sof_entry_t *ent, struct sonode *so) +{ + sof_instance_t *inst; + + inst = kmem_zalloc(sizeof (sof_instance_t), KM_NOSLEEP); + if (inst == NULL) + return (NULL); + sof_entry_hold(ent); + inst->sofi_filter = ent; + inst->sofi_sonode = so; + + inst->sofi_next = so->so_filter_top; + if (so->so_filter_top != NULL) + so->so_filter_top->sofi_prev = inst; + else + so->so_filter_bottom = inst; + so->so_filter_top = inst; + so->so_filter_active++; + + return (inst); +} +/* + * Destroys the filter instance `inst' and unlinks it from the sonode. + * + * Any filter private state must be destroyed (via the detach callback) + * before the instance is destroyed. + */ +static void +sof_instance_destroy(sof_instance_t *inst) +{ + struct sonode *so = inst->sofi_sonode; + + ASSERT(inst->sofi_sonode != NULL); + ASSERT(inst->sofi_filter != NULL); + ASSERT(inst->sofi_prev != NULL || so->so_filter_top == inst); + ASSERT(inst->sofi_next != NULL || so->so_filter_bottom == inst); + + if (inst->sofi_prev != NULL) + inst->sofi_prev->sofi_next = inst->sofi_next; + else + so->so_filter_top = inst->sofi_next; + + if (inst->sofi_next != NULL) + inst->sofi_next->sofi_prev = inst->sofi_prev; + else + so->so_filter_bottom = inst->sofi_prev; + + if (!(inst->sofi_flags & SOFIF_BYPASS)) { + ASSERT(so->so_filter_active > 0); + so->so_filter_active--; + } + if (inst->sofi_flags & SOFIF_DEFER) + SOF_STAT_ADD(inst, ndeferred, -1); + sof_entry_rele(inst->sofi_filter); + kmem_free(inst, sizeof (sof_instance_t)); +} + +static sof_entry_t * +sof_entry_find(const char *name) +{ + sof_entry_t *ent; + + for (ent = list_head(&sof_entry_list); ent != NULL; + ent = list_next(&sof_entry_list, ent)) { + if (strncmp(ent->sofe_name, name, SOF_MAXNAMELEN) == 0) + return (ent); + } + return (NULL); +} + +void +sof_entry_free(sof_entry_t *ent) +{ + ASSERT(ent->sofe_refcnt == 0); + ASSERT(!list_link_active(&ent->sofe_node)); + + if (ent->sofe_hintarg != NULL) { + ASSERT(ent->sofe_hint == SOF_HINT_BEFORE || + ent->sofe_hint == SOF_HINT_AFTER); + kmem_free(ent->sofe_hintarg, strlen(ent->sofe_hintarg) + 1); + ent->sofe_hintarg = NULL; + } + if (ent->sofe_socktuple_cnt > 0) { + ASSERT(ent->sofe_socktuple != NULL); + kmem_free(ent->sofe_socktuple, + sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt); + ent->sofe_socktuple = NULL; + ent->sofe_socktuple_cnt = 0; + } + sof_entry_kstat_destroy(ent); + + mutex_destroy(&ent->sofe_lock); + kmem_free(ent, sizeof (sof_entry_t)); +} + +static int +sof_entry_kstat_update(kstat_t *ksp, int rw) +{ + sof_entry_t *ent = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ent->sofe_kstat.sofek_nactive.value.ui64 = ent->sofe_refcnt; + + return (0); +} + +/* + * Create the kstat for filter entry `ent'. + */ +static int +sof_entry_kstat_create(sof_entry_t *ent) +{ + char name[SOF_MAXNAMELEN + 7]; + + (void) snprintf(name, sizeof (name), "filter_%s", ent->sofe_name); + ent->sofe_ksp = kstat_create("sockfs", 0, name, "misc", + KSTAT_TYPE_NAMED, + sizeof (sof_entry_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ent->sofe_ksp == NULL) + return (ENOMEM); + + kstat_named_init(&ent->sofe_kstat.sofek_nactive, "nactive", + KSTAT_DATA_UINT64); + kstat_named_init(&ent->sofe_kstat.sofek_tot_active_attach, + "tot_active_attach", KSTAT_DATA_UINT64); + kstat_named_init(&ent->sofe_kstat.sofek_tot_passive_attach, + "tot_passive_attach", KSTAT_DATA_UINT64); + kstat_named_init(&ent->sofe_kstat.sofek_ndeferred, "ndeferred", + KSTAT_DATA_UINT64); + kstat_named_init(&ent->sofe_kstat.sofek_attach_failures, + "attach_failures", KSTAT_DATA_UINT64); + + ent->sofe_ksp->ks_data = &ent->sofe_kstat; + ent->sofe_ksp->ks_update = sof_entry_kstat_update; + ent->sofe_ksp->ks_private = ent; + kstat_install(ent->sofe_ksp); + + return (0); +} + +/* + * Destroys the kstat for filter entry `ent'. + */ +static void +sof_entry_kstat_destroy(sof_entry_t *ent) +{ + if (ent->sofe_ksp != NULL) { + kstat_delete(ent->sofe_ksp); + ent->sofe_ksp = NULL; + } +} + +static void +sof_entry_hold(sof_entry_t *ent) +{ + mutex_enter(&ent->sofe_lock); + ent->sofe_refcnt++; + mutex_exit(&ent->sofe_lock); +} + +/* + * Decrement the reference count for `ent'. The entry will + * drop its' reference on the filter module whenever its' + * ref count reaches zero. + */ +static void +sof_entry_rele(sof_entry_t *ent) +{ + mutex_enter(&ent->sofe_lock); + if (--ent->sofe_refcnt == 0) { + sof_module_t *mod = ent->sofe_mod; + ent->sofe_mod = NULL; + if (ent->sofe_flags & SOFEF_CONDEMED) { + mutex_exit(&ent->sofe_lock); + sof_entry_free(ent); + } else { + mutex_exit(&ent->sofe_lock); + } + if (mod != NULL) + sof_module_rele(mod); + } else { + mutex_exit(&ent->sofe_lock); + } +} + +/* + * Loads the module used by `ent' + */ +static int +sof_entry_load_module(sof_entry_t *ent) +{ + sof_module_t *mod = sof_module_hold_by_name(ent->sofe_name, + ent->sofe_modname); + + if (mod == NULL) + return (EINVAL); + + mutex_enter(&ent->sofe_lock); + /* Another thread might have already loaded the module */ + ASSERT(ent->sofe_mod == mod || ent->sofe_mod == NULL); + if (ent->sofe_mod != NULL) { + mutex_exit(&ent->sofe_lock); + sof_module_rele(mod); + } else { + ent->sofe_mod = mod; + mutex_exit(&ent->sofe_lock); + } + + return (0); +} + +/* + * Add filter entry `ent' to the global list and attach it to all sockparam + * entries which the filter is interested in. Upon successful return the filter + * will be available for applications to use. + */ +int +sof_entry_add(sof_entry_t *ent) +{ + int error; + + /* + * We hold sockconf_lock as a WRITER for the whole operation, + * so all operations must be non-blocking. + */ + rw_enter(&sockconf_lock, RW_WRITER); + if (sof_entry_find(ent->sofe_name) != NULL) { + rw_exit(&sockconf_lock); + return (EEXIST); + } + + /* The entry is unique; create the kstats */ + if (sof_entry_kstat_create(ent) != 0) { + rw_exit(&sockconf_lock); + return (ENOMEM); + } + + /* + * Attach the filter to sockparams of interest. + */ + if ((error = sockparams_new_filter(ent)) != 0) { + sof_entry_kstat_destroy(ent); + rw_exit(&sockconf_lock); + return (error); + } + /* + * Everything is OK; insert in global list. + */ + list_insert_tail(&sof_entry_list, ent); + rw_exit(&sockconf_lock); + + return (0); +} + +/* + * Removes the filter entry `ent' from global list and all sockparams. + */ +sof_entry_t * +sof_entry_remove_by_name(const char *name) +{ + sof_entry_t *ent; + + rw_enter(&sockconf_lock, RW_WRITER); + if ((ent = sof_entry_find(name)) == NULL) { + rw_exit(&sockconf_lock); + return (NULL); + } + list_remove(&sof_entry_list, ent); + sockparams_filter_cleanup(ent); + sof_entry_kstat_destroy(ent); + rw_exit(&sockconf_lock); + + return (ent); +} + +/* + * Filter entry `ent' will process sockparams entry `sp' to determine whether + * it should be attached to the sockparams. It should be called whenever a new + * filter or sockparams is being added. Returns zero either if the filter is + * not interested in the sockparams or if it successfully attached to the + * sockparams. On failure an errno is returned. + */ +int +sof_entry_proc_sockparams(sof_entry_t *ent, struct sockparams *sp) +{ + uint_t i; + sof_socktuple_t *t = ent->sofe_socktuple; + sp_filter_t *new, *fil; + + /* Only interested in non-TPI sockets */ + if (strcmp(sp->sp_smod_name, SOTPI_SMOD_NAME) == 0) + return (0); + + for (i = 0; i < ent->sofe_socktuple_cnt; i++) { + if (t[i].sofst_family == sp->sp_family && + t[i].sofst_type == sp->sp_type && + t[i].sofst_protocol == sp->sp_protocol) + break; + } + /* This filter is not interested in the sockparams entry */ + if (i == ent->sofe_socktuple_cnt) + return (0); + + new = kmem_zalloc(sizeof (sp_filter_t), KM_NOSLEEP); + if (new == NULL) + return (ENOMEM); + + new->spf_filter = ent; + if (ent->sofe_flags & SOFEF_PROG) { + /* placement is irrelevant for programmatic filters */ + list_insert_head(&sp->sp_prog_filters, new); + return (0); + } else { + ASSERT(ent->sofe_flags & SOFEF_AUTO); + /* + * If the filter specifies a placement hint, then make sure + * it can be satisfied. + */ + switch (ent->sofe_hint) { + case SOF_HINT_TOP: + if ((fil = list_head(&sp->sp_auto_filters)) != NULL && + fil->spf_filter->sofe_hint == SOF_HINT_TOP) + break; + list_insert_head(&sp->sp_auto_filters, new); + return (0); + case SOF_HINT_BOTTOM: + if ((fil = list_tail(&sp->sp_auto_filters)) != NULL && + fil->spf_filter->sofe_hint == SOF_HINT_BOTTOM) + break; + list_insert_tail(&sp->sp_auto_filters, new); + return (0); + case SOF_HINT_BEFORE: + case SOF_HINT_AFTER: + for (fil = list_head(&sp->sp_auto_filters); + fil != NULL; + fil = list_next(&sp->sp_auto_filters, fil)) { + if (strncmp(ent->sofe_hintarg, + fil->spf_filter->sofe_name, + SOF_MAXNAMELEN) == 0) + break; + } + + if (fil != NULL) { + if (ent->sofe_hint == SOF_HINT_BEFORE) { + if (fil->spf_filter->sofe_hint == + SOF_HINT_TOP) + break; + list_insert_before(&sp->sp_auto_filters, + fil, new); + } else { + if (fil->spf_filter->sofe_hint == + SOF_HINT_BOTTOM) + break; + list_insert_after(&sp->sp_auto_filters, + fil, new); + } + return (0); + } + /*FALLTHRU*/ + case SOF_HINT_NONE: + /* + * Insert the new filter at the beginning as long as it + * does not violate a TOP hint, otherwise insert in the + * next suitable location. + */ + if ((fil = list_head(&sp->sp_auto_filters)) != NULL && + fil->spf_filter->sofe_hint == SOF_HINT_TOP) { + list_insert_after(&sp->sp_auto_filters, fil, + new); + } else { + list_insert_head(&sp->sp_auto_filters, new); + } + return (0); + } + /* Failed to insert the filter */ + kmem_free(new, sizeof (sp_filter_t)); + return (ENOSPC); + } +} + +/* + * Remove all filter entries attached to the sockparams entry `sp'. + */ +void +sof_sockparams_fini(struct sockparams *sp) +{ + sp_filter_t *fil; + + ASSERT(!list_link_active(&sp->sp_node)); + + while ((fil = list_remove_head(&sp->sp_auto_filters)) != NULL) + kmem_free(fil, sizeof (sp_filter_t)); + while ((fil = list_remove_head(&sp->sp_prog_filters)) != NULL) + kmem_free(fil, sizeof (sp_filter_t)); +} + +/* + * A new sockparams is being added. Walk all filters and attach those that + * are interested in the entry. + * + * It should be called when the sockparams entry is about to be made available + * for use and while holding the sockconf_lock. + */ +int +sof_sockparams_init(struct sockparams *sp) +{ + sof_entry_t *ent; + + ASSERT(RW_WRITE_HELD(&sockconf_lock)); + + for (ent = list_head(&sof_entry_list); ent != NULL; + ent = list_next(&sof_entry_list, ent)) { + if (sof_entry_proc_sockparams(ent, sp) != 0) { + sof_sockparams_fini(sp); + return (ENOMEM); + } + } + return (0); +} + +static sof_module_t * +sof_module_find(const char *name) +{ + sof_module_t *ent; + + ASSERT(MUTEX_HELD(&sof_module_lock)); + + for (ent = list_head(&sof_module_list); ent != NULL; + ent = list_next(&sof_module_list, ent)) + if (strcmp(ent->sofm_name, name) == 0) + return (ent); + return (NULL); +} + +/* + * Returns a pointer to a module identified by `name' with its ref count + * bumped. An attempt to load the module is done if it's not found in the + * global list. + */ +sof_module_t * +sof_module_hold_by_name(const char *name, const char *modname) +{ + ddi_modhandle_t handle = NULL; + sof_module_t *mod = NULL; + char *modpath; + int error; + + /* + * We'll go through the loop at most two times, which will only + * happen if the module needs to be loaded. + */ + for (;;) { + mutex_enter(&sof_module_lock); + mod = sof_module_find(name); + if (mod != NULL || handle != NULL) + break; + mutex_exit(&sof_module_lock); + + modpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(modpath, MAXPATHLEN, "%s/%s", SOF_MODPATH, + modname); + handle = ddi_modopen(modpath, KRTLD_MODE_FIRST, &error); + kmem_free(modpath, MAXPATHLEN); + /* Failed to load, then bail */ + if (handle == NULL) { + cmn_err(CE_WARN, + "Failed to load socket filter module: %s (err %d)", + modname, error); + return (NULL); + } + } + if (mod != NULL) + mod->sofm_refcnt++; + mutex_exit(&sof_module_lock); + + if (handle != NULL) { + (void) ddi_modclose(handle); + /* + * The module was loaded, but the filter module could not be + * found. It's likely a misconfigured filter. + */ + if (mod == NULL) { + cmn_err(CE_WARN, + "Socket filter module %s was loaded, but did not" \ + "register. Filter %s is likely misconfigured.", + modname, name); + } + } + + return (mod); +} + +void +sof_module_rele(sof_module_t *mod) +{ + mutex_enter(&sof_module_lock); + mod->sofm_refcnt--; + mutex_exit(&sof_module_lock); +} + +int +sof_rval2errno(sof_rval_t rval) +{ + if (rval > SOF_RVAL_CONTINUE) { + return ((int)rval); + } else { +#ifdef DEBUG + if (socket_filter_debug) + printf("sof_rval2errno: invalid rval '%d'\n", rval); +#endif + return (EINVAL); + } +} + +/* + * Walk through all the filters attached to `so' and allow each filter + * to process the data using its data_out callback. `mp' is a b_cont chain. + * + * Returns the processed mblk, or NULL if mblk was consumed. The mblk might + * have been consumed as a result of an error, in which case `errp' is set to + * the appropriate errno. + */ +mblk_t * +sof_filter_data_out_from(struct sonode *so, sof_instance_t *start, + mblk_t *mp, struct nmsghdr *msg, cred_t *cr, int *errp) +{ + sof_instance_t *inst; + sof_rval_t rval; + + _NOTE(ARGUNUSED(so)); + + for (inst = start; inst != NULL; inst = inst->sofi_next) { + if (!SOF_INTERESTED(inst, data_out)) + continue; + mp = (inst->sofi_ops->sofop_data_out)((sof_handle_t)inst, + inst->sofi_cookie, mp, msg, cr, &rval); + DTRACE_PROBE2(filter__data, (sof_instance_t), inst, + (mblk_t *), mp); + if (mp == NULL) { + *errp = sof_rval2errno(rval); + break; + } + } + return (mp); +} + +/* + * Walk through all the filters attached to `so' and allow each filter + * to process the data using its data_in_proc callback. `mp' is the start of + * a possible b_next chain, and `lastmp' points to the last mblk in the chain. + * + * Returns the processed mblk, or NULL if all mblks in the chain were + * consumed. `lastmp' is updated to point to the last mblk in the processed + * chain. + */ +mblk_t * +sof_filter_data_in_proc(struct sonode *so, mblk_t *mp, mblk_t **lastmp) +{ + sof_instance_t *inst; + size_t len = 0, orig = 0; + ssize_t diff = 0; + mblk_t *retmp = NULL, *tailmp, *nextmp; + + *lastmp = NULL; + do { + nextmp = mp->b_next; + mp->b_next = mp->b_prev = NULL; + len = orig = msgdsize(mp); + for (inst = so->so_filter_bottom; inst != NULL; + inst = inst->sofi_prev) { + if (!SOF_INTERESTED(inst, data_in_proc)) + continue; + mp = (inst->sofi_ops->sofop_data_in_proc)( + (sof_handle_t)inst, inst->sofi_cookie, mp, + kcred, &len); + if (mp == NULL) + break; + } + DTRACE_PROBE2(filter__data, (sof_instance_t), inst, + (mblk_t *), mp); + diff += len - orig; + if (mp == NULL) + continue; + + for (tailmp = mp; tailmp->b_cont != NULL; + tailmp = tailmp->b_cont) + ; + mp->b_prev = tailmp; + + if (*lastmp == NULL) + retmp = mp; + else + (*lastmp)->b_next = mp; + *lastmp = mp; + } while ((mp = nextmp) != NULL); + + /* + * The size of the chain has changed; make sure the rcv queue + * stays consistent and check if the flow control state should + * change. + */ + if (diff != 0) { + DTRACE_PROBE2(filter__data__adjust__qlen, + (struct sonode *), so, (size_t), diff); + mutex_enter(&so->so_lock); + so->so_rcv_queued += diff; + /* so_check_flow_control drops so_lock */ + so_check_flow_control(so); + } + + return (retmp); +} + +int +sof_filter_bind(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + __SOF_FILTER_OP(so, bind, cr, addr, addrlen) +} + +int +sof_filter_listen(struct sonode *so, int *backlogp, cred_t *cr) +{ + __SOF_FILTER_OP(so, listen, cr, backlogp) +} + +int +sof_filter_connect(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + __SOF_FILTER_OP(so, connect, cr, addr, addrlen) +} + +int +sof_filter_accept(struct sonode *so, cred_t *cr) +{ + sof_instance_t *inst; + sof_rval_t rval; + + for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) { + if (!SOF_INTERESTED(inst, accept)) + continue; + rval = (inst->sofi_ops->sofop_accept)((sof_handle_t)inst, + inst->sofi_cookie, cr); + DTRACE_PROBE2(filter__action, (sof_instance_t), inst, + (sof_rval_t), rval); + if (rval != SOF_RVAL_CONTINUE) { + ASSERT(rval != SOF_RVAL_RETURN); + return (sof_rval2errno(rval)); + } + } + return (-1); +} + +int +sof_filter_shutdown(struct sonode *so, int *howp, cred_t *cr) +{ + __SOF_FILTER_OP(so, shutdown, cr, howp) +} + +int +sof_filter_getsockname(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlenp, cred_t *cr) +{ + __SOF_FILTER_OP(so, getsockname, cr, addr, addrlenp) +} + +int +sof_filter_getpeername(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlenp, cred_t *cr) +{ + __SOF_FILTER_OP(so, getpeername, cr, addr, addrlenp) +} + +int +sof_filter_setsockopt(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp, cred_t *cr) +{ + __SOF_FILTER_OP(so, setsockopt, cr, level, option_name, + optval, optlenp) +} + +int +sof_filter_getsockopt(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp, cred_t *cr) +{ + __SOF_FILTER_OP(so, getsockopt, cr, level, option_name, + optval, optlenp) +} + +int +sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + int32_t *rvalp, cred_t *cr) +{ + __SOF_FILTER_OP(so, ioctl, cr, cmd, arg, mode, rvalp) +} + +/* + * sof_register(version, name, ops, flags) + * + * Register a socket filter identified by name `name' and which should use + * the ops vector `ops' for event notification. `flags' should be set to 0. + * On success 0 is returned, otherwise an errno is returned. + */ +int +sof_register(int version, const char *name, const sof_ops_t *ops, int flags) +{ + sof_module_t *mod; + + _NOTE(ARGUNUSED(flags)); + + if (version != SOF_VERSION) + return (EINVAL); + + mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); + mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(mod->sofm_name, name); + mod->sofm_ops = *ops; + + mutex_enter(&sof_module_lock); + if (sof_module_find(name) != NULL) { + mutex_exit(&sof_module_lock); + kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1); + kmem_free(mod, sizeof (sof_module_t)); + return (EEXIST); + } + list_insert_tail(&sof_module_list, mod); + mutex_exit(&sof_module_lock); + + return (0); +} + +/* + * sof_unregister(name) + * + * Try to unregister the socket filter identified by `name'. If the filter + * is successfully unregistered, then 0 is returned, otherwise an errno is + * returned. + */ +int +sof_unregister(const char *name) +{ + sof_module_t *mod; + + mutex_enter(&sof_module_lock); + mod = sof_module_find(name); + if (mod != NULL) { + if (mod->sofm_refcnt == 0) { + list_remove(&sof_module_list, mod); + mutex_exit(&sof_module_lock); + + kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1); + kmem_free(mod, sizeof (sof_module_t)); + return (0); + } else { + mutex_exit(&sof_module_lock); + return (EBUSY); + } + } + mutex_exit(&sof_module_lock); + + return (ENXIO); +} + +/* + * sof_newconn_ready(handle) + * + * The filter `handle` no longer wants to defer the socket it is attached + * to. A newconn notification will be generated if there is no other filter + * that wants the socket deferred. + */ +void +sof_newconn_ready(sof_handle_t handle) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + struct sonode *so = inst->sofi_sonode; + struct sonode *pso = so->so_listener; + + mutex_enter(&so->so_lock); + if (!(inst->sofi_flags & SOFIF_DEFER)) { + mutex_exit(&so->so_lock); + return; + } + ASSERT(so->so_state & SS_FIL_DEFER); + inst->sofi_flags &= ~SOFIF_DEFER; + SOF_STAT_ADD(inst, ndeferred, -1); + + /* + * Check if any other filter has deferred the socket. The last + * filter to remove its DEFER flag will be the one generating the + * wakeup. + */ + for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) { + /* Still deferred; nothing to do */ + if (inst->sofi_flags & SOFIF_DEFER) { + mutex_exit(&so->so_lock); + return; + } + } + so->so_state &= ~SS_FIL_DEFER; + mutex_exit(&so->so_lock); + + /* + * The socket is no longer deferred; move it over to the regular + * accept list and notify the user. However, it is possible that + * the socket is being dropped by sof_sonode_drop_deferred(), so + * first make sure the socket is on the deferred list. + */ + mutex_enter(&pso->so_acceptq_lock); + if (!list_link_active(&so->so_acceptq_node)) { + mutex_exit(&pso->so_acceptq_lock); + return; + } + list_remove(&pso->so_acceptq_defer, so); + list_insert_tail(&pso->so_acceptq_list, so); + cv_signal(&pso->so_acceptq_cv); + mutex_exit(&pso->so_acceptq_lock); + + mutex_enter(&pso->so_lock); + so_notify_newconn(pso); /* so_notify_newconn drops the lock */ +} + +/* + * sof_bypass(handle) + * + * Stop generating callbacks for `handle'. + */ +void +sof_bypass(sof_handle_t handle) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + struct sonode *so = inst->sofi_sonode; + + mutex_enter(&so->so_lock); + if (!(inst->sofi_flags & SOFIF_BYPASS)) { + inst->sofi_flags |= SOFIF_BYPASS; + ASSERT(so->so_filter_active > 0); + so->so_filter_active--; + } + mutex_exit(&so->so_lock); +} + +/* + * sof_rcv_flowctrl(handle, enable) + * + * If `enable' is TRUE, then recv side flow control will be asserted for + * the socket associated with `handle'. When `enable' is FALSE the filter + * indicates that it no longer wants to assert flow control, however, the + * condition will not be removed until there are no other filters asserting + * flow control and there is space available in the receive buffer. + */ +void +sof_rcv_flowctrl(sof_handle_t handle, boolean_t enable) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + struct sonode *so = inst->sofi_sonode; + + mutex_enter(&so->so_lock); + if (enable) { + inst->sofi_flags |= SOFIF_RCV_FLOWCTRL; + so->so_flowctrld = B_TRUE; + so->so_state |= SS_FIL_RCV_FLOWCTRL; + mutex_exit(&so->so_lock); + } else { + inst->sofi_flags &= ~SOFIF_RCV_FLOWCTRL; + for (inst = so->so_filter_top; inst != NULL; + inst = inst->sofi_next) { + /* another filter is asserting flow control */ + if (inst->sofi_flags & SOFIF_RCV_FLOWCTRL) { + mutex_exit(&so->so_lock); + return; + } + } + so->so_state &= ~SS_FIL_RCV_FLOWCTRL; + /* so_check_flow_control drops so_lock */ + so_check_flow_control(so); + } + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * sof_snd_flowctrl(handle, enable) + * + * If `enable' is TRUE, then send side flow control will be asserted for + * the socket associated with `handle'. When `enable' is FALSE the filter + * indicates that is no longer wants to assert flow control, however, the + * condition will not be removed until there are no other filters asserting + * flow control and there are tx buffers available. + */ +void +sof_snd_flowctrl(sof_handle_t handle, boolean_t enable) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + struct sonode *so = inst->sofi_sonode; + + mutex_enter(&so->so_lock); + if (enable) { + inst->sofi_flags |= SOFIF_SND_FLOWCTRL; + so->so_state |= SS_FIL_SND_FLOWCTRL; + } else { + inst->sofi_flags &= ~SOFIF_SND_FLOWCTRL; + for (inst = so->so_filter_top; inst != NULL; + inst = inst->sofi_next) { + if (inst->sofi_flags & SOFIF_SND_FLOWCTRL) { + mutex_exit(&so->so_lock); + return; + } + } + so->so_state &= ~SS_FIL_SND_FLOWCTRL; + /* + * Wake up writer if the socket is no longer flow controlled. + */ + if (!SO_SND_FLOWCTRLD(so)) { + /* so_notify_writable drops so_lock */ + so_notify_writable(so); + return; + } + } + mutex_exit(&so->so_lock); +} + +/* + * sof_get_cookie(handle) + * + * Returns the cookie used by `handle'. + */ +void * +sof_get_cookie(sof_handle_t handle) +{ + return (((sof_instance_t *)handle)->sofi_cookie); +} + +/* + * sof_cas_cookie(handle, old, new) + * + * Compare-and-swap the cookie used by `handle'. + */ +void * +sof_cas_cookie(sof_handle_t handle, void *old, void *new) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + + return (atomic_cas_ptr(&inst->sofi_cookie, old, new)); +} + +/* + * sof_inject_data_out(handle, mp, msg, flowctrld) + * + * Submit `mp' for transmission. `msg' cannot by NULL, and may contain + * ancillary data and destination address. Returns 0 when successful + * in which case `flowctrld' is updated. If flow controlled, no new data + * should be injected until a SOF_EV_INJECT_DATA_OUT_OK event is observed. + * In case of failure, an errno is returned. + * + * Filters that are lower in the stack than `handle' will see the data + * before it is transmitted and may end up modifying or freeing the data. + */ +int +sof_inject_data_out(sof_handle_t handle, mblk_t *mp, struct nmsghdr *msg, + boolean_t *flowctrld) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + struct sonode *so = inst->sofi_sonode; + int error; + + /* + * Data cannot be sent down to the protocol once the socket has + * started the process of closing. + */ + mutex_enter(&so->so_lock); + if (so->so_state & SS_CLOSING) { + mutex_exit(&so->so_lock); + freemsg(mp); + return (EPIPE); + } + so->so_filter_tx++; + mutex_exit(&so->so_lock); + + error = so_sendmblk_impl(inst->sofi_sonode, msg, FNONBLOCK, + kcred, &mp, inst->sofi_next, B_TRUE); + + mutex_enter(&so->so_lock); + ASSERT(so->so_filter_tx > 0); + so->so_filter_tx--; + if (so->so_state & SS_CLOSING) + cv_signal(&so->so_closing_cv); + mutex_exit(&so->so_lock); + + if (mp != NULL) + freemsg(mp); + + if (error == ENOSPC) { + *flowctrld = B_TRUE; + error = 0; + } else { + *flowctrld = B_FALSE; + } + + return (error); +} + +/* + * sof_inject_data_in(handle, mp, len, flag, flowctrld) + * + * Enqueue `mp' which contains `len' bytes of M_DATA onto the socket + * associated with `handle'. `flags' should be set to 0. Returns 0 when + * successful in which case `flowctrld' is updated. If flow controlled, + * no new data should be injected until a SOF_EV_INJECT_DATA_IN_OK event + * is observed. In case of failure, an errno is returned. + * + * Filters that are higher in the stack than `handle' will see the data + * before it is enqueued on the receive queue and may end up modifying or + * freeing the data. + */ +int +sof_inject_data_in(sof_handle_t handle, mblk_t *mp, size_t len, int flags, + boolean_t *flowctrld) +{ + sof_instance_t *inst = (sof_instance_t *)handle; + ssize_t avail; + int error = 0; + + ASSERT(flags == 0); + avail = so_queue_msg_impl(inst->sofi_sonode, mp, len, flags, &error, + NULL, inst->sofi_prev); + /* fallback should never happen when there is an active filter */ + ASSERT(error != EOPNOTSUPP); + + *flowctrld = (avail > 0) ? B_FALSE : B_TRUE; + return (error); +} diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h new file mode 100644 index 0000000000..d37410a0d1 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SOCKFS_SOCKFILTER_H +#define _SOCKFS_SOCKFILTER_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct sonode; +struct sockparams; + +typedef struct sof_module sof_module_t; +typedef struct sof_entry_kstat sof_entry_kstat_t; +typedef struct sof_entry sof_entry_t; +typedef struct sof_instance sof_instance_t; +typedef struct sof_kstat sof_kstat_t; + +#define SOF_MAXNAMELEN FILNAME_MAX +#define SOF_MAXSOCKTUPLECNT 32 +#define SOF_MODPATH SOCKMOD_PATH + +struct sof_module { + char *sofm_name; + sof_ops_t sofm_ops; + uint_t sofm_refcnt; + list_node_t sofm_node; +}; + +struct sof_kstat { + kstat_named_t sofks_defer_closed; + kstat_named_t sofks_defer_close_backlog; + kstat_named_t sofks_defer_close_failed_backlog_too_big; +}; + +#define SOF_GLOBAL_STAT_BUMP(s) \ + atomic_add_64(&sof_stat.sofks_##s.value.ui64, 1) + +/* + * Per filter statistics. + */ +struct sof_entry_kstat { + kstat_named_t sofek_nactive; /* # of consumers */ + kstat_named_t sofek_tot_active_attach; + kstat_named_t sofek_tot_passive_attach; + kstat_named_t sofek_ndeferred; /* # of deferred conns */ + kstat_named_t sofek_attach_failures; +}; + +/* + * Socket filter entry - one for each configured filter (added and + * removed by soconfig(1M)). + * + * sofe_flags, sofe_refcnt and sofe_mod are protected by sofe_lock, and all + * other fields are write once. + */ +struct sof_entry { + char sofe_name[SOF_MAXNAMELEN]; /* filter name */ + char sofe_modname[MODMAXNAMELEN]; /* filter module */ + sof_hint_t sofe_hint; /* order hint */ + char *sofe_hintarg; /* hint argument */ + list_node_t sofe_node; /* global list node */ + uint_t sofe_socktuple_cnt; /* # of socket tuples */ + sof_socktuple_t *sofe_socktuple; /* socket tuple list */ + + sof_entry_kstat_t sofe_kstat; /* filter stats */ + kstat_t *sofe_ksp; + + kmutex_t sofe_lock; + char sofe_flags; /* SOFEF_* flags */ + uint_t sofe_refcnt; /* # of instances */ + sof_module_t *sofe_mod; /* filter module */ +}; + +/* Filter entry flags */ +#define SOFEF_AUTO 0x1 /* automatic filter */ +#define SOFEF_PROG 0x2 /* programmatic filter */ +#define SOFEF_CONDEMED 0x4 /* removed by soconfig(1M) */ + +/* + * Socket filter instance - one for each socket using a sof_entry_t + */ +struct sof_instance { + sof_ops_t *sofi_ops; /* filter ops */ + void *sofi_cookie; /* filter cookie (from attach) */ + char sofi_flags; /* instance flags (SOFIF_*) */ + sof_instance_t *sofi_prev; /* up the stack */ + sof_instance_t *sofi_next; /* down the stack */ + struct sonode *sofi_sonode; /* socket instance is attached to */ + sof_entry_t *sofi_filter; /* filter this is an instance of */ +}; + +/* Filter instance flags */ +#define SOFIF_BYPASS 0x1 /* filter does not want any callbacks */ +#define SOFIF_DEFER 0x2 /* defer notification of socket */ +#define SOFIF_RCV_FLOWCTRL 0x4 /* flow control recv path */ +#define SOFIF_SND_FLOWCTRL 0x8 /* flow control send path */ + +#define SOF_STAT_ADD(i, s, v) \ + atomic_add_64(&(i)->sofi_filter->sofe_kstat.sofek_##s.value.ui64, (v)) + +extern void sof_init(void); + +extern void sof_entry_free(sof_entry_t *); +extern int sof_entry_add(sof_entry_t *); +extern sof_entry_t *sof_entry_remove_by_name(const char *); +extern int sof_entry_proc_sockparams(sof_entry_t *, struct sockparams *); + +extern int sof_sockparams_init(struct sockparams *); +extern void sof_sockparams_fini(struct sockparams *); + +extern int sof_sonode_autoattach_filters(struct sonode *, cred_t *); +extern int sof_sonode_inherit_filters(struct sonode *, struct sonode *); +extern void sof_sonode_closing(struct sonode *); +extern void sof_sonode_cleanup(struct sonode *); +extern void sof_sonode_notify_filters(struct sonode *, sof_event_t, + uintptr_t); +extern boolean_t sof_sonode_drop_deferred(struct sonode *); + +extern int sof_setsockopt(struct sonode *, int, const void *, socklen_t, + struct cred *); +extern int sof_getsockopt(struct sonode *, int, void *, socklen_t *, + struct cred *); + +extern int sof_rval2errno(sof_rval_t); + +#define SOF_INTERESTED(inst, op) \ + (!((inst)->sofi_flags & SOFIF_BYPASS) && \ + (inst)->sofi_ops->sofop_##op != NULL) + +/* + * SOF_FILTER_OP traverses the filter stack for sonode `so' top-down, + * calling `op' for each filter with the supplied `args'. A non-negative + * return value indicates that a filter action was taken. + */ +#define __SOF_FILTER_OP(so, op, cr, ...) \ + sof_instance_t *__inst; \ + sof_rval_t __rval; \ + \ + for (__inst = (so)->so_filter_top; __inst != NULL; \ + __inst = __inst->sofi_next) { \ + if (!SOF_INTERESTED(__inst, op)) \ + continue; \ + __rval = (__inst->sofi_ops->sofop_##op)((sof_handle_t)__inst,\ + __inst->sofi_cookie, __VA_ARGS__, cr); \ + DTRACE_PROBE2(filter__action, (sof_instance_t), __inst,\ + (sof_rval_t), __rval); \ + if (__rval != SOF_RVAL_CONTINUE) \ + return (sof_rval2errno(__rval)); \ + } \ + return (-1); + +extern mblk_t *sof_filter_data_out_from(struct sonode *so, + sof_instance_t *, mblk_t *, struct nmsghdr *, cred_t *, int *); +extern mblk_t *sof_filter_data_in_proc(struct sonode *so, + mblk_t *, mblk_t **); +extern int sof_filter_bind(struct sonode *, struct sockaddr *, + socklen_t *, cred_t *); +extern int sof_filter_listen(struct sonode *, int *, cred_t *); +extern int sof_filter_connect(struct sonode *, struct sockaddr *, + socklen_t *, cred_t *); +extern int sof_filter_accept(struct sonode *, cred_t *); +extern int sof_filter_shutdown(struct sonode *, int *, cred_t *); +extern int sof_filter_getsockname(struct sonode *, struct sockaddr *, + socklen_t *, cred_t *); +extern int sof_filter_getpeername(struct sonode *, struct sockaddr *, + socklen_t *, cred_t *); +extern int sof_filter_setsockopt(struct sonode *, int, int, void *, + socklen_t *, cred_t *); +extern int sof_filter_getsockopt(struct sonode *, int, int, void *, + socklen_t *, cred_t *); +extern int sof_filter_ioctl(struct sonode *, int, intptr_t, int, + int32_t *, cred_t *); + +#define SOF_FILTER_DATA_OUT(so, mp, msg, cr, errp) \ + sof_filter_data_out_from(so, (so)->so_filter_top, mp, msg, cr, errp) +#define SOF_FILTER_DATA_OUT_FROM(so, inst, mp, msg, cr, errp) \ + sof_filter_data_out_from(so, inst, mp, msg, cr, errp) + +#ifdef __cplusplus +} +#endif + +#endif /* _SOCKFS_SOCKFILTER_H */ diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c index 2bb564288b..d6c1f9ea85 100644 --- a/usr/src/uts/common/fs/sockfs/socknotify.c +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -33,6 +32,7 @@ #include #include #include +#include /* * There can only be a single thread waiting for data (enforced by @@ -78,6 +78,7 @@ so_notify_connected(struct sonode *so) mutex_exit(&so->so_lock); pollwakeup(&so->so_poll_list, POLLOUT); } + sof_sonode_notify_filters(so, SOF_EV_CONNECTED, 0); ASSERT(MUTEX_NOT_HELD(&so->so_lock)); } @@ -93,18 +94,19 @@ so_notify_disconnecting(struct sonode *so) int sigev = 0; ASSERT(MUTEX_HELD(&so->so_lock)); + (void) i_so_notify_last_tx(so, &pollev, &sigev); if (IS_KERNEL_SOCKET(so)) { - SO_WAKEUP_WRITER(so); KSOCKET_CALLBACK(so, cantsendmore, 0); mutex_exit(&so->so_lock); - } else if (i_so_notify_last_tx(so, &pollev, &sigev)) { - socket_sendsig(so, sigev); - mutex_exit(&so->so_lock); - pollwakeup(&so->so_poll_list, pollev); } else { + if (sigev != 0) + socket_sendsig(so, sigev); mutex_exit(&so->so_lock); + if (pollev != 0) + pollwakeup(&so->so_poll_list, pollev); } + sof_sonode_notify_filters(so, SOF_EV_CANTSENDMORE, 0); ASSERT(MUTEX_NOT_HELD(&so->so_lock)); } @@ -114,7 +116,7 @@ so_notify_disconnecting(struct sonode *so) * Wake up anyone that is waiting to send or receive data. */ void -so_notify_disconnected(struct sonode *so, int error) +so_notify_disconnected(struct sonode *so, boolean_t connfailed, int error) { int pollev = 0; int sigev = 0; @@ -125,7 +127,11 @@ so_notify_disconnected(struct sonode *so, int error) (void) i_so_notify_last_rx(so, &pollev, &sigev); if (IS_KERNEL_SOCKET(so)) { - KSOCKET_CALLBACK(so, disconnected, error); + if (connfailed) { + KSOCKET_CALLBACK(so, disconnected, error); + } else { + KSOCKET_CALLBACK(so, connectfailed, error); + } mutex_exit(&so->so_lock); } else { if (sigev != 0) @@ -134,6 +140,8 @@ so_notify_disconnected(struct sonode *so, int error) if (pollev != 0) pollwakeup(&so->so_poll_list, pollev); } + sof_sonode_notify_filters(so, (connfailed) ? SOF_EV_CONNECTFAILED : + SOF_EV_DISCONNECTED, error); ASSERT(MUTEX_NOT_HELD(&so->so_lock)); } @@ -158,6 +166,10 @@ so_notify_writable(struct sonode *so) } ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + + /* filters can start injecting data */ + if (so->so_filter_active > 0) + sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_OUT_OK, 0); } /* @@ -270,7 +282,6 @@ so_notify_eof(struct sonode *so) (void) i_so_notify_last_rx(so, &pollev, &sigev); if (IS_KERNEL_SOCKET(so)) { - SO_WAKEUP_READER(so); KSOCKET_CALLBACK(so, cantrecvmore, 0); mutex_exit(&so->so_lock); } else { @@ -281,6 +292,7 @@ so_notify_eof(struct sonode *so) pollwakeup(&so->so_poll_list, pollev); } + sof_sonode_notify_filters(so, SOF_EV_CANTRECVMORE, 0); ASSERT(MUTEX_NOT_HELD(&so->so_lock)); } @@ -294,7 +306,7 @@ so_notify_newconn(struct sonode *so) ASSERT(MUTEX_HELD(&so->so_lock)); if (IS_KERNEL_SOCKET(so)) { - KSOCKET_CALLBACK(so, newconn, so->so_rcv_queued); + KSOCKET_CALLBACK(so, newconn, 0); mutex_exit(&so->so_lock); } else { socket_sendsig(so, SOCKETSIG_READ); diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c index 5c4872f090..60a1a1580c 100644 --- a/usr/src/uts/common/fs/sockfs/sockparams.c +++ b/usr/src/uts/common/fs/sockfs/sockparams.c @@ -36,6 +36,7 @@ #include #include +#include #include /* @@ -53,12 +54,9 @@ * supplied device path, or when a socket is falling back to TPI. * * Lock order: - * The lock order is splist_lock -> sp_lock. - * The lock order is sp_ephem_lock -> sp_lock. + * The lock order is sockconf_lock -> sp_lock. */ extern int kobj_path_exists(char *, int); -extern void nl7c_init(void); -extern int sockfs_defer_nl7c_init; static int sockparams_sdev_init(struct sockparams *, char *, int); static void sockparams_sdev_fini(struct sockparams *); @@ -67,13 +65,11 @@ static void sockparams_sdev_fini(struct sockparams *); * Global sockparams list (populated via soconfig(1M)). */ static list_t sphead; -static krwlock_t splist_lock; /* * List of ephemeral sockparams. */ static list_t sp_ephem_list; -static krwlock_t sp_ephem_lock; /* Global kstats for sockparams */ typedef struct sockparams_g_stats { @@ -93,9 +89,6 @@ sockparams_init(void) list_create(&sp_ephem_list, sizeof (struct sockparams), offsetof(struct sockparams, sp_node)); - rw_init(&splist_lock, NULL, RW_DEFAULT, NULL); - rw_init(&sp_ephem_lock, NULL, RW_DEFAULT, NULL); - kstat_named_init(&sp_g_stats.spgs_ephem_nalloc, "ephemeral_nalloc", KSTAT_DATA_UINT64); kstat_named_init(&sp_g_stats.spgs_ephem_nreuse, "ephemeral_nreuse", @@ -170,9 +163,8 @@ sockparams_kstat_fini(struct sockparams *sp) * modname: Name of the module associated with the socket type. The * module can be NULL if a device path is given, in which * case the TPI module is used. - * devpath: Path to the STREAMS device. May be NULL for non-STREAMS - * based transports, or those transports that do not provide - * the capability to fallback to STREAMS. + * devpath: Path to the STREAMS device. Must be NULL for non-STREAMS + * based transports. * devpathlen: Length of the devpath string. The argument can be 0, * indicating that devpath was allocated statically, and should * not be freed when the sockparams entry is destroyed. @@ -202,7 +194,7 @@ sockparams_create(int family, int type, int protocol, char *modname, goto error; } - /* either a module or device must be given */ + /* either a module or device must be given, but not both */ if (modname == NULL && devpath == NULL) { *errorp = EINVAL; goto error; @@ -219,6 +211,11 @@ sockparams_create(int family, int type, int protocol, char *modname, sp->sp_refcnt = 0; sp->sp_flags = flags; + list_create(&sp->sp_auto_filters, sizeof (sp_filter_t), + offsetof(sp_filter_t, spf_node)); + list_create(&sp->sp_prog_filters, sizeof (sp_filter_t), + offsetof(sp_filter_t, spf_node)); + kstat_named_init(&sp->sp_stats.sps_nfallback, "nfallback", KSTAT_DATA_UINT64); kstat_named_init(&sp->sp_stats.sps_nactive, "nactive", @@ -322,6 +319,10 @@ sockparams_destroy(struct sockparams *sp) mutex_destroy(&sp->sp_lock); sockparams_kstat_fini(sp); + sof_sockparams_fini(sp); + list_destroy(&sp->sp_auto_filters); + list_destroy(&sp->sp_prog_filters); + kmem_free(sp, sizeof (*sp)); } @@ -404,12 +405,12 @@ sockparams_hold_ephemeral(int family, int type, int protocol, /* * First look for an existing entry */ - rw_enter(&sp_ephem_lock, RW_READER); + rw_enter(&sockconf_lock, RW_READER); sp = sockparams_find(&sp_ephem_list, family, type, protocol, by_devpath, name); if (sp != NULL) { SOCKPARAMS_INC_REF(sp); - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); sp_g_stats.spgs_ephem_nreuse.value.ui64++; return (sp); @@ -418,7 +419,7 @@ sockparams_hold_ephemeral(int family, int type, int protocol, char *namebuf = NULL; int namelen = 0; - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); namelen = strlen(name) + 1; namebuf = kmem_alloc(namelen, kmflag); @@ -460,7 +461,7 @@ sockparams_hold_ephemeral(int family, int type, int protocol, * The sockparams entry was created, now try to add it * to the list. We need to hold the lock as a WRITER. */ - rw_enter(&sp_ephem_lock, RW_WRITER); + rw_enter(&sockconf_lock, RW_WRITER); sp = sockparams_find(&sp_ephem_list, family, type, protocol, by_devpath, name); if (sp != NULL) { @@ -469,13 +470,19 @@ sockparams_hold_ephemeral(int family, int type, int protocol, * place a hold on it and release the entry we alloc'ed. */ SOCKPARAMS_INC_REF(sp); - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); sockparams_destroy(newsp); } else { + *errorp = sof_sockparams_init(newsp); + if (*errorp != 0) { + rw_exit(&sockconf_lock); + sockparams_destroy(newsp); + return (NULL); + } SOCKPARAMS_INC_REF(newsp); list_insert_tail(&sp_ephem_list, newsp); - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); sp = newsp; } @@ -514,18 +521,18 @@ sockparams_ephemeral_drop_last_ref(struct sockparams *sp) ASSERT(sp->sp_flags & SOCKPARAMS_EPHEMERAL); ASSERT(MUTEX_NOT_HELD(&sp->sp_lock)); - rw_enter(&sp_ephem_lock, RW_WRITER); + rw_enter(&sockconf_lock, RW_WRITER); mutex_enter(&sp->sp_lock); if (--sp->sp_refcnt == 0) { list_remove(&sp_ephem_list, sp); mutex_exit(&sp->sp_lock); - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); sockparams_destroy(sp); } else { mutex_exit(&sp->sp_lock); - rw_exit(&sp_ephem_lock); + rw_exit(&sockconf_lock); } } @@ -542,21 +549,37 @@ sockparams_ephemeral_drop_last_ref(struct sockparams *sp) * is returned. * * Locking: - * The caller can not be holding splist_lock. + * The caller can not be holding sockconf_lock. */ -static int +int sockparams_add(struct sockparams *sp) { + int error; + ASSERT(!(sp->sp_flags & SOCKPARAMS_EPHEMERAL)); - rw_enter(&splist_lock, RW_WRITER); + rw_enter(&sockconf_lock, RW_WRITER); if (sockparams_find(&sphead, sp->sp_family, sp->sp_type, sp->sp_protocol, B_TRUE, NULL) != 0) { - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); return (EEXIST); } else { + /* + * Unique sockparams entry, so init the kstats. + */ + sockparams_kstat_init(sp); + + /* + * Before making the socket type available we must make + * sure that interested socket filters are aware of it. + */ + error = sof_sockparams_init(sp); + if (error != 0) { + rw_exit(&sockconf_lock); + return (error); + } list_insert_tail(&sphead, sp); - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); return (0); } } @@ -575,15 +598,15 @@ sockparams_add(struct sockparams *sp) * On success 0, otherwise ENXIO. * * Locking: - * Caller can not be holding splist_lock or the sp_lock of + * Caller can not be holding sockconf_lock or the sp_lock of * any sockparams entry. */ -static int +int sockparams_delete(int family, int type, int protocol) { struct sockparams *sp; - rw_enter(&splist_lock, RW_WRITER); + rw_enter(&sockconf_lock, RW_WRITER); sp = sockparams_find(&sphead, family, type, protocol, B_TRUE, NULL); if (sp != NULL) { @@ -595,97 +618,22 @@ sockparams_delete(int family, int type, int protocol) mutex_enter(&sp->sp_lock); if (sp->sp_refcnt != 0) { mutex_exit(&sp->sp_lock); - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); return (EBUSY); } mutex_exit(&sp->sp_lock); /* Delete the sockparams entry. */ list_remove(&sphead, sp); - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); sockparams_destroy(sp); return (0); } else { - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); return (ENXIO); } } -/* - * soconfig(int family, int type, int protocol, - * char *devpath, int devpathlen, char *module) - * - * Add or delete an entry to the sockparams table. - * When devpath and module both are NULL, it will delete an entry. - * - * Arguments: - * family, type, protocol: the tuple in question - * devpath: STREAMS device path. Can be NULL for module based sockets. - * module : Name of the socket module. Can be NULL for STREAMS - * based sockets. - * devpathlen: length of the devpath string, or 0 if devpath - * was statically allocated. - * - * Note: - * This routine assumes that the caller has kmem_alloced - * devpath (if devpathlen > 0) and module for this routine to - * consume. - */ -int -soconfig(int family, int type, int protocol, - char *devpath, int devpathlen, char *module) -{ - struct sockparams *sp; - int error = 0; - - dprint(0, ("soconfig(%d,%d,%d,%s,%d,%s)\n", - family, type, protocol, devpath, devpathlen, - module == NULL ? "NULL" : module)); - - if (sockfs_defer_nl7c_init) { - nl7c_init(); - sockfs_defer_nl7c_init = 0; - } - - if (devpath == NULL && module == NULL) { - /* - * Delete existing entry, - * both socket module and STEAMS device. - */ - ASSERT(module == NULL); - error = sockparams_delete(family, type, protocol); - } else { - /* - * Adding an entry - * sockparams_create frees mod name and devpath upon failure. - */ - sp = sockparams_create(family, type, protocol, module, - devpath, devpathlen, 0, KM_SLEEP, &error); - - if (sp != NULL) { - /* - * The sockparams entry becomes globally visible once - * we call sockparams_add(). So we add a reference so - * we do not have to worry about the entry being - * immediately deleted. - */ - SOCKPARAMS_INC_REF(sp); - error = sockparams_add(sp); - if (error != 0) { - SOCKPARAMS_DEC_REF(sp); - sockparams_destroy(sp); - } else { - /* - * Unique sockparams entry, so init the kstats. - */ - sockparams_kstat_init(sp); - SOCKPARAMS_DEC_REF(sp); - } - } - } - - return (error); -} /* * solookup(int family, int type, int protocol, struct sockparams **spp) @@ -716,7 +664,7 @@ solookup(int family, int type, int protocol, struct sockparams **spp) int error = 0; *spp = NULL; - rw_enter(&splist_lock, RW_READER); + rw_enter(&sockconf_lock, RW_READER); /* * Search the sockparams list for an appropiate entry. @@ -740,7 +688,7 @@ solookup(int family, int type, int protocol, struct sockparams **spp) sp->sp_protocol == protocol && found < 2) found = 2; } - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); switch (found) { case 0: error = EAFNOSUPPORT; @@ -760,13 +708,13 @@ solookup(int family, int type, int protocol, struct sockparams **spp) * * We put a hold on the entry early on, so if the * sockmod is not loaded, and we have to exit - * splist_lock to call modload(), we know that the + * sockconf_lock to call modload(), we know that the * sockparams entry wont go away. That way we don't * have to look up the entry once we come back from * modload(). */ SOCKPARAMS_INC_REF(sp); - rw_exit(&splist_lock); + rw_exit(&sockconf_lock); if (sp->sp_smod_info == NULL) { smod_info_t *smod = smod_lookup_byname(sp->sp_smod_name); @@ -807,3 +755,73 @@ solookup(int family, int type, int protocol, struct sockparams **spp) *spp = sp; return (0); } + +/* + * Called when filter entry `ent' is going away. All sockparams remove + * their references to `ent'. + */ +static void +sockparams_filter_cleanup_impl(sof_entry_t *ent, list_t *list) +{ + struct sockparams *sp; + sp_filter_t *fil; + list_t *flist; + + ASSERT(RW_WRITE_HELD(&sockconf_lock)); + + for (sp = list_head(list); sp != NULL; + sp = list_next(list, sp)) { + flist = (ent->sofe_flags & SOFEF_AUTO) ? + &sp->sp_auto_filters : &sp->sp_prog_filters; + fil = list_head(flist); + for (fil = list_head(flist); fil != NULL; + fil = list_next(flist, fil)) { + if (fil->spf_filter == ent) { + list_remove(flist, fil); + kmem_free(fil, sizeof (sp_filter_t)); + break; + } + } + } +} +void +sockparams_filter_cleanup(sof_entry_t *ent) +{ + sockparams_filter_cleanup_impl(ent, &sphead); + sockparams_filter_cleanup_impl(ent, &sp_ephem_list); +} + +/* + * New filter is being added; walk the list of sockparams to see if + * the filter is interested in any of the sockparams. + */ +static int +sockparams_new_filter_impl(sof_entry_t *ent, list_t *list) +{ + struct sockparams *sp; + int err; + + ASSERT(RW_WRITE_HELD(&sockconf_lock)); + + for (sp = list_head(list); sp != NULL; + sp = list_next(list, sp)) { + if ((err = sof_entry_proc_sockparams(ent, sp)) != 0) { + sockparams_filter_cleanup(ent); + return (err); + } + } + return (0); +} + +int +sockparams_new_filter(sof_entry_t *ent) +{ + int error; + + if ((error = sockparams_new_filter_impl(ent, &sphead)) != 0) + return (error); + + if ((error = sockparams_new_filter_impl(ent, &sp_ephem_list)) != 0) + sockparams_filter_cleanup_impl(ent, &sphead); + return (error); +} diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 2a329da653..06d76044e5 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -72,6 +71,7 @@ #include #include +#include #include #include #include @@ -97,6 +97,12 @@ struct socklist socklist; struct kmem_cache *socket_cache; +/* + * sockconf_lock protects the socket configuration (socket types and + * socket filters) which is changed via the sockconfig system call. + */ +krwlock_t sockconf_lock; + static int sockfs_update(kstat_t *, int); static int sockfs_snapshot(kstat_t *, void *, int); extern smod_info_t *sotpi_smod_create(void); @@ -239,6 +245,8 @@ sockinit(int fstype, char *name) sizeof (struct sonode), 0, sonode_constructor, sonode_destructor, NULL, NULL, NULL, 0); + rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL); + error = socktpi_init(); if (error != 0) { err_str = NULL; @@ -288,6 +296,9 @@ sockinit(int fstype, char *name) nl7c_init(); } + /* Initialize socket filters */ + sof_init(); + return (0); failure: diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index 6ce3fac8e6..4b518e632b 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -67,6 +66,7 @@ #include #include +#include #include #ifdef SOCK_TEST @@ -75,7 +75,10 @@ int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */ #define do_useracc 1 #endif /* SOCK_TEST */ -extern int xnet_truncate_print; +extern int xnet_truncate_print; + +extern void nl7c_init(void); +extern int sockfs_defer_nl7c_init; /* * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" @@ -1519,143 +1522,291 @@ done2: return (0); } -/* - * Add config info when name is non-NULL; delete info when name is NULL. - * name could be a device name or a module name and are user address. - */ -int -sockconfig(int family, int type, int protocol, char *name) +static int +sockconf_add_sock(int family, int type, int protocol, char *name) { - char *kdevpath = NULL; /* Copied in devpath string */ + int error = 0; + char *kdevpath = NULL; char *kmodule = NULL; + char *buf = NULL; size_t pathlen = 0; - int error = 0; - - dprint(1, ("sockconfig(%d, %d, %d, %p)\n", - family, type, protocol, (void *)name)); - - if (secpolicy_net_config(CRED(), B_FALSE) != 0) - return (set_errno(EPERM)); + struct sockparams *sp; + if (name == NULL) + return (EINVAL); /* - * By default set the kdevpath and kmodule to NULL to delete an entry. - * Otherwise when name is not NULL, set the kdevpath or kmodule - * value to add an entry. + * Copyin the name. + * This also makes it possible to check for too long pathnames. + * Compress the space needed for the name before passing it + * to soconfig - soconfig will store the string until + * the configuration is removed. */ - if (name != NULL) { + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) { + kmem_free(buf, MAXPATHLEN); + return (error); + } + if (strncmp(buf, "/dev", strlen("/dev")) == 0) { + /* For device */ + /* - * Adding an entry. - * Copyin the name. - * This also makes it possible to check for too long pathnames. - * Compress the space needed for the name before passing it - * to soconfig - soconfig will store the string until - * the configuration is removed. + * Special handling for NCA: + * + * DEV_NCA is never opened even if an application + * requests for AF_NCA. The device opened is instead a + * predefined AF_INET transport (NCA_INET_DEV). + * + * Prior to Volo (PSARC/2007/587) NCA would determine + * the device using a lookup, which worked then because + * all protocols were based on TPI. Since TPI is no + * longer the default, we have to explicitly state + * which device to use. */ - char *buf; - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); - if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) { - kmem_free(buf, MAXPATHLEN); - goto done; + if (strcmp(buf, NCA_DEV) == 0) { + /* only support entry <28, 2, 0> */ + if (family != AF_NCA || type != SOCK_STREAM || + protocol != 0) { + kmem_free(buf, MAXPATHLEN); + return (EINVAL); + } + + pathlen = strlen(NCA_INET_DEV) + 1; + kdevpath = kmem_alloc(pathlen, KM_SLEEP); + bcopy(NCA_INET_DEV, kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } else { + kdevpath = kmem_alloc(pathlen, KM_SLEEP); + bcopy(buf, kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; } - if (strncmp(buf, "/dev", strlen("/dev")) == 0) { - /* For device */ + } else { + /* For socket module */ + kmodule = kmem_alloc(pathlen, KM_SLEEP); + bcopy(buf, kmodule, pathlen); + kmodule[pathlen - 1] = '\0'; + pathlen = 0; + } + kmem_free(buf, MAXPATHLEN); - /* - * Special handling for NCA: - * - * DEV_NCA is never opened even if an application - * requests for AF_NCA. The device opened is instead a - * predefined AF_INET transport (NCA_INET_DEV). - * - * Prior to Volo (PSARC/2007/587) NCA would determine - * the device using a lookup, which worked then because - * all protocols were based on TPI. Since TPI is no - * longer the default, we have to explicitly state - * which device to use. - */ - if (strcmp(buf, NCA_DEV) == 0) { - /* only support entry <28, 2, 0> */ - if (family != AF_NCA || type != SOCK_STREAM || - protocol != 0) { - kmem_free(buf, MAXPATHLEN); - error = EINVAL; - goto done; - } + /* sockparams_create frees mod name and devpath upon failure */ + sp = sockparams_create(family, type, protocol, kmodule, + kdevpath, pathlen, 0, KM_SLEEP, &error); + if (sp != NULL) { + error = sockparams_add(sp); + if (error != 0) + sockparams_destroy(sp); + } - pathlen = strlen(NCA_INET_DEV) + 1; - kdevpath = kmem_alloc(pathlen, KM_SLEEP); - bcopy(NCA_INET_DEV, kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } else { - kdevpath = kmem_alloc(pathlen, KM_SLEEP); - bcopy(buf, kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } - } else { - /* For socket module */ - kmodule = kmem_alloc(pathlen, KM_SLEEP); - bcopy(buf, kmodule, pathlen); - kmodule[pathlen - 1] = '\0'; - - pathlen = 0; - if (strcmp(kmodule, "tcp") == 0) { - /* Get the tcp device name for fallback */ - if (family == 2) { - pathlen = strlen("/dev/tcp") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/tcp", kdevpath, - pathlen); - kdevpath[pathlen - 1] = '\0'; - } else { - ASSERT(family == 26); - pathlen = strlen("/dev/tcp6") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/tcp6", kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } - } else if (strcmp(kmodule, "udp") == 0) { - /* Get the udp device name for fallback */ - if (family == 2) { - pathlen = strlen("/dev/udp") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/udp", kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } else { - ASSERT(family == 26); - pathlen = strlen("/dev/udp6") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/udp6", kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } - } else if (strcmp(kmodule, "icmp") == 0) { - /* Get the icmp device name for fallback */ - if (family == 2) { - pathlen = strlen("/dev/rawip") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/rawip", kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } else { - ASSERT(family == 26); - pathlen = strlen("/dev/rawip6") + 1; - kdevpath = kmem_alloc(pathlen, - KM_SLEEP); - bcopy("/dev/rawip6", kdevpath, pathlen); - kdevpath[pathlen - 1] = '\0'; - } + return (error); +} + +static int +sockconf_remove_sock(int family, int type, int protocol) +{ + return (sockparams_delete(family, type, protocol)); +} + +static int +sockconfig_remove_filter(const char *uname) +{ + char kname[SOF_MAXNAMELEN]; + size_t len; + int error; + sof_entry_t *ent; + + if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0) + return (error); + + ent = sof_entry_remove_by_name(kname); + if (ent == NULL) + return (ENXIO); + + mutex_enter(&ent->sofe_lock); + ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED)); + if (ent->sofe_refcnt == 0) { + mutex_exit(&ent->sofe_lock); + sof_entry_free(ent); + } else { + /* let the last socket free the filter */ + ent->sofe_flags |= SOFEF_CONDEMED; + mutex_exit(&ent->sofe_lock); + } + + return (0); +} + +static int +sockconfig_add_filter(const char *uname, void *ufilpropp) +{ + struct sockconfig_filter_props filprop; + sof_entry_t *ent; + int error; + size_t tuplesz, len; + char hintbuf[SOF_MAXNAMELEN]; + + ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP); + mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL); + + if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN, + &len)) != 0) { + sof_entry_free(ent); + return (error); + } + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) { + sof_entry_free(ent); + return (EFAULT); + } + } +#ifdef _SYSCALL32_IMPL + else { + struct sockconfig_filter_props32 filprop32; + + if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) { + sof_entry_free(ent); + return (EFAULT); + } + filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname; + filprop.sfp_autoattach = filprop32.sfp_autoattach; + filprop.sfp_hint = filprop32.sfp_hint; + filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg; + filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt; + filprop.sfp_socktuple = + (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple; + } +#endif /* _SYSCALL32_IMPL */ + + if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname, + sizeof (ent->sofe_modname), &len)) != 0) { + sof_entry_free(ent); + return (error); + } + + /* + * A filter must specify at least one socket tuple. + */ + if (filprop.sfp_socktuple_cnt == 0 || + filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) { + sof_entry_free(ent); + return (EINVAL); + } + ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG; + ent->sofe_hint = filprop.sfp_hint; + + /* + * Verify the hint, and copy in the hint argument, if necessary. + */ + switch (ent->sofe_hint) { + case SOF_HINT_BEFORE: + case SOF_HINT_AFTER: + if ((error = copyinstr(filprop.sfp_hintarg, hintbuf, + sizeof (hintbuf), &len)) != 0) { + sof_entry_free(ent); + return (error); + } + ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP); + bcopy(hintbuf, ent->sofe_hintarg, len); + /* FALLTHRU */ + case SOF_HINT_TOP: + case SOF_HINT_BOTTOM: + /* hints cannot be used with programmatic filters */ + if (ent->sofe_flags & SOFEF_PROG) { + sof_entry_free(ent); + return (EINVAL); + } + break; + case SOF_HINT_NONE: + break; + default: + /* bad hint value */ + sof_entry_free(ent); + return (EINVAL); + } + + ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt; + tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt; + ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple, + tuplesz)) { + sof_entry_free(ent); + return (EFAULT); + } + } +#ifdef _SYSCALL32_IMPL + else { + int i; + caddr_t data = (caddr_t)filprop.sfp_socktuple; + sof_socktuple_t *tup = ent->sofe_socktuple; + sof_socktuple32_t tup32; + + tup = ent->sofe_socktuple; + for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) { + ASSERT(tup < ent->sofe_socktuple + tuplesz); + + if (copyin(data, &tup32, sizeof (tup32)) != 0) { + sof_entry_free(ent); + return (EFAULT); } + tup->sofst_family = tup32.sofst_family; + tup->sofst_type = tup32.sofst_type; + tup->sofst_protocol = tup32.sofst_protocol; + + data += sizeof (tup32); } + } +#endif /* _SYSCALL32_IMPL */ - kmem_free(buf, MAXPATHLEN); + /* Sockets can start using the filter as soon as the filter is added */ + if ((error = sof_entry_add(ent)) != 0) + sof_entry_free(ent); + + return (error); +} + +/* + * Socket configuration system call. It is used to add and remove + * socket types. + */ +int +sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) +{ + int error = 0; + + if (secpolicy_net_config(CRED(), B_FALSE) != 0) + return (set_errno(EPERM)); + + if (sockfs_defer_nl7c_init) { + nl7c_init(); + sockfs_defer_nl7c_init = 0; } - error = soconfig(family, type, protocol, kdevpath, (int)pathlen, - kmodule); -done: - if (error) { + + switch (cmd) { + case SOCKCONFIG_ADD_SOCK: + error = sockconf_add_sock((int)(uintptr_t)arg1, + (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4); + break; + case SOCKCONFIG_REMOVE_SOCK: + error = sockconf_remove_sock((int)(uintptr_t)arg1, + (int)(uintptr_t)arg2, (int)(uintptr_t)arg3); + break; + case SOCKCONFIG_ADD_FILTER: + error = sockconfig_add_filter((const char *)arg1, arg2); + break; + case SOCKCONFIG_REMOVE_FILTER: + error = sockconfig_remove_filter((const char *)arg1); + break; + default: +#ifdef DEBUG + cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd); +#endif + error = EINVAL; + break; + } + + if (error != 0) { eprintline(error); return (set_errno(error)); } @@ -1943,9 +2094,15 @@ snf_async_read(snf_req_t *sr) * For sockets acting as an SSL proxy, we * need to adjust the size to the maximum * SSL record size set in the stream head. + * + * Socket filters can limit the mblk size, + * so limit reads to maxblk if there are + * filters present. */ - if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) && - SOTOTPI(so)->sti_kssl_ctx != NULL) + if (vp->v_type == VSOCK && + (!SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_kssl_ctx != NULL) || + (so->so_filter_active > 0 && maxblk != INFPSZ)) iosize = (int)MIN(iosize, maxblk); if (is_system_labeled()) { @@ -2550,9 +2707,14 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, * For sockets acting as an SSL proxy, we * need to adjust the size to the maximum * SSL record size set in the stream head. + * + * Socket filters can limit the mblk size, + * so limit reads to maxblk if there are */ - if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) && - SOTOTPI(so)->sti_kssl_ctx != NULL) + if (vp->v_type == VSOCK && + (!SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_kssl_ctx != NULL) || + so->so_filter_active > 0 && maxblk != INFPSZ) iosize = (int)MIN(iosize, maxblk); if (is_system_labeled()) { @@ -2804,7 +2966,7 @@ solisten(struct sonode *so, int backlog) } int -soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, +soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags) { return (socket_connect(so, name, namelen, fflag, flags, CRED())); diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index de0293e710..a4f9f90a4a 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -214,7 +213,7 @@ static int sotpi_accept(struct sonode *, int, struct cred *, static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t, int, struct cred *); static int sotpi_listen(struct sonode *, int, struct cred *); -static int sotpi_connect(struct sonode *, const struct sockaddr *, +static int sotpi_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, struct cred *); @@ -2231,7 +2230,7 @@ e_bad: */ int sotpi_connect(struct sonode *so, - const struct sockaddr *name, + struct sockaddr *name, socklen_t namelen, int fflag, int flags, @@ -6483,23 +6482,6 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, if (sti->sti_direct != 0) *direct = B_TRUE; - /* - * When it comes to urgent data we have two cases to deal with; - * (1) The oob byte has already arrived, or (2) the protocol has - * notified that oob data is pending, but it has not yet arrived. - * - * For (1) all we need to do is send a T_EXDATA_IND to indicate were - * in the byte stream the oob byte is. For (2) we have to send a - * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether - * the oob byte will be the next byte from the protocol. - * - * So in the worst case we need two mblks, one for the signal, another - * for mark indication. In that case we use the exdata_mp for the sig. - */ - sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED, - STR_NOSIG, NULL); - sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); - /* * Keep the original sp around so we can properly dispose of the * sonode when the socket is being closed. @@ -6560,16 +6542,6 @@ sotpi_revert_sonode(struct sonode *so, struct cred *cr) ASSERT(!SOCK_IS_NONSTR(so)); ASSERT(vp->v_stream != NULL); - if (SOTOTPI(so)->sti_exdata_mp != NULL) { - freeb(SOTOTPI(so)->sti_exdata_mp); - SOTOTPI(so)->sti_exdata_mp = NULL; - } - - if (SOTOTPI(so)->sti_urgmark_mp != NULL) { - freeb(SOTOTPI(so)->sti_urgmark_mp); - SOTOTPI(so)->sti_urgmark_mp = NULL; - } - strclean(vp); (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr); @@ -6677,9 +6649,6 @@ i_sotpi_info_constructor(sotpi_info_t *sti) sti->sti_nl7c_uri = NULL; sti->sti_nl7c_rcv_mp = NULL; - sti->sti_exdata_mp = NULL; - sti->sti_urgmark_mp = NULL; - mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL); @@ -6705,9 +6674,6 @@ i_sotpi_info_destructor(sotpi_info_t *sti) ASSERT(sti->sti_nl7c_uri == NULL); ASSERT(sti->sti_nl7c_rcv_mp == NULL); - ASSERT(sti->sti_exdata_mp == NULL); - ASSERT(sti->sti_urgmark_mp == NULL); - mutex_destroy(&sti->sti_plumb_lock); cv_destroy(&sti->sti_ack_cv); } diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h index c8dc101bdd..8044973377 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.h +++ b/usr/src/uts/common/fs/sockfs/socktpi.h @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SOCKFS_SOCKTPI_H @@ -251,12 +250,6 @@ typedef struct sotpi_info { kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */ kssl_ent_t sti_kssl_ent; /* SSL config entry */ kssl_ctx_t sti_kssl_ctx; /* SSL session context */ - - /* - * The mblks below are only allocated and used during fallback. - */ - mblk_t *sti_exdata_mp; /* T_EXDATA_IND or SIGURG */ - mblk_t *sti_urgmark_mp; /* mark indication */ } sotpi_info_t; struct T_capability_ack; diff --git a/usr/src/uts/common/fs/sockfs/sodirect.c b/usr/src/uts/common/fs/sockfs/sodirect.c index e64fca9de6..f30681fdc7 100644 --- a/usr/src/uts/common/fs/sockfs/sodirect.c +++ b/usr/src/uts/common/fs/sockfs/sodirect.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -78,7 +77,7 @@ sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) if (uiop->uio_resid >= uioasync.mincnt && sodp != NULL && sodp->sod_enabled && uioasync.enabled && !(flags & MSG_PEEK) && - !so->so_proto_props.sopp_loopback && + !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 && !(so->so_state & SS_CANTRCVMORE)) { /* * Big enough I/O for uioa min setup and an sodirect socket diff --git a/usr/src/uts/common/inet/inetddi.c b/usr/src/uts/common/inet/inetddi.c index 6b0cd5839a..a64bf7e978 100644 --- a/usr/src/uts/common/inet/inetddi.c +++ b/usr/src/uts/common/inet/inetddi.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -88,6 +87,12 @@ INET_SOCKDESC must be defined! #elif defined(INET_SOCKDESC) && !defined(INET_SOCK_PROTO_CREATE_FUNC) #error inetddi.c: INET_SOCKDESC is defined but INET_SOCK_PROTO_CREATE_FUNC \ is not! +#elif defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V4) +#error inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \ +INET_SOCK_FALLBACK_DEV_V4 is not! +#elif defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V6) +#error inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \ +INET_SOCK_FALLBACK_DEV_V6 is not! #endif #ifdef INET_DEVDESC @@ -216,7 +221,9 @@ static struct modlstrmod modlstrmod = { static __smod_priv_t smodpriv = { NULL, NULL, - INET_SOCK_PROTO_FB_FUNC + INET_SOCK_PROTO_FB_FUNC, + INET_SOCK_FALLBACK_DEV_V4, + INET_SOCK_FALLBACK_DEV_V6 }; #endif /* INET_SOCK_PROTO_FB_FUNC */ diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 7c5ae628be..d67bf624dd 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -5215,7 +5215,8 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, /* ARGSUSED2 */ int rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, - boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, + sock_quiesce_arg_t *arg) { conn_t *connp = (conn_t *)proto_handle; icmp_t *icmp; @@ -5224,7 +5225,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, socklen_t laddrlen, faddrlen; short opts; struct stroptions *stropt; - mblk_t *stropt_mp; + mblk_t *mp, *stropt_mp; int error; icmp = connp->conn_icmp; @@ -5276,7 +5277,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; - (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, (struct sockaddr *)&laddr, laddrlen, (struct sockaddr *)&faddr, faddrlen, opts); @@ -5285,9 +5286,11 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, * queued in icmp_t. Now we push up any queued packets. */ mutex_enter(&icmp->icmp_recv_lock); + if (mp != NULL) { + mp->b_next = icmp->icmp_fallback_queue_head; + icmp->icmp_fallback_queue_head = mp; + } while (icmp->icmp_fallback_queue_head != NULL) { - mblk_t *mp; - mp = icmp->icmp_fallback_queue_head; icmp->icmp_fallback_queue_head = mp->b_next; mp->b_next = NULL; diff --git a/usr/src/uts/common/inet/ip/icmpddi.c b/usr/src/uts/common/inet/ip/icmpddi.c index dd0023c0c8..0caa9c7f6c 100644 --- a/usr/src/uts/common/inet/ip/icmpddi.c +++ b/usr/src/uts/common/inet/ip/icmpddi.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -40,8 +39,10 @@ #define INET_DEVSTRTAB icmpinfov4 #define INET_MODSTRTAB dummymodinfo #define INET_SOCKDESC "Rawip socket module" -#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create) -#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback) +#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create) +#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback) +#define INET_SOCK_FALLBACK_DEV_V4 "/dev/icmp" +#define INET_SOCK_FALLBACK_DEV_V6 "/dev/icmp6" #define INET_DEVMTFLAGS D_MP #define INET_MODMTFLAGS D_MP diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 71b4f3f228..6fb72d1d08 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -107,7 +106,7 @@ extern void icmp_ddi_g_destroy(void); extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t, - so_proto_quiesced_cb_t); + so_proto_quiesced_cb_t, sock_quiesce_arg_t *); extern sock_downcalls_t sock_rawip_downcalls; diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c index 0f277be716..871e9f71e5 100644 --- a/usr/src/uts/common/inet/sockmods/socksctp.c +++ b/usr/src/uts/common/inet/sockmods/socksctp.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -63,7 +62,7 @@ static int sosctp_accept(struct sonode *, int, struct cred *, struct sonode **); static int sosctp_bind(struct sonode *, struct sockaddr *, socklen_t, int, struct cred *); static int sosctp_listen(struct sonode *, int, struct cred *); -static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t, +static int sosctp_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); static int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, struct cred *); @@ -86,7 +85,7 @@ void sosctp_fini(struct sonode *, struct cred *); /* * SCTP sockfs sonode operations, 1-N socket */ -static int sosctp_seq_connect(struct sonode *, const struct sockaddr *, +static int sosctp_seq_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, struct cred *); @@ -352,7 +351,7 @@ done: */ /*ARGSUSED*/ static int -sosctp_connect(struct sonode *so, const struct sockaddr *name, +sosctp_connect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags, struct cred *cr) { int error = 0; @@ -433,7 +432,7 @@ done: * make it so. */ static int -sosctp_seq_connect(struct sonode *so, const struct sockaddr *name, +sosctp_seq_connect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags, struct cred *cr) { struct sctp_soassoc *ssa; diff --git a/usr/src/uts/common/inet/sockmods/socksdp.c b/usr/src/uts/common/inet/sockmods/socksdp.c index 3ec9ff5cfb..8841bce55c 100644 --- a/usr/src/uts/common/inet/sockmods/socksdp.c +++ b/usr/src/uts/common/inet/sockmods/socksdp.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -63,7 +62,7 @@ static int sosdp_accept(struct sonode *, int, struct cred *, struct sonode **); static int sosdp_bind(struct sonode *, struct sockaddr *, socklen_t, int, struct cred *); static int sosdp_listen(struct sonode *, int, struct cred *); -static int sosdp_connect(struct sonode *, const struct sockaddr *, socklen_t, +static int sosdp_connect(struct sonode *, struct sockaddr *, socklen_t, int, int, struct cred *); static int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, struct cred *); @@ -325,7 +324,7 @@ done: */ /*ARGSUSED*/ static int -sosdp_connect(struct sonode *so, const struct sockaddr *name, +sosdp_connect(struct sonode *so, struct sockaddr *name, socklen_t namelen, int fflag, int flags, struct cred *cr) { int error = 0; @@ -1120,7 +1119,7 @@ sosdp_poll(struct sonode *so, short events, int anyyet, short *reventsp, *reventsp |= (POLLIN|POLLRDNORM) & events; } - if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_head != NULL)) { + if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_len > 0)) { *reventsp |= (POLLIN|POLLRDNORM) & events; } @@ -1158,7 +1157,7 @@ sosdp_close(struct sonode *so, int flag, struct cred *cr) mutex_enter(&so->so_lock); so_unlock_single(so, SOLOCKED); - so_notify_disconnected(so, error); + so_notify_disconnected(so, B_FALSE, error); return (error); } @@ -1266,7 +1265,7 @@ sdp_sock_disconnected(void *handle, int error) ASSERT(so->so_proto_handle != NULL); /* closed conn */ soisdisconnected(so, error); - so_notify_disconnected(so, error); + so_notify_disconnected(so, B_FALSE, error); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 441722acd4..40e78141c9 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -952,6 +952,18 @@ tcp_clean_death(tcp_t *tcp, int err) } } + /* + * ESTABLISHED non-STREAMS eagers are not 'detached' because + * an upper handle is obtained when the SYN-ACK comes in. So it + * should receive the 'disconnected' upcall, but tcp_reinit should + * not be called since this is an eager. + */ + if (tcp->tcp_listener != NULL && IPCL_IS_NONSTR(connp)) { + tcp_closei_local(tcp); + tcp->tcp_state = TCPS_BOUND; + return (0); + } + tcp_reinit(tcp); if (IPCL_IS_NONSTR(connp)) (void) tcp_do_unbind(connp); @@ -1014,15 +1026,23 @@ tcp_stop_lingering(tcp_t *tcp) CONN_DEC_REF(connp); } finish: - /* Signal closing thread that it can complete close */ - mutex_enter(&tcp->tcp_closelock); tcp->tcp_detached = B_TRUE; connp->conn_rq = NULL; connp->conn_wq = NULL; + /* Signal closing thread that it can complete close */ + mutex_enter(&tcp->tcp_closelock); tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); mutex_exit(&tcp->tcp_closelock); + + /* If we have an upper handle (socket), release it */ + if (IPCL_IS_NONSTR(connp)) { + ASSERT(connp->conn_upper_handle != NULL); + (*connp->conn_upcalls->su_closed)(connp->conn_upper_handle); + connp->conn_upper_handle = NULL; + connp->conn_upcalls = NULL; + } } void @@ -1088,6 +1108,15 @@ tcp_close_common(conn_t *connp, int flags) SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); + /* + * For non-STREAMS sockets, the normal case is that the conn makes + * an upcall when it's finally closed, so there is no need to wait + * in the protocol. But in case of SO_LINGER the thread sleeps here + * so it can properly deal with the thread being interrupted. + */ + if (IPCL_IS_NONSTR(connp) && connp->conn_linger == 0) + goto nowait; + mutex_enter(&tcp->tcp_closelock); while (!tcp->tcp_closed) { if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { @@ -1129,8 +1158,12 @@ tcp_close_common(conn_t *connp, int flags) * conn_wq of the eagers point to our queues. By waiting for the * refcnt to drop to 1, we are sure that the eagers have cleaned * up their queue pointers and also dropped their references to us. + * + * For non-STREAMS sockets we do not have to wait here; the + * listener will instead make a su_closed upcall when the last + * reference is dropped. */ - if (tcp->tcp_wait_for_eagers) { + if (tcp->tcp_wait_for_eagers && !IPCL_IS_NONSTR(connp)) { mutex_enter(&connp->conn_lock); while (connp->conn_ref != 1) { cv_wait(&connp->conn_cv, &connp->conn_lock); @@ -1138,6 +1171,7 @@ tcp_close_common(conn_t *connp, int flags) mutex_exit(&connp->conn_lock); } +nowait: connp->conn_cpid = NOPID; } @@ -1410,6 +1444,22 @@ tcp_free(tcp_t *tcp) * the following code is enough. */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); + + /* + * If this is a non-STREAM socket still holding on to an upper + * handle, release it. As a result of fallback we might also see + * STREAMS based conns with upper handles, in which case there is + * nothing to do other than clearing the field. + */ + if (connp->conn_upper_handle != NULL) { + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_closed)( + connp->conn_upper_handle); + tcp->tcp_detached = B_TRUE; + } + connp->conn_upper_handle = NULL; + connp->conn_upcalls = NULL; + } } /* @@ -3092,103 +3142,19 @@ tcp_do_unbind(conn_t *connp) } /* - * This runs at the tail end of accept processing on the squeue of the - * new connection. + * Collect protocol properties to send to the upper handle. */ -/* ARGSUSED */ void -tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +tcp_get_proto_props(tcp_t *tcp, struct sock_proto_props *sopp) { - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - queue_t *q = connp->conn_rq; - tcp_stack_t *tcps = tcp->tcp_tcps; - /* socket options */ - struct sock_proto_props sopp; - - /* We should just receive a single mblk that fits a T_discon_ind */ - ASSERT(mp->b_cont == NULL); - - /* - * Drop the eager's ref on the listener, that was placed when - * this eager began life in tcp_input_listener. - */ - CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); - if (IPCL_IS_NONSTR(connp)) { - /* Safe to free conn_ind message */ - freemsg(tcp->tcp_conn.tcp_eager_conn_ind); - tcp->tcp_conn.tcp_eager_conn_ind = NULL; - } - - tcp->tcp_detached = B_FALSE; + conn_t *connp = tcp->tcp_connp; - if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { - /* - * Someone blewoff the eager before we could finish - * the accept. - * - * The only reason eager exists it because we put in - * a ref on it when conn ind went up. We need to send - * a disconnect indication up while the last reference - * on the eager will be dropped by the squeue when we - * return. - */ - ASSERT(tcp->tcp_listener == NULL); - if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { - if (IPCL_IS_NONSTR(connp)) { - ASSERT(tcp->tcp_issocket); - (*connp->conn_upcalls->su_disconnected)( - connp->conn_upper_handle, tcp->tcp_connid, - ECONNREFUSED); - freemsg(mp); - } else { - struct T_discon_ind *tdi; - - (void) putnextctl1(q, M_FLUSH, FLUSHRW); - /* - * Let us reuse the incoming mblk to avoid - * memory allocation failure problems. We know - * that the size of the incoming mblk i.e. - * stroptions is greater than sizeof - * T_discon_ind. - */ - ASSERT(DB_REF(mp) == 1); - ASSERT(MBLKSIZE(mp) >= - sizeof (struct T_discon_ind)); - - DB_TYPE(mp) = M_PROTO; - ((union T_primitives *)mp->b_rptr)->type = - T_DISCON_IND; - tdi = (struct T_discon_ind *)mp->b_rptr; - if (tcp->tcp_issocket) { - tdi->DISCON_reason = ECONNREFUSED; - tdi->SEQ_number = 0; - } else { - tdi->DISCON_reason = ENOPROTOOPT; - tdi->SEQ_number = - tcp->tcp_conn_req_seqnum; - } - mp->b_wptr = mp->b_rptr + - sizeof (struct T_discon_ind); - putnext(q, mp); - } - } - tcp->tcp_hard_binding = B_FALSE; - return; - } - - /* - * This is the first time we run on the correct - * queue after tcp_accept. So fix all the q parameters - * here. - */ - sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; - sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + sopp->sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; + sopp->sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - sopp.sopp_rxhiwat = tcp->tcp_fused ? + sopp->sopp_rxhiwat = tcp->tcp_fused ? tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : connp->conn_rcvbuf; - /* * Determine what write offset value to use depending on SACK and * whether the endpoint is fused or not. @@ -3203,18 +3169,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) * since it would reduce the amount of work done by kmem. * Non-fused tcp loopback case is handled separately below. */ - sopp.sopp_wroff = 0; + sopp->sopp_wroff = 0; /* * Update the peer's transmit parameters according to * our recently calculated high water mark value. */ (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { - sopp.sopp_wroff = connp->conn_ht_iphc_allocated + - (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); + sopp->sopp_wroff = connp->conn_ht_iphc_allocated + + (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); } else { - sopp.sopp_wroff = connp->conn_ht_iphc_len + - (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); + sopp->sopp_wroff = connp->conn_ht_iphc_len + + (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); } /* @@ -3239,297 +3205,10 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN; } - - /* Send the options up */ - if (IPCL_IS_NONSTR(connp)) { - if (sopp.sopp_flags & SOCKOPT_TAIL) { - ASSERT(tcp->tcp_kssl_ctx != NULL); - ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY); - } - if (tcp->tcp_loopback) { - sopp.sopp_flags |= SOCKOPT_LOOPBACK; - sopp.sopp_loopback = B_TRUE; - } - (*connp->conn_upcalls->su_set_proto_props) - (connp->conn_upper_handle, &sopp); - freemsg(mp); - } else { - /* - * Let us reuse the incoming mblk to avoid - * memory allocation failure problems. We know - * that the size of the incoming mblk is at least - * stroptions - */ - struct stroptions *stropt; - - ASSERT(DB_REF(mp) == 1); - ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); - - DB_TYPE(mp) = M_SETOPTS; - stropt = (struct stroptions *)mp->b_rptr; - mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); - stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - stropt->so_hiwat = sopp.sopp_rxhiwat; - stropt->so_wroff = sopp.sopp_wroff; - stropt->so_maxblk = sopp.sopp_maxblk; - - if (sopp.sopp_flags & SOCKOPT_TAIL) { - ASSERT(tcp->tcp_kssl_ctx != NULL); - - stropt->so_flags |= SO_TAIL | SO_COPYOPT; - stropt->so_tail = sopp.sopp_tail; - stropt->so_copyopt = sopp.sopp_zcopyflag; - } - - /* Send the options up */ - putnext(q, mp); - } - - /* - * Pass up any data and/or a fin that has been received. - * - * Adjust receive window in case it had decreased - * (because there is data <=> tcp_rcv_list != NULL) - * while the connection was detached. Note that - * in case the eager was flow-controlled, w/o this - * code, the rwnd may never open up again! - */ - if (tcp->tcp_rcv_list != NULL) { - if (IPCL_IS_NONSTR(connp)) { - mblk_t *mp; - int space_left; - int error; - boolean_t push = B_TRUE; - - if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, NULL, 0, 0, &error, - &push) >= 0) { - tcp->tcp_rwnd = connp->conn_rcvbuf; - if (tcp->tcp_state >= TCPS_ESTABLISHED && - tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { - tcp_xmit_ctl(NULL, - tcp, (tcp->tcp_swnd == 0) ? - tcp->tcp_suna : tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - } - } - while ((mp = tcp->tcp_rcv_list) != NULL) { - push = B_TRUE; - tcp->tcp_rcv_list = mp->b_next; - mp->b_next = NULL; - space_left = (*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, msgdsize(mp), - 0, &error, &push); - if (space_left < 0) { - /* - * We should never be in middle of a - * fallback, the squeue guarantees that. - */ - ASSERT(error != EOPNOTSUPP); - } - } - tcp->tcp_rcv_last_head = NULL; - tcp->tcp_rcv_last_tail = NULL; - tcp->tcp_rcv_cnt = 0; - } else { - /* We drain directly in case of fused tcp loopback */ - - if (!tcp->tcp_fused && canputnext(q)) { - tcp->tcp_rwnd = connp->conn_rcvbuf; - if (tcp->tcp_state >= TCPS_ESTABLISHED && - tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { - tcp_xmit_ctl(NULL, - tcp, (tcp->tcp_swnd == 0) ? - tcp->tcp_suna : tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - } - } - - (void) tcp_rcv_drain(tcp); - } - - /* - * For fused tcp loopback, back-enable peer endpoint - * if it's currently flow-controlled. - */ - if (tcp->tcp_fused) { - tcp_t *peer_tcp = tcp->tcp_loopback_peer; - - ASSERT(peer_tcp != NULL); - ASSERT(peer_tcp->tcp_fused); - - mutex_enter(&peer_tcp->tcp_non_sq_lock); - if (peer_tcp->tcp_flow_stopped) { - tcp_clrqfull(peer_tcp); - TCP_STAT(tcps, tcp_fusion_backenabled); - } - mutex_exit(&peer_tcp->tcp_non_sq_lock); - } - } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); - if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { - tcp->tcp_ordrel_done = B_TRUE; - if (IPCL_IS_NONSTR(connp)) { - ASSERT(tcp->tcp_ordrel_mp == NULL); - (*connp->conn_upcalls->su_opctl)( - connp->conn_upper_handle, - SOCK_OPCTL_SHUT_RECV, 0); - } else { - mp = tcp->tcp_ordrel_mp; - tcp->tcp_ordrel_mp = NULL; - putnext(q, mp); - } - } - tcp->tcp_hard_binding = B_FALSE; - - if (connp->conn_keepalive) { - tcp->tcp_ka_last_intrvl = 0; - tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, - tcp->tcp_ka_interval); - } - - /* - * At this point, eager is fully established and will - * have the following references - - * - * 2 references for connection to exist (1 for TCP and 1 for IP). - * 1 reference for the squeue which will be dropped by the squeue as - * soon as this function returns. - * There will be 1 additonal reference for being in classifier - * hash list provided something bad hasn't happened. - */ - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); -} - -/* - * Common to TPI and sockfs accept code. - */ -/* ARGSUSED2 */ -int -tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) -{ - tcp_t *listener, *eager; - mblk_t *discon_mp; - - listener = lconnp->conn_tcp; - ASSERT(listener->tcp_state == TCPS_LISTEN); - eager = econnp->conn_tcp; - ASSERT(eager->tcp_listener != NULL); - - /* - * Pre allocate the discon_ind mblk also. tcp_accept_finish will - * use it if something failed. - */ - discon_mp = allocb(MAX(sizeof (struct T_discon_ind), - sizeof (struct stroptions)), BPRI_HI); - - if (discon_mp == NULL) { - return (-TPROTO); - } - eager->tcp_issocket = B_TRUE; - - econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; - econnp->conn_allzones = listener->tcp_connp->conn_allzones; - ASSERT(econnp->conn_netstack == - listener->tcp_connp->conn_netstack); - ASSERT(eager->tcp_tcps == listener->tcp_tcps); - - /* Put the ref for IP */ - CONN_INC_REF(econnp); - - /* - * We should have minimum of 3 references on the conn - * at this point. One each for TCP and IP and one for - * the T_conn_ind that was sent up when the 3-way handshake - * completed. In the normal case we would also have another - * reference (making a total of 4) for the conn being in the - * classifier hash list. However the eager could have received - * an RST subsequently and tcp_closei_local could have removed - * the eager from the classifier hash list, hence we can't - * assert that reference. - */ - ASSERT(econnp->conn_ref >= 3); - - mutex_enter(&listener->tcp_eager_lock); - if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { - - tcp_t *tail; - tcp_t *tcp; - mblk_t *mp1; - - tcp = listener->tcp_eager_prev_q0; - /* - * listener->tcp_eager_prev_q0 points to the TAIL of the - * deferred T_conn_ind queue. We need to get to the head - * of the queue in order to send up T_conn_ind the same - * order as how the 3WHS is completed. - */ - while (tcp != listener) { - if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && - !tcp->tcp_kssl_pending) - break; - else - tcp = tcp->tcp_eager_prev_q0; - } - /* None of the pending eagers can be sent up now */ - if (tcp == listener) - goto no_more_eagers; - - mp1 = tcp->tcp_conn.tcp_eager_conn_ind; - tcp->tcp_conn.tcp_eager_conn_ind = NULL; - /* Move from q0 to q */ - ASSERT(listener->tcp_conn_req_cnt_q0 > 0); - listener->tcp_conn_req_cnt_q0--; - listener->tcp_conn_req_cnt_q++; - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - tcp->tcp_eager_prev_q0 = NULL; - tcp->tcp_eager_next_q0 = NULL; - tcp->tcp_conn_def_q0 = B_FALSE; - - /* Make sure the tcp isn't in the list of droppables */ - ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && - tcp->tcp_eager_prev_drop_q0 == NULL); - - /* - * Insert at end of the queue because sockfs sends - * down T_CONN_RES in chronological order. Leaving - * the older conn indications at front of the queue - * helps reducing search time. - */ - tail = listener->tcp_eager_last_q; - if (tail != NULL) { - tail->tcp_eager_next_q = tcp; - } else { - listener->tcp_eager_next_q = tcp; - } - listener->tcp_eager_last_q = tcp; - tcp->tcp_eager_next_q = NULL; - - /* Need to get inside the listener perimeter */ - CONN_INC_REF(listener->tcp_connp); - SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, - tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL, - SQTAG_TCP_SEND_PENDING); + if (tcp->tcp_loopback) { + sopp->sopp_flags |= SOCKOPT_LOOPBACK; + sopp->sopp_loopback = B_TRUE; } -no_more_eagers: - tcp_eager_unlink(eager); - mutex_exit(&listener->tcp_eager_lock); - - /* - * At this point, the eager is detached from the listener - * but we still have an extra refs on eager (apart from the - * usual tcp references). The ref was placed in tcp_input_data - * before sending the conn_ind in tcp_send_conn_ind. - * The ref will be dropped in tcp_accept_finish(). - */ - SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, - econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); - return (0); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index c8f50cee8f..81640b8329 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -233,8 +233,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha) mp->b_wptr += sizeof (*stropt); stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_WROFF; + stropt->so_flags = SO_WROFF | SO_MAXBLK; stropt->so_wroff = 0; + stropt->so_maxblk = INFPSZ; /* Send the options up */ putnext(peer_rq, mp); @@ -244,8 +245,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha) /* The peer is a non-STREAMS end point */ ASSERT(IPCL_IS_TCP(peer_connp)); - sopp.sopp_flags = SOCKOPT_WROFF; + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK; sopp.sopp_wroff = 0; + sopp.sopp_maxblk = INFPSZ; (*peer_connp->conn_upcalls->su_set_proto_props) (peer_connp->conn_upper_handle, &sopp); } diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index ce00372741..4b5a37a13c 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -1542,14 +1542,14 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) eager->tcp_kssl_pending = B_TRUE; } + ASSERT(eager->tcp_ordrel_mp == NULL); + /* Inherit the listener's non-STREAMS flag */ if (IPCL_IS_NONSTR(lconnp)) { econnp->conn_flags |= IPCL_NONSTR; - } - - ASSERT(eager->tcp_ordrel_mp == NULL); - - if (!IPCL_IS_NONSTR(econnp)) { + /* All non-STREAMS tcp_ts are sockets */ + eager->tcp_issocket = B_TRUE; + } else { /* * Pre-allocate the T_ordrel_ind mblk for TPI socket so that * at close time, we will always have that to send up. @@ -1632,7 +1632,7 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) /* * Since we will clear tcp_listener before we clear tcp_detached * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress - * so we can tell a TCP_DETACHED_NONEAGER apart. + * so we can tell a TCP_IS_DETACHED_NONEAGER apart. */ eager->tcp_hard_binding = B_TRUE; @@ -2003,8 +2003,6 @@ tcp_rcv_drain(tcp_t *tcp) * some work. */ if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { - ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) || - tcp->tcp_fused_sigurg_mp != NULL); if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : &tcp->tcp_fused_sigurg_mp)) return (ret); @@ -3588,14 +3586,79 @@ process_ack: if (bytes_acked > 0) tcp->tcp_ip_forward_progress = B_TRUE; if (tcp->tcp_state == TCPS_SYN_RCVD) { - if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && - ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { - /* 3-way handshake complete - pass up the T_CONN_IND */ + /* + * tcp_sendmsg() checks tcp_state without entering + * the squeue so tcp_state should be updated before + * sending up a connection confirmation or a new + * connection indication. + */ + tcp->tcp_state = TCPS_ESTABLISHED; + + /* + * We are seeing the final ack in the three way + * hand shake of a active open'ed connection + * so we must send up a T_CONN_CON + */ + if (tcp->tcp_active_open) { + if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { + freemsg(mp); + tcp->tcp_state = TCPS_SYN_RCVD; + return; + } + /* + * Don't fuse the loopback endpoints for + * simultaneous active opens. + */ + if (tcp->tcp_loopback) { + TCP_STAT(tcps, tcp_fusion_unfusable); + tcp->tcp_unfusable = B_TRUE; + } + /* + * For simultaneous active open, trace receipt of final + * ACK as tcp:::connect-established. + */ + DTRACE_TCP5(connect__established, mblk_t *, NULL, + ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, + iphdr, tcp_t *, tcp, tcph_t *, tcpha); + } else if (IPCL_IS_NONSTR(connp)) { + /* + * 3-way handshake has completed, so notify socket + * of the new connection. + * + * We are here means eager is fine but it can + * get a TH_RST at any point between now and till + * accept completes and disappear. We need to + * ensure that reference to eager is valid after + * we get out of eager's perimeter. So we do + * an extra refhold. + */ + CONN_INC_REF(connp); + + if (!tcp_newconn_notify(tcp, ira)) { + freemsg(mp); + /* notification did not go up, so drop ref */ + CONN_DEC_REF(connp); + return; + } + /* + * For passive open, trace receipt of final ACK as + * tcp:::accept-established. + */ + DTRACE_TCP5(accept__established, mlbk_t *, NULL, + ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, + iphdr, tcp_t *, tcp, tcph_t *, tcpha); + } else if (((tcp->tcp_kssl_ent == NULL) || + !tcp->tcp_kssl_pending)) { + /* + * 3-way handshake complete - this is a STREAMS based + * socket, so pass up the T_CONN_IND. + */ tcp_t *listener = tcp->tcp_listener; mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; tcp->tcp_tconnind_started = B_TRUE; tcp->tcp_conn.tcp_eager_conn_ind = NULL; + ASSERT(mp != NULL); /* * We are here means eager is fine but it can * get a TH_RST at any point between now and till @@ -3638,43 +3701,6 @@ process_ack: listener->tcp_connp, NULL, SQ_NODRAIN, SQTAG_TCP_CONN_IND); } - } - - /* - * We are seeing the final ack in the three way - * hand shake of a active open'ed connection - * so we must send up a T_CONN_CON - * - * tcp_sendmsg() checks tcp_state without entering - * the squeue so tcp_state should be updated before - * sending up connection confirmation. Probe the state - * change below when we are sure sending of the confirmation - * has succeeded. - */ - tcp->tcp_state = TCPS_ESTABLISHED; - - if (tcp->tcp_active_open) { - if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { - freemsg(mp); - tcp->tcp_state = TCPS_SYN_RCVD; - return; - } - /* - * Don't fuse the loopback endpoints for - * simultaneous active opens. - */ - if (tcp->tcp_loopback) { - TCP_STAT(tcps, tcp_fusion_unfusable); - tcp->tcp_unfusable = B_TRUE; - } - /* - * For simultaneous active open, trace receipt of final - * ACK as tcp:::connect-established. - */ - DTRACE_TCP5(connect__established, mblk_t *, NULL, - ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *, - iphdr, tcp_t *, tcp, tcph_t *, tcpha); - } else { /* * For passive open, trace receipt of final ACK as * tcp:::accept-established. @@ -4454,13 +4480,14 @@ est: tcpha->tha_ack = htonl(tcp->tcp_rnxt); /* - * Generate the ordrel_ind at the end unless we - * are an eager guy. - * In the eager case tcp_rsrv will do this when run - * after tcp_accept is done. + * Generate the ordrel_ind at the end unless the + * conn is detached or it is a STREAMS based eager. + * In the eager case we defer the notification until + * tcp_accept_finish has run. */ - if (tcp->tcp_listener == NULL && - !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) + if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) || + (tcp->tcp_listener == NULL && + !tcp->tcp_hard_binding))) flags |= TH_ORDREL_NEEDED; switch (tcp->tcp_state) { case TCPS_SYN_RCVD: @@ -4599,25 +4626,7 @@ update_ack: return; } - if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { - /* - * Side queue inbound data until the accept happens. - * tcp_accept/tcp_rput drains this when the accept happens. - * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or - * T_EXDATA_IND) it is queued on b_next. - * XXX Make urgent data use this. Requires: - * Removing tcp_listener check for TH_URG - * Making M_PCPROTO and MARK messages skip the eager case - */ - - if (tcp->tcp_kssl_pending) { - DTRACE_PROBE1(kssl_mblk__ksslinput_pending, - mblk_t *, mp); - tcp_kssl_input(tcp, mp, ira->ira_cred); - } else { - tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); - } - } else if (IPCL_IS_NONSTR(connp)) { + if (IPCL_IS_NONSTR(connp)) { /* * Non-STREAMS socket * @@ -4641,8 +4650,26 @@ update_ack: /* PUSH bit set and sockfs is not flow controlled */ flags |= tcp_rwnd_reopen(tcp); } + } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { + /* + * Side queue inbound data until the accept happens. + * tcp_accept/tcp_rput drains this when the accept happens. + * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or + * T_EXDATA_IND) it is queued on b_next. + * XXX Make urgent data use this. Requires: + * Removing tcp_listener check for TH_URG + * Making M_PCPROTO and MARK messages skip the eager case + */ + + if (tcp->tcp_kssl_pending) { + DTRACE_PROBE1(kssl_mblk__ksslinput_pending, + mblk_t *, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); + } else { + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); + } } else { - /* STREAMS socket */ + /* Active STREAMS socket */ if (mp->b_datap->db_type != M_DATA || (flags & TH_MARKNEXT_NEEDED)) { if (tcp->tcp_rcv_list != NULL) { @@ -4858,11 +4885,14 @@ ack_check: } if (flags & TH_ORDREL_NEEDED) { /* - * Send up the ordrel_ind unless we are an eager guy. - * In the eager case tcp_rsrv will do this when run - * after tcp_accept is done. + * Notify upper layer about an orderly release. If this is + * a non-STREAMS socket, then just make an upcall. For STREAMS + * we send up an ordrel_ind, unless this is an eager, in which + * case the ordrel will be sent when tcp_accept_finish runs. + * Note that for non-STREAMS we make an upcall even if it is an + * eager, because we have an upper handle to send it to. */ - ASSERT(tcp->tcp_listener == NULL); + ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL); ASSERT(!tcp->tcp_detached); if (IPCL_IS_NONSTR(connp)) { diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c index a93c5bce9e..249df69de5 100644 --- a/usr/src/uts/common/inet/tcp/tcp_output.c +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -1465,13 +1465,24 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) clock_t delta = 0; tcp_stack_t *tcps = tcp->tcp_tcps; - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); + /* + * When a non-STREAMS socket is being closed, it does not always + * stick around waiting for tcp_close_output to run and can therefore + * have dropped a reference already. So adjust the asserts accordingly. + */ + ASSERT((connp->conn_fanout != NULL && + connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) || + (connp->conn_fanout == NULL && + connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3))); mutex_enter(&tcp->tcp_eager_lock); if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { - /* Cleanup for listener */ - tcp_eager_cleanup(tcp, 0); + /* + * Cleanup for listener. For non-STREAM sockets sockfs will + * close all the eagers on 'q', so in that case only deal + * with 'q0'. + */ + tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0); tcp->tcp_wait_for_eagers = 1; } mutex_exit(&tcp->tcp_eager_lock); @@ -1516,14 +1527,37 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) msg = "tcp_close, unread data"; break; } + /* - * We have done a qwait() above which could have possibly - * drained more messages in turn causing transition to a - * different state. Check whether we have to do the rest - * of the processing or not. + * Abort connection if it is being closed without first + * being accepted. This can happen if a listening non-STREAM + * socket wants to get rid of the socket, for example, if the + * listener is closing. */ - if (tcp->tcp_state <= TCPS_LISTEN) + if (tcp->tcp_listener != NULL) { + ASSERT(IPCL_IS_NONSTR(connp)); + msg = "tcp_close, close before accept"; + + /* + * Unlink from the listener and drop the reference + * put on it by the eager. tcp_closei_local will not + * do it because tcp_tconnind_started is TRUE. + */ + mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock); + tcp_eager_unlink(tcp); + mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock); + CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); + + /* + * If the conn has received a RST, the only thing + * left to do is to drop the ref. + */ + if (tcp->tcp_state <= TCPS_BOUND) { + CONN_DEC_REF(tcp->tcp_connp); + return; + } break; + } /* * Transmit the FIN before detaching the tcp_t. @@ -1593,7 +1627,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); TCP_DBGSTAT(tcps, tcp_detach_time_wait); - ASSERT(connp->conn_ref >= 3); + ASSERT(connp->conn_ref >= + (IPCL_IS_NONSTR(connp) ? 2 : 3)); goto finish; } @@ -1606,7 +1641,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, delta ? delta : 1); - ASSERT(connp->conn_ref >= 3); + ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)); goto finish; } @@ -1623,22 +1658,35 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) tcp_closei_local(tcp); CONN_DEC_REF(connp); - ASSERT(connp->conn_ref >= 2); + ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2)); finish: - mutex_enter(&tcp->tcp_closelock); /* * Don't change the queues in the case of a listener that has * eagers in its q or q0. It could surprise the eagers. * Instead wait for the eagers outside the squeue. + * + * For non-STREAMS sockets tcp_wait_for_eagers implies that + * we should delay the su_closed upcall until all eagers have + * dropped their references. */ if (!tcp->tcp_wait_for_eagers) { tcp->tcp_detached = B_TRUE; connp->conn_rq = NULL; connp->conn_wq = NULL; + + /* non-STREAM socket, release the upper handle */ + if (IPCL_IS_NONSTR(connp)) { + ASSERT(connp->conn_upper_handle != NULL); + (*connp->conn_upcalls->su_closed) + (connp->conn_upper_handle); + connp->conn_upper_handle = NULL; + connp->conn_upcalls = NULL; + } } /* Signal tcp_close() to finish closing. */ + mutex_enter(&tcp->tcp_closelock); tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); mutex_exit(&tcp->tcp_closelock); diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index 4b50c65cc6..f5df6b156c 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -33,6 +33,7 @@ #include #include #include +#define _SUN_TPI_VERSION 2 #include #include #include @@ -121,6 +122,7 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); } +/*ARGSUSED*/ static int tcp_accept(sock_lower_handle_t lproto_handle, sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, @@ -135,18 +137,59 @@ tcp_accept(sock_lower_handle_t lproto_handle, econnp = (conn_t *)eproto_handle; eager = econnp->conn_tcp; ASSERT(eager->tcp_listener != NULL); + ASSERT(IPCL_IS_NONSTR(econnp)); + ASSERT(lconnp->conn_upper_handle != NULL); /* - * It is OK to manipulate these fields outside the eager's squeue - * because they will not start being used until tcp_accept_finish - * has been called. + * It is possible for the accept thread to race with the thread that + * made the su_newconn upcall in tcp_newconn_notify. Both + * tcp_newconn_notify and tcp_accept require that conn_upper_handle + * and conn_upcalls be set before returning, so they both write to + * them. However, we're guaranteed that the value written is the same + * for both threads. */ - ASSERT(lconnp->conn_upper_handle != NULL); - ASSERT(econnp->conn_upper_handle == NULL); + ASSERT(econnp->conn_upper_handle == NULL || + econnp->conn_upper_handle == sock_handle); + ASSERT(econnp->conn_upcalls == NULL || + econnp->conn_upcalls == lconnp->conn_upcalls); econnp->conn_upper_handle = sock_handle; econnp->conn_upcalls = lconnp->conn_upcalls; - ASSERT(IPCL_IS_NONSTR(econnp)); - return (tcp_accept_common(lconnp, econnp, cr)); + + ASSERT(econnp->conn_netstack == + listener->tcp_connp->conn_netstack); + ASSERT(eager->tcp_tcps == listener->tcp_tcps); + + /* + * We should have a minimum of 2 references on the conn at this + * point. One for TCP and one for the newconn notification + * (which is now taken over by IP). In the normal case we would + * also have another reference (making a total of 3) for the conn + * being in the classifier hash list. However the eager could have + * received an RST subsequently and tcp_closei_local could have + * removed the eager from the classifier hash list, hence we can't + * assert that reference. + */ + ASSERT(econnp->conn_ref >= 2); + + /* + * An error is returned if this conn has been reset, which will + * cause the socket to be closed immediately. The eager will be + * unlinked from the listener during close. + */ + if (eager->tcp_state < TCPS_ESTABLISHED) + return (ECONNABORTED); + + mutex_enter(&listener->tcp_eager_lock); + /* + * Non-STREAMS listeners never defer the notification of new + * connections. + */ + ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); + tcp_eager_unlink(eager); + mutex_exit(&listener->tcp_eager_lock); + CONN_DEC_REF(listener->tcp_connp); + + return (0); } static int @@ -188,14 +231,12 @@ tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, return (error); } -/* - * SOP_LISTEN() calls into tcp_listen(). - */ /* ARGSUSED */ static int tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; int error; ASSERT(connp->conn_upper_handle != NULL); @@ -211,8 +252,14 @@ tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); if (error == 0) { + /* + * sockfs needs to know what's the maximum number of socket + * that can be queued on the listener. + */ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, - SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog); + SOCK_OPCTL_ENAB_ACCEPT, + (uintptr_t)(tcp->tcp_conn_req_max + + tcp->tcp_tcps->tcps_conn_req_max_q0)); } else if (error < 0) { if (error == -TOUTSTATE) error = EINVAL; @@ -296,7 +343,6 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, conn_t *connp = (conn_t *)proto_handle; tcp_t *tcp = connp->conn_tcp; - ASSERT(connp->conn_upper_handle != NULL); /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); @@ -317,7 +363,6 @@ tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - ASSERT(connp->conn_upper_handle != NULL); return (conn_getsockname(connp, addr, addrlenp)); } @@ -694,7 +739,12 @@ tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) * packets in squeue for the timewait state. */ CONN_DEC_REF(connp); - return (0); + + /* + * EINPROGRESS tells sockfs to wait for a 'closed' upcall before + * freeing the socket. + */ + return (EINPROGRESS); } /* ARGSUSED */ @@ -737,9 +787,206 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return ((sock_lower_handle_t)connp); } +/* + * tcp_fallback + * + * A direct socket is falling back to using STREAMS. The queue + * that is being passed down was created using tcp_open() with + * the SO_FALLBACK flag set. As a result, the queue is not + * associated with a conn, and the q_ptrs instead contain the + * dev and minor area that should be used. + * + * The 'issocket' flag indicates whether the FireEngine + * optimizations should be used. The common case would be that + * optimizations are enabled, and they might be subsequently + * disabled using the _SIOCSOCKFALLBACK ioctl. + */ + +/* + * An active connection is falling back to TPI. Gather all the information + * required by the STREAM head and TPI sonode and send it up. + */ +static void +tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, + boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, + sock_quiesce_arg_t *arg) +{ + conn_t *connp = tcp->tcp_connp; + struct stroptions *stropt; + struct T_capability_ack tca; + struct sockaddr_in6 laddr, faddr; + socklen_t laddrlen, faddrlen; + short opts; + int error; + mblk_t *mp, *mpnext; + + connp->conn_dev = (dev_t)RD(q)->q_ptr; + connp->conn_minor_arena = WR(q)->q_ptr; + + RD(q)->q_ptr = WR(q)->q_ptr = connp; + + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); + + WR(q)->q_qinfo = &tcp_sock_winit; + + if (!issocket) + tcp_use_pure_tpi(tcp); + + /* + * free the helper stream + */ + ip_free_helper_stream(connp); + + /* + * Notify the STREAM head about options + */ + DB_TYPE(stropt_mp) = M_SETOPTS; + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt_mp->b_wptr += sizeof (struct stroptions); + stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; + + stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : + tcp->tcp_tcps->tcps_wroff_xtra); + if (tcp->tcp_snd_sack_ok) + stropt->so_wroff += TCPOPT_MAX_SACK_LEN; + stropt->so_hiwat = connp->conn_rcvbuf; + stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + + putnext(RD(q), stropt_mp); + + /* + * Collect the information needed to sync with the sonode + */ + tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); + + laddrlen = faddrlen = sizeof (sin6_t); + (void) tcp_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, CRED()); + error = tcp_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, CRED()); + if (error != 0) + faddrlen = 0; + + opts = 0; + if (connp->conn_oobinline) + opts |= SO_OOBINLINE; + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) + opts |= SO_DONTROUTE; + + /* + * Notify the socket that the protocol is now quiescent, + * and it's therefore safe move data from the socket + * to the stream head. + */ + mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, + (struct sockaddr *)&laddr, laddrlen, + (struct sockaddr *)&faddr, faddrlen, opts); + + while (mp != NULL) { + mpnext = mp->b_next; + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + putnext(q, mp); + mp = mpnext; + } + ASSERT(tcp->tcp_rcv_last_head == NULL); + ASSERT(tcp->tcp_rcv_last_tail == NULL); + ASSERT(tcp->tcp_rcv_cnt == 0); + + /* + * All eagers in q0 are marked as being non-STREAM, so they will + * make su_newconn upcalls when the handshake completes, which + * will fail (resulting in the conn being closed). So we just blow + * off everything in q0 instead of waiting for the inevitable. + */ + if (tcp->tcp_conn_req_cnt_q0 != 0) + tcp_eager_cleanup(tcp, B_TRUE); +} + +/* + * An eager is falling back to TPI. All we have to do is send + * up a T_CONN_IND. + */ +static void +tcp_fallback_eager(tcp_t *eager, boolean_t issocket, + so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) +{ + conn_t *connp = eager->tcp_connp; + tcp_t *listener = eager->tcp_listener; + mblk_t *mp; + + ASSERT(listener != NULL); + + /* + * Notify the socket that the protocol is now quiescent, + * and it's therefore safe move data from the socket + * to tcp's rcv queue. + */ + mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, + NULL, 0, 0); + + if (mp != NULL) { + ASSERT(eager->tcp_rcv_cnt == 0); + + eager->tcp_rcv_list = mp; + eager->tcp_rcv_cnt = msgdsize(mp); + while (mp->b_next != NULL) { + mp = mp->b_next; + eager->tcp_rcv_cnt += msgdsize(mp); + } + eager->tcp_rcv_last_head = mp; + while (mp->b_cont) + mp = mp->b_cont; + eager->tcp_rcv_last_tail = mp; + if (eager->tcp_rcv_cnt > eager->tcp_rwnd) + eager->tcp_rwnd = 0; + else + eager->tcp_rwnd -= eager->tcp_rcv_cnt; + } + + if (!issocket) + eager->tcp_issocket = B_FALSE; + /* + * The stream for this eager does not yet exist, so mark it as + * being detached. + */ + eager->tcp_detached = B_TRUE; + eager->tcp_hard_binding = B_TRUE; + connp->conn_rq = listener->tcp_connp->conn_rq; + connp->conn_wq = listener->tcp_connp->conn_wq; + + /* Send up the connection indication */ + mp = eager->tcp_conn.tcp_eager_conn_ind; + ASSERT(mp != NULL); + eager->tcp_conn.tcp_eager_conn_ind = NULL; + + /* + * TLI/XTI applications will get confused by + * sending eager as an option since it violates + * the option semantics. So remove the eager as + * option since TLI/XTI app doesn't need it anyway. + */ + if (!issocket) { + struct T_conn_ind *conn_ind; + + conn_ind = (struct T_conn_ind *)mp->b_rptr; + conn_ind->OPT_length = 0; + conn_ind->OPT_offset = 0; + } + + /* + * Sockfs guarantees that the listener will not be closed + * during fallback. So we can safely use the listener's queue. + */ + putnext(listener->tcp_connp->conn_rq, mp); +} + + int tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, - boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, + sock_quiesce_arg_t *arg) { tcp_t *tcp; conn_t *connp = (conn_t *)proto_handle; @@ -768,14 +1015,6 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* failed to enter, free all the pre-allocated messages. */ freeb(stropt_mp); freeb(ordrel_mp); - /* - * We cannot process the eager, so at least send out a - * RST so the peer can reconnect. - */ - if (tcp->tcp_listener != NULL) { - (void) tcp_eager_blowoff(tcp->tcp_listener, - tcp->tcp_conn_req_seqnum); - } return (ENOMEM); } @@ -787,21 +1026,24 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (tcp->tcp_fused) tcp_unfuse(tcp); - /* - * No longer a direct socket - */ - connp->conn_flags &= ~IPCL_NONSTR; - tcp->tcp_ordrel_mp = ordrel_mp; - if (tcp->tcp_listener != NULL) { /* The eager will deal with opts when accept() is called */ freeb(stropt_mp); - tcp_fallback_eager(tcp, direct_sockfs); + tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); } else { tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, - quiesced_cb); + quiesced_cb, arg); } + /* + * No longer a direct socket + * + * Note that we intentionally leave the upper_handle and upcalls + * intact, since eagers may still be using them. + */ + connp->conn_flags &= ~IPCL_NONSTR; + tcp->tcp_ordrel_mp = ordrel_mp; + /* * There should be atleast two ref's (IP + TCP) */ @@ -810,3 +1052,141 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, return (0); } + +/* + * Notifies a non-STREAMS based listener about a new connection. This + * function is executed on the *eager*'s squeue once the 3 way handshake + * has completed. Note that the behavior differs from STREAMS, where the + * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s + * squeue. + * + * Returns B_TRUE if the notification succeeded, in which case `tcp' will + * be moved over to the ESTABLISHED list (q) of the listener. Othwerise, + * B_FALSE is returned and `tcp' is killed. + */ +boolean_t +tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) +{ + tcp_t *listener = tcp->tcp_listener; + conn_t *lconnp = listener->tcp_connp; + conn_t *econnp = tcp->tcp_connp; + tcp_t *tail; + ipaddr_t *addr_cache; + sock_upper_handle_t upper; + struct sock_proto_props sopp; + mblk_t *mp; + + mutex_enter(&listener->tcp_eager_lock); + /* + * Take the eager out, if it is in the list of droppable eagers + * as we are here because the 3W handshake is over. + */ + MAKE_UNDROPPABLE(tcp); + /* + * The eager already has an extra ref put in tcp_input_data + * so that it stays till accept comes back even though it + * might get into TCPS_CLOSED as a result of a TH_RST etc. + */ + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + listener->tcp_conn_req_cnt_q++; + + /* Move from SYN_RCVD to ESTABLISHED list */ + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; + tcp->tcp_eager_prev_q0 = NULL; + tcp->tcp_eager_next_q0 = NULL; + + /* + * Insert at end of the queue because connections are accepted + * in chronological order. Leaving the older connections at front + * of the queue helps reducing search time. + */ + tail = listener->tcp_eager_last_q; + if (tail != NULL) + tail->tcp_eager_next_q = tcp; + else + listener->tcp_eager_next_q = tcp; + listener->tcp_eager_last_q = tcp; + tcp->tcp_eager_next_q = NULL; + + /* we have timed out before */ + if (tcp->tcp_syn_rcvd_timeout != 0) { + tcp->tcp_syn_rcvd_timeout = 0; + listener->tcp_syn_rcvd_timeout--; + if (listener->tcp_syn_defense && + listener->tcp_syn_rcvd_timeout <= + (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && + 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - + listener->tcp_last_rcv_lbolt)) { + /* + * Turn off the defense mode if we + * believe the SYN attack is over. + */ + listener->tcp_syn_defense = B_FALSE; + if (listener->tcp_ip_addr_cache) { + kmem_free((void *)listener->tcp_ip_addr_cache, + IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); + listener->tcp_ip_addr_cache = NULL; + } + } + } + addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); + if (addr_cache != NULL) { + /* + * We have finished a 3-way handshake with this + * remote host. This proves the IP addr is good. + * Cache it! + */ + addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = + tcp->tcp_connp->conn_faddr_v4; + } + mutex_exit(&listener->tcp_eager_lock); + + /* + * Notify the ULP about the newconn. It is guaranteed that no + * tcp_accept() call will be made for the eager if the + * notification fails. + */ + if ((upper = (*lconnp->conn_upcalls->su_newconn) + (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, + &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, + &econnp->conn_upcalls)) == NULL) { + /* + * Normally this should not happen, but the listener might + * have done a fallback to TPI followed by a close(), in + * which case tcp_closemp for this conn might have been + * used by tcp_eager_cleanup(). + */ + mutex_enter(&listener->tcp_eager_lock); + if (tcp->tcp_closemp_used) { + mutex_exit(&listener->tcp_eager_lock); + return (B_FALSE); + } + tcp->tcp_closemp_used = B_TRUE; + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + mutex_exit(&listener->tcp_eager_lock); + tcp_eager_kill(econnp, mp, NULL, NULL); + return (B_FALSE); + } + econnp->conn_upper_handle = upper; + + tcp->tcp_detached = B_FALSE; + tcp->tcp_hard_binding = B_FALSE; + tcp->tcp_tconnind_started = B_TRUE; + + if (econnp->conn_keepalive) { + tcp->tcp_ka_last_intrvl = 0; + tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, + tcp->tcp_ka_interval); + } + + /* Update the necessary parameters */ + tcp_get_proto_props(tcp, &sopp); + + (*econnp->conn_upcalls->su_set_proto_props) + (econnp->conn_upper_handle, &sopp); + + return (B_TRUE); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_tpi.c b/usr/src/uts/common/inet/tcp/tcp_tpi.c index bcaa1595ec..8c645425b7 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tpi.c +++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* This files contains all TCP TLI/TPI related functions */ @@ -47,7 +46,6 @@ static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); -static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); void tcp_use_pure_tpi(tcp_t *tcp) @@ -823,7 +821,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) /* TODO: Default ETSDU is 1. Is that correct for tcp? */ } -static void +void tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, t_uscalar_t cap_bits1) { @@ -949,148 +947,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp) putnext(tcp->tcp_connp->conn_rq, ackmp); } -/* - * tcp_fallback - * - * A direct socket is falling back to using STREAMS. The queue - * that is being passed down was created using tcp_open() with - * the SO_FALLBACK flag set. As a result, the queue is not - * associated with a conn, and the q_ptrs instead contain the - * dev and minor area that should be used. - * - * The 'issocket' flag indicates whether the FireEngine - * optimizations should be used. The common case would be that - * optimizations are enabled, and they might be subsequently - * disabled using the _SIOCSOCKFALLBACK ioctl. - */ - -/* - * An active connection is falling back to TPI. Gather all the information - * required by the STREAM head and TPI sonode and send it up. - */ -void -tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, - boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) -{ - conn_t *connp = tcp->tcp_connp; - struct stroptions *stropt; - struct T_capability_ack tca; - struct sockaddr_in6 laddr, faddr; - socklen_t laddrlen, faddrlen; - short opts; - int error; - mblk_t *mp; - - connp->conn_dev = (dev_t)RD(q)->q_ptr; - connp->conn_minor_arena = WR(q)->q_ptr; - - RD(q)->q_ptr = WR(q)->q_ptr = connp; - - connp->conn_rq = RD(q); - connp->conn_wq = WR(q); - - WR(q)->q_qinfo = &tcp_sock_winit; - - if (!issocket) - tcp_use_pure_tpi(tcp); - - /* - * free the helper stream - */ - ip_free_helper_stream(connp); - - /* - * Notify the STREAM head about options - */ - DB_TYPE(stropt_mp) = M_SETOPTS; - stropt = (struct stroptions *)stropt_mp->b_rptr; - stropt_mp->b_wptr += sizeof (struct stroptions); - stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - - stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : - tcp->tcp_tcps->tcps_wroff_xtra); - if (tcp->tcp_snd_sack_ok) - stropt->so_wroff += TCPOPT_MAX_SACK_LEN; - stropt->so_hiwat = connp->conn_rcvbuf; - stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - - putnext(RD(q), stropt_mp); - - /* - * Collect the information needed to sync with the sonode - */ - tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); - - laddrlen = faddrlen = sizeof (sin6_t); - (void) tcp_getsockname((sock_lower_handle_t)connp, - (struct sockaddr *)&laddr, &laddrlen, CRED()); - error = tcp_getpeername((sock_lower_handle_t)connp, - (struct sockaddr *)&faddr, &faddrlen, CRED()); - if (error != 0) - faddrlen = 0; - - opts = 0; - if (connp->conn_oobinline) - opts |= SO_OOBINLINE; - if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) - opts |= SO_DONTROUTE; - - /* - * Notify the socket that the protocol is now quiescent, - * and it's therefore safe move data from the socket - * to the stream head. - */ - (*quiesced_cb)(connp->conn_upper_handle, q, &tca, - (struct sockaddr *)&laddr, laddrlen, - (struct sockaddr *)&faddr, faddrlen, opts); - - while ((mp = tcp->tcp_rcv_list) != NULL) { - tcp->tcp_rcv_list = mp->b_next; - mp->b_next = NULL; - /* We never do fallback for kernel RPC */ - putnext(q, mp); - } - tcp->tcp_rcv_last_head = NULL; - tcp->tcp_rcv_last_tail = NULL; - tcp->tcp_rcv_cnt = 0; -} - -/* - * An eager is falling back to TPI. All we have to do is send - * up a T_CONN_IND. - */ -void -tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) -{ - tcp_t *listener = eager->tcp_listener; - mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; - - ASSERT(listener != NULL); - ASSERT(mp != NULL); - - eager->tcp_conn.tcp_eager_conn_ind = NULL; - - /* - * TLI/XTI applications will get confused by - * sending eager as an option since it violates - * the option semantics. So remove the eager as - * option since TLI/XTI app doesn't need it anyway. - */ - if (!direct_sockfs) { - struct T_conn_ind *conn_ind; - - conn_ind = (struct T_conn_ind *)mp->b_rptr; - conn_ind->OPT_length = 0; - conn_ind->OPT_offset = 0; - } - - /* - * Sockfs guarantees that the listener will not be closed - * during fallback. So we can safely use the listener's queue. - */ - putnext(listener->tcp_connp->conn_rq, mp); -} - /* * Swap information between the eager and acceptor for a TLI/XTI client. * The sockfs accept is done on the acceptor stream and control goes @@ -1184,6 +1040,191 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) CONN_DEC_REF(aconnp); } +/* + * This runs at the tail end of accept processing on the squeue of the + * new connection. + */ +/* ARGSUSED */ +static void +tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + queue_t *q = connp->conn_rq; + tcp_stack_t *tcps = tcp->tcp_tcps; + struct stroptions *stropt; + struct sock_proto_props sopp; + + /* Should never be called for non-STREAMS sockets */ + ASSERT(!IPCL_IS_NONSTR(connp)); + + /* We should just receive a single mblk that fits a T_discon_ind */ + ASSERT(mp->b_cont == NULL); + + /* + * Drop the eager's ref on the listener, that was placed when + * this eager began life in tcp_input_listener. + */ + CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); + + tcp->tcp_detached = B_FALSE; + + if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { + /* + * Someone blewoff the eager before we could finish + * the accept. + * + * The only reason eager exists it because we put in + * a ref on it when conn ind went up. We need to send + * a disconnect indication up while the last reference + * on the eager will be dropped by the squeue when we + * return. + */ + ASSERT(tcp->tcp_listener == NULL); + if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { + struct T_discon_ind *tdi; + + (void) putnextctl1(q, M_FLUSH, FLUSHRW); + /* + * Let us reuse the incoming mblk to avoid + * memory allocation failure problems. We know + * that the size of the incoming mblk i.e. + * stroptions is greater than sizeof + * T_discon_ind. + */ + ASSERT(DB_REF(mp) == 1); + ASSERT(MBLKSIZE(mp) >= + sizeof (struct T_discon_ind)); + + DB_TYPE(mp) = M_PROTO; + ((union T_primitives *)mp->b_rptr)->type = + T_DISCON_IND; + tdi = (struct T_discon_ind *)mp->b_rptr; + if (tcp->tcp_issocket) { + tdi->DISCON_reason = ECONNREFUSED; + tdi->SEQ_number = 0; + } else { + tdi->DISCON_reason = ENOPROTOOPT; + tdi->SEQ_number = + tcp->tcp_conn_req_seqnum; + } + mp->b_wptr = mp->b_rptr + + sizeof (struct T_discon_ind); + putnext(q, mp); + } + tcp->tcp_hard_binding = B_FALSE; + return; + } + + /* + * This is the first time we run on the correct + * queue after tcp_accept. So fix all the q parameters + * here. + * + * Let us reuse the incoming mblk to avoid + * memory allocation failure problems. We know + * that the size of the incoming mblk is at least + * stroptions + */ + tcp_get_proto_props(tcp, &sopp); + + ASSERT(DB_REF(mp) == 1); + ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); + + DB_TYPE(mp) = M_SETOPTS; + stropt = (struct stroptions *)mp->b_rptr; + mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); + stropt = (struct stroptions *)mp->b_rptr; + ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); + stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; + stropt->so_hiwat = sopp.sopp_rxhiwat; + stropt->so_wroff = sopp.sopp_wroff; + stropt->so_maxblk = sopp.sopp_maxblk; + + if (sopp.sopp_flags & SOCKOPT_TAIL) { + ASSERT(tcp->tcp_kssl_ctx != NULL); + + stropt->so_flags |= SO_TAIL | SO_COPYOPT; + stropt->so_tail = sopp.sopp_tail; + stropt->so_copyopt = sopp.sopp_zcopyflag; + } + + /* Send the options up */ + putnext(q, mp); + + /* + * Pass up any data and/or a fin that has been received. + * + * Adjust receive window in case it had decreased + * (because there is data <=> tcp_rcv_list != NULL) + * while the connection was detached. Note that + * in case the eager was flow-controlled, w/o this + * code, the rwnd may never open up again! + */ + if (tcp->tcp_rcv_list != NULL) { + /* We drain directly in case of fused tcp loopback */ + + if (!tcp->tcp_fused && canputnext(q)) { + tcp->tcp_rwnd = connp->conn_rcvbuf; + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { + tcp_xmit_ctl(NULL, + tcp, (tcp->tcp_swnd == 0) ? + tcp->tcp_suna : tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + } + } + + (void) tcp_rcv_drain(tcp); + + /* + * For fused tcp loopback, back-enable peer endpoint + * if it's currently flow-controlled. + */ + if (tcp->tcp_fused) { + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + ASSERT(peer_tcp != NULL); + ASSERT(peer_tcp->tcp_fused); + + mutex_enter(&peer_tcp->tcp_non_sq_lock); + if (peer_tcp->tcp_flow_stopped) { + tcp_clrqfull(peer_tcp); + TCP_STAT(tcps, tcp_fusion_backenabled); + } + mutex_exit(&peer_tcp->tcp_non_sq_lock); + } + } + ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { + tcp->tcp_ordrel_done = B_TRUE; + mp = tcp->tcp_ordrel_mp; + tcp->tcp_ordrel_mp = NULL; + putnext(q, mp); + } + tcp->tcp_hard_binding = B_FALSE; + + if (connp->conn_keepalive) { + tcp->tcp_ka_last_intrvl = 0; + tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, + tcp->tcp_ka_interval); + } + + /* + * At this point, eager is fully established and will + * have the following references - + * + * 2 references for connection to exist (1 for TCP and 1 for IP). + * 1 reference for the squeue which will be dropped by the squeue as + * soon as this function returns. + * There will be 1 additonal reference for being in classifier + * hash list provided something bad hasn't happened. + */ + ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || + (connp->conn_fanout == NULL && connp->conn_ref >= 3)); +} + + /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES @@ -1643,6 +1684,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) tcp_t *listener; struct T_ok_ack *ok; t_scalar_t PRIM_type; + mblk_t *discon_mp; conn_t *econnp; cred_t *cr; @@ -1703,14 +1745,120 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) q->q_qinfo = &tcp_winit; listener = eager->tcp_listener; - if (tcp_accept_common(listener->tcp_connp, - econnp, cr) < 0) { + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); + + if (discon_mp == NULL) { mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); if (mp != NULL) putnext(rq, mp); return; } + eager->tcp_issocket = B_TRUE; + + ASSERT(econnp->conn_netstack == + listener->tcp_connp->conn_netstack); + ASSERT(eager->tcp_tcps == listener->tcp_tcps); + + /* Put the ref for IP */ + CONN_INC_REF(econnp); + + /* + * We should have minimum of 3 references on the conn + * at this point. One each for TCP and IP and one for + * the T_conn_ind that was sent up when the 3-way handshake + * completed. In the normal case we would also have another + * reference (making a total of 4) for the conn being in the + * classifier hash list. However the eager could have received + * an RST subsequently and tcp_closei_local could have removed + * the eager from the classifier hash list, hence we can't + * assert that reference. + */ + ASSERT(econnp->conn_ref >= 3); + + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { + + tcp_t *tail; + tcp_t *tcp; + mblk_t *mp1; + + tcp = listener->tcp_eager_prev_q0; + /* + * listener->tcp_eager_prev_q0 points to the TAIL of the + * deferred T_conn_ind queue. We need to get to the head + * of the queue in order to send up T_conn_ind the same + * order as how the 3WHS is completed. + */ + while (tcp != listener) { + if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && + !tcp->tcp_kssl_pending) + break; + else + tcp = tcp->tcp_eager_prev_q0; + } + /* None of the pending eagers can be sent up now */ + if (tcp == listener) + goto no_more_eagers; + + mp1 = tcp->tcp_conn.tcp_eager_conn_ind; + tcp->tcp_conn.tcp_eager_conn_ind = NULL; + /* Move from q0 to q */ + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + listener->tcp_conn_req_cnt_q++; + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + tcp->tcp_eager_prev_q0 = NULL; + tcp->tcp_eager_next_q0 = NULL; + tcp->tcp_conn_def_q0 = B_FALSE; + + /* Make sure the tcp isn't in the list of droppables */ + ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL); + + /* + * Insert at end of the queue because sockfs sends + * down T_CONN_RES in chronological order. Leaving + * the older conn indications at front of the queue + * helps reducing search time. + */ + tail = listener->tcp_eager_last_q; + if (tail != NULL) { + tail->tcp_eager_next_q = tcp; + } else { + listener->tcp_eager_next_q = tcp; + } + listener->tcp_eager_last_q = tcp; + tcp->tcp_eager_next_q = NULL; + + /* Need to get inside the listener perimeter */ + CONN_INC_REF(listener->tcp_connp); + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, + tcp_send_pending, listener->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_SEND_PENDING); + } +no_more_eagers: + tcp_eager_unlink(eager); + mutex_exit(&listener->tcp_eager_lock); + + /* + * At this point, the eager is detached from the listener + * but we still have an extra refs on eager (apart from the + * usual tcp references). The ref was placed in tcp_input_data + * before sending the conn_ind in tcp_send_conn_ind. + * The ref will be dropped in tcp_accept_finish(). + */ + SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, + econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); + /* * Send the new local address also up to sockfs. There * should already be enough space in the mp that came @@ -1760,50 +1908,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) } } -/* - * Send the newconn notification to ulp. The eager is blown off if the - * notification fails. - */ -static void -tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) -{ - if (IPCL_IS_NONSTR(lconnp)) { - cred_t *cr; - pid_t cpid = NOPID; - - ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); - ASSERT(econnp->conn_tcp->tcp_saved_listener == - lconnp->conn_tcp); - - cr = msg_getcred(mp, &cpid); - - /* Keep the message around in case of a fallback to TPI */ - econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; - /* - * Notify the ULP about the newconn. It is guaranteed that no - * tcp_accept() call will be made for the eager if the - * notification fails, so it's safe to blow it off in that - * case. - * - * The upper handle will be assigned when tcp_accept() is - * called. - */ - if ((*lconnp->conn_upcalls->su_newconn) - (lconnp->conn_upper_handle, - (sock_lower_handle_t)econnp, - &sock_tcp_downcalls, cr, cpid, - &econnp->conn_upcalls) == NULL) { - /* Failed to allocate a socket */ - TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps, - tcpEstabResets); - (void) tcp_eager_blowoff(lconnp->conn_tcp, - econnp->conn_tcp->tcp_conn_req_seqnum); - } - } else { - putnext(lconnp->conn_rq, mp); - } -} - /* * The function called through squeue to get behind listener's perimeter to * send a deferred conn_ind. @@ -1831,7 +1935,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) return; } - tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); + putnext(lconnp->conn_rq, mp); } /* @@ -1989,5 +2093,5 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) } mutex_exit(&listener->tcp_eager_lock); if (need_send_conn_ind) - tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); + putnext(lconnp->conn_rq, mp); } diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c index 0d6fc8acc8..1984580efa 100644 --- a/usr/src/uts/common/inet/tcp/tcpddi.c +++ b/usr/src/uts/common/inet/tcp/tcpddi.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -41,6 +40,8 @@ #define INET_SOCKDESC "TCP socket module" #define INET_SOCK_PROTO_CREATE_FUNC (*tcp_create) #define INET_SOCK_PROTO_FB_FUNC (*tcp_fallback) +#define INET_SOCK_FALLBACK_DEV_V4 "/dev/tcp" +#define INET_SOCK_FALLBACK_DEV_V6 "/dev/tcp6" #define INET_DEVMINOR 0 #define INET_MODMTFLAGS D_MP #define INET_DEVMTFLAGS (D_MP|_D_DIRECT) diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 46b12b27f0..5b0dfc6c3b 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -515,8 +515,6 @@ extern uint_t tcp_free_list_max_cnt; /* * Functions in tcp.c. */ -extern int tcp_accept_common(conn_t *, conn_t *, cred_t *); -extern void tcp_accept_finish(void *, mblk_t *, void *, ip_recv_attr_t *); extern void tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *); extern tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *); extern void tcp_acceptor_hash_remove(tcp_t *); @@ -565,6 +563,7 @@ extern void tcp_update_pmtu(tcp_t *, boolean_t); extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); extern boolean_t tcp_zcopy_check(tcp_t *); extern void tcp_zcopy_notify(tcp_t *); +extern void tcp_get_proto_props(tcp_t *, struct sock_proto_props *); /* * Bind related functions in tcp_bind.c @@ -630,8 +629,9 @@ extern boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, /* * Kernel socket related functions in tcp_socket.c. */ -extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, - so_proto_quiesced_cb_t); +extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, + so_proto_quiesced_cb_t, sock_quiesce_arg_t *); +extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *); /* * Timer related functions in tcp_timers.c. @@ -657,9 +657,6 @@ extern boolean_t tcp_conn_con(tcp_t *, uchar_t *, mblk_t *, mblk_t **, ip_recv_attr_t *); extern void tcp_err_ack(tcp_t *, mblk_t *, int, int); extern void tcp_err_ack_prim(tcp_t *, mblk_t *, int, int, int); -extern void tcp_fallback_eager(tcp_t *, boolean_t); -extern void tcp_fallback_noneager(tcp_t *, mblk_t *, queue_t *, - boolean_t, so_proto_quiesced_cb_t); extern void tcp_info_req(tcp_t *, mblk_t *); extern void tcp_send_conn_ind(void *, mblk_t *, void *); extern void tcp_send_pending(void *, mblk_t *, void *, ip_recv_attr_t *); @@ -674,6 +671,8 @@ extern int tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, extern void tcp_tpi_unbind(tcp_t *, mblk_t *); extern void tcp_tli_accept(tcp_t *, mblk_t *); extern void tcp_use_pure_tpi(tcp_t *); +extern void tcp_do_capability_ack(tcp_t *, struct T_capability_ack *, + t_uscalar_t); /* * TCP option processing related functions in tcp_opt_data.c diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index adcd0652d7..d3f6f0dc7e 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -6498,7 +6498,8 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, int udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, - boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) + boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, + sock_quiesce_arg_t *arg) { conn_t *connp = (conn_t *)proto_handle; udp_t *udp; @@ -6507,7 +6508,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, socklen_t laddrlen, faddrlen; short opts; struct stroptions *stropt; - mblk_t *stropt_mp; + mblk_t *mp, *stropt_mp; int error; udp = connp->conn_udp; @@ -6563,17 +6564,21 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; - (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, (struct sockaddr *)&laddr, laddrlen, (struct sockaddr *)&faddr, faddrlen, opts); mutex_enter(&udp->udp_recv_lock); /* * Attempts to send data up during fallback will result in it being - * queued in udp_t. Now we push up any queued packets. + * queued in udp_t. First push up the datagrams obtained from the + * socket, then any packets queued in udp_t. */ + if (mp != NULL) { + mp->b_next = udp->udp_fallback_queue_head; + udp->udp_fallback_queue_head = mp; + } while (udp->udp_fallback_queue_head != NULL) { - mblk_t *mp; mp = udp->udp_fallback_queue_head; udp->udp_fallback_queue_head = mp->b_next; mutex_exit(&udp->udp_recv_lock); @@ -6598,7 +6603,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* ARGSUSED3 */ int -udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, +udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t *salenp, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c index 144af2192f..6d1b110cec 100644 --- a/usr/src/uts/common/inet/udp/udpddi.c +++ b/usr/src/uts/common/inet/udp/udpddi.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -43,6 +42,8 @@ #define INET_SOCKDESC "UDP socket module" #define INET_SOCK_PROTO_CREATE_FUNC (*udp_create) #define INET_SOCK_PROTO_FB_FUNC (*udp_fallback) +#define INET_SOCK_FALLBACK_DEV_V4 "/dev/udp" +#define INET_SOCK_FALLBACK_DEV_V6 "/dev/udp6" #define INET_DEVMTFLAGS (D_MP|_D_DIRECT) #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 11ca9f9810..4fbcbb5323 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _UDP_IMPL_H @@ -227,7 +226,7 @@ extern uint_t udp_max_optsize; extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t, - so_proto_quiesced_cb_t); + so_proto_quiesced_cb_t, sock_quiesce_arg_t *); extern sock_downcalls_t sock_udp_downcalls; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 561188a388..4100f049d7 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -166,7 +165,7 @@ ksocket_accept(ksocket_t ks, struct sockaddr *addr, } int -ksocket_connect(ksocket_t ks, const struct sockaddr *addr, socklen_t addrlen, +ksocket_connect(ksocket_t ks, struct sockaddr *addr, socklen_t addrlen, struct cred *cr) { /* All Solaris components should pass a cred for this operation. */ diff --git a/usr/src/uts/common/io/sock_conf.c b/usr/src/uts/common/io/sock_conf.c index b6d31de8ea..964175b6cd 100644 --- a/usr/src/uts/common/io/sock_conf.c +++ b/usr/src/uts/common/io/sock_conf.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -127,6 +126,10 @@ smod_register(const smod_reg_t *reg) if (reg->__smod_priv != NULL) { smodp->smod_proto_fallback_func = reg->__smod_priv->smodp_proto_fallback_func; + smodp->smod_fallback_devpath_v4 = + reg->__smod_priv->smodp_fallback_devpath_v4; + smodp->smod_fallback_devpath_v6 = + reg->__smod_priv->smodp_fallback_devpath_v6; } } smod_add(smodp); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 70160e318d..b956756758 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -21,8 +21,7 @@ /* ONC_PLUS EXTRACT START */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -731,7 +730,7 @@ struct sysent sysent[NSYSCALL] = /* 244 */ SYSENT_CI("getsockname", getsockname, 4), /* 245 */ SYSENT_CI("getsockopt", getsockopt, 6), /* 246 */ SYSENT_CI("setsockopt", setsockopt, 6), - /* 247 */ SYSENT_CI("sockconfig", sockconfig, 4), + /* 247 */ SYSENT_CI("sockconfig", sockconfig, 5), /* 248 */ SYSENT_CI("ntp_gettime", ntp_gettime, 1), /* 249 */ SYSENT_CI("ntp_adjtime", ntp_adjtime, 1), /* 250 */ SYSENT_CI("lwp_mutex_unlock", lwp_mutex_unlock, 1), @@ -1057,7 +1056,7 @@ struct sysent sysent32[NSYSCALL] = /* 244 */ SYSENT_CI("getsockname", getsockname, 4), /* 245 */ SYSENT_CI("getsockopt", getsockopt, 6), /* 246 */ SYSENT_CI("setsockopt", setsockopt, 6), - /* 247 */ SYSENT_CI("sockconfig", sockconfig, 4), + /* 247 */ SYSENT_CI("sockconfig", sockconfig, 5), /* 248 */ SYSENT_CI("ntp_gettime", ntp_gettime, 1), /* 249 */ SYSENT_CI("ntp_adjtime", ntp_adjtime, 1), /* 250 */ SYSENT_CI("lwp_mutex_unlock", lwp_mutex_unlock, 1), diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index acebf3dc1e..8d56de5adc 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -506,6 +506,7 @@ CHKHDRS= \ socket_impl.h \ socket_proto.h \ socketvar.h \ + sockfilter.h \ sockio.h \ soundcard.h \ squeue.h \ diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index fb834b027f..df15b12c08 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_KSOCKET_H_ @@ -88,7 +87,7 @@ extern int ksocket_bind(ksocket_t, struct sockaddr *, socklen_t, extern int ksocket_listen(ksocket_t, int, struct cred *); extern int ksocket_accept(ksocket_t, struct sockaddr *, socklen_t *, ksocket_t *, struct cred *); -extern int ksocket_connect(ksocket_t, const struct sockaddr *, socklen_t, +extern int ksocket_connect(ksocket_t, struct sockaddr *, socklen_t, struct cred *); extern int ksocket_send(ksocket_t, void *, size_t, int, size_t *, struct cred *); diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 803f7a07b0..5d4648234c 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -185,6 +184,27 @@ struct so_snd_bufinfo { #define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */ #endif /* _KERNEL */ +/* + * Socket filter options + */ +#define FIL_ATTACH 0x1 /* attach filter */ +#define FIL_DETACH 0x2 /* detach filter */ +#define FIL_LIST 0x3 /* list attached filters */ + +#define FILNAME_MAX 32 +/* + * Structure returned by FIL_LIST + */ +struct fil_info { + int fi_flags; /* see below (FILF_*) */ + int fi_pos; /* position (0 is bottom) */ + char fi_name[FILNAME_MAX]; /* filter name */ +}; + +#define FILF_PROG 0x1 /* programmatic attach */ +#define FILF_AUTO 0x2 /* automatic attach */ +#define FILF_BYPASS 0x4 /* filter is not active */ + #ifdef _KERNEL /* * new socket open flags to identify socket and acceptor streams @@ -199,13 +219,6 @@ struct so_snd_bufinfo { #define SOCKET_SLEEP KM_SLEEP #define SOCKET_NOSLEEP KM_NOSLEEP - -/* - * flags used by sockfs when falling back to tpi socket - */ -#define SO_FB_START 0x1 -#define SO_FB_FINISH 0x2 - #endif /* _KERNEL */ /* @@ -224,6 +237,7 @@ struct linger { #define SOL_ROUTE 0xfffe /* options for routing socket level */ #endif #define SOL_PACKET 0xfffd /* options for packet level */ +#define SOL_FILTER 0xfffc /* options for socket filter level */ /* * Address families. diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h index 56e312930b..6bc968be1b 100644 --- a/usr/src/uts/common/sys/socket_proto.h +++ b/usr/src/uts/common/sys/socket_proto.h @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SOCKET_PROTO_H_ @@ -128,11 +127,15 @@ struct sock_downcalls_s { typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); -typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *, - struct T_capability_ack *, struct sockaddr *, socklen_t, - struct sockaddr *, socklen_t, short); +typedef struct sock_quiesce_arg { + mblk_t *soqa_exdata_mp; + mblk_t *soqa_urgmark_mp; +} sock_quiesce_arg_t; +typedef mblk_t *(*so_proto_quiesced_cb_t)(sock_upper_handle_t, + sock_quiesce_arg_t *, struct T_capability_ack *, struct sockaddr *, + socklen_t, struct sockaddr *, socklen_t, short); typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *, - boolean_t, so_proto_quiesced_cb_t); + boolean_t, so_proto_quiesced_cb_t, sock_quiesce_arg_t *); /* * These functions return EOPNOTSUPP and are intended for the sockfs @@ -196,6 +199,7 @@ struct sock_upcalls_s { void (*su_signal_oob)(sock_upper_handle_t, ssize_t); void (*su_zcopy_notify)(sock_upper_handle_t); void (*su_set_error)(sock_upper_handle_t, int); + void (*su_closed)(sock_upper_handle_t); }; #define SOCK_UC_VERSION sizeof (sock_upcalls_t) diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 268adc6103..75b1626bcb 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -162,12 +162,13 @@ struct sonode { /* Accept queue */ kmutex_t so_acceptq_lock; /* protects accept queue */ - struct sonode *so_acceptq_next; /* acceptq list node */ - struct sonode *so_acceptq_head; - struct sonode **so_acceptq_tail; - unsigned int so_acceptq_len; + list_t so_acceptq_list; /* pending conns */ + list_t so_acceptq_defer; /* deferred conns */ + list_node_t so_acceptq_node; /* acceptq list node */ + unsigned int so_acceptq_len; /* # of conns (both lists) */ unsigned int so_backlog; /* Listen backlog */ kcondvar_t so_acceptq_cv; /* wait for new conn. */ + struct sonode *so_listener; /* parent socket */ /* Options */ short so_options; /* From socket call, see socket.h */ @@ -233,6 +234,13 @@ struct sonode { /* != NULL for sodirect enabled socket */ struct sodirect_s *so_direct; + + /* socket filters */ + uint_t so_filter_active; /* # of active fil */ + uint_t so_filter_tx; /* pending tx ops */ + struct sof_instance *so_filter_top; /* top of stack */ + struct sof_instance *so_filter_bottom; /* bottom of stack */ + clock_t so_filter_defertime; /* time when deferred */ }; #define SO_HAVE_DATA(so) \ @@ -288,10 +296,10 @@ struct sonode { #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ #define SS_CLOSING 0x00010000 /* in process of closing */ -/* unused 0x00020000 */ /* was SS_FADDR_NOXLATE */ -/* unused 0x00040000 */ /* was SS_HASDATA */ -/* unused 0x00080000 */ /* was SS_DONEREAD */ -/* unused 0x00100000 */ /* was SS_MOREDATA */ +#define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ +#define SS_FILOP_OK 0x00040000 /* socket can attach filters */ +#define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ +#define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ /* unused 0x00200000 */ /* was SS_DIRECT */ #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ @@ -312,19 +320,27 @@ struct sonode { * Sockets that can fall back to TPI must ensure that fall back is not * initiated while a thread is using a socket. */ -#define SO_BLOCK_FALLBACK(so, fn) { \ - ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ - rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ - if ((so)->so_state & SS_FALLBACK_COMP) { \ - rw_exit(&(so)->so_fallback_rwlock); \ - return (fn); \ - } \ -} +#define SO_BLOCK_FALLBACK(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & (SS_FALLBACK_COMP|SS_FILOP_OK)) { \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state &= ~SS_FILOP_OK; \ + mutex_exit(&(so)->so_lock); \ + } \ + } #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } +#define SO_SND_FLOWCTRLD(so) \ + ((so)->so_snd_qfull || (so)->so_state & SS_FIL_SND_FLOWCTRL) + /* Poll events */ #define SO_POLLEV_IN 0x1 /* POLLIN wakeup needed */ #define SO_POLLEV_ALWAYS 0x2 /* wakeups */ @@ -375,7 +391,9 @@ typedef struct sdev_info { vnode_t *sd_vnode; } sdev_info_t; -#define SOCKMOD_VERSION 1 +#define SOCKMOD_VERSION_1 1 +#define SOCKMOD_VERSION 2 + /* name of the TPI pseudo socket module */ #define SOTPI_SMOD_NAME "socktpi" @@ -383,6 +401,8 @@ typedef struct __smod_priv_s { so_create_func_t smodp_sock_create_func; so_destroy_func_t smodp_sock_destroy_func; so_proto_fallback_func_t smodp_proto_fallback_func; + const char *smodp_fallback_devpath_v4; + const char *smodp_fallback_devpath_v6; } __smod_priv_t; /* @@ -410,6 +430,8 @@ typedef struct smod_info { size_t smod_dc_version; /* down call version */ so_proto_create_func_t smod_proto_create_func; so_proto_fallback_func_t smod_proto_fallback_func; + const char *smod_fallback_devpath_v4; + const char *smod_fallback_devpath_v6; so_create_func_t smod_sock_create_func; so_destroy_func_t smod_sock_destroy_func; list_node_t smod_node; @@ -448,12 +470,22 @@ struct sockparams { /* * The entries below are only modified while holding - * splist_lock as a writer. + * sockconf_lock as a writer. */ int sp_flags; /* see below */ list_node_t sp_node; + + list_t sp_auto_filters; /* list of automatic filters */ + list_t sp_prog_filters; /* list of programmatic filters */ }; +struct sof_entry; + +typedef struct sp_filter { + struct sof_entry *spf_filter; + list_node_t spf_node; +} sp_filter_t; + /* * sockparams flags @@ -467,6 +499,14 @@ extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int, const char *, int, int *); extern void sockparams_ephemeral_drop_last_ref(struct sockparams *); +extern struct sockparams *sockparams_create(int, int, int, char *, char *, int, + int, int, int *); +extern void sockparams_destroy(struct sockparams *); +extern int sockparams_add(struct sockparams *); +extern int sockparams_delete(int, int, int); +extern int sockparams_new_filter(struct sof_entry *); +extern void sockparams_filter_cleanup(struct sof_entry *); + extern void smod_init(void); extern void smod_add(smod_info_t *); extern int smod_register(const smod_reg_t *); @@ -614,7 +654,7 @@ struct sonodeops { int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t, int, cred_t *); int (*sop_listen)(struct sonode *, int, cred_t *); - int (*sop_connect)(struct sonode *, const struct sockaddr *, + int (*sop_connect)(struct sonode *, struct sockaddr *, socklen_t, int, int, cred_t *); int (*sop_recvmsg)(struct sonode *, struct msghdr *, struct uio *, cred_t *); @@ -833,6 +873,8 @@ extern const struct fs_operation_def socket_vnodeops_template[]; extern dev_t sockdev; +extern krwlock_t sockconf_lock; + /* * sockfs functions */ @@ -842,7 +884,6 @@ extern int sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *, uchar_t, int, int); extern int sogetvp(char *, vnode_t **, int); extern int sockinit(int, char *); -extern int soconfig(int, int, int, char *, int, char *); extern int solookup(int, int, int, struct sockparams **); extern void so_lock_single(struct sonode *); extern void so_unlock_single(struct sonode *, int); @@ -885,7 +926,7 @@ extern int soaccept(struct sonode *, int, struct sonode **); extern int sobind(struct sonode *, struct sockaddr *, socklen_t, int, int); extern int solisten(struct sonode *, int); -extern int soconnect(struct sonode *, const struct sockaddr *, socklen_t, +extern int soconnect(struct sonode *, struct sockaddr *, socklen_t, int, int); extern int sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *); extern int sosendmsg(struct sonode *, struct nmsghdr *, struct uio *); @@ -927,6 +968,70 @@ struct sockinfo { zoneid_t si_szoneid; }; +/* + * Subcodes for sockconf() system call + */ +#define SOCKCONFIG_ADD_SOCK 0 +#define SOCKCONFIG_REMOVE_SOCK 1 +#define SOCKCONFIG_ADD_FILTER 2 +#define SOCKCONFIG_REMOVE_FILTER 3 + +/* + * Data structures for configuring socket filters. + */ + +/* + * Placement hint for automatic filters + */ +typedef enum { + SOF_HINT_NONE, + SOF_HINT_TOP, + SOF_HINT_BOTTOM, + SOF_HINT_BEFORE, + SOF_HINT_AFTER +} sof_hint_t; + +/* + * Socket tuple. Used by sockconfig_filter_props to list socket + * types of interest. + */ +typedef struct sof_socktuple { + int sofst_family; + int sofst_type; + int sofst_protocol; +} sof_socktuple_t; + +/* + * Socket filter properties used by sockconfig() system call. + */ +struct sockconfig_filter_props { + char *sfp_modname; + boolean_t sfp_autoattach; + sof_hint_t sfp_hint; + char *sfp_hintarg; + uint_t sfp_socktuple_cnt; + sof_socktuple_t *sfp_socktuple; +}; + +#ifdef _SYSCALL32 + +typedef struct sof_socktuple32 { + int32_t sofst_family; + int32_t sofst_type; + int32_t sofst_protocol; +} sof_socktuple32_t; + +struct sockconfig_filter_props32 { + caddr32_t sfp_modname; + boolean_t sfp_autoattach; + sof_hint_t sfp_hint; + caddr32_t sfp_hintarg; + uint32_t sfp_socktuple_cnt; + caddr32_t sfp_socktuple; +}; + +#endif /* _SYSCALL32 */ + #define SOCKMOD_PATH "socketmod" /* dir where sockmods are stored */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h new file mode 100644 index 0000000000..9f6d8b499b --- /dev/null +++ b/usr/src/uts/common/sys/sockfilter.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SOCKFILTER_H +#define _SYS_SOCKFILTER_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Opaque socket filter handle + */ +typedef struct __sof_handle *sof_handle_t; + +/* + * Return values for callback functions. + * + * A - Attach (passive/active) only + * P - Passive attach only + */ +typedef enum { + SOF_RVAL_DEFER = -3, /* defer notification (P) */ + SOF_RVAL_DETACH = -2, /* detach filter, continue proc. (A) */ + SOF_RVAL_CONTINUE = -1, /* continue processing */ + SOF_RVAL_RETURN = 0, /* stop proc, does not return error */ + SOF_RVAL_EINVAL = EINVAL, /* stop proc., returns error */ + SOF_RVAL_EACCES = EACCES, /* stop proc., returns error */ + SOF_RVAL_ENOMEM = ENOMEM, /* stop proc., returns error */ + SOF_RVAL_ECONNABORTED = ECONNABORTED /* stop proc, returns error */ +} sof_rval_t; + +/* + * Events generated by the sofop_notify callback. + */ +typedef enum { /* socket ... */ + SOF_EV_CLOSING, /* ... is closing */ + SOF_EV_CONNECTED, /* ... is connected */ + SOF_EV_CONNECTFAILED, /* ... failed to connect */ + SOF_EV_DISCONNECTED, /* ... was disconnected */ + SOF_EV_CANTRECVMORE, /* ... cannot receive any more data */ + SOF_EV_CANTSENDMORE, /* ... cannot send any more data */ + SOF_EV_INJECT_DATA_IN_OK, /* ... has cleared rcv flow ctrl */ + SOF_EV_INJECT_DATA_OUT_OK, /* ... has cleared snd flow ctrl */ +} sof_event_t; + +/* Filter callbacks */ +typedef sof_rval_t (*sof_attach_active_fn_t)(sof_handle_t, int, int, int, + cred_t *, void **); +typedef sof_rval_t (*sof_attach_passive_fn_t)(sof_handle_t, sof_handle_t, + void *, struct sockaddr *, socklen_t, struct sockaddr *, socklen_t, + void **); +typedef void (*sof_detach_fn_t)(sof_handle_t, void *, cred_t *); +typedef mblk_t *(*sof_data_in_fn_t)(sof_handle_t, void *, mblk_t *, + int, size_t *); +typedef mblk_t *(*sof_data_in_proc_fn_t)(sof_handle_t, void *, + mblk_t *, cred_t *, size_t *); +typedef mblk_t *(*sof_data_out_fn_t)(sof_handle_t, void *, mblk_t *, + struct nmsghdr *, cred_t *, sof_rval_t *); +typedef sof_rval_t (*sof_bind_fn_t)(sof_handle_t, void *, + struct sockaddr *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_listen_fn_t)(sof_handle_t, void *, int *, + cred_t *); +typedef sof_rval_t (*sof_accept_fn_t)(sof_handle_t, void *, cred_t *); +typedef sof_rval_t (*sof_connect_fn_t)(sof_handle_t, void *, + struct sockaddr *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_shutdown_fn_t)(sof_handle_t, void *, int *, + cred_t *); +typedef sof_rval_t (*sof_getsockname_fn_t)(sof_handle_t, void *, + struct sockaddr *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_getpeername_fn_t)(sof_handle_t, void *, + struct sockaddr *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_setsockopt_fn_t)(sof_handle_t, void *, + int, int, void *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_getsockopt_fn_t)(sof_handle_t, void *, + int, int, void *, socklen_t *, cred_t *); +typedef sof_rval_t (*sof_ioctl_fn_t)(sof_handle_t, void *, int, intptr_t, + int, int32_t *, cred_t *); +typedef void (*sof_mblk_prop_fn_t)(sof_handle_t, void *, ssize_t *, + ushort_t *, ushort_t *); +typedef void (*sof_notify_fn_t)(sof_handle_t, void *, sof_event_t, + uintptr_t); + +typedef struct sof_ops { + sof_attach_active_fn_t sofop_attach_active; + sof_attach_passive_fn_t sofop_attach_passive; + sof_detach_fn_t sofop_detach; + sof_data_in_fn_t sofop_data_in; + sof_data_in_proc_fn_t sofop_data_in_proc; + sof_data_out_fn_t sofop_data_out; + sof_bind_fn_t sofop_bind; + sof_listen_fn_t sofop_listen; + sof_connect_fn_t sofop_connect; + sof_accept_fn_t sofop_accept; + sof_shutdown_fn_t sofop_shutdown; + sof_getsockname_fn_t sofop_getsockname; + sof_getpeername_fn_t sofop_getpeername; + sof_setsockopt_fn_t sofop_setsockopt; + sof_getsockopt_fn_t sofop_getsockopt; + sof_ioctl_fn_t sofop_ioctl; + sof_mblk_prop_fn_t sofop_mblk_prop; + sof_notify_fn_t sofop_notify; +} sof_ops_t; + +#define SOF_VERSION 1 + +extern int sof_register(int, const char *, const sof_ops_t *, int); +extern int sof_unregister(const char *); + +extern void sof_newconn_ready(sof_handle_t); +extern void sof_bypass(sof_handle_t); +extern void *sof_get_cookie(sof_handle_t); +extern void *sof_cas_cookie(sof_handle_t, void *, void *); +extern int sof_inject_data_out(sof_handle_t, mblk_t *, struct nmsghdr *, + boolean_t *); +extern int sof_inject_data_in(sof_handle_t, mblk_t *, size_t, int, + boolean_t *); +extern void sof_rcv_flowctrl(sof_handle_t, boolean_t); +extern void sof_snd_flowctrl(sof_handle_t, boolean_t); +extern boolean_t sof_newconn_move(sof_handle_t, sof_handle_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SOCKFILTER_H */ diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index d279593b0f..29d8c5b564 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -781,8 +780,16 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, size_t iov_len; iov_len = sfv_len; - if (!SOCK_IS_NONSTR(so) && - SOTOTPI(so)->sti_kssl_ctx != NULL) + /* + * Socket filters can limit the mblk + * size, so limit reads to maxblk if + * there are filters present. + */ + if ((!SOCK_IS_NONSTR(so) && + _SOTOTPI(so)->sti_kssl_ctx + != NULL) || + (so->so_filter_active > 0 && + maxblk != INFPSZ)) iov_len = MIN(iov_len, maxblk); aiov.iov_len = iov_len; @@ -928,13 +935,16 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, copyflag = stp != NULL ? stp->sd_copyflag : so->so_proto_props.sopp_zcopyflag; + /* - * For sockets acting as an SSL proxy, we - * need to adjust the size to the maximum - * SSL record size set in the stream head. + * Socket filters can limit the mblk size, + * so limit reads to maxblk if there are + * filters present. */ - if (!SOCK_IS_NONSTR(so) && - _SOTOTPI(so)->sti_kssl_ctx != NULL) + if ((!SOCK_IS_NONSTR(so) && + _SOTOTPI(so)->sti_kssl_ctx != NULL) || + (so->so_filter_active > 0 && + maxblk != INFPSZ)) size = MIN(size, maxblk); if (vn_has_flocks(readvp) || -- cgit v1.2.3